// SPDX-License-Identifier: GPL-2.0-only /* * Partial Parity Log for closing the RAID5 write hole * Copyright (c) 2017, Intel Corporation. */ #include <linux/kernel.h> #include <linux/blkdev.h> #include <linux/slab.h> #include <linux/crc32c.h> #include <linux/async_tx.h> #include <linux/raid/md_p.h> #include "md.h" #include "raid5.h" #include "raid5-log.h" /* * PPL consists of a 4KB header (struct ppl_header) and at least 128KB for * partial parity data. The header contains an array of entries * (struct ppl_header_entry) which describe the logged write requests. * Partial parity for the entries comes after the header, written in the same * sequence as the entries: * * Header * entry0 * ... * entryN * PP data * PP for entry0 * ... * PP for entryN * * An entry describes one or more consecutive stripe_heads, up to a full * stripe. The modifed raid data chunks form an m-by-n matrix, where m is the * number of stripe_heads in the entry and n is the number of modified data * disks. Every stripe_head in the entry must write to the same data disks. * An example of a valid case described by a single entry (writes to the first * stripe of a 4 disk array, 16k chunk size): * * sh->sector dd0 dd1 dd2 ppl * +-----+-----+-----+ * 0 | --- | --- | --- | +----+ * 8 | -W- | -W- | --- | | pp | data_sector = 8 * 16 | -W- | -W- | --- | | pp | data_size = 3 * 2 * 4k * 24 | -W- | -W- | --- | | pp | pp_size = 3 * 4k * +-----+-----+-----+ +----+ * * data_sector is the first raid sector of the modified data, data_size is the * total size of modified data and pp_size is the size of partial parity for * this entry. Entries for full stripe writes contain no partial parity * (pp_size = 0), they only mark the stripes for which parity should be * recalculated after an unclean shutdown. Every entry holds a checksum of its * partial parity, the header also has a checksum of the header itself. * * A write request is always logged to the PPL instance stored on the parity * disk of the corresponding stripe. For each member disk there is one ppl_log * used to handle logging for this disk, independently from others. They are * grouped in child_logs array in struct ppl_conf, which is assigned to * r5conf->log_private. * * ppl_io_unit represents a full PPL write, header_page contains the ppl_header. * PPL entries for logged stripes are added in ppl_log_stripe(). A stripe_head * can be appended to the last entry if it meets the conditions for a valid * entry described above, otherwise a new entry is added. Checksums of entries * are calculated incrementally as stripes containing partial parity are being * added. ppl_submit_iounit() calculates the checksum of the header and submits * a bio containing the header page and partial parity pages (sh->ppl_page) for * all stripes of the io_unit. When the PPL write completes, the stripes * associated with the io_unit are released and raid5d starts writing their data * and parity. When all stripes are written, the io_unit is freed and the next * can be submitted. * * An io_unit is used to gather stripes until it is submitted or becomes full * (if the maximum number of entries or size of PPL is reached). Another io_unit * can't be submitted until the previous has completed (PPL and stripe * data+parity is written). The log->io_list tracks all io_units of a log * (for a single member disk). New io_units are added to the end of the list * and the first io_unit is submitted, if it is not submitted already. * The current io_unit accepting new stripes is always at the end of the list. * * If write-back cache is enabled for any of the disks in the array, its data * must be flushed before next io_unit is submitted. */ #define PPL_SPACE_SIZE … struct ppl_conf { … }; struct ppl_log { … }; #define PPL_IO_INLINE_BVECS … struct ppl_io_unit { … }; struct dma_async_tx_descriptor * ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu, struct dma_async_tx_descriptor *tx) { … } static void *ppl_io_pool_alloc(gfp_t gfp_mask, void *pool_data) { … } static void ppl_io_pool_free(void *element, void *pool_data) { … } static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log, struct stripe_head *sh) { … } static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh) { … } int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh) { … } static void ppl_log_endio(struct bio *bio) { … } static void ppl_submit_iounit_bio(struct ppl_io_unit *io, struct bio *bio) { … } static void ppl_submit_iounit(struct ppl_io_unit *io) { … } static void ppl_submit_current_io(struct ppl_log *log) { … } void ppl_write_stripe_run(struct r5conf *conf) { … } static void ppl_io_unit_finished(struct ppl_io_unit *io) { … } static void ppl_flush_endio(struct bio *bio) { … } static void ppl_do_flush(struct ppl_io_unit *io) { … } static inline bool ppl_no_io_unit_submitted(struct r5conf *conf, struct ppl_log *log) { … } void ppl_quiesce(struct r5conf *conf, int quiesce) { … } int ppl_handle_flush_request(struct bio *bio) { … } void ppl_stripe_write_finished(struct stripe_head *sh) { … } static void ppl_xor(int size, struct page *page1, struct page *page2) { … } /* * PPL recovery strategy: xor partial parity and data from all modified data * disks within a stripe and write the result as the new stripe parity. If all * stripe data disks are modified (full stripe write), no partial parity is * available, so just xor the data disks. * * Recovery of a PPL entry shall occur only if all modified data disks are * available and read from all of them succeeds. * * A PPL entry applies to a stripe, partial parity size for an entry is at most * the size of the chunk. Examples of possible cases for a single entry: * * case 0: single data disk write: * data0 data1 data2 ppl parity * +--------+--------+--------+ +--------------------+ * | ------ | ------ | ------ | +----+ | (no change) | * | ------ | -data- | ------ | | pp | -> | data1 ^ pp | * | ------ | -data- | ------ | | pp | -> | data1 ^ pp | * | ------ | ------ | ------ | +----+ | (no change) | * +--------+--------+--------+ +--------------------+ * pp_size = data_size * * case 1: more than one data disk write: * data0 data1 data2 ppl parity * +--------+--------+--------+ +--------------------+ * | ------ | ------ | ------ | +----+ | (no change) | * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp | * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp | * | ------ | ------ | ------ | +----+ | (no change) | * +--------+--------+--------+ +--------------------+ * pp_size = data_size / modified_data_disks * * case 2: write to all data disks (also full stripe write): * data0 data1 data2 parity * +--------+--------+--------+ +--------------------+ * | ------ | ------ | ------ | | (no change) | * | -data- | -data- | -data- | --------> | xor all data | * | ------ | ------ | ------ | --------> | (no change) | * | ------ | ------ | ------ | | (no change) | * +--------+--------+--------+ +--------------------+ * pp_size = 0 * * The following cases are possible only in other implementations. The recovery * code can handle them, but they are not generated at runtime because they can * be reduced to cases 0, 1 and 2: * * case 3: * data0 data1 data2 ppl parity * +--------+--------+--------+ +----+ +--------------------+ * | ------ | -data- | -data- | | pp | | data1 ^ data2 ^ pp | * | ------ | -data- | -data- | | pp | -> | data1 ^ data2 ^ pp | * | -data- | -data- | -data- | | -- | -> | xor all data | * | -data- | -data- | ------ | | pp | | data0 ^ data1 ^ pp | * +--------+--------+--------+ +----+ +--------------------+ * pp_size = chunk_size * * case 4: * data0 data1 data2 ppl parity * +--------+--------+--------+ +----+ +--------------------+ * | ------ | -data- | ------ | | pp | | data1 ^ pp | * | ------ | ------ | ------ | | -- | -> | (no change) | * | ------ | ------ | ------ | | -- | -> | (no change) | * | -data- | ------ | ------ | | pp | | data0 ^ pp | * +--------+--------+--------+ +----+ +--------------------+ * pp_size = chunk_size */ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e, sector_t ppl_sector) { … } static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr, sector_t offset) { … } static int ppl_write_empty_header(struct ppl_log *log) { … } static int ppl_load_distributed(struct ppl_log *log) { … } static int ppl_load(struct ppl_conf *ppl_conf) { … } static void __ppl_exit_log(struct ppl_conf *ppl_conf) { … } void ppl_exit_log(struct r5conf *conf) { … } static int ppl_validate_rdev(struct md_rdev *rdev) { … } static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev) { … } int ppl_init_log(struct r5conf *conf) { … } int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add) { … } static ssize_t ppl_write_hint_show(struct mddev *mddev, char *buf) { … } static ssize_t ppl_write_hint_store(struct mddev *mddev, const char *page, size_t len) { … } struct md_sysfs_entry ppl_write_hint = …;