linux/drivers/md/raid5-ppl.c

// SPDX-License-Identifier: GPL-2.0-only
/*
 * Partial Parity Log for closing the RAID5 write hole
 * Copyright (c) 2017, Intel Corporation.
 */

#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/crc32c.h>
#include <linux/async_tx.h>
#include <linux/raid/md_p.h>
#include "md.h"
#include "raid5.h"
#include "raid5-log.h"

/*
 * PPL consists of a 4KB header (struct ppl_header) and at least 128KB for
 * partial parity data. The header contains an array of entries
 * (struct ppl_header_entry) which describe the logged write requests.
 * Partial parity for the entries comes after the header, written in the same
 * sequence as the entries:
 *
 * Header
 *   entry0
 *   ...
 *   entryN
 * PP data
 *   PP for entry0
 *   ...
 *   PP for entryN
 *
 * An entry describes one or more consecutive stripe_heads, up to a full
 * stripe. The modifed raid data chunks form an m-by-n matrix, where m is the
 * number of stripe_heads in the entry and n is the number of modified data
 * disks. Every stripe_head in the entry must write to the same data disks.
 * An example of a valid case described by a single entry (writes to the first
 * stripe of a 4 disk array, 16k chunk size):
 *
 * sh->sector   dd0   dd1   dd2    ppl
 *            +-----+-----+-----+
 * 0          | --- | --- | --- | +----+
 * 8          | -W- | -W- | --- | | pp |   data_sector = 8
 * 16         | -W- | -W- | --- | | pp |   data_size = 3 * 2 * 4k
 * 24         | -W- | -W- | --- | | pp |   pp_size = 3 * 4k
 *            +-----+-----+-----+ +----+
 *
 * data_sector is the first raid sector of the modified data, data_size is the
 * total size of modified data and pp_size is the size of partial parity for
 * this entry. Entries for full stripe writes contain no partial parity
 * (pp_size = 0), they only mark the stripes for which parity should be
 * recalculated after an unclean shutdown. Every entry holds a checksum of its
 * partial parity, the header also has a checksum of the header itself.
 *
 * A write request is always logged to the PPL instance stored on the parity
 * disk of the corresponding stripe. For each member disk there is one ppl_log
 * used to handle logging for this disk, independently from others. They are
 * grouped in child_logs array in struct ppl_conf, which is assigned to
 * r5conf->log_private.
 *
 * ppl_io_unit represents a full PPL write, header_page contains the ppl_header.
 * PPL entries for logged stripes are added in ppl_log_stripe(). A stripe_head
 * can be appended to the last entry if it meets the conditions for a valid
 * entry described above, otherwise a new entry is added. Checksums of entries
 * are calculated incrementally as stripes containing partial parity are being
 * added. ppl_submit_iounit() calculates the checksum of the header and submits
 * a bio containing the header page and partial parity pages (sh->ppl_page) for
 * all stripes of the io_unit. When the PPL write completes, the stripes
 * associated with the io_unit are released and raid5d starts writing their data
 * and parity. When all stripes are written, the io_unit is freed and the next
 * can be submitted.
 *
 * An io_unit is used to gather stripes until it is submitted or becomes full
 * (if the maximum number of entries or size of PPL is reached). Another io_unit
 * can't be submitted until the previous has completed (PPL and stripe
 * data+parity is written). The log->io_list tracks all io_units of a log
 * (for a single member disk). New io_units are added to the end of the list
 * and the first io_unit is submitted, if it is not submitted already.
 * The current io_unit accepting new stripes is always at the end of the list.
 *
 * If write-back cache is enabled for any of the disks in the array, its data
 * must be flushed before next io_unit is submitted.
 */

#define PPL_SPACE_SIZE

struct ppl_conf {};

struct ppl_log {};

#define PPL_IO_INLINE_BVECS

struct ppl_io_unit {};

struct dma_async_tx_descriptor *
ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
		       struct dma_async_tx_descriptor *tx)
{}

static void *ppl_io_pool_alloc(gfp_t gfp_mask, void *pool_data)
{}

static void ppl_io_pool_free(void *element, void *pool_data)
{}

static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log,
					  struct stripe_head *sh)
{}

static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh)
{}

int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh)
{}

static void ppl_log_endio(struct bio *bio)
{}

static void ppl_submit_iounit_bio(struct ppl_io_unit *io, struct bio *bio)
{}

static void ppl_submit_iounit(struct ppl_io_unit *io)
{}

static void ppl_submit_current_io(struct ppl_log *log)
{}

void ppl_write_stripe_run(struct r5conf *conf)
{}

static void ppl_io_unit_finished(struct ppl_io_unit *io)
{}

static void ppl_flush_endio(struct bio *bio)
{}

static void ppl_do_flush(struct ppl_io_unit *io)
{}

static inline bool ppl_no_io_unit_submitted(struct r5conf *conf,
					    struct ppl_log *log)
{}

void ppl_quiesce(struct r5conf *conf, int quiesce)
{}

int ppl_handle_flush_request(struct bio *bio)
{}

void ppl_stripe_write_finished(struct stripe_head *sh)
{}

static void ppl_xor(int size, struct page *page1, struct page *page2)
{}

/*
 * PPL recovery strategy: xor partial parity and data from all modified data
 * disks within a stripe and write the result as the new stripe parity. If all
 * stripe data disks are modified (full stripe write), no partial parity is
 * available, so just xor the data disks.
 *
 * Recovery of a PPL entry shall occur only if all modified data disks are
 * available and read from all of them succeeds.
 *
 * A PPL entry applies to a stripe, partial parity size for an entry is at most
 * the size of the chunk. Examples of possible cases for a single entry:
 *
 * case 0: single data disk write:
 *   data0    data1    data2     ppl        parity
 * +--------+--------+--------+           +--------------------+
 * | ------ | ------ | ------ | +----+    | (no change)        |
 * | ------ | -data- | ------ | | pp | -> | data1 ^ pp         |
 * | ------ | -data- | ------ | | pp | -> | data1 ^ pp         |
 * | ------ | ------ | ------ | +----+    | (no change)        |
 * +--------+--------+--------+           +--------------------+
 * pp_size = data_size
 *
 * case 1: more than one data disk write:
 *   data0    data1    data2     ppl        parity
 * +--------+--------+--------+           +--------------------+
 * | ------ | ------ | ------ | +----+    | (no change)        |
 * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp |
 * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp |
 * | ------ | ------ | ------ | +----+    | (no change)        |
 * +--------+--------+--------+           +--------------------+
 * pp_size = data_size / modified_data_disks
 *
 * case 2: write to all data disks (also full stripe write):
 *   data0    data1    data2                parity
 * +--------+--------+--------+           +--------------------+
 * | ------ | ------ | ------ |           | (no change)        |
 * | -data- | -data- | -data- | --------> | xor all data       |
 * | ------ | ------ | ------ | --------> | (no change)        |
 * | ------ | ------ | ------ |           | (no change)        |
 * +--------+--------+--------+           +--------------------+
 * pp_size = 0
 *
 * The following cases are possible only in other implementations. The recovery
 * code can handle them, but they are not generated at runtime because they can
 * be reduced to cases 0, 1 and 2:
 *
 * case 3:
 *   data0    data1    data2     ppl        parity
 * +--------+--------+--------+ +----+    +--------------------+
 * | ------ | -data- | -data- | | pp |    | data1 ^ data2 ^ pp |
 * | ------ | -data- | -data- | | pp | -> | data1 ^ data2 ^ pp |
 * | -data- | -data- | -data- | | -- | -> | xor all data       |
 * | -data- | -data- | ------ | | pp |    | data0 ^ data1 ^ pp |
 * +--------+--------+--------+ +----+    +--------------------+
 * pp_size = chunk_size
 *
 * case 4:
 *   data0    data1    data2     ppl        parity
 * +--------+--------+--------+ +----+    +--------------------+
 * | ------ | -data- | ------ | | pp |    | data1 ^ pp         |
 * | ------ | ------ | ------ | | -- | -> | (no change)        |
 * | ------ | ------ | ------ | | -- | -> | (no change)        |
 * | -data- | ------ | ------ | | pp |    | data0 ^ pp         |
 * +--------+--------+--------+ +----+    +--------------------+
 * pp_size = chunk_size
 */
static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e,
			     sector_t ppl_sector)
{}

static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr,
		       sector_t offset)
{}

static int ppl_write_empty_header(struct ppl_log *log)
{}

static int ppl_load_distributed(struct ppl_log *log)
{}

static int ppl_load(struct ppl_conf *ppl_conf)
{}

static void __ppl_exit_log(struct ppl_conf *ppl_conf)
{}

void ppl_exit_log(struct r5conf *conf)
{}

static int ppl_validate_rdev(struct md_rdev *rdev)
{}

static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev)
{}

int ppl_init_log(struct r5conf *conf)
{}

int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add)
{}

static ssize_t
ppl_write_hint_show(struct mddev *mddev, char *buf)
{}

static ssize_t
ppl_write_hint_store(struct mddev *mddev, const char *page, size_t len)
{}

struct md_sysfs_entry
ppl_write_hint =;