raid5-cache.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2015 Shaohua Li <[email protected]>
 * Copyright (C) 2016 Song Liu <[email protected]>
 */
#include <linux/kernel.h>
#include <linux/wait.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/raid/md_p.h>
#include <linux/crc32c.h>
#include <linux/random.h>
#include <linux/kthread.h>
#include <linux/types.h>
#include "md.h"
#include "raid5.h"
#include "md-bitmap.h"
#include "raid5-log.h"

/*
 * metadata/data stored in disk with 4k size unit (a block) regardless
 * underneath hardware sector size. only works with PAGE_SIZE == 4096
 */
#define BLOCK_SECTORS …
#define BLOCK_SECTOR_SHIFT …

/*
 * log->max_free_space is min(1/4 disk size, 10G reclaimable space).
 *
 * In write through mode, the reclaim runs every log->max_free_space.
 * This can prevent the recovery scans for too long
 */
#define RECLAIM_MAX_FREE_SPACE …
#define RECLAIM_MAX_FREE_SPACE_SHIFT …

/* wake up reclaim thread periodically */
#define R5C_RECLAIM_WAKEUP_INTERVAL …
/* start flush with these full stripes */
#define R5C_FULL_STRIPE_FLUSH_BATCH(conf) …
/* reclaim stripes in groups */
#define R5C_RECLAIM_STRIPE_GROUP …

/*
 * We only need 2 bios per I/O unit to make progress, but ensure we
 * have a few more available to not get too tight.
 */
#define R5L_POOL_SIZE …

static char *r5c_journal_mode_str[] = …;
/*
 * raid5 cache state machine
 *
 * With the RAID cache, each stripe works in two phases:
 *	- caching phase
 *	- writing-out phase
 *
 * These two phases are controlled by bit STRIPE_R5C_CACHING:
 *   if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase
 *   if STRIPE_R5C_CACHING == 1, the stripe is in caching phase
 *
 * When there is no journal, or the journal is in write-through mode,
 * the stripe is always in writing-out phase.
 *
 * For write-back journal, the stripe is sent to caching phase on write
 * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off
 * the write-out phase by clearing STRIPE_R5C_CACHING.
 *
 * Stripes in caching phase do not write the raid disks. Instead, all
 * writes are committed from the log device. Therefore, a stripe in
 * caching phase handles writes as:
 *	- write to log device
 *	- return IO
 *
 * Stripes in writing-out phase handle writes as:
 *	- calculate parity
 *	- write pending data and parity to journal
 *	- write data and parity to raid disks
 *	- return IO for pending writes
 */

struct r5l_log { … };

/*
 * Enable chunk_aligned_read() with write back cache.
 *
 * Each chunk may contain more than one stripe (for example, a 256kB
 * chunk contains 64 4kB-page, so this chunk contain 64 stripes). For
 * chunk_aligned_read, these stripes are grouped into one "big_stripe".
 * For each big_stripe, we count how many stripes of this big_stripe
 * are in the write back cache. These data are tracked in a radix tree
 * (big_stripe_tree). We use radix_tree item pointer as the counter.
 * r5c_tree_index() is used to calculate keys for the radix tree.
 *
 * chunk_aligned_read() calls r5c_big_stripe_cached() to look up
 * big_stripe of each chunk in the tree. If this big_stripe is in the
 * tree, chunk_aligned_read() aborts. This look up is protected by
 * rcu_read_lock().
 *
 * It is necessary to remember whether a stripe is counted in
 * big_stripe_tree. Instead of adding new flag, we reuses existing flags:
 * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE. If either of these
 * two flags are set, the stripe is counted in big_stripe_tree. This
 * requires moving set_bit(STRIPE_R5C_PARTIAL_STRIPE) to
 * r5c_try_caching_write(); and moving clear_bit of
 * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE to
 * r5c_finish_stripe_write_out().
 */

/*
 * radix tree requests lowest 2 bits of data pointer to be 2b'00.
 * So it is necessary to left shift the counter by 2 bits before using it
 * as data pointer of the tree.
 */
#define R5C_RADIX_COUNT_SHIFT …

/*
 * calculate key for big_stripe_tree
 *
 * sect: align_bi->bi_iter.bi_sector or sh->sector
 */
static inline sector_t r5c_tree_index(struct r5conf *conf,
				      sector_t sect)
{ … }

/*
 * an IO range starts from a meta data block and end at the next meta data
 * block. The io unit's the meta data block tracks data/parity followed it. io
 * unit is written to log disk with normal write, as we always flush log disk
 * first and then start move data to raid disks, there is no requirement to
 * write io unit with FLUSH/FUA
 */
struct r5l_io_unit { … };

/* r5l_io_unit state */
enum r5l_io_unit_state { … };

bool r5c_is_writeback(struct r5l_log *log)
{ … }

static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
{ … }

static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
				  sector_t end)
{ … }

static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
{ … }

static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
				    enum r5l_io_unit_state state)
{ … }

static void
r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev)
{ … }

void r5c_handle_cached_data_endio(struct r5conf *conf,
				  struct stripe_head *sh, int disks)
{ … }

void r5l_wake_reclaim(struct r5l_log *log, sector_t space);

/* Check whether we should flush some stripes to free up stripe cache */
void r5c_check_stripe_cache_usage(struct r5conf *conf)
{ … }

/*
 * flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full
 * stripes in the cache
 */
void r5c_check_cached_full_stripe(struct r5conf *conf)
{ … }

/*
 * Total log space (in sectors) needed to flush all data in cache
 *
 * To avoid deadlock due to log space, it is necessary to reserve log
 * space to flush critical stripes (stripes that occupying log space near
 * last_checkpoint). This function helps check how much log space is
 * required to flush all cached stripes.
 *
 * To reduce log space requirements, two mechanisms are used to give cache
 * flush higher priorities:
 *    1. In handle_stripe_dirtying() and schedule_reconstruction(),
 *       stripes ALREADY in journal can be flushed w/o pending writes;
 *    2. In r5l_write_stripe() and r5c_cache_data(), stripes NOT in journal
 *       can be delayed (r5l_add_no_space_stripe).
 *
 * In cache flush, the stripe goes through 1 and then 2. For a stripe that
 * already passed 1, flushing it requires at most (conf->max_degraded + 1)
 * pages of journal space. For stripes that has not passed 1, flushing it
 * requires (conf->raid_disks + 1) pages of journal space. There are at
 * most (conf->group_cnt + 1) stripe that passed 1. So total journal space
 * required to flush all cached stripes (in pages) is:
 *
 *     (stripe_in_journal_count - group_cnt - 1) * (max_degraded + 1) +
 *     (group_cnt + 1) * (raid_disks + 1)
 * or
 *     (stripe_in_journal_count) * (max_degraded + 1) +
 *     (group_cnt + 1) * (raid_disks - max_degraded)
 */
static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
{ … }

/*
 * evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL
 *
 * R5C_LOG_TIGHT is set when free space on the log device is less than 3x of
 * reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log
 * device is less than 2x of reclaim_required_space.
 */
static inline void r5c_update_log_state(struct r5l_log *log)
{ … }

/*
 * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING.
 * This function should only be called in write-back mode.
 */
void r5c_make_stripe_write_out(struct stripe_head *sh)
{ … }

static void r5c_handle_data_cached(struct stripe_head *sh)
{ … }

/*
 * this journal write must contain full parity,
 * it may also contain some data pages
 */
static void r5c_handle_parity_cached(struct stripe_head *sh)
{ … }

/*
 * Setting proper flags after writing (or flushing) data and/or parity to the
 * log device. This is called from r5l_log_endio() or r5l_log_flush_endio().
 */
static void r5c_finish_cache_stripe(struct stripe_head *sh)
{ … }

static void r5l_io_run_stripes(struct r5l_io_unit *io)
{ … }

static void r5l_log_run_stripes(struct r5l_log *log)
{ … }

static void r5l_move_to_end_ios(struct r5l_log *log)
{ … }

static void __r5l_stripe_write_finished(struct r5l_io_unit *io);
static void r5l_log_endio(struct bio *bio)
{ … }

static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
{ … }

/* deferred io_unit will be dispatched here */
static void r5l_submit_io_async(struct work_struct *work)
{ … }

static void r5c_disable_writeback_async(struct work_struct *work)
{ … }

static void r5l_submit_current_io(struct r5l_log *log)
{ … }

static struct bio *r5l_bio_alloc(struct r5l_log *log)
{ … }

static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
{ … }

static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
{ … }

static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
{ … }

static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
				    sector_t location,
				    u32 checksum1, u32 checksum2,
				    bool checksum2_valid)
{ … }

static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
{ … }

static void r5l_append_flush_payload(struct r5l_log *log, sector_t sect)
{ … }

static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
			   int data_pages, int parity_pages)
{ … }

/* add stripe to no_space_stripes, and then wake up reclaim */
static inline void r5l_add_no_space_stripe(struct r5l_log *log,
					   struct stripe_head *sh)
{ … }

/*
 * running in raid5d, where reclaim could wait for raid5d too (when it flushes
 * data from log to raid disks), so we shouldn't wait for reclaim here
 */
int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
{ … }

void r5l_write_stripe_run(struct r5l_log *log)
{ … }

int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
{ … }

/* This will run after log space is reclaimed */
static void r5l_run_no_space_stripes(struct r5l_log *log)
{ … }

/*
 * calculate new last_checkpoint
 * for write through mode, returns log->next_checkpoint
 * for write back, returns log_start of first sh in stripe_in_journal_list
 */
static sector_t r5c_calculate_new_cp(struct r5conf *conf)
{ … }

static sector_t r5l_reclaimable_space(struct r5l_log *log)
{ … }

static void r5l_run_no_mem_stripe(struct r5l_log *log)
{ … }

static bool r5l_complete_finished_ios(struct r5l_log *log)
{ … }

static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
{ … }

void r5l_stripe_write_finished(struct stripe_head *sh)
{ … }

static void r5l_log_flush_endio(struct bio *bio)
{ … }

/*
 * Starting dispatch IO to raid.
 * io_unit(meta) consists of a log. There is one situation we want to avoid. A
 * broken meta in the middle of a log causes recovery can't find meta at the
 * head of log. If operations require meta at the head persistent in log, we
 * must make sure meta before it persistent in log too. A case is:
 *
 * stripe data/parity is in log, we start write stripe to raid disks. stripe
 * data/parity must be persistent in log before we do the write to raid disks.
 *
 * The solution is we restrictly maintain io_unit list order. In this case, we
 * only write stripes of an io_unit to raid disks till the io_unit is the first
 * one whose data/parity is in log.
 */
void r5l_flush_stripe_to_raid(struct r5l_log *log)
{ … }

static void r5l_write_super(struct r5l_log *log, sector_t cp);
static void r5l_write_super_and_discard_space(struct r5l_log *log,
	sector_t end)
{ … }

/*
 * r5c_flush_stripe moves stripe from cached list to handle_list. When called,
 * the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes.
 *
 * must hold conf->device_lock
 */
static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
{ … }

/*
 * if num == 0, flush all full stripes
 * if num > 0, flush all full stripes. If less than num full stripes are
 *             flushed, flush some partial stripes until totally num stripes are
 *             flushed or there is no more cached stripes.
 */
void r5c_flush_cache(struct r5conf *conf, int num)
{ … }

static void r5c_do_reclaim(struct r5conf *conf)
{ … }

static void r5l_do_reclaim(struct r5l_log *log)
{ … }

static void r5l_reclaim_thread(struct md_thread *thread)
{ … }

void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
{ … }

void r5l_quiesce(struct r5l_log *log, int quiesce)
{ … }

bool r5l_log_disk_error(struct r5conf *conf)
{ … }

#define R5L_RECOVERY_PAGE_POOL_SIZE …

struct r5l_recovery_ctx { … };

static int r5l_recovery_allocate_ra_pool(struct r5l_log *log,
					    struct r5l_recovery_ctx *ctx)
{ … }

static void r5l_recovery_free_ra_pool(struct r5l_log *log,
					struct r5l_recovery_ctx *ctx)
{ … }

/*
 * fetch ctx->valid_pages pages from offset
 * In normal cases, ctx->valid_pages == ctx->total_pages after the call.
 * However, if the offset is close to the end of the journal device,
 * ctx->valid_pages could be smaller than ctx->total_pages
 */
static int r5l_recovery_fetch_ra_pool(struct r5l_log *log,
				      struct r5l_recovery_ctx *ctx,
				      sector_t offset)
{ … }

/*
 * try read a page from the read ahead page pool, if the page is not in the
 * pool, call r5l_recovery_fetch_ra_pool
 */
static int r5l_recovery_read_page(struct r5l_log *log,
				  struct r5l_recovery_ctx *ctx,
				  struct page *page,
				  sector_t offset)
{ … }

static int r5l_recovery_read_meta_block(struct r5l_log *log,
					struct r5l_recovery_ctx *ctx)
{ … }

static void
r5l_recovery_create_empty_meta_block(struct r5l_log *log,
				     struct page *page,
				     sector_t pos, u64 seq)
{ … }

static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
					  u64 seq)
{ … }

/*
 * r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite
 * to mark valid (potentially not flushed) data in the journal.
 *
 * We already verified checksum in r5l_recovery_verify_data_checksum_for_mb,
 * so there should not be any mismatch here.
 */
static void r5l_recovery_load_data(struct r5l_log *log,
				   struct stripe_head *sh,
				   struct r5l_recovery_ctx *ctx,
				   struct r5l_payload_data_parity *payload,
				   sector_t log_offset)
{ … }

static void r5l_recovery_load_parity(struct r5l_log *log,
				     struct stripe_head *sh,
				     struct r5l_recovery_ctx *ctx,
				     struct r5l_payload_data_parity *payload,
				     sector_t log_offset)
{ … }

static void r5l_recovery_reset_stripe(struct stripe_head *sh)
{ … }

static void
r5l_recovery_replay_one_stripe(struct r5conf *conf,
			       struct stripe_head *sh,
			       struct r5l_recovery_ctx *ctx)
{ … }

static struct stripe_head *
r5c_recovery_alloc_stripe(
		struct r5conf *conf,
		sector_t stripe_sect,
		int noblock)
{ … }

static struct stripe_head *
r5c_recovery_lookup_stripe(struct list_head *list, sector_t sect)
{ … }

static void
r5c_recovery_drop_stripes(struct list_head *cached_stripe_list,
			  struct r5l_recovery_ctx *ctx)
{ … }

static void
r5c_recovery_replay_stripes(struct list_head *cached_stripe_list,
			    struct r5l_recovery_ctx *ctx)
{ … }

/* if matches return 0; otherwise return -EINVAL */
static int
r5l_recovery_verify_data_checksum(struct r5l_log *log,
				  struct r5l_recovery_ctx *ctx,
				  struct page *page,
				  sector_t log_offset, __le32 log_checksum)
{ … }

/*
 * before loading data to stripe cache, we need verify checksum for all data,
 * if there is mismatch for any data page, we drop all data in the mata block
 */
static int
r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
					 struct r5l_recovery_ctx *ctx)
{ … }

/*
 * Analyze all data/parity pages in one meta block
 * Returns:
 * 0 for success
 * -EINVAL for unknown playload type
 * -EAGAIN for checksum mismatch of data page
 * -ENOMEM for run out of memory (alloc_page failed or run out of stripes)
 */
static int
r5c_recovery_analyze_meta_block(struct r5l_log *log,
				struct r5l_recovery_ctx *ctx,
				struct list_head *cached_stripe_list)
{ … }

/*
 * Load the stripe into cache. The stripe will be written out later by
 * the stripe cache state machine.
 */
static void r5c_recovery_load_one_stripe(struct r5l_log *log,
					 struct stripe_head *sh)
{ … }

/*
 * Scan through the log for all to-be-flushed data
 *
 * For stripes with data and parity, namely Data-Parity stripe
 * (STRIPE_R5C_CACHING == 0), we simply replay all the writes.
 *
 * For stripes with only data, namely Data-Only stripe
 * (STRIPE_R5C_CACHING == 1), we load them to stripe cache state machine.
 *
 * For a stripe, if we see data after parity, we should discard all previous
 * data and parity for this stripe, as these data are already flushed to
 * the array.
 *
 * At the end of the scan, we return the new journal_tail, which points to
 * first data-only stripe on the journal device, or next invalid meta block.
 */
static int r5c_recovery_flush_log(struct r5l_log *log,
				  struct r5l_recovery_ctx *ctx)
{ … }

/*
 * we did a recovery. Now ctx.pos points to an invalid meta block. New
 * log will start here. but we can't let superblock point to last valid
 * meta block. The log might looks like:
 * | meta 1| meta 2| meta 3|
 * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
 * superblock points to meta 1, we write a new valid meta 2n.  if crash
 * happens again, new recovery will start from meta 1. Since meta 2n is
 * valid now, recovery will think meta 3 is valid, which is wrong.
 * The solution is we create a new meta in meta2 with its seq == meta
 * 1's seq + 10000 and let superblock points to meta2. The same recovery
 * will not think meta 3 is a valid meta, because its seq doesn't match
 */

/*
 * Before recovery, the log looks like the following
 *
 *   ---------------------------------------------
 *   |           valid log        | invalid log  |
 *   ---------------------------------------------
 *   ^
 *   |- log->last_checkpoint
 *   |- log->last_cp_seq
 *
 * Now we scan through the log until we see invalid entry
 *
 *   ---------------------------------------------
 *   |           valid log        | invalid log  |
 *   ---------------------------------------------
 *   ^                            ^
 *   |- log->last_checkpoint      |- ctx->pos
 *   |- log->last_cp_seq          |- ctx->seq
 *
 * From this point, we need to increase seq number by 10 to avoid
 * confusing next recovery.
 *
 *   ---------------------------------------------
 *   |           valid log        | invalid log  |
 *   ---------------------------------------------
 *   ^                              ^
 *   |- log->last_checkpoint        |- ctx->pos+1
 *   |- log->last_cp_seq            |- ctx->seq+10001
 *
 * However, it is not safe to start the state machine yet, because data only
 * parities are not yet secured in RAID. To save these data only parities, we
 * rewrite them from seq+11.
 *
 *   -----------------------------------------------------------------
 *   |           valid log        | data only stripes | invalid log  |
 *   -----------------------------------------------------------------
 *   ^                                                ^
 *   |- log->last_checkpoint                          |- ctx->pos+n
 *   |- log->last_cp_seq                              |- ctx->seq+10000+n
 *
 * If failure happens again during this process, the recovery can safe start
 * again from log->last_checkpoint.
 *
 * Once data only stripes are rewritten to journal, we move log_tail
 *
 *   -----------------------------------------------------------------
 *   |     old log        |    data only stripes    | invalid log  |
 *   -----------------------------------------------------------------
 *                        ^                         ^
 *                        |- log->last_checkpoint   |- ctx->pos+n
 *                        |- log->last_cp_seq       |- ctx->seq+10000+n
 *
 * Then we can safely start the state machine. If failure happens from this
 * point on, the recovery will start from new log->last_checkpoint.
 */
static int
r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
				       struct r5l_recovery_ctx *ctx)
{ … }

static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
						 struct r5l_recovery_ctx *ctx)
{ … }

static int r5l_recovery_log(struct r5l_log *log)
{ … }

static void r5l_write_super(struct r5l_log *log, sector_t cp)
{ … }

static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
{ … }

/*
 * Set journal cache mode on @mddev (external API initially needed by dm-raid).
 *
 * @mode as defined in 'enum r5c_journal_mode'.
 *
 */
int r5c_journal_mode_set(struct mddev *mddev, int mode)
{ … }
EXPORT_SYMBOL(…);

static ssize_t r5c_journal_mode_store(struct mddev *mddev,
				      const char *page, size_t length)
{ … }

struct md_sysfs_entry
r5c_journal_mode = …;

/*
 * Try handle write operation in caching phase. This function should only
 * be called in write-back mode.
 *
 * If all outstanding writes can be handled in caching phase, returns 0
 * If writes requires write-out phase, call r5c_make_stripe_write_out()
 * and returns -EAGAIN
 */
int r5c_try_caching_write(struct r5conf *conf,
			  struct stripe_head *sh,
			  struct stripe_head_state *s,
			  int disks)
{ … }

/*
 * free extra pages (orig_page) we allocated for prexor
 */
void r5c_release_extra_page(struct stripe_head *sh)
{ … }

void r5c_use_extra_page(struct stripe_head *sh)
{ … }

/*
 * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the
 * stripe is committed to RAID disks.
 */
void r5c_finish_stripe_write_out(struct r5conf *conf,
				 struct stripe_head *sh,
				 struct stripe_head_state *s)
{ … }

int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)
{ … }

/* check whether this big stripe is in write back cache. */
bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect)
{ … }

static int r5l_load_log(struct r5l_log *log)
{ … }

int r5l_start(struct r5l_log *log)
{ … }

void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
{ … }

int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
{ … }

void r5l_exit_log(struct r5conf *conf)
{ … }
linux/drivers/md/raid5-cache.c