// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2015 Shaohua Li <[email protected]> * Copyright (C) 2016 Song Liu <[email protected]> */ #include <linux/kernel.h> #include <linux/wait.h> #include <linux/blkdev.h> #include <linux/slab.h> #include <linux/raid/md_p.h> #include <linux/crc32c.h> #include <linux/random.h> #include <linux/kthread.h> #include <linux/types.h> #include "md.h" #include "raid5.h" #include "md-bitmap.h" #include "raid5-log.h" /* * metadata/data stored in disk with 4k size unit (a block) regardless * underneath hardware sector size. only works with PAGE_SIZE == 4096 */ #define BLOCK_SECTORS … #define BLOCK_SECTOR_SHIFT … /* * log->max_free_space is min(1/4 disk size, 10G reclaimable space). * * In write through mode, the reclaim runs every log->max_free_space. * This can prevent the recovery scans for too long */ #define RECLAIM_MAX_FREE_SPACE … #define RECLAIM_MAX_FREE_SPACE_SHIFT … /* wake up reclaim thread periodically */ #define R5C_RECLAIM_WAKEUP_INTERVAL … /* start flush with these full stripes */ #define R5C_FULL_STRIPE_FLUSH_BATCH(conf) … /* reclaim stripes in groups */ #define R5C_RECLAIM_STRIPE_GROUP … /* * We only need 2 bios per I/O unit to make progress, but ensure we * have a few more available to not get too tight. */ #define R5L_POOL_SIZE … static char *r5c_journal_mode_str[] = …; /* * raid5 cache state machine * * With the RAID cache, each stripe works in two phases: * - caching phase * - writing-out phase * * These two phases are controlled by bit STRIPE_R5C_CACHING: * if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase * if STRIPE_R5C_CACHING == 1, the stripe is in caching phase * * When there is no journal, or the journal is in write-through mode, * the stripe is always in writing-out phase. * * For write-back journal, the stripe is sent to caching phase on write * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off * the write-out phase by clearing STRIPE_R5C_CACHING. * * Stripes in caching phase do not write the raid disks. Instead, all * writes are committed from the log device. Therefore, a stripe in * caching phase handles writes as: * - write to log device * - return IO * * Stripes in writing-out phase handle writes as: * - calculate parity * - write pending data and parity to journal * - write data and parity to raid disks * - return IO for pending writes */ struct r5l_log { … }; /* * Enable chunk_aligned_read() with write back cache. * * Each chunk may contain more than one stripe (for example, a 256kB * chunk contains 64 4kB-page, so this chunk contain 64 stripes). For * chunk_aligned_read, these stripes are grouped into one "big_stripe". * For each big_stripe, we count how many stripes of this big_stripe * are in the write back cache. These data are tracked in a radix tree * (big_stripe_tree). We use radix_tree item pointer as the counter. * r5c_tree_index() is used to calculate keys for the radix tree. * * chunk_aligned_read() calls r5c_big_stripe_cached() to look up * big_stripe of each chunk in the tree. If this big_stripe is in the * tree, chunk_aligned_read() aborts. This look up is protected by * rcu_read_lock(). * * It is necessary to remember whether a stripe is counted in * big_stripe_tree. Instead of adding new flag, we reuses existing flags: * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE. If either of these * two flags are set, the stripe is counted in big_stripe_tree. This * requires moving set_bit(STRIPE_R5C_PARTIAL_STRIPE) to * r5c_try_caching_write(); and moving clear_bit of * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE to * r5c_finish_stripe_write_out(). */ /* * radix tree requests lowest 2 bits of data pointer to be 2b'00. * So it is necessary to left shift the counter by 2 bits before using it * as data pointer of the tree. */ #define R5C_RADIX_COUNT_SHIFT … /* * calculate key for big_stripe_tree * * sect: align_bi->bi_iter.bi_sector or sh->sector */ static inline sector_t r5c_tree_index(struct r5conf *conf, sector_t sect) { … } /* * an IO range starts from a meta data block and end at the next meta data * block. The io unit's the meta data block tracks data/parity followed it. io * unit is written to log disk with normal write, as we always flush log disk * first and then start move data to raid disks, there is no requirement to * write io unit with FLUSH/FUA */ struct r5l_io_unit { … }; /* r5l_io_unit state */ enum r5l_io_unit_state { … }; bool r5c_is_writeback(struct r5l_log *log) { … } static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc) { … } static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start, sector_t end) { … } static bool r5l_has_free_space(struct r5l_log *log, sector_t size) { … } static void __r5l_set_io_unit_state(struct r5l_io_unit *io, enum r5l_io_unit_state state) { … } static void r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev) { … } void r5c_handle_cached_data_endio(struct r5conf *conf, struct stripe_head *sh, int disks) { … } void r5l_wake_reclaim(struct r5l_log *log, sector_t space); /* Check whether we should flush some stripes to free up stripe cache */ void r5c_check_stripe_cache_usage(struct r5conf *conf) { … } /* * flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full * stripes in the cache */ void r5c_check_cached_full_stripe(struct r5conf *conf) { … } /* * Total log space (in sectors) needed to flush all data in cache * * To avoid deadlock due to log space, it is necessary to reserve log * space to flush critical stripes (stripes that occupying log space near * last_checkpoint). This function helps check how much log space is * required to flush all cached stripes. * * To reduce log space requirements, two mechanisms are used to give cache * flush higher priorities: * 1. In handle_stripe_dirtying() and schedule_reconstruction(), * stripes ALREADY in journal can be flushed w/o pending writes; * 2. In r5l_write_stripe() and r5c_cache_data(), stripes NOT in journal * can be delayed (r5l_add_no_space_stripe). * * In cache flush, the stripe goes through 1 and then 2. For a stripe that * already passed 1, flushing it requires at most (conf->max_degraded + 1) * pages of journal space. For stripes that has not passed 1, flushing it * requires (conf->raid_disks + 1) pages of journal space. There are at * most (conf->group_cnt + 1) stripe that passed 1. So total journal space * required to flush all cached stripes (in pages) is: * * (stripe_in_journal_count - group_cnt - 1) * (max_degraded + 1) + * (group_cnt + 1) * (raid_disks + 1) * or * (stripe_in_journal_count) * (max_degraded + 1) + * (group_cnt + 1) * (raid_disks - max_degraded) */ static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf) { … } /* * evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL * * R5C_LOG_TIGHT is set when free space on the log device is less than 3x of * reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log * device is less than 2x of reclaim_required_space. */ static inline void r5c_update_log_state(struct r5l_log *log) { … } /* * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING. * This function should only be called in write-back mode. */ void r5c_make_stripe_write_out(struct stripe_head *sh) { … } static void r5c_handle_data_cached(struct stripe_head *sh) { … } /* * this journal write must contain full parity, * it may also contain some data pages */ static void r5c_handle_parity_cached(struct stripe_head *sh) { … } /* * Setting proper flags after writing (or flushing) data and/or parity to the * log device. This is called from r5l_log_endio() or r5l_log_flush_endio(). */ static void r5c_finish_cache_stripe(struct stripe_head *sh) { … } static void r5l_io_run_stripes(struct r5l_io_unit *io) { … } static void r5l_log_run_stripes(struct r5l_log *log) { … } static void r5l_move_to_end_ios(struct r5l_log *log) { … } static void __r5l_stripe_write_finished(struct r5l_io_unit *io); static void r5l_log_endio(struct bio *bio) { … } static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io) { … } /* deferred io_unit will be dispatched here */ static void r5l_submit_io_async(struct work_struct *work) { … } static void r5c_disable_writeback_async(struct work_struct *work) { … } static void r5l_submit_current_io(struct r5l_log *log) { … } static struct bio *r5l_bio_alloc(struct r5l_log *log) { … } static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io) { … } static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) { … } static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size) { … } static void r5l_append_payload_meta(struct r5l_log *log, u16 type, sector_t location, u32 checksum1, u32 checksum2, bool checksum2_valid) { … } static void r5l_append_payload_page(struct r5l_log *log, struct page *page) { … } static void r5l_append_flush_payload(struct r5l_log *log, sector_t sect) { … } static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, int data_pages, int parity_pages) { … } /* add stripe to no_space_stripes, and then wake up reclaim */ static inline void r5l_add_no_space_stripe(struct r5l_log *log, struct stripe_head *sh) { … } /* * running in raid5d, where reclaim could wait for raid5d too (when it flushes * data from log to raid disks), so we shouldn't wait for reclaim here */ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) { … } void r5l_write_stripe_run(struct r5l_log *log) { … } int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio) { … } /* This will run after log space is reclaimed */ static void r5l_run_no_space_stripes(struct r5l_log *log) { … } /* * calculate new last_checkpoint * for write through mode, returns log->next_checkpoint * for write back, returns log_start of first sh in stripe_in_journal_list */ static sector_t r5c_calculate_new_cp(struct r5conf *conf) { … } static sector_t r5l_reclaimable_space(struct r5l_log *log) { … } static void r5l_run_no_mem_stripe(struct r5l_log *log) { … } static bool r5l_complete_finished_ios(struct r5l_log *log) { … } static void __r5l_stripe_write_finished(struct r5l_io_unit *io) { … } void r5l_stripe_write_finished(struct stripe_head *sh) { … } static void r5l_log_flush_endio(struct bio *bio) { … } /* * Starting dispatch IO to raid. * io_unit(meta) consists of a log. There is one situation we want to avoid. A * broken meta in the middle of a log causes recovery can't find meta at the * head of log. If operations require meta at the head persistent in log, we * must make sure meta before it persistent in log too. A case is: * * stripe data/parity is in log, we start write stripe to raid disks. stripe * data/parity must be persistent in log before we do the write to raid disks. * * The solution is we restrictly maintain io_unit list order. In this case, we * only write stripes of an io_unit to raid disks till the io_unit is the first * one whose data/parity is in log. */ void r5l_flush_stripe_to_raid(struct r5l_log *log) { … } static void r5l_write_super(struct r5l_log *log, sector_t cp); static void r5l_write_super_and_discard_space(struct r5l_log *log, sector_t end) { … } /* * r5c_flush_stripe moves stripe from cached list to handle_list. When called, * the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes. * * must hold conf->device_lock */ static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh) { … } /* * if num == 0, flush all full stripes * if num > 0, flush all full stripes. If less than num full stripes are * flushed, flush some partial stripes until totally num stripes are * flushed or there is no more cached stripes. */ void r5c_flush_cache(struct r5conf *conf, int num) { … } static void r5c_do_reclaim(struct r5conf *conf) { … } static void r5l_do_reclaim(struct r5l_log *log) { … } static void r5l_reclaim_thread(struct md_thread *thread) { … } void r5l_wake_reclaim(struct r5l_log *log, sector_t space) { … } void r5l_quiesce(struct r5l_log *log, int quiesce) { … } bool r5l_log_disk_error(struct r5conf *conf) { … } #define R5L_RECOVERY_PAGE_POOL_SIZE … struct r5l_recovery_ctx { … }; static int r5l_recovery_allocate_ra_pool(struct r5l_log *log, struct r5l_recovery_ctx *ctx) { … } static void r5l_recovery_free_ra_pool(struct r5l_log *log, struct r5l_recovery_ctx *ctx) { … } /* * fetch ctx->valid_pages pages from offset * In normal cases, ctx->valid_pages == ctx->total_pages after the call. * However, if the offset is close to the end of the journal device, * ctx->valid_pages could be smaller than ctx->total_pages */ static int r5l_recovery_fetch_ra_pool(struct r5l_log *log, struct r5l_recovery_ctx *ctx, sector_t offset) { … } /* * try read a page from the read ahead page pool, if the page is not in the * pool, call r5l_recovery_fetch_ra_pool */ static int r5l_recovery_read_page(struct r5l_log *log, struct r5l_recovery_ctx *ctx, struct page *page, sector_t offset) { … } static int r5l_recovery_read_meta_block(struct r5l_log *log, struct r5l_recovery_ctx *ctx) { … } static void r5l_recovery_create_empty_meta_block(struct r5l_log *log, struct page *page, sector_t pos, u64 seq) { … } static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, u64 seq) { … } /* * r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite * to mark valid (potentially not flushed) data in the journal. * * We already verified checksum in r5l_recovery_verify_data_checksum_for_mb, * so there should not be any mismatch here. */ static void r5l_recovery_load_data(struct r5l_log *log, struct stripe_head *sh, struct r5l_recovery_ctx *ctx, struct r5l_payload_data_parity *payload, sector_t log_offset) { … } static void r5l_recovery_load_parity(struct r5l_log *log, struct stripe_head *sh, struct r5l_recovery_ctx *ctx, struct r5l_payload_data_parity *payload, sector_t log_offset) { … } static void r5l_recovery_reset_stripe(struct stripe_head *sh) { … } static void r5l_recovery_replay_one_stripe(struct r5conf *conf, struct stripe_head *sh, struct r5l_recovery_ctx *ctx) { … } static struct stripe_head * r5c_recovery_alloc_stripe( struct r5conf *conf, sector_t stripe_sect, int noblock) { … } static struct stripe_head * r5c_recovery_lookup_stripe(struct list_head *list, sector_t sect) { … } static void r5c_recovery_drop_stripes(struct list_head *cached_stripe_list, struct r5l_recovery_ctx *ctx) { … } static void r5c_recovery_replay_stripes(struct list_head *cached_stripe_list, struct r5l_recovery_ctx *ctx) { … } /* if matches return 0; otherwise return -EINVAL */ static int r5l_recovery_verify_data_checksum(struct r5l_log *log, struct r5l_recovery_ctx *ctx, struct page *page, sector_t log_offset, __le32 log_checksum) { … } /* * before loading data to stripe cache, we need verify checksum for all data, * if there is mismatch for any data page, we drop all data in the mata block */ static int r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log, struct r5l_recovery_ctx *ctx) { … } /* * Analyze all data/parity pages in one meta block * Returns: * 0 for success * -EINVAL for unknown playload type * -EAGAIN for checksum mismatch of data page * -ENOMEM for run out of memory (alloc_page failed or run out of stripes) */ static int r5c_recovery_analyze_meta_block(struct r5l_log *log, struct r5l_recovery_ctx *ctx, struct list_head *cached_stripe_list) { … } /* * Load the stripe into cache. The stripe will be written out later by * the stripe cache state machine. */ static void r5c_recovery_load_one_stripe(struct r5l_log *log, struct stripe_head *sh) { … } /* * Scan through the log for all to-be-flushed data * * For stripes with data and parity, namely Data-Parity stripe * (STRIPE_R5C_CACHING == 0), we simply replay all the writes. * * For stripes with only data, namely Data-Only stripe * (STRIPE_R5C_CACHING == 1), we load them to stripe cache state machine. * * For a stripe, if we see data after parity, we should discard all previous * data and parity for this stripe, as these data are already flushed to * the array. * * At the end of the scan, we return the new journal_tail, which points to * first data-only stripe on the journal device, or next invalid meta block. */ static int r5c_recovery_flush_log(struct r5l_log *log, struct r5l_recovery_ctx *ctx) { … } /* * we did a recovery. Now ctx.pos points to an invalid meta block. New * log will start here. but we can't let superblock point to last valid * meta block. The log might looks like: * | meta 1| meta 2| meta 3| * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If * superblock points to meta 1, we write a new valid meta 2n. if crash * happens again, new recovery will start from meta 1. Since meta 2n is * valid now, recovery will think meta 3 is valid, which is wrong. * The solution is we create a new meta in meta2 with its seq == meta * 1's seq + 10000 and let superblock points to meta2. The same recovery * will not think meta 3 is a valid meta, because its seq doesn't match */ /* * Before recovery, the log looks like the following * * --------------------------------------------- * | valid log | invalid log | * --------------------------------------------- * ^ * |- log->last_checkpoint * |- log->last_cp_seq * * Now we scan through the log until we see invalid entry * * --------------------------------------------- * | valid log | invalid log | * --------------------------------------------- * ^ ^ * |- log->last_checkpoint |- ctx->pos * |- log->last_cp_seq |- ctx->seq * * From this point, we need to increase seq number by 10 to avoid * confusing next recovery. * * --------------------------------------------- * | valid log | invalid log | * --------------------------------------------- * ^ ^ * |- log->last_checkpoint |- ctx->pos+1 * |- log->last_cp_seq |- ctx->seq+10001 * * However, it is not safe to start the state machine yet, because data only * parities are not yet secured in RAID. To save these data only parities, we * rewrite them from seq+11. * * ----------------------------------------------------------------- * | valid log | data only stripes | invalid log | * ----------------------------------------------------------------- * ^ ^ * |- log->last_checkpoint |- ctx->pos+n * |- log->last_cp_seq |- ctx->seq+10000+n * * If failure happens again during this process, the recovery can safe start * again from log->last_checkpoint. * * Once data only stripes are rewritten to journal, we move log_tail * * ----------------------------------------------------------------- * | old log | data only stripes | invalid log | * ----------------------------------------------------------------- * ^ ^ * |- log->last_checkpoint |- ctx->pos+n * |- log->last_cp_seq |- ctx->seq+10000+n * * Then we can safely start the state machine. If failure happens from this * point on, the recovery will start from new log->last_checkpoint. */ static int r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, struct r5l_recovery_ctx *ctx) { … } static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log, struct r5l_recovery_ctx *ctx) { … } static int r5l_recovery_log(struct r5l_log *log) { … } static void r5l_write_super(struct r5l_log *log, sector_t cp) { … } static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) { … } /* * Set journal cache mode on @mddev (external API initially needed by dm-raid). * * @mode as defined in 'enum r5c_journal_mode'. * */ int r5c_journal_mode_set(struct mddev *mddev, int mode) { … } EXPORT_SYMBOL(…); static ssize_t r5c_journal_mode_store(struct mddev *mddev, const char *page, size_t length) { … } struct md_sysfs_entry r5c_journal_mode = …; /* * Try handle write operation in caching phase. This function should only * be called in write-back mode. * * If all outstanding writes can be handled in caching phase, returns 0 * If writes requires write-out phase, call r5c_make_stripe_write_out() * and returns -EAGAIN */ int r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks) { … } /* * free extra pages (orig_page) we allocated for prexor */ void r5c_release_extra_page(struct stripe_head *sh) { … } void r5c_use_extra_page(struct stripe_head *sh) { … } /* * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the * stripe is committed to RAID disks. */ void r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh, struct stripe_head_state *s) { … } int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh) { … } /* check whether this big stripe is in write back cache. */ bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect) { … } static int r5l_load_log(struct r5l_log *log) { … } int r5l_start(struct r5l_log *log) { … } void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev) { … } int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) { … } void r5l_exit_log(struct r5conf *conf) { … }