// SPDX-License-Identifier: GPL-2.0-or-later /* * raid10.c : Multiple Devices driver for Linux * * Copyright (C) 2000-2004 Neil Brown * * RAID-10 support for md. * * Base on code in raid1.c. See raid1.c for further copyright information. */ #include <linux/slab.h> #include <linux/delay.h> #include <linux/blkdev.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/ratelimit.h> #include <linux/kthread.h> #include <linux/raid/md_p.h> #include <trace/events/block.h> #include "md.h" #define RAID_1_10_NAME … #include "raid10.h" #include "raid0.h" #include "md-bitmap.h" /* * RAID10 provides a combination of RAID0 and RAID1 functionality. * The layout of data is defined by * chunk_size * raid_disks * near_copies (stored in low byte of layout) * far_copies (stored in second byte of layout) * far_offset (stored in bit 16 of layout ) * use_far_sets (stored in bit 17 of layout ) * use_far_sets_bugfixed (stored in bit 18 of layout ) * * The data to be stored is divided into chunks using chunksize. Each device * is divided into far_copies sections. In each section, chunks are laid out * in a style similar to raid0, but near_copies copies of each chunk is stored * (each on a different drive). The starting device for each section is offset * near_copies from the starting device of the previous section. Thus there * are (near_copies * far_copies) of each chunk, and each is on a different * drive. near_copies and far_copies must be at least one, and their product * is at most raid_disks. * * If far_offset is true, then the far_copies are handled a bit differently. * The copies are still in different stripes, but instead of being very far * apart on disk, there are adjacent stripes. * * The far and offset algorithms are handled slightly differently if * 'use_far_sets' is true. In this case, the array's devices are grouped into * sets that are (near_copies * far_copies) in size. The far copied stripes * are still shifted by 'near_copies' devices, but this shifting stays confined * to the set rather than the entire array. This is done to improve the number * of device combinations that can fail without causing the array to fail. * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk * on a device): * A B C D A B C D E * ... ... * D A B C E A B C D * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s): * [A B] [C D] [A B] [C D E] * |...| |...| |...| | ... | * [B A] [D C] [B A] [E C D] */ static void allow_barrier(struct r10conf *conf); static void lower_barrier(struct r10conf *conf); static int _enough(struct r10conf *conf, int previous, int ignore); static int enough(struct r10conf *conf, int ignore); static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped); static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio); static void end_reshape_write(struct bio *bio); static void end_reshape(struct r10conf *conf); #include "raid1-10.c" #define NULL_CMD #define cmd_before(conf, cmd) … #define cmd_after(conf) … #define wait_event_barrier_cmd(conf, cond, cmd) … #define wait_event_barrier(conf, cond) … /* * for resync bio, r10bio pointer can be retrieved from the per-bio * 'struct resync_pages'. */ static inline struct r10bio *get_resync_r10bio(struct bio *bio) { … } static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) { … } #define RESYNC_SECTORS … /* amount of memory to reserve for resync requests */ #define RESYNC_WINDOW … /* maximum number of concurrent requests, memory permitting */ #define RESYNC_DEPTH … #define CLUSTER_RESYNC_WINDOW … #define CLUSTER_RESYNC_WINDOW_SECTORS … /* * When performing a resync, we need to read and compare, so * we need as many pages are there are copies. * When performing a recovery, we need 2 bios, one for read, * one for write (we recover only one drive per r10buf) * */ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) { … } static void r10buf_pool_free(void *__r10_bio, void *data) { … } static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio) { … } static void free_r10bio(struct r10bio *r10_bio) { … } static void put_buf(struct r10bio *r10_bio) { … } static void wake_up_barrier(struct r10conf *conf) { … } static void reschedule_retry(struct r10bio *r10_bio) { … } /* * raid_end_bio_io() is called when we have finished servicing a mirrored * operation and are ready to return a success/failure code to the buffer * cache layer. */ static void raid_end_bio_io(struct r10bio *r10_bio) { … } /* * Update disk head position estimator based on IRQ completion info. */ static inline void update_head_pos(int slot, struct r10bio *r10_bio) { … } /* * Find the disk number which triggered given bio */ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, struct bio *bio, int *slotp, int *replp) { … } static void raid10_end_read_request(struct bio *bio) { … } static void close_write(struct r10bio *r10_bio) { … } static void one_write_done(struct r10bio *r10_bio) { … } static void raid10_end_write_request(struct bio *bio) { … } /* * RAID10 layout manager * As well as the chunksize and raid_disks count, there are two * parameters: near_copies and far_copies. * near_copies * far_copies must be <= raid_disks. * Normally one of these will be 1. * If both are 1, we get raid0. * If near_copies == raid_disks, we get raid1. * * Chunks are laid out in raid0 style with near_copies copies of the * first chunk, followed by near_copies copies of the next chunk and * so on. * If far_copies > 1, then after 1/far_copies of the array has been assigned * as described above, we start again with a device offset of near_copies. * So we effectively have another copy of the whole array further down all * the drives, but with blocks on different drives. * With this layout, and block is never stored twice on the one device. * * raid10_find_phys finds the sector offset of a given virtual sector * on each device that it is on. * * raid10_find_virt does the reverse mapping, from a device and a * sector offset to a virtual address */ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio) { … } static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio) { … } static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) { … } /* * This routine returns the disk from which the requested read should * be done. There is a per-array 'next expected sequential IO' sector * number - if this matches on the next IO then we use the last disk. * There is also a per-disk 'last know head position' sector that is * maintained from IRQ contexts, both the normal and the resync IO * completion handlers update this position correctly. If there is no * perfect sequential match then we pick the disk whose head is closest. * * If there are 2 mirrors in the same 2 devices, performance degrades * because position is mirror, not device based. * * The rdev for the device selected will have nr_pending incremented. */ /* * FIXME: possibly should rethink readbalancing and do it differently * depending on near_copies / far_copies geometry. */ static struct md_rdev *read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_sectors) { … } static void flush_pending_writes(struct r10conf *conf) { … } /* Barriers.... * Sometimes we need to suspend IO while we do something else, * either some resync/recovery, or reconfigure the array. * To do this we raise a 'barrier'. * The 'barrier' is a counter that can be raised multiple times * to count how many activities are happening which preclude * normal IO. * We can only raise the barrier if there is no pending IO. * i.e. if nr_pending == 0. * We choose only to raise the barrier if no-one is waiting for the * barrier to go down. This means that as soon as an IO request * is ready, no other operations which require a barrier will start * until the IO request has had a chance. * * So: regular IO calls 'wait_barrier'. When that returns there * is no backgroup IO happening, It must arrange to call * allow_barrier when it has finished its IO. * backgroup IO calls must call raise_barrier. Once that returns * there is no normal IO happeing. It must arrange to call * lower_barrier when the particular background IO completes. */ static void raise_barrier(struct r10conf *conf, int force) { … } static void lower_barrier(struct r10conf *conf) { … } static bool stop_waiting_barrier(struct r10conf *conf) { … } static bool wait_barrier_nolock(struct r10conf *conf) { … } static bool wait_barrier(struct r10conf *conf, bool nowait) { … } static void allow_barrier(struct r10conf *conf) { … } static void freeze_array(struct r10conf *conf, int extra) { … } static void unfreeze_array(struct r10conf *conf) { … } static sector_t choose_data_offset(struct r10bio *r10_bio, struct md_rdev *rdev) { … } static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) { … } /* * 1. Register the new request and wait if the reconstruction thread has put * up a bar for new requests. Continue immediately if no resync is active * currently. * 2. If IO spans the reshape position. Need to wait for reshape to pass. */ static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf, struct bio *bio, sector_t sectors) { … } static void raid10_read_request(struct mddev *mddev, struct bio *bio, struct r10bio *r10_bio, bool io_accounting) { … } static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, struct bio *bio, bool replacement, int n_copy) { … } static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio) { … } static void raid10_write_request(struct mddev *mddev, struct bio *bio, struct r10bio *r10_bio) { … } static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) { … } static void raid_end_discard_bio(struct r10bio *r10bio) { … } static void raid10_end_discard_request(struct bio *bio) { … } /* * There are some limitations to handle discard bio * 1st, the discard size is bigger than stripe_size*2. * 2st, if the discard bio spans reshape progress, we use the old way to * handle discard bio */ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) { … } static bool raid10_make_request(struct mddev *mddev, struct bio *bio) { … } static void raid10_status(struct seq_file *seq, struct mddev *mddev) { … } /* check if there are enough drives for * every block to appear on atleast one. * Don't consider the device numbered 'ignore' * as we might be about to remove it. */ static int _enough(struct r10conf *conf, int previous, int ignore) { … } static int enough(struct r10conf *conf, int ignore) { … } /** * raid10_error() - RAID10 error handler. * @mddev: affected md device. * @rdev: member device to fail. * * The routine acknowledges &rdev failure and determines new @mddev state. * If it failed, then: * - &MD_BROKEN flag is set in &mddev->flags. * Otherwise, it must be degraded: * - recovery is interrupted. * - &mddev->degraded is bumped. * * @rdev is marked as &Faulty excluding case when array is failed and * &mddev->fail_last_dev is off. */ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev) { … } static void print_conf(struct r10conf *conf) { … } static void close_sync(struct r10conf *conf) { … } static int raid10_spare_active(struct mddev *mddev) { … } static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) { … } static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) { … } static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d) { … } static void end_sync_read(struct bio *bio) { … } static void end_reshape_read(struct bio *bio) { … } static void end_sync_request(struct r10bio *r10_bio) { … } static void end_sync_write(struct bio *bio) { … } /* * Note: sync and recover and handled very differently for raid10 * This code is for resync. * For resync, we read through virtual addresses and read all blocks. * If there is any error, we schedule a write. The lowest numbered * drive is authoritative. * However requests come for physical address, so we need to map. * For every physical address there are raid_disks/copies virtual addresses, * which is always are least one, but is not necessarly an integer. * This means that a physical address can span multiple chunks, so we may * have to submit multiple io requests for a single sync request. */ /* * We check if all blocks are in-sync and only write to blocks that * aren't in sync */ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) { … } /* * Now for the recovery code. * Recovery happens across physical sectors. * We recover all non-is_sync drives by finding the virtual address of * each, and then choose a working drive that also has that virt address. * There is a separate r10_bio for each non-in_sync drive. * Only the first two slots are in use. The first for reading, * The second for writing. * */ static void fix_recovery_read_error(struct r10bio *r10_bio) { … } static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) { … } static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, int sectors, struct page *page, enum req_op op) { … } /* * This is a kernel thread which: * * 1. Retries failed read operations on working mirrors. * 2. Updates the raid superblock when problems encounter. * 3. Performs writes following reads for array synchronising. */ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio) { … } static int narrow_write_error(struct r10bio *r10_bio, int i) { … } static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) { … } static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) { … } static void raid10d(struct md_thread *thread) { … } static int init_resync(struct r10conf *conf) { … } static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf) { … } /* * Set cluster_sync_high since we need other nodes to add the * range [cluster_sync_low, cluster_sync_high] to suspend list. */ static void raid10_set_cluster_sync_high(struct r10conf *conf) { … } /* * perform a "sync" on one "block" * * We need to make sure that no normal I/O request - particularly write * requests - conflict with active sync requests. * * This is achieved by tracking pending requests and a 'barrier' concept * that can be installed to exclude normal IO requests. * * Resync and recovery are handled very differently. * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery. * * For resync, we iterate over virtual addresses, read all copies, * and update if there are differences. If only one copy is live, * skip it. * For recovery, we iterate over physical addresses, read a good * value for each non-in_sync drive, and over-write. * * So, for recovery we may have several outstanding complex requests for a * given address, one for each out-of-sync device. We model this by allocating * a number of r10_bio structures, one for each out-of-sync device. * As we setup these structures, we collect all bio's together into a list * which we then process collectively to add pages, and then process again * to pass to submit_bio_noacct. * * The r10_bio structures are linked using a borrowed master_bio pointer. * This link is counted in ->remaining. When the r10_bio that points to NULL * has its remaining count decremented to 0, the whole complex operation * is complete. * */ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, sector_t max_sector, int *skipped) { … } static sector_t raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks) { … } static void calc_sectors(struct r10conf *conf, sector_t size) { … } enum geo_type { … }; static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) { … } static void raid10_free_conf(struct r10conf *conf) { … } static struct r10conf *setup_conf(struct mddev *mddev) { … } static unsigned int raid10_nr_stripes(struct r10conf *conf) { … } static int raid10_set_queue_limits(struct mddev *mddev) { … } static int raid10_run(struct mddev *mddev) { … } static void raid10_free(struct mddev *mddev, void *priv) { … } static void raid10_quiesce(struct mddev *mddev, int quiesce) { … } static int raid10_resize(struct mddev *mddev, sector_t sectors) { … } static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs) { … } static void *raid10_takeover(struct mddev *mddev) { … } static int raid10_check_reshape(struct mddev *mddev) { … } /* * Need to check if array has failed when deciding whether to: * - start an array * - remove non-faulty devices * - add a spare * - allow a reshape * This determination is simple when no reshape is happening. * However if there is a reshape, we need to carefully check * both the before and after sections. * This is because some failed devices may only affect one * of the two sections, and some non-in_sync devices may * be insync in the section most affected by failed devices. */ static int calc_degraded(struct r10conf *conf) { … } static int raid10_start_reshape(struct mddev *mddev) { … } /* Calculate the last device-address that could contain * any block from the chunk that includes the array-address 's' * and report the next address. * i.e. the address returned will be chunk-aligned and after * any data that is in the chunk containing 's'. */ static sector_t last_dev_address(sector_t s, struct geom *geo) { … } /* Calculate the first device-address that could contain * any block from the chunk that includes the array-address 's'. * This too will be the start of a chunk */ static sector_t first_dev_address(sector_t s, struct geom *geo) { … } static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped) { … } static void end_reshape_request(struct r10bio *r10_bio); static int handle_reshape_read_error(struct mddev *mddev, struct r10bio *r10_bio); static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio) { … } static void end_reshape(struct r10conf *conf) { … } static void raid10_update_reshape_pos(struct mddev *mddev) { … } static int handle_reshape_read_error(struct mddev *mddev, struct r10bio *r10_bio) { … } static void end_reshape_write(struct bio *bio) { … } static void end_reshape_request(struct r10bio *r10_bio) { … } static void raid10_finish_reshape(struct mddev *mddev) { … } static struct md_personality raid10_personality = …; static int __init raid_init(void) { … } static void raid_exit(void) { … } module_init(…) …; module_exit(raid_exit); MODULE_LICENSE(…) …; MODULE_DESCRIPTION(…) …; MODULE_ALIAS(…) …; /* RAID10 */ MODULE_ALIAS(…) …; MODULE_ALIAS(…) …;