raid10.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * raid10.c : Multiple Devices driver for Linux
 *
 * Copyright (C) 2000-2004 Neil Brown
 *
 * RAID-10 support for md.
 *
 * Base on code in raid1.c.  See raid1.c for further copyright information.
 */

#include <linux/slab.h>
#include <linux/delay.h>
#include <linux/blkdev.h>
#include <linux/module.h>
#include <linux/seq_file.h>
#include <linux/ratelimit.h>
#include <linux/kthread.h>
#include <linux/raid/md_p.h>
#include <trace/events/block.h>
#include "md.h"

#define RAID_1_10_NAME …
#include "raid10.h"
#include "raid0.h"
#include "md-bitmap.h"

/*
 * RAID10 provides a combination of RAID0 and RAID1 functionality.
 * The layout of data is defined by
 *    chunk_size
 *    raid_disks
 *    near_copies (stored in low byte of layout)
 *    far_copies (stored in second byte of layout)
 *    far_offset (stored in bit 16 of layout )
 *    use_far_sets (stored in bit 17 of layout )
 *    use_far_sets_bugfixed (stored in bit 18 of layout )
 *
 * The data to be stored is divided into chunks using chunksize.  Each device
 * is divided into far_copies sections.   In each section, chunks are laid out
 * in a style similar to raid0, but near_copies copies of each chunk is stored
 * (each on a different drive).  The starting device for each section is offset
 * near_copies from the starting device of the previous section.  Thus there
 * are (near_copies * far_copies) of each chunk, and each is on a different
 * drive.  near_copies and far_copies must be at least one, and their product
 * is at most raid_disks.
 *
 * If far_offset is true, then the far_copies are handled a bit differently.
 * The copies are still in different stripes, but instead of being very far
 * apart on disk, there are adjacent stripes.
 *
 * The far and offset algorithms are handled slightly differently if
 * 'use_far_sets' is true.  In this case, the array's devices are grouped into
 * sets that are (near_copies * far_copies) in size.  The far copied stripes
 * are still shifted by 'near_copies' devices, but this shifting stays confined
 * to the set rather than the entire array.  This is done to improve the number
 * of device combinations that can fail without causing the array to fail.
 * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk
 * on a device):
 *    A B C D    A B C D E
 *      ...         ...
 *    D A B C    E A B C D
 * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s):
 *    [A B] [C D]    [A B] [C D E]
 *    |...| |...|    |...| | ... |
 *    [B A] [D C]    [B A] [E C D]
 */

static void allow_barrier(struct r10conf *conf);
static void lower_barrier(struct r10conf *conf);
static int _enough(struct r10conf *conf, int previous, int ignore);
static int enough(struct r10conf *conf, int ignore);
static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
				int *skipped);
static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
static void end_reshape_write(struct bio *bio);
static void end_reshape(struct r10conf *conf);

#include "raid1-10.c"

#define NULL_CMD
#define cmd_before(conf, cmd) …
#define cmd_after(conf) …

#define wait_event_barrier_cmd(conf, cond, cmd) …

#define wait_event_barrier(conf, cond) …

/*
 * for resync bio, r10bio pointer can be retrieved from the per-bio
 * 'struct resync_pages'.
 */
static inline struct r10bio *get_resync_r10bio(struct bio *bio)
{ … }

static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
{ … }

#define RESYNC_SECTORS …
/* amount of memory to reserve for resync requests */
#define RESYNC_WINDOW …
/* maximum number of concurrent requests, memory permitting */
#define RESYNC_DEPTH …
#define CLUSTER_RESYNC_WINDOW …
#define CLUSTER_RESYNC_WINDOW_SECTORS …

/*
 * When performing a resync, we need to read and compare, so
 * we need as many pages are there are copies.
 * When performing a recovery, we need 2 bios, one for read,
 * one for write (we recover only one drive per r10buf)
 *
 */
static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
{ … }

static void r10buf_pool_free(void *__r10_bio, void *data)
{ … }

static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
{ … }

static void free_r10bio(struct r10bio *r10_bio)
{ … }

static void put_buf(struct r10bio *r10_bio)
{ … }

static void wake_up_barrier(struct r10conf *conf)
{ … }

static void reschedule_retry(struct r10bio *r10_bio)
{ … }

/*
 * raid_end_bio_io() is called when we have finished servicing a mirrored
 * operation and are ready to return a success/failure code to the buffer
 * cache layer.
 */
static void raid_end_bio_io(struct r10bio *r10_bio)
{ … }

/*
 * Update disk head position estimator based on IRQ completion info.
 */
static inline void update_head_pos(int slot, struct r10bio *r10_bio)
{ … }

/*
 * Find the disk number which triggered given bio
 */
static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
			 struct bio *bio, int *slotp, int *replp)
{ … }

static void raid10_end_read_request(struct bio *bio)
{ … }

static void close_write(struct r10bio *r10_bio)
{ … }

static void one_write_done(struct r10bio *r10_bio)
{ … }

static void raid10_end_write_request(struct bio *bio)
{ … }

/*
 * RAID10 layout manager
 * As well as the chunksize and raid_disks count, there are two
 * parameters: near_copies and far_copies.
 * near_copies * far_copies must be <= raid_disks.
 * Normally one of these will be 1.
 * If both are 1, we get raid0.
 * If near_copies == raid_disks, we get raid1.
 *
 * Chunks are laid out in raid0 style with near_copies copies of the
 * first chunk, followed by near_copies copies of the next chunk and
 * so on.
 * If far_copies > 1, then after 1/far_copies of the array has been assigned
 * as described above, we start again with a device offset of near_copies.
 * So we effectively have another copy of the whole array further down all
 * the drives, but with blocks on different drives.
 * With this layout, and block is never stored twice on the one device.
 *
 * raid10_find_phys finds the sector offset of a given virtual sector
 * on each device that it is on.
 *
 * raid10_find_virt does the reverse mapping, from a device and a
 * sector offset to a virtual address
 */

static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
{ … }

static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
{ … }

static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
{ … }

/*
 * This routine returns the disk from which the requested read should
 * be done. There is a per-array 'next expected sequential IO' sector
 * number - if this matches on the next IO then we use the last disk.
 * There is also a per-disk 'last know head position' sector that is
 * maintained from IRQ contexts, both the normal and the resync IO
 * completion handlers update this position correctly. If there is no
 * perfect sequential match then we pick the disk whose head is closest.
 *
 * If there are 2 mirrors in the same 2 devices, performance degrades
 * because position is mirror, not device based.
 *
 * The rdev for the device selected will have nr_pending incremented.
 */

/*
 * FIXME: possibly should rethink readbalancing and do it differently
 * depending on near_copies / far_copies geometry.
 */
static struct md_rdev *read_balance(struct r10conf *conf,
				    struct r10bio *r10_bio,
				    int *max_sectors)
{ … }

static void flush_pending_writes(struct r10conf *conf)
{ … }

/* Barriers....
 * Sometimes we need to suspend IO while we do something else,
 * either some resync/recovery, or reconfigure the array.
 * To do this we raise a 'barrier'.
 * The 'barrier' is a counter that can be raised multiple times
 * to count how many activities are happening which preclude
 * normal IO.
 * We can only raise the barrier if there is no pending IO.
 * i.e. if nr_pending == 0.
 * We choose only to raise the barrier if no-one is waiting for the
 * barrier to go down.  This means that as soon as an IO request
 * is ready, no other operations which require a barrier will start
 * until the IO request has had a chance.
 *
 * So: regular IO calls 'wait_barrier'.  When that returns there
 *    is no backgroup IO happening,  It must arrange to call
 *    allow_barrier when it has finished its IO.
 * backgroup IO calls must call raise_barrier.  Once that returns
 *    there is no normal IO happeing.  It must arrange to call
 *    lower_barrier when the particular background IO completes.
 */

static void raise_barrier(struct r10conf *conf, int force)
{ … }

static void lower_barrier(struct r10conf *conf)
{ … }

static bool stop_waiting_barrier(struct r10conf *conf)
{ … }

static bool wait_barrier_nolock(struct r10conf *conf)
{ … }

static bool wait_barrier(struct r10conf *conf, bool nowait)
{ … }

static void allow_barrier(struct r10conf *conf)
{ … }

static void freeze_array(struct r10conf *conf, int extra)
{ … }

static void unfreeze_array(struct r10conf *conf)
{ … }

static sector_t choose_data_offset(struct r10bio *r10_bio,
				   struct md_rdev *rdev)
{ … }

static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
{ … }

/*
 * 1. Register the new request and wait if the reconstruction thread has put
 * up a bar for new requests. Continue immediately if no resync is active
 * currently.
 * 2. If IO spans the reshape position.  Need to wait for reshape to pass.
 */
static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf,
				 struct bio *bio, sector_t sectors)
{ … }

static void raid10_read_request(struct mddev *mddev, struct bio *bio,
				struct r10bio *r10_bio, bool io_accounting)
{ … }

static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
				  struct bio *bio, bool replacement,
				  int n_copy)
{ … }

static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
{ … }

static void raid10_write_request(struct mddev *mddev, struct bio *bio,
				 struct r10bio *r10_bio)
{ … }

static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
{ … }

static void raid_end_discard_bio(struct r10bio *r10bio)
{ … }

static void raid10_end_discard_request(struct bio *bio)
{ … }

/*
 * There are some limitations to handle discard bio
 * 1st, the discard size is bigger than stripe_size*2.
 * 2st, if the discard bio spans reshape progress, we use the old way to
 * handle discard bio
 */
static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
{ … }

static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
{ … }

static void raid10_status(struct seq_file *seq, struct mddev *mddev)
{ … }

/* check if there are enough drives for
 * every block to appear on atleast one.
 * Don't consider the device numbered 'ignore'
 * as we might be about to remove it.
 */
static int _enough(struct r10conf *conf, int previous, int ignore)
{ … }

static int enough(struct r10conf *conf, int ignore)
{ … }

/**
 * raid10_error() - RAID10 error handler.
 * @mddev: affected md device.
 * @rdev: member device to fail.
 *
 * The routine acknowledges &rdev failure and determines new @mddev state.
 * If it failed, then:
 *	- &MD_BROKEN flag is set in &mddev->flags.
 * Otherwise, it must be degraded:
 *	- recovery is interrupted.
 *	- &mddev->degraded is bumped.
 *
 * @rdev is marked as &Faulty excluding case when array is failed and
 * &mddev->fail_last_dev is off.
 */
static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
{ … }

static void print_conf(struct r10conf *conf)
{ … }

static void close_sync(struct r10conf *conf)
{ … }

static int raid10_spare_active(struct mddev *mddev)
{ … }

static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
{ … }

static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
{ … }

static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d)
{ … }

static void end_sync_read(struct bio *bio)
{ … }

static void end_reshape_read(struct bio *bio)
{ … }

static void end_sync_request(struct r10bio *r10_bio)
{ … }

static void end_sync_write(struct bio *bio)
{ … }

/*
 * Note: sync and recover and handled very differently for raid10
 * This code is for resync.
 * For resync, we read through virtual addresses and read all blocks.
 * If there is any error, we schedule a write.  The lowest numbered
 * drive is authoritative.
 * However requests come for physical address, so we need to map.
 * For every physical address there are raid_disks/copies virtual addresses,
 * which is always are least one, but is not necessarly an integer.
 * This means that a physical address can span multiple chunks, so we may
 * have to submit multiple io requests for a single sync request.
 */
/*
 * We check if all blocks are in-sync and only write to blocks that
 * aren't in sync
 */
static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
{ … }

/*
 * Now for the recovery code.
 * Recovery happens across physical sectors.
 * We recover all non-is_sync drives by finding the virtual address of
 * each, and then choose a working drive that also has that virt address.
 * There is a separate r10_bio for each non-in_sync drive.
 * Only the first two slots are in use. The first for reading,
 * The second for writing.
 *
 */
static void fix_recovery_read_error(struct r10bio *r10_bio)
{ … }

static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
{ … }

static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
			    int sectors, struct page *page, enum req_op op)
{ … }

/*
 * This is a kernel thread which:
 *
 *	1.	Retries failed read operations on working mirrors.
 *	2.	Updates the raid superblock when problems encounter.
 *	3.	Performs writes following reads for array synchronising.
 */

static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
{ … }

static int narrow_write_error(struct r10bio *r10_bio, int i)
{ … }

static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
{ … }

static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
{ … }

static void raid10d(struct md_thread *thread)
{ … }

static int init_resync(struct r10conf *conf)
{ … }

static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf)
{ … }

/*
 * Set cluster_sync_high since we need other nodes to add the
 * range [cluster_sync_low, cluster_sync_high] to suspend list.
 */
static void raid10_set_cluster_sync_high(struct r10conf *conf)
{ … }

/*
 * perform a "sync" on one "block"
 *
 * We need to make sure that no normal I/O request - particularly write
 * requests - conflict with active sync requests.
 *
 * This is achieved by tracking pending requests and a 'barrier' concept
 * that can be installed to exclude normal IO requests.
 *
 * Resync and recovery are handled very differently.
 * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery.
 *
 * For resync, we iterate over virtual addresses, read all copies,
 * and update if there are differences.  If only one copy is live,
 * skip it.
 * For recovery, we iterate over physical addresses, read a good
 * value for each non-in_sync drive, and over-write.
 *
 * So, for recovery we may have several outstanding complex requests for a
 * given address, one for each out-of-sync device.  We model this by allocating
 * a number of r10_bio structures, one for each out-of-sync device.
 * As we setup these structures, we collect all bio's together into a list
 * which we then process collectively to add pages, and then process again
 * to pass to submit_bio_noacct.
 *
 * The r10_bio structures are linked using a borrowed master_bio pointer.
 * This link is counted in ->remaining.  When the r10_bio that points to NULL
 * has its remaining count decremented to 0, the whole complex operation
 * is complete.
 *
 */

static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
				    sector_t max_sector, int *skipped)
{ … }

static sector_t
raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
{ … }

static void calc_sectors(struct r10conf *conf, sector_t size)
{ … }

enum geo_type { … };
static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
{ … }

static void raid10_free_conf(struct r10conf *conf)
{ … }

static struct r10conf *setup_conf(struct mddev *mddev)
{ … }

static unsigned int raid10_nr_stripes(struct r10conf *conf)
{ … }

static int raid10_set_queue_limits(struct mddev *mddev)
{ … }

static int raid10_run(struct mddev *mddev)
{ … }

static void raid10_free(struct mddev *mddev, void *priv)
{ … }

static void raid10_quiesce(struct mddev *mddev, int quiesce)
{ … }

static int raid10_resize(struct mddev *mddev, sector_t sectors)
{ … }

static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
{ … }

static void *raid10_takeover(struct mddev *mddev)
{ … }

static int raid10_check_reshape(struct mddev *mddev)
{ … }

/*
 * Need to check if array has failed when deciding whether to:
 *  - start an array
 *  - remove non-faulty devices
 *  - add a spare
 *  - allow a reshape
 * This determination is simple when no reshape is happening.
 * However if there is a reshape, we need to carefully check
 * both the before and after sections.
 * This is because some failed devices may only affect one
 * of the two sections, and some non-in_sync devices may
 * be insync in the section most affected by failed devices.
 */
static int calc_degraded(struct r10conf *conf)
{ … }

static int raid10_start_reshape(struct mddev *mddev)
{ … }

/* Calculate the last device-address that could contain
 * any block from the chunk that includes the array-address 's'
 * and report the next address.
 * i.e. the address returned will be chunk-aligned and after
 * any data that is in the chunk containing 's'.
 */
static sector_t last_dev_address(sector_t s, struct geom *geo)
{ … }

/* Calculate the first device-address that could contain
 * any block from the chunk that includes the array-address 's'.
 * This too will be the start of a chunk
 */
static sector_t first_dev_address(sector_t s, struct geom *geo)
{ … }

static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
				int *skipped)
{ … }

static void end_reshape_request(struct r10bio *r10_bio);
static int handle_reshape_read_error(struct mddev *mddev,
				     struct r10bio *r10_bio);
static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
{ … }

static void end_reshape(struct r10conf *conf)
{ … }

static void raid10_update_reshape_pos(struct mddev *mddev)
{ … }

static int handle_reshape_read_error(struct mddev *mddev,
				     struct r10bio *r10_bio)
{ … }

static void end_reshape_write(struct bio *bio)
{ … }

static void end_reshape_request(struct r10bio *r10_bio)
{ … }

static void raid10_finish_reshape(struct mddev *mddev)
{ … }

static struct md_personality raid10_personality = …;

static int __init raid_init(void)
{ … }

static void raid_exit(void)
{ … }

module_init(…) …;
module_exit(raid_exit);
MODULE_LICENSE(…) …;
MODULE_DESCRIPTION(…) …;
MODULE_ALIAS(…) …; /* RAID10 */
MODULE_ALIAS(…) …;
MODULE_ALIAS(…) …;
linux/drivers/md/raid10.c