raid5.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * raid5.c : Multiple Devices driver for Linux
 *	   Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
 *	   Copyright (C) 1999, 2000 Ingo Molnar
 *	   Copyright (C) 2002, 2003 H. Peter Anvin
 *
 * RAID-4/5/6 management functions.
 * Thanks to Penguin Computing for making the RAID-6 development possible
 * by donating a test server!
 */

/*
 * BITMAP UNPLUGGING:
 *
 * The sequencing for updating the bitmap reliably is a little
 * subtle (and I got it wrong the first time) so it deserves some
 * explanation.
 *
 * We group bitmap updates into batches.  Each batch has a number.
 * We may write out several batches at once, but that isn't very important.
 * conf->seq_write is the number of the last batch successfully written.
 * conf->seq_flush is the number of the last batch that was closed to
 *    new additions.
 * When we discover that we will need to write to any block in a stripe
 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
 * the number of the batch it will be in. This is seq_flush+1.
 * When we are ready to do a write, if that batch hasn't been written yet,
 *   we plug the array and queue the stripe for later.
 * When an unplug happens, we increment bm_flush, thus closing the current
 *   batch.
 * When we notice that bm_flush > bm_write, we write out all pending updates
 * to the bitmap, and advance bm_write to where bm_flush was.
 * This may occasionally write a bit out twice, but is sure never to
 * miss any bits.
 */

#include <linux/blkdev.h>
#include <linux/kthread.h>
#include <linux/raid/pq.h>
#include <linux/async_tx.h>
#include <linux/module.h>
#include <linux/async.h>
#include <linux/seq_file.h>
#include <linux/cpu.h>
#include <linux/slab.h>
#include <linux/ratelimit.h>
#include <linux/nodemask.h>

#include <trace/events/block.h>
#include <linux/list_sort.h>

#include "md.h"
#include "raid5.h"
#include "raid0.h"
#include "md-bitmap.h"
#include "raid5-log.h"

#define UNSUPPORTED_MDDEV_FLAGS …

#define cpu_to_group(cpu) …
#define ANY_GROUP …

#define RAID5_MAX_REQ_STRIPES …

static bool devices_handle_discard_safely = …;
module_param(devices_handle_discard_safely, bool, 0644);
MODULE_PARM_DESC(…) …;
static struct workqueue_struct *raid5_wq;

static void raid5_quiesce(struct mddev *mddev, int quiesce);

static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
{ … }

static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect)
{ … }

static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
	__acquires(&conf->device_lock)
{ … }

static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
	__releases(&conf->device_lock)
{ … }

static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
	__acquires(&conf->device_lock)
{ … }

static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
	__releases(&conf->device_lock)
{ … }

/* Find first data disk in a raid6 stripe */
static inline int raid6_d0(struct stripe_head *sh)
{ … }
static inline int raid6_next_disk(int disk, int raid_disks)
{ … }

/* When walking through the disks in a raid5, starting at raid6_d0,
 * We need to map each disk to a 'slot', where the data disks are slot
 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
 * is raid_disks-1.  This help does that mapping.
 */
static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
			     int *count, int syndrome_disks)
{ … }

static void print_raid5_conf(struct r5conf *conf);

static int stripe_operations_active(struct stripe_head *sh)
{ … }

static bool stripe_is_lowprio(struct stripe_head *sh)
{ … }

static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
	__must_hold(&sh->raid_conf->device_lock)
{ … }

static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
			      struct list_head *temp_inactive_list)
	__must_hold(&conf->device_lock)
{ … }

static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
			     struct list_head *temp_inactive_list)
	__must_hold(&conf->device_lock)
{ … }

/*
 * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list
 *
 * Be careful: Only one task can add/delete stripes from temp_inactive_list at
 * given time. Adding stripes only takes device lock, while deleting stripes
 * only takes hash lock.
 */
static void release_inactive_stripe_list(struct r5conf *conf,
					 struct list_head *temp_inactive_list,
					 int hash)
{ … }

static int release_stripe_list(struct r5conf *conf,
			       struct list_head *temp_inactive_list)
	__must_hold(&conf->device_lock)
{ … }

void raid5_release_stripe(struct stripe_head *sh)
{ … }

static inline void remove_hash(struct stripe_head *sh)
{ … }

static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
{ … }

/* find an idle stripe, make sure it is unhashed, and return it. */
static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
{ … }

#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
static void free_stripe_pages(struct stripe_head *sh)
{
	int i;
	struct page *p;

	/* Have not allocate page pool */
	if (!sh->pages)
		return;

	for (i = 0; i < sh->nr_pages; i++) {
		p = sh->pages[i];
		if (p)
			put_page(p);
		sh->pages[i] = NULL;
	}
}

static int alloc_stripe_pages(struct stripe_head *sh, gfp_t gfp)
{
	int i;
	struct page *p;

	for (i = 0; i < sh->nr_pages; i++) {
		/* The page have allocated. */
		if (sh->pages[i])
			continue;

		p = alloc_page(gfp);
		if (!p) {
			free_stripe_pages(sh);
			return -ENOMEM;
		}
		sh->pages[i] = p;
	}
	return 0;
}

static int
init_stripe_shared_pages(struct stripe_head *sh, struct r5conf *conf, int disks)
{
	int nr_pages, cnt;

	if (sh->pages)
		return 0;

	/* Each of the sh->dev[i] need one conf->stripe_size */
	cnt = PAGE_SIZE / conf->stripe_size;
	nr_pages = (disks + cnt - 1) / cnt;

	sh->pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
	if (!sh->pages)
		return -ENOMEM;
	sh->nr_pages = nr_pages;
	sh->stripes_per_page = cnt;
	return 0;
}
#endif

static void shrink_buffers(struct stripe_head *sh)
{ … }

static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
{ … }

static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
			    struct stripe_head *sh);

static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
{ … }

static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
					 short generation)
{ … }

static struct stripe_head *find_get_stripe(struct r5conf *conf,
		sector_t sector, short generation, int hash)
{ … }

/*
 * Need to check if array has failed when deciding whether to:
 *  - start an array
 *  - remove non-faulty devices
 *  - add a spare
 *  - allow a reshape
 * This determination is simple when no reshape is happening.
 * However if there is a reshape, we need to carefully check
 * both the before and after sections.
 * This is because some failed devices may only affect one
 * of the two sections, and some non-in_sync devices may
 * be insync in the section most affected by failed devices.
 *
 * Most calls to this function hold &conf->device_lock. Calls
 * in raid5_run() do not require the lock as no other threads
 * have been started yet.
 */
int raid5_calc_degraded(struct r5conf *conf)
{ … }

static bool has_failed(struct r5conf *conf)
{ … }

enum stripe_result { … };

struct stripe_request_ctx { … };

/*
 * Block until another thread clears R5_INACTIVE_BLOCKED or
 * there are fewer than 3/4 the maximum number of active stripes
 * and there is an inactive stripe available.
 */
static bool is_inactive_blocked(struct r5conf *conf, int hash)
{ … }

struct stripe_head *raid5_get_active_stripe(struct r5conf *conf,
		struct stripe_request_ctx *ctx, sector_t sector,
		unsigned int flags)
{ … }

static bool is_full_stripe_write(struct stripe_head *sh)
{ … }

static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
		__acquires(&sh1->stripe_lock)
		__acquires(&sh2->stripe_lock)
{ … }

static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
		__releases(&sh1->stripe_lock)
		__releases(&sh2->stripe_lock)
{ … }

/* Only freshly new full stripe normal write stripe can be added to a batch list */
static bool stripe_can_batch(struct stripe_head *sh)
{ … }

/* we only do back search */
static void stripe_add_to_batch_list(struct r5conf *conf,
		struct stripe_head *sh, struct stripe_head *last_sh)
{ … }

/* Determine if 'data_offset' or 'new_data_offset' should be used
 * in this stripe_head.
 */
static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
{ … }

static void dispatch_bio_list(struct bio_list *tmp)
{ … }

static int cmp_stripe(void *priv, const struct list_head *a,
		      const struct list_head *b)
{ … }

static void dispatch_defer_bios(struct r5conf *conf, int target,
				struct bio_list *list)
{ … }

static void flush_deferred_bios(struct r5conf *conf)
{ … }

static void defer_issue_bios(struct r5conf *conf, sector_t sector,
				struct bio_list *bios)
{ … }

static void
raid5_end_read_request(struct bio *bi);
static void
raid5_end_write_request(struct bio *bi);

static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
{ … }

static struct dma_async_tx_descriptor *
async_copy_data(int frombio, struct bio *bio, struct page **page,
	unsigned int poff, sector_t sector, struct dma_async_tx_descriptor *tx,
	struct stripe_head *sh, int no_skipcopy)
{ … }

static void ops_complete_biofill(void *stripe_head_ref)
{ … }

static void ops_run_biofill(struct stripe_head *sh)
{ … }

static void mark_target_uptodate(struct stripe_head *sh, int target)
{ … }

static void ops_complete_compute(void *stripe_head_ref)
{ … }

/* return a pointer to the address conversion region of the scribble buffer */
static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
{ … }

/* return a pointer to the address conversion region of the scribble buffer */
static addr_conv_t *to_addr_conv(struct stripe_head *sh,
				 struct raid5_percpu *percpu, int i)
{ … }

/*
 * Return a pointer to record offset address.
 */
static unsigned int *
to_addr_offs(struct stripe_head *sh, struct raid5_percpu *percpu)
{ … }

static struct dma_async_tx_descriptor *
ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
{ … }

/* set_syndrome_sources - populate source buffers for gen_syndrome
 * @srcs - (struct page *) array of size sh->disks
 * @offs - (unsigned int) array of offset for each page
 * @sh - stripe_head to parse
 *
 * Populates srcs in proper layout order for the stripe and returns the
 * 'count' of sources to be used in a call to async_gen_syndrome.  The P
 * destination buffer is recorded in srcs[count] and the Q destination
 * is recorded in srcs[count+1]].
 */
static int set_syndrome_sources(struct page **srcs,
				unsigned int *offs,
				struct stripe_head *sh,
				int srctype)
{ … }

static struct dma_async_tx_descriptor *
ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
{ … }

static struct dma_async_tx_descriptor *
ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
{ … }

static void ops_complete_prexor(void *stripe_head_ref)
{ … }

static struct dma_async_tx_descriptor *
ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
		struct dma_async_tx_descriptor *tx)
{ … }

static struct dma_async_tx_descriptor *
ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
		struct dma_async_tx_descriptor *tx)
{ … }

static struct dma_async_tx_descriptor *
ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
{ … }

static void ops_complete_reconstruct(void *stripe_head_ref)
{ … }

static void
ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
		     struct dma_async_tx_descriptor *tx)
{ … }

static void
ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
		     struct dma_async_tx_descriptor *tx)
{ … }

static void ops_complete_check(void *stripe_head_ref)
{ … }

static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
{ … }

static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
{ … }

static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
{ … }

static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
{ … }

static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
	int disks, struct r5conf *conf)
{ … }
static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
{ … }

static int grow_stripes(struct r5conf *conf, int num)
{ … }

/**
 * scribble_alloc - allocate percpu scribble buffer for required size
 *		    of the scribble region
 * @percpu: from for_each_present_cpu() of the caller
 * @num: total number of disks in the array
 * @cnt: scribble objs count for required size of the scribble region
 *
 * The scribble buffer size must be enough to contain:
 * 1/ a struct page pointer for each device in the array +2
 * 2/ room to convert each entry in (1) to its corresponding dma
 *    (dma_map_page()) or page (page_address()) address.
 *
 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
 * calculate over all devices (not just the data blocks), using zeros in place
 * of the P and Q blocks.
 */
static int scribble_alloc(struct raid5_percpu *percpu,
			  int num, int cnt)
{ … }

static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
{ … }

static int resize_stripes(struct r5conf *conf, int newsize)
{ … }

static int drop_one_stripe(struct r5conf *conf)
{ … }

static void shrink_stripes(struct r5conf *conf)
{ … }

static void raid5_end_read_request(struct bio * bi)
{ … }

static void raid5_end_write_request(struct bio *bi)
{ … }

static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
{ … }

/*
 * Input: a 'big' sector number,
 * Output: index of the data and parity disk, and the sector # in them.
 */
sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
			      int previous, int *dd_idx,
			      struct stripe_head *sh)
{ … }

sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
{ … }

/*
 * There are cases where we want handle_stripe_dirtying() and
 * schedule_reconstruction() to delay towrite to some dev of a stripe.
 *
 * This function checks whether we want to delay the towrite. Specifically,
 * we delay the towrite when:
 *
 *   1. degraded stripe has a non-overwrite to the missing dev, AND this
 *      stripe has data in journal (for other devices).
 *
 *      In this case, when reading data for the non-overwrite dev, it is
 *      necessary to handle complex rmw of write back cache (prexor with
 *      orig_page, and xor with page). To keep read path simple, we would
 *      like to flush data in journal to RAID disks first, so complex rmw
 *      is handled in the write patch (handle_stripe_dirtying).
 *
 *   2. when journal space is critical (R5C_LOG_CRITICAL=1)
 *
 *      It is important to be able to flush all stripes in raid5-cache.
 *      Therefore, we need reserve some space on the journal device for
 *      these flushes. If flush operation includes pending writes to the
 *      stripe, we need to reserve (conf->raid_disk + 1) pages per stripe
 *      for the flush out. If we exclude these pending writes from flush
 *      operation, we only need (conf->max_degraded + 1) pages per stripe.
 *      Therefore, excluding pending writes in these cases enables more
 *      efficient use of the journal device.
 *
 *      Note: To make sure the stripe makes progress, we only delay
 *      towrite for stripes with data already in journal (injournal > 0).
 *      When LOG_CRITICAL, stripes with injournal == 0 will be sent to
 *      no_space_stripes list.
 *
 *   3. during journal failure
 *      In journal failure, we try to flush all cached data to raid disks
 *      based on data in stripe cache. The array is read-only to upper
 *      layers, so we would skip all pending writes.
 *
 */
static inline bool delay_towrite(struct r5conf *conf,
				 struct r5dev *dev,
				 struct stripe_head_state *s)
{ … }

static void
schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
			 int rcw, int expand)
{ … }

static bool stripe_bio_overlaps(struct stripe_head *sh, struct bio *bi,
				int dd_idx, int forwrite)
{ … }

static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi,
			     int dd_idx, int forwrite, int previous)
{ … }

/*
 * Each stripe/dev can have one or more bios attached.
 * toread/towrite point to the first in a chain.
 * The bi_next chain must be in order.
 */
static bool add_stripe_bio(struct stripe_head *sh, struct bio *bi,
			   int dd_idx, int forwrite, int previous)
{ … }

static void end_reshape(struct r5conf *conf);

static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
			    struct stripe_head *sh)
{ … }

static void
handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
		     struct stripe_head_state *s, int disks)
{ … }

static void
handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
		   struct stripe_head_state *s)
{ … }

static int want_replace(struct stripe_head *sh, int disk_idx)
{ … }

static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
			   int disk_idx, int disks)
{ … }

/* fetch_block - checks the given member device to see if its data needs
 * to be read or computed to satisfy a request.
 *
 * Returns 1 when no more member devices need to be checked, otherwise returns
 * 0 to tell the loop in handle_stripe_fill to continue
 */
static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
		       int disk_idx, int disks)
{ … }

/*
 * handle_stripe_fill - read or compute data to satisfy pending requests.
 */
static void handle_stripe_fill(struct stripe_head *sh,
			       struct stripe_head_state *s,
			       int disks)
{ … }

static void break_stripe_batch_list(struct stripe_head *head_sh,
				    unsigned long handle_flags);
/* handle_stripe_clean_event
 * any written block on an uptodate or failed drive can be returned.
 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
 * never LOCKED, so we don't need to test 'failed' directly.
 */
static void handle_stripe_clean_event(struct r5conf *conf,
	struct stripe_head *sh, int disks)
{ … }

/*
 * For RMW in write back cache, we need extra page in prexor to store the
 * old data. This page is stored in dev->orig_page.
 *
 * This function checks whether we have data for prexor. The exact logic
 * is:
 *       R5_UPTODATE && (!R5_InJournal || R5_OrigPageUPTDODATE)
 */
static inline bool uptodate_for_rmw(struct r5dev *dev)
{ … }

static int handle_stripe_dirtying(struct r5conf *conf,
				  struct stripe_head *sh,
				  struct stripe_head_state *s,
				  int disks)
{ … }

static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
				struct stripe_head_state *s, int disks)
{ … }

static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
				  struct stripe_head_state *s,
				  int disks)
{ … }

static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
{ … }

/*
 * handle_stripe - do things to a stripe.
 *
 * We lock the stripe by setting STRIPE_ACTIVE and then examine the
 * state of various bits to see what needs to be done.
 * Possible results:
 *    return some read requests which now have data
 *    return some write requests which are safely on storage
 *    schedule a read on some buffers
 *    schedule a write of some buffers
 *    return confirmation of parity correctness
 *
 */

static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
{ … }

/*
 * Return '1' if this is a member of batch, or '0' if it is a lone stripe or
 * a head which can now be handled.
 */
static int clear_batch_ready(struct stripe_head *sh)
{ … }

static void break_stripe_batch_list(struct stripe_head *head_sh,
				    unsigned long handle_flags)
{ … }

static void handle_stripe(struct stripe_head *sh)
{ … }

static void raid5_activate_delayed(struct r5conf *conf)
	__must_hold(&conf->device_lock)
{ … }

static void activate_bit_delay(struct r5conf *conf,
		struct list_head *temp_inactive_list)
	__must_hold(&conf->device_lock)
{ … }

static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
{ … }

/*
 *  add bio to the retry LIFO  ( in O(1) ... we are in interrupt )
 *  later sampled by raid5d.
 */
static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
{ … }

static struct bio *remove_bio_from_retry(struct r5conf *conf,
					 unsigned int *offset)
{ … }

/*
 *  The "raid5_align_endio" should check if the read succeeded and if it
 *  did, call bio_endio on the original bio (having bio_put the new bio
 *  first).
 *  If the read failed..
 */
static void raid5_align_endio(struct bio *bi)
{ … }

static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
{ … }

static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
{ … }

/* __get_priority_stripe - get the next stripe to process
 *
 * Full stripe writes are allowed to pass preread active stripes up until
 * the bypass_threshold is exceeded.  In general the bypass_count
 * increments when the handle_list is handled before the hold_list; however, it
 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a
 * stripe with in flight i/o.  The bypass_count will be reset when the
 * head of the hold_list has changed, i.e. the head was promoted to the
 * handle_list.
 */
static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
	__must_hold(&conf->device_lock)
{ … }

struct raid5_plug_cb { … };

static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
{ … }

static void release_stripe_plug(struct mddev *mddev,
				struct stripe_head *sh)
{ … }

static void make_discard_request(struct mddev *mddev, struct bio *bi)
{ … }

static bool ahead_of_reshape(struct mddev *mddev, sector_t sector,
			     sector_t reshape_sector)
{ … }

static bool range_ahead_of_reshape(struct mddev *mddev, sector_t min,
				   sector_t max, sector_t reshape_sector)
{ … }

static bool stripe_ahead_of_reshape(struct mddev *mddev, struct r5conf *conf,
				    struct stripe_head *sh)
{ … }

static int add_all_stripe_bios(struct r5conf *conf,
		struct stripe_request_ctx *ctx, struct stripe_head *sh,
		struct bio *bi, int forwrite, int previous)
{ … }

enum reshape_loc { … };

static enum reshape_loc get_reshape_loc(struct mddev *mddev,
		struct r5conf *conf, sector_t logical_sector)
{ … }

static enum stripe_result make_stripe_request(struct mddev *mddev,
		struct r5conf *conf, struct stripe_request_ctx *ctx,
		sector_t logical_sector, struct bio *bi)
{ … }

/*
 * If the bio covers multiple data disks, find sector within the bio that has
 * the lowest chunk offset in the first chunk.
 */
static sector_t raid5_bio_lowest_chunk_sector(struct r5conf *conf,
					      struct bio *bi)
{ … }

static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
{ … }

static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);

static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
{ … }

static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr,
					  sector_t max_sector, int *skipped)
{ … }

static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
			       unsigned int offset)
{ … }

static int handle_active_stripes(struct r5conf *conf, int group,
				 struct r5worker *worker,
				 struct list_head *temp_inactive_list)
		__must_hold(&conf->device_lock)
{ … }

static void raid5_do_work(struct work_struct *work)
{ … }

/*
 * This is our raid5 kernel thread.
 *
 * We scan the hash table for stripes which can be handled now.
 * During the scan, completed stripes are saved for us by the interrupt
 * handler, so that they will not have to wait for our next wakeup.
 */
static void raid5d(struct md_thread *thread)
{ … }

static ssize_t
raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
{ … }

int
raid5_set_cache_size(struct mddev *mddev, int size)
{ … }
EXPORT_SYMBOL(…);

static ssize_t
raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len)
{ … }

static struct md_sysfs_entry
raid5_stripecache_size = …;

static ssize_t
raid5_show_rmw_level(struct mddev  *mddev, char *page)
{ … }

static ssize_t
raid5_store_rmw_level(struct mddev  *mddev, const char *page, size_t len)
{ … }

static struct md_sysfs_entry
raid5_rmw_level = …;

static ssize_t
raid5_show_stripe_size(struct mddev  *mddev, char *page)
{ … }

#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
static ssize_t
raid5_store_stripe_size(struct mddev  *mddev, const char *page, size_t len)
{
	struct r5conf *conf;
	unsigned long new;
	int err;
	int size;

	if (len >= PAGE_SIZE)
		return -EINVAL;
	if (kstrtoul(page, 10, &new))
		return -EINVAL;

	/*
	 * The value should not be bigger than PAGE_SIZE. It requires to
	 * be multiple of DEFAULT_STRIPE_SIZE and the value should be power
	 * of two.
	 */
	if (new % DEFAULT_STRIPE_SIZE != 0 ||
			new > PAGE_SIZE || new == 0 ||
			new != roundup_pow_of_two(new))
		return -EINVAL;

	err = mddev_suspend_and_lock(mddev);
	if (err)
		return err;

	conf = mddev->private;
	if (!conf) {
		err = -ENODEV;
		goto out_unlock;
	}

	if (new == conf->stripe_size)
		goto out_unlock;

	pr_debug("md/raid: change stripe_size from %lu to %lu\n",
			conf->stripe_size, new);

	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
	    mddev->reshape_position != MaxSector || mddev->sysfs_active) {
		err = -EBUSY;
		goto out_unlock;
	}

	mutex_lock(&conf->cache_size_mutex);
	size = conf->max_nr_stripes;

	shrink_stripes(conf);

	conf->stripe_size = new;
	conf->stripe_shift = ilog2(new) - 9;
	conf->stripe_sectors = new >> 9;
	if (grow_stripes(conf, size)) {
		pr_warn("md/raid:%s: couldn't allocate buffers\n",
				mdname(mddev));
		err = -ENOMEM;
	}
	mutex_unlock(&conf->cache_size_mutex);

out_unlock:
	mddev_unlock_and_resume(mddev);
	return err ?: len;
}

static struct md_sysfs_entry
raid5_stripe_size = __ATTR(stripe_size, 0644,
			 raid5_show_stripe_size,
			 raid5_store_stripe_size);
#else
static struct md_sysfs_entry
raid5_stripe_size = …;
#endif

static ssize_t
raid5_show_preread_threshold(struct mddev *mddev, char *page)
{ … }

static ssize_t
raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
{ … }

static struct md_sysfs_entry
raid5_preread_bypass_threshold = …;

static ssize_t
raid5_show_skip_copy(struct mddev *mddev, char *page)
{ … }

static ssize_t
raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
{ … }

static struct md_sysfs_entry
raid5_skip_copy = …;

static ssize_t
stripe_cache_active_show(struct mddev *mddev, char *page)
{ … }

static struct md_sysfs_entry
raid5_stripecache_active = …;

static ssize_t
raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
{ … }

static int alloc_thread_groups(struct r5conf *conf, int cnt,
			       int *group_cnt,
			       struct r5worker_group **worker_groups);
static ssize_t
raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
{ … }

static struct md_sysfs_entry
raid5_group_thread_cnt = …;

static struct attribute *raid5_attrs[] = …;
static const struct attribute_group raid5_attrs_group = …;

static int alloc_thread_groups(struct r5conf *conf, int cnt, int *group_cnt,
			       struct r5worker_group **worker_groups)
{ … }

static void free_thread_groups(struct r5conf *conf)
{ … }

static sector_t
raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
{ … }

static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
{ … }

static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
{ … }

static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node)
{ … }

static void raid5_free_percpu(struct r5conf *conf)
{ … }

static void free_conf(struct r5conf *conf)
{ … }

static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
{ … }

static int raid5_alloc_percpu(struct r5conf *conf)
{ … }

static unsigned long raid5_cache_scan(struct shrinker *shrink,
				      struct shrink_control *sc)
{ … }

static unsigned long raid5_cache_count(struct shrinker *shrink,
				       struct shrink_control *sc)
{ … }

static struct r5conf *setup_conf(struct mddev *mddev)
{ … }

static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded)
{ … }

static int raid5_set_limits(struct mddev *mddev)
{ … }

static int raid5_run(struct mddev *mddev)
{ … }

static void raid5_free(struct mddev *mddev, void *priv)
{ … }

static void raid5_status(struct seq_file *seq, struct mddev *mddev)
{ … }

static void print_raid5_conf(struct r5conf *conf)
{ … }

static int raid5_spare_active(struct mddev *mddev)
{ … }

static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
{ … }

static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
{ … }

static int raid5_resize(struct mddev *mddev, sector_t sectors)
{ … }

static int check_stripe_cache(struct mddev *mddev)
{ … }

static int check_reshape(struct mddev *mddev)
{ … }

static int raid5_start_reshape(struct mddev *mddev)
{ … }

/* This is called from the reshape thread and should make any
 * changes needed in 'conf'
 */
static void end_reshape(struct r5conf *conf)
{ … }

/* This is called from the raid5d thread with mddev_lock held.
 * It makes config changes to the device.
 */
static void raid5_finish_reshape(struct mddev *mddev)
{ … }

static void raid5_quiesce(struct mddev *mddev, int quiesce)
{ … }

static void *raid45_takeover_raid0(struct mddev *mddev, int level)
{ … }

static void *raid5_takeover_raid1(struct mddev *mddev)
{ … }

static void *raid5_takeover_raid6(struct mddev *mddev)
{ … }

static int raid5_check_reshape(struct mddev *mddev)
{ … }

static int raid6_check_reshape(struct mddev *mddev)
{ … }

static void *raid5_takeover(struct mddev *mddev)
{ … }

static void *raid4_takeover(struct mddev *mddev)
{ … }

static struct md_personality raid5_personality;

static void *raid6_takeover(struct mddev *mddev)
{ … }

static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
{ … }

static int raid5_start(struct mddev *mddev)
{ … }

/*
 * This is only used for dm-raid456, caller already frozen sync_thread, hence
 * if rehsape is still in progress, io that is waiting for reshape can never be
 * done now, hence wake up and handle those IO.
 */
static void raid5_prepare_suspend(struct mddev *mddev)
{ … }

static struct md_personality raid6_personality = …;
static struct md_personality raid5_personality = …;

static struct md_personality raid4_personality = …;

static int __init raid5_init(void)
{ … }

static void raid5_exit(void)
{ … }

module_init(…) …;
module_exit(raid5_exit);
MODULE_LICENSE(…) …;
MODULE_DESCRIPTION(…) …;
MODULE_ALIAS(…) …; /* RAID5 */
MODULE_ALIAS(…) …;
MODULE_ALIAS(…) …;
MODULE_ALIAS(…) …;
MODULE_ALIAS(…) …;
MODULE_ALIAS(…) …; /* RAID6 */
MODULE_ALIAS(…) …;
MODULE_ALIAS(…) …;

/* This used to be two separate modules, they were: */
MODULE_ALIAS(…) …;
MODULE_ALIAS(…) …;
linux/drivers/md/raid5.c