raid56.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2012 Fusion-io  All rights reserved.
 * Copyright (C) 2012 Intel Corp. All rights reserved.
 */

#include <linux/sched.h>
#include <linux/bio.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/raid/pq.h>
#include <linux/hash.h>
#include <linux/list_sort.h>
#include <linux/raid/xor.h>
#include <linux/mm.h>
#include "messages.h"
#include "ctree.h"
#include "disk-io.h"
#include "volumes.h"
#include "raid56.h"
#include "async-thread.h"
#include "file-item.h"
#include "btrfs_inode.h"

/* set when additional merges to this rbio are not allowed */
#define RBIO_RMW_LOCKED_BIT …

/*
 * set when this rbio is sitting in the hash, but it is just a cache
 * of past RMW
 */
#define RBIO_CACHE_BIT …

/*
 * set when it is safe to trust the stripe_pages for caching
 */
#define RBIO_CACHE_READY_BIT …

#define RBIO_CACHE_SIZE …

#define BTRFS_STRIPE_HASH_TABLE_BITS …

static void dump_bioc(const struct btrfs_fs_info *fs_info, const struct btrfs_io_context *bioc)
{ … }

static void btrfs_dump_rbio(const struct btrfs_fs_info *fs_info,
			    const struct btrfs_raid_bio *rbio)
{ … }

#define ASSERT_RBIO(expr, rbio) …

#define ASSERT_RBIO_STRIPE(expr, rbio, stripe_nr) …

#define ASSERT_RBIO_SECTOR(expr, rbio, sector_nr) …

#define ASSERT_RBIO_LOGICAL(expr, rbio, logical) …

/* Used by the raid56 code to lock stripes for read/modify/write */
struct btrfs_stripe_hash { … };

/* Used by the raid56 code to lock stripes for read/modify/write */
struct btrfs_stripe_hash_table { … };

/*
 * A bvec like structure to present a sector inside a page.
 *
 * Unlike bvec we don't need bvlen, as it's fixed to sectorsize.
 */
struct sector_ptr { … };

static void rmw_rbio_work(struct work_struct *work);
static void rmw_rbio_work_locked(struct work_struct *work);
static void index_rbio_pages(struct btrfs_raid_bio *rbio);
static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);

static int finish_parity_scrub(struct btrfs_raid_bio *rbio);
static void scrub_rbio_work_locked(struct work_struct *work);

static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio)
{ … }

static void free_raid_bio(struct btrfs_raid_bio *rbio)
{ … }

static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
{ … }

/*
 * the stripe hash table is used for locking, and to collect
 * bios in hopes of making a full stripe
 */
int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
{ … }

/*
 * caching an rbio means to copy anything from the
 * bio_sectors array into the stripe_pages array.  We
 * use the page uptodate bit in the stripe cache array
 * to indicate if it has valid data
 *
 * once the caching is done, we set the cache ready
 * bit.
 */
static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
{ … }

/*
 * we hash on the first logical address of the stripe
 */
static int rbio_bucket(struct btrfs_raid_bio *rbio)
{ … }

static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
				       unsigned int page_nr)
{ … }

/*
 * Update the stripe_sectors[] array to use correct page and pgoff
 *
 * Should be called every time any page pointer in stripes_pages[] got modified.
 */
static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
{ … }

static void steal_rbio_page(struct btrfs_raid_bio *src,
			    struct btrfs_raid_bio *dest, int page_nr)
{ … }

static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr)
{ … }

/*
 * Stealing an rbio means taking all the uptodate pages from the stripe array
 * in the source rbio and putting them into the destination rbio.
 *
 * This will also update the involved stripe_sectors[] which are referring to
 * the old pages.
 */
static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
{ … }

/*
 * merging means we take the bio_list from the victim and
 * splice it into the destination.  The victim should
 * be discarded afterwards.
 *
 * must be called with dest->rbio_list_lock held
 */
static void merge_rbio(struct btrfs_raid_bio *dest,
		       struct btrfs_raid_bio *victim)
{ … }

/*
 * used to prune items that are in the cache.  The caller
 * must hold the hash table lock.
 */
static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
{ … }

/*
 * prune a given rbio from the cache
 */
static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
{ … }

/*
 * remove everything in the cache
 */
static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
{ … }

/*
 * remove all cached entries and free the hash table
 * used by unmount
 */
void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
{ … }

/*
 * insert an rbio into the stripe cache.  It
 * must have already been prepared by calling
 * cache_rbio_pages
 *
 * If this rbio was already cached, it gets
 * moved to the front of the lru.
 *
 * If the size of the rbio cache is too big, we
 * prune an item.
 */
static void cache_rbio(struct btrfs_raid_bio *rbio)
{ … }

/*
 * helper function to run the xor_blocks api.  It is only
 * able to do MAX_XOR_BLOCKS at a time, so we need to
 * loop through.
 */
static void run_xor(void **pages, int src_cnt, ssize_t len)
{ … }

/*
 * Returns true if the bio list inside this rbio covers an entire stripe (no
 * rmw required).
 */
static int rbio_is_full(struct btrfs_raid_bio *rbio)
{ … }

/*
 * returns 1 if it is safe to merge two rbios together.
 * The merging is safe if the two rbios correspond to
 * the same stripe and if they are both going in the same
 * direction (read vs write), and if neither one is
 * locked for final IO
 *
 * The caller is responsible for locking such that
 * rmw_locked is safe to test
 */
static int rbio_can_merge(struct btrfs_raid_bio *last,
			  struct btrfs_raid_bio *cur)
{ … }

static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio,
					     unsigned int stripe_nr,
					     unsigned int sector_nr)
{ … }

/* Return a sector from rbio->stripe_sectors, not from the bio list */
static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio,
					     unsigned int stripe_nr,
					     unsigned int sector_nr)
{ … }

/* Grab a sector inside P stripe */
static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio,
					      unsigned int sector_nr)
{ … }

/* Grab a sector inside Q stripe, return NULL if not RAID6 */
static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio,
					      unsigned int sector_nr)
{ … }

/*
 * The first stripe in the table for a logical address
 * has the lock.  rbios are added in one of three ways:
 *
 * 1) Nobody has the stripe locked yet.  The rbio is given
 * the lock and 0 is returned.  The caller must start the IO
 * themselves.
 *
 * 2) Someone has the stripe locked, but we're able to merge
 * with the lock owner.  The rbio is freed and the IO will
 * start automatically along with the existing rbio.  1 is returned.
 *
 * 3) Someone has the stripe locked, but we're not able to merge.
 * The rbio is added to the lock owner's plug list, or merged into
 * an rbio already on the plug list.  When the lock owner unlocks,
 * the next rbio on the list is run and the IO is started automatically.
 * 1 is returned
 *
 * If we return 0, the caller still owns the rbio and must continue with
 * IO submission.  If we return 1, the caller must assume the rbio has
 * already been freed.
 */
static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
{ … }

static void recover_rbio_work_locked(struct work_struct *work);

/*
 * called as rmw or parity rebuild is completed.  If the plug list has more
 * rbios waiting for this stripe, the next one on the list will be started
 */
static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
{ … }

static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
{ … }

/*
 * this frees the rbio and runs through all the bios in the
 * bio_list and calls end_io on them
 */
static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
{ … }

/*
 * Get a sector pointer specified by its @stripe_nr and @sector_nr.
 *
 * @rbio:               The raid bio
 * @stripe_nr:          Stripe number, valid range [0, real_stripe)
 * @sector_nr:		Sector number inside the stripe,
 *			valid range [0, stripe_nsectors)
 * @bio_list_only:      Whether to use sectors inside the bio list only.
 *
 * The read/modify/write code wants to reuse the original bio page as much
 * as possible, and only use stripe_sectors as fallback.
 */
static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
					 int stripe_nr, int sector_nr,
					 bool bio_list_only)
{ … }

/*
 * allocation and initial setup for the btrfs_raid_bio.  Not
 * this does not allocate any pages for rbio->pages.
 */
static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
					 struct btrfs_io_context *bioc)
{ … }

/* allocate pages for all the stripes in the bio, including parity */
static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
{ … }

/* only allocate pages for p/q stripes */
static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
{ … }

/*
 * Return the total number of errors found in the vertical stripe of @sector_nr.
 *
 * @faila and @failb will also be updated to the first and second stripe
 * number of the errors.
 */
static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr,
				     int *faila, int *failb)
{ … }

/*
 * Add a single sector @sector into our list of bios for IO.
 *
 * Return 0 if everything went well.
 * Return <0 for error.
 */
static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
			      struct bio_list *bio_list,
			      struct sector_ptr *sector,
			      unsigned int stripe_nr,
			      unsigned int sector_nr,
			      enum req_op op)
{ … }

static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
{ … }

/*
 * helper function to walk our bio list and populate the bio_pages array with
 * the result.  This seems expensive, but it is faster than constantly
 * searching through the bio list as we setup the IO in finish_rmw or stripe
 * reconstruction.
 *
 * This must be called before you trust the answers from page_in_rbio
 */
static void index_rbio_pages(struct btrfs_raid_bio *rbio)
{ … }

static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio,
			       struct raid56_bio_trace_info *trace_info)
{ … }

static inline void bio_list_put(struct bio_list *bio_list)
{ … }

static void assert_rbio(struct btrfs_raid_bio *rbio)
{ … }

/* Generate PQ for one vertical stripe. */
static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
{ … }

static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
				   struct bio_list *bio_list)
{ … }

static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio)
{ … }

/*
 * For subpage case, we can no longer set page Up-to-date directly for
 * stripe_pages[], thus we need to locate the sector.
 */
static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
					     struct page *page,
					     unsigned int pgoff)
{ … }

/*
 * this sets each page in the bio uptodate.  It should only be used on private
 * rbio pages, nothing that comes in from the higher layers
 */
static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
{ … }

static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio)
{ … }

static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio)
{ … }

/* Verify the data sectors at read time. */
static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio,
				    struct bio *bio)
{ … }

static void raid_wait_read_end_io(struct bio *bio)
{ … }

static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio,
			     struct bio_list *bio_list)
{ … }

static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
{ … }

/*
 * We use plugging call backs to collect full stripes.
 * Any time we get a partial stripe write while plugged
 * we collect it into a list.  When the unplug comes down,
 * we sort the list by logical block number and merge
 * everything we can into the same rbios
 */
struct btrfs_plug_cb { … };

/*
 * rbios on the plug list are sorted for easier merging.
 */
static int plug_cmp(void *priv, const struct list_head *a,
		    const struct list_head *b)
{ … }

static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
{ … }

/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
{ … }

/*
 * our main entry point for writes from the rest of the FS.
 */
void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
{ … }

static int verify_one_sector(struct btrfs_raid_bio *rbio,
			     int stripe_nr, int sector_nr)
{ … }

/*
 * Recover a vertical stripe specified by @sector_nr.
 * @*pointers are the pre-allocated pointers by the caller, so we don't
 * need to allocate/free the pointers again and again.
 */
static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
			    void **pointers, void **unmap_array)
{ … }

static int recover_sectors(struct btrfs_raid_bio *rbio)
{ … }

static void recover_rbio(struct btrfs_raid_bio *rbio)
{ … }

static void recover_rbio_work(struct work_struct *work)
{ … }

static void recover_rbio_work_locked(struct work_struct *work)
{ … }

static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num)
{ … }

/*
 * the main entry point for reads from the higher layers.  This
 * is really only called when the normal read path had a failure,
 * so we assume the bio they send down corresponds to a failed part
 * of the drive.
 */
void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
			   int mirror_num)
{ … }

static void fill_data_csums(struct btrfs_raid_bio *rbio)
{ … }

static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio)
{ … }

static void raid_wait_write_end_io(struct bio *bio)
{ … }

static void submit_write_bios(struct btrfs_raid_bio *rbio,
			      struct bio_list *bio_list)
{ … }

/*
 * To determine if we need to read any sector from the disk.
 * Should only be utilized in RMW path, to skip cached rbio.
 */
static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio)
{ … }

static void rmw_rbio(struct btrfs_raid_bio *rbio)
{ … }

static void rmw_rbio_work(struct work_struct *work)
{ … }

static void rmw_rbio_work_locked(struct work_struct *work)
{ … }

/*
 * The following code is used to scrub/replace the parity stripe
 *
 * Caller must have already increased bio_counter for getting @bioc.
 *
 * Note: We need make sure all the pages that add into the scrub/replace
 * raid bio are correct and not be changed during the scrub/replace. That
 * is those pages just hold metadata or file data with checksum.
 */

struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
				struct btrfs_io_context *bioc,
				struct btrfs_device *scrub_dev,
				unsigned long *dbitmap, int stripe_nsectors)
{ … }

/*
 * We just scrub the parity that we have correct data on the same horizontal,
 * so we needn't allocate all pages for all the stripes.
 */
static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
{ … }

static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
{ … }

static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
{ … }

static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
{ … }

static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio)
{ … }

static void scrub_rbio(struct btrfs_raid_bio *rbio)
{ … }

static void scrub_rbio_work_locked(struct work_struct *work)
{ … }

void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
{ … }

/*
 * This is for scrub call sites where we already have correct data contents.
 * This allows us to avoid reading data stripes again.
 *
 * Unfortunately here we have to do page copy, other than reusing the pages.
 * This is due to the fact rbio has its own page management for its cache.
 */
void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio,
				    struct page **data_pages, u64 data_logical)
{ … }
linux/fs/btrfs/raid56.c