extent_io.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0

#include <linux/bitops.h>
#include <linux/slab.h>
#include <linux/bio.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/page-flags.h>
#include <linux/sched/mm.h>
#include <linux/spinlock.h>
#include <linux/blkdev.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/prefetch.h>
#include <linux/fsverity.h>
#include "extent_io.h"
#include "extent-io-tree.h"
#include "extent_map.h"
#include "ctree.h"
#include "btrfs_inode.h"
#include "bio.h"
#include "locking.h"
#include "backref.h"
#include "disk-io.h"
#include "subpage.h"
#include "zoned.h"
#include "block-group.h"
#include "compression.h"
#include "fs.h"
#include "accessors.h"
#include "file-item.h"
#include "file.h"
#include "dev-replace.h"
#include "super.h"
#include "transaction.h"

static struct kmem_cache *extent_buffer_cache;

#ifdef CONFIG_BTRFS_DEBUG
static inline void btrfs_leak_debug_add_eb(struct extent_buffer *eb)
{ … }

static inline void btrfs_leak_debug_del_eb(struct extent_buffer *eb)
{ … }

void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
{ … }
#else
#define btrfs_leak_debug_add_eb …
#define btrfs_leak_debug_del_eb …
#endif

/*
 * Structure to record info about the bio being assembled, and other info like
 * how many bytes are there before stripe/ordered extent boundary.
 */
struct btrfs_bio_ctrl { … };

static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
{ … }

/*
 * Submit or fail the current bio in the bio_ctrl structure.
 */
static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret)
{ … }

int __init extent_buffer_init_cachep(void)
{ … }

void __cold extent_buffer_free_cachep(void)
{ … }

static void process_one_page(struct btrfs_fs_info *fs_info,
			     struct page *page, const struct page *locked_page,
			     unsigned long page_ops, u64 start, u64 end)
{ … }

static void __process_pages_contig(struct address_space *mapping,
				   const struct page *locked_page, u64 start, u64 end,
				   unsigned long page_ops)
{ … }

static noinline void __unlock_for_delalloc(const struct inode *inode,
					   const struct page *locked_page,
					   u64 start, u64 end)
{ … }

static noinline int lock_delalloc_pages(struct inode *inode,
					const struct page *locked_page,
					u64 start,
					u64 end)
{ … }

/*
 * Find and lock a contiguous range of bytes in the file marked as delalloc, no
 * more than @max_bytes.
 *
 * @start:	The original start bytenr to search.
 *		Will store the extent range start bytenr.
 * @end:	The original end bytenr of the search range
 *		Will store the extent range end bytenr.
 *
 * Return true if we find a delalloc range which starts inside the original
 * range, and @start/@end will store the delalloc range start/end.
 *
 * Return false if we can't find any delalloc range which starts inside the
 * original range, and @start/@end will be the non-delalloc range start/end.
 */
EXPORT_FOR_TESTS
noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
				    struct page *locked_page, u64 *start,
				    u64 *end)
{ … }

void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
				  const struct page *locked_page,
				  struct extent_state **cached,
				  u32 clear_bits, unsigned long page_ops)
{ … }

static bool btrfs_verify_page(struct page *page, u64 start)
{ … }

static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
{ … }

/*
 * After a write IO is done, we need to:
 *
 * - clear the uptodate bits on error
 * - clear the writeback bits in the extent tree for the range
 * - filio_end_writeback()  if there is no more pending io for the folio
 *
 * Scheduling is not allowed, so the extent state tree is expected
 * to have one and only one object corresponding to this IO.
 */
static void end_bbio_data_write(struct btrfs_bio *bbio)
{ … }

/*
 * Record previously processed extent range
 *
 * For endio_readpage_release_extent() to handle a full extent range, reducing
 * the extent io operations.
 */
struct processed_extent { … };

/*
 * Try to release processed extent range
 *
 * May not release the extent range right now if the current range is
 * contiguous to processed extent.
 *
 * Will release processed extent when any of @inode, @uptodate, the range is
 * no longer contiguous to the processed range.
 *
 * Passing @inode == NULL will force processed extent to be released.
 */
static void endio_readpage_release_extent(struct processed_extent *processed,
			      struct btrfs_inode *inode, u64 start, u64 end,
			      bool uptodate)
{ … }

static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
{ … }

/*
 * After a data read IO is done, we need to:
 *
 * - clear the uptodate bits on error
 * - set the uptodate bits if things worked
 * - set the folio up to date if all extents in the tree are uptodate
 * - clear the lock bit in the extent tree
 * - unlock the folio if there are no other extents locked for it
 *
 * Scheduling is not allowed, so the extent state tree is expected
 * to have one and only one object corresponding to this IO.
 */
static void end_bbio_data_read(struct btrfs_bio *bbio)
{ … }

/*
 * Populate every free slot in a provided array with folios using GFP_NOFS.
 *
 * @nr_folios:   number of folios to allocate
 * @folio_array: the array to fill with folios; any existing non-NULL entries in
 *		 the array will be skipped
 *
 * Return: 0        if all folios were able to be allocated;
 *         -ENOMEM  otherwise, the partially allocated folios would be freed and
 *                  the array slots zeroed
 */
int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array)
{ … }

/*
 * Populate every free slot in a provided array with pages, using GFP_NOFS.
 *
 * @nr_pages:   number of pages to allocate
 * @page_array: the array to fill with pages; any existing non-null entries in
 *		the array will be skipped
 * @nofail:	whether using __GFP_NOFAIL flag
 *
 * Return: 0        if all pages were able to be allocated;
 *         -ENOMEM  otherwise, the partially allocated pages would be freed and
 *                  the array slots zeroed
 */
int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
			   bool nofail)
{ … }

/*
 * Populate needed folios for the extent buffer.
 *
 * For now, the folios populated are always in order 0 (aka, single page).
 */
static int alloc_eb_folio_array(struct extent_buffer *eb, bool nofail)
{ … }

static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl,
				struct page *page, u64 disk_bytenr,
				unsigned int pg_offset)
{ … }

static void alloc_new_bio(struct btrfs_inode *inode,
			  struct btrfs_bio_ctrl *bio_ctrl,
			  u64 disk_bytenr, u64 file_offset)
{ … }

/*
 * @disk_bytenr: logical bytenr where the write will be
 * @page:	page to add to the bio
 * @size:	portion of page that we want to write to
 * @pg_offset:	offset of the new bio or to check whether we are adding
 *              a contiguous page to the previous one
 *
 * The will either add the page into the existing @bio_ctrl->bbio, or allocate a
 * new one in @bio_ctrl->bbio.
 * The mirror number for this IO should already be initizlied in
 * @bio_ctrl->mirror_num.
 */
static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl,
			       u64 disk_bytenr, struct page *page,
			       size_t size, unsigned long pg_offset)
{ … }

static int attach_extent_buffer_folio(struct extent_buffer *eb,
				      struct folio *folio,
				      struct btrfs_subpage *prealloc)
{ … }

int set_page_extent_mapped(struct page *page)
{ … }

int set_folio_extent_mapped(struct folio *folio)
{ … }

void clear_page_extent_mapped(struct page *page)
{ … }

static struct extent_map *__get_extent_map(struct inode *inode, struct page *page,
		 u64 start, u64 len, struct extent_map **em_cached)
{ … }
/*
 * basic readpage implementation.  Locked extent state structs are inserted
 * into the tree that are removed when the IO is done (by the end_io
 * handlers)
 * XXX JDM: This needs looking at to ensure proper page locking
 * return 0 on success, otherwise return error
 */
static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
		      struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start)
{ … }

int btrfs_read_folio(struct file *file, struct folio *folio)
{ … }

static inline void contiguous_readpages(struct page *pages[], int nr_pages,
					u64 start, u64 end,
					struct extent_map **em_cached,
					struct btrfs_bio_ctrl *bio_ctrl,
					u64 *prev_em_start)
{ … }

/*
 * helper for __extent_writepage, doing all of the delayed allocation setup.
 *
 * This returns 1 if btrfs_run_delalloc_range function did all the work required
 * to write the page (copy into inline extent).  In this case the IO has
 * been started and the page is already unlocked.
 *
 * This returns 0 if all went well (page still locked)
 * This returns < 0 if there were errors (page still locked)
 */
static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
		struct page *page, struct writeback_control *wbc)
{ … }

/*
 * Find the first byte we need to write.
 *
 * For subpage, one page can contain several sectors, and
 * __extent_writepage_io() will just grab all extent maps in the page
 * range and try to submit all non-inline/non-compressed extents.
 *
 * This is a big problem for subpage, we shouldn't re-submit already written
 * data at all.
 * This function will lookup subpage dirty bit to find which range we really
 * need to submit.
 *
 * Return the next dirty range in [@start, @end).
 * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE.
 */
static void find_next_dirty_byte(const struct btrfs_fs_info *fs_info,
				 struct page *page, u64 *start, u64 *end)
{ … }

/*
 * helper for __extent_writepage.  This calls the writepage start hooks,
 * and does the loop to map the page into extents and bios.
 *
 * We return 1 if the IO is started and the page is unlocked,
 * 0 if all went well (page still locked)
 * < 0 if there were errors (page still locked)
 */
static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
				 struct page *page, u64 start, u32 len,
				 struct btrfs_bio_ctrl *bio_ctrl,
				 loff_t i_size,
				 int *nr_ret)
{ … }

/*
 * the writepage semantics are similar to regular writepage.  extent
 * records are inserted to lock ranges in the tree, and as dirty areas
 * are found, they are marked writeback.  Then the lock bits are removed
 * and the end_io handler clears the writeback ranges
 *
 * Return 0 if everything goes well.
 * Return <0 for error.
 */
static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl)
{ … }

void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
{ … }

/*
 * Lock extent buffer status and pages for writeback.
 *
 * Return %false if the extent buffer doesn't need to be submitted (e.g. the
 * extent buffer is not dirty)
 * Return %true is the extent buffer is submitted to bio.
 */
static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *eb,
			  struct writeback_control *wbc)
{ … }

static void set_btree_ioerr(struct extent_buffer *eb)
{ … }

/*
 * The endio specific version which won't touch any unsafe spinlock in endio
 * context.
 */
static struct extent_buffer *find_extent_buffer_nolock(
		const struct btrfs_fs_info *fs_info, u64 start)
{ … }

static void end_bbio_meta_write(struct btrfs_bio *bbio)
{ … }

static void prepare_eb_write(struct extent_buffer *eb)
{ … }

static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
					    struct writeback_control *wbc)
{ … }

/*
 * Submit one subpage btree page.
 *
 * The main difference to submit_eb_page() is:
 * - Page locking
 *   For subpage, we don't rely on page locking at all.
 *
 * - Flush write bio
 *   We only flush bio if we may be unable to fit current extent buffers into
 *   current bio.
 *
 * Return >=0 for the number of submitted extent buffers.
 * Return <0 for fatal error.
 */
static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
{ … }

/*
 * Submit all page(s) of one extent buffer.
 *
 * @page:	the page of one extent buffer
 * @eb_context:	to determine if we need to submit this page, if current page
 *		belongs to this eb, we don't need to submit
 *
 * The caller should pass each page in their bytenr order, and here we use
 * @eb_context to determine if we have submitted pages of one extent buffer.
 *
 * If we have, we just skip until we hit a new page that doesn't belong to
 * current @eb_context.
 *
 * If not, we submit all the page(s) of the extent buffer.
 *
 * Return >0 if we have submitted the extent buffer successfully.
 * Return 0 if we don't need to submit the page, as it's already submitted by
 * previous call.
 * Return <0 for fatal error.
 */
static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx)
{ … }

int btree_write_cache_pages(struct address_space *mapping,
				   struct writeback_control *wbc)
{ … }

/*
 * Walk the list of dirty pages of the given address space and write all of them.
 *
 * @mapping:   address space structure to write
 * @wbc:       subtract the number of written pages from *@wbc->nr_to_write
 * @bio_ctrl:  holds context for the write, namely the bio
 *
 * If a page is already under I/O, write_cache_pages() skips it, even
 * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
 * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
 * and msync() need to guarantee that all the data which was dirty at the time
 * the call was made get new I/O started against them.  If wbc->sync_mode is
 * WB_SYNC_ALL then we were called for data integrity and we must wait for
 * existing IO to complete.
 */
static int extent_write_cache_pages(struct address_space *mapping,
			     struct btrfs_bio_ctrl *bio_ctrl)
{ … }

/*
 * Submit the pages in the range to bio for call sites which delalloc range has
 * already been ran (aka, ordered extent inserted) and all pages are still
 * locked.
 */
void extent_write_locked_range(struct inode *inode, const struct page *locked_page,
			       u64 start, u64 end, struct writeback_control *wbc,
			       bool pages_dirty)
{ … }

int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{ … }

void btrfs_readahead(struct readahead_control *rac)
{ … }

/*
 * basic invalidate_folio code, this waits on any locked or writeback
 * ranges corresponding to the folio, and then deletes any extent state
 * records from the tree
 */
int extent_invalidate_folio(struct extent_io_tree *tree,
			  struct folio *folio, size_t offset)
{ … }

/*
 * a helper for release_folio, this tests for areas of the page that
 * are locked or under IO and drops the related state bits if it is safe
 * to drop the page.
 */
static bool try_release_extent_state(struct extent_io_tree *tree,
				    struct page *page, gfp_t mask)
{ … }

/*
 * a helper for release_folio.  As long as there are no locked extents
 * in the range corresponding to the page, both state records and extent
 * map records are removed
 */
bool try_release_extent_mapping(struct page *page, gfp_t mask)
{ … }

static void __free_extent_buffer(struct extent_buffer *eb)
{ … }

static int extent_buffer_under_io(const struct extent_buffer *eb)
{ … }

static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *folio)
{ … }

static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct folio *folio)
{ … }

/* Release all pages attached to the extent buffer */
static void btrfs_release_extent_buffer_pages(const struct extent_buffer *eb)
{ … }

/*
 * Helper for releasing the extent buffer.
 */
static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
{ … }

static struct extent_buffer *
__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
		      unsigned long len)
{ … }

struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
{ … }

struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
						  u64 start, unsigned long len)
{ … }

struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
						u64 start)
{ … }

static void check_buffer_tree_ref(struct extent_buffer *eb)
{ … }

static void mark_extent_buffer_accessed(struct extent_buffer *eb)
{ … }

struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
					 u64 start)
{ … }

#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
					u64 start)
{ … }
#endif

static struct extent_buffer *grab_extent_buffer(
		struct btrfs_fs_info *fs_info, struct page *page)
{ … }

static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
{ … }


/*
 * Return 0 if eb->folios[i] is attached to btree inode successfully.
 * Return >0 if there is already another extent buffer for the range,
 * and @found_eb_ret would be updated.
 * Return -EAGAIN if the filemap has an existing folio but with different size
 * than @eb.
 * The caller needs to free the existing folios and retry using the same order.
 */
static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
				      struct btrfs_subpage *prealloc,
				      struct extent_buffer **found_eb_ret)
{ … }

struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
					  u64 start, u64 owner_root, int level)
{ … }

static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
{ … }

static int release_extent_buffer(struct extent_buffer *eb)
	__releases(&eb->refs_lock)
{ … }

void free_extent_buffer(struct extent_buffer *eb)
{ … }

void free_extent_buffer_stale(struct extent_buffer *eb)
{ … }

static void btree_clear_folio_dirty(struct folio *folio)
{ … }

static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
{ … }

void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
			      struct extent_buffer *eb)
{ … }

void set_extent_buffer_dirty(struct extent_buffer *eb)
{ … }

void clear_extent_buffer_uptodate(struct extent_buffer *eb)
{ … }

void set_extent_buffer_uptodate(struct extent_buffer *eb)
{ … }

static void clear_extent_buffer_reading(struct extent_buffer *eb)
{ … }

static void end_bbio_meta_read(struct btrfs_bio *bbio)
{ … }

int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
			     const struct btrfs_tree_parent_check *check)
{ … }

static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
			    unsigned long len)
{ … }

/*
 * Check if the [start, start + len) range is valid before reading/writing
 * the eb.
 * NOTE: @start and @len are offset inside the eb, not logical address.
 *
 * Caller should not touch the dst/src memory if this function returns error.
 */
static inline int check_eb_range(const struct extent_buffer *eb,
				 unsigned long start, unsigned long len)
{ … }

void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
			unsigned long start, unsigned long len)
{ … }

int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
				       void __user *dstv,
				       unsigned long start, unsigned long len)
{ … }

int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
			 unsigned long start, unsigned long len)
{ … }

/*
 * Check that the extent buffer is uptodate.
 *
 * For regular sector size == PAGE_SIZE case, check if @page is uptodate.
 * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE.
 */
static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i)
{ … }

static void __write_extent_buffer(const struct extent_buffer *eb,
				  const void *srcv, unsigned long start,
				  unsigned long len, bool use_memmove)
{ … }

void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
			 unsigned long start, unsigned long len)
{ … }

static void memset_extent_buffer(const struct extent_buffer *eb, int c,
				 unsigned long start, unsigned long len)
{ … }

void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
			   unsigned long len)
{ … }

void copy_extent_buffer_full(const struct extent_buffer *dst,
			     const struct extent_buffer *src)
{ … }

void copy_extent_buffer(const struct extent_buffer *dst,
			const struct extent_buffer *src,
			unsigned long dst_offset, unsigned long src_offset,
			unsigned long len)
{ … }

/*
 * Calculate the folio and offset of the byte containing the given bit number.
 *
 * @eb:           the extent buffer
 * @start:        offset of the bitmap item in the extent buffer
 * @nr:           bit number
 * @folio_index:  return index of the folio in the extent buffer that contains
 *                the given bit number
 * @folio_offset: return offset into the folio given by folio_index
 *
 * This helper hides the ugliness of finding the byte in an extent buffer which
 * contains a given bit.
 */
static inline void eb_bitmap_offset(const struct extent_buffer *eb,
				    unsigned long start, unsigned long nr,
				    unsigned long *folio_index,
				    size_t *folio_offset)
{ … }

/*
 * Determine whether a bit in a bitmap item is set.
 *
 * @eb:     the extent buffer
 * @start:  offset of the bitmap item in the extent buffer
 * @nr:     bit number to test
 */
int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
			   unsigned long nr)
{ … }

static u8 *extent_buffer_get_byte(const struct extent_buffer *eb, unsigned long bytenr)
{ … }

/*
 * Set an area of a bitmap to 1.
 *
 * @eb:     the extent buffer
 * @start:  offset of the bitmap item in the extent buffer
 * @pos:    bit number of the first bit
 * @len:    number of bits to set
 */
void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
			      unsigned long pos, unsigned long len)
{ … }


/*
 * Clear an area of a bitmap.
 *
 * @eb:     the extent buffer
 * @start:  offset of the bitmap item in the extent buffer
 * @pos:    bit number of the first bit
 * @len:    number of bits to clear
 */
void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
				unsigned long start, unsigned long pos,
				unsigned long len)
{ … }

static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
{ … }

void memcpy_extent_buffer(const struct extent_buffer *dst,
			  unsigned long dst_offset, unsigned long src_offset,
			  unsigned long len)
{ … }

void memmove_extent_buffer(const struct extent_buffer *dst,
			   unsigned long dst_offset, unsigned long src_offset,
			   unsigned long len)
{ … }

#define GANG_LOOKUP_SIZE …
static struct extent_buffer *get_next_extent_buffer(
		const struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
{ … }

static int try_release_subpage_extent_buffer(struct page *page)
{ … }

int try_release_extent_buffer(struct page *page)
{ … }

/*
 * Attempt to readahead a child block.
 *
 * @fs_info:	the fs_info
 * @bytenr:	bytenr to read
 * @owner_root: objectid of the root that owns this eb
 * @gen:	generation for the uptodate check, can be 0
 * @level:	level for the eb
 *
 * Attempt to readahead a tree block at @bytenr.  If @gen is 0 then we do a
 * normal uptodate check of the eb, without checking the generation.  If we have
 * to read the block we will not block on anything.
 */
void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
				u64 bytenr, u64 owner_root, u64 gen, int level)
{ … }

/*
 * Readahead a node's child block.
 *
 * @node:	parent node we're reading from
 * @slot:	slot in the parent node for the child we want to read
 *
 * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at
 * the slot in the node provided.
 */
void btrfs_readahead_node_child(struct extent_buffer *node, int slot)
{ … }
linux/fs/btrfs/extent_io.c