buffered-io.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2010 Red Hat, Inc.
 * Copyright (C) 2016-2023 Christoph Hellwig.
 */
#include <linux/module.h>
#include <linux/compiler.h>
#include <linux/fs.h>
#include <linux/iomap.h>
#include <linux/pagemap.h>
#include <linux/uio.h>
#include <linux/buffer_head.h>
#include <linux/dax.h>
#include <linux/writeback.h>
#include <linux/list_sort.h>
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/sched/signal.h>
#include <linux/migrate.h>
#include "trace.h"

#include "../internal.h"

#define IOEND_BATCH_SIZE …

iomap_punch_t;
/*
 * Structure allocated for each folio to track per-block uptodate, dirty state
 * and I/O completions.
 */
struct iomap_folio_state { … };

static struct bio_set iomap_ioend_bioset;

static inline bool ifs_is_fully_uptodate(struct folio *folio,
		struct iomap_folio_state *ifs)
{ … }

static inline bool ifs_block_is_uptodate(struct iomap_folio_state *ifs,
		unsigned int block)
{ … }

static bool ifs_set_range_uptodate(struct folio *folio,
		struct iomap_folio_state *ifs, size_t off, size_t len)
{ … }

static void iomap_set_range_uptodate(struct folio *folio, size_t off,
		size_t len)
{ … }

static inline bool ifs_block_is_dirty(struct folio *folio,
		struct iomap_folio_state *ifs, int block)
{ … }

static unsigned ifs_find_dirty_range(struct folio *folio,
		struct iomap_folio_state *ifs, u64 *range_start, u64 range_end)
{ … }

static unsigned iomap_find_dirty_range(struct folio *folio, u64 *range_start,
		u64 range_end)
{ … }

static void ifs_clear_range_dirty(struct folio *folio,
		struct iomap_folio_state *ifs, size_t off, size_t len)
{ … }

static void iomap_clear_range_dirty(struct folio *folio, size_t off, size_t len)
{ … }

static void ifs_set_range_dirty(struct folio *folio,
		struct iomap_folio_state *ifs, size_t off, size_t len)
{ … }

static void iomap_set_range_dirty(struct folio *folio, size_t off, size_t len)
{ … }

static struct iomap_folio_state *ifs_alloc(struct inode *inode,
		struct folio *folio, unsigned int flags)
{ … }

static void ifs_free(struct folio *folio)
{ … }

/*
 * Calculate the range inside the folio that we actually need to read.
 */
static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
		loff_t *pos, loff_t length, size_t *offp, size_t *lenp)
{ … }

static void iomap_finish_folio_read(struct folio *folio, size_t off,
		size_t len, int error)
{ … }

static void iomap_read_end_io(struct bio *bio)
{ … }

struct iomap_readpage_ctx { … };

/**
 * iomap_read_inline_data - copy inline data into the page cache
 * @iter: iteration structure
 * @folio: folio to copy to
 *
 * Copy the inline data in @iter into @folio and zero out the rest of the folio.
 * Only a single IOMAP_INLINE extent is allowed at the end of each file.
 * Returns zero for success to complete the read, or the usual negative errno.
 */
static int iomap_read_inline_data(const struct iomap_iter *iter,
		struct folio *folio)
{ … }

static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter,
		loff_t pos)
{ … }

static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
		struct iomap_readpage_ctx *ctx, loff_t offset)
{ … }

static loff_t iomap_read_folio_iter(const struct iomap_iter *iter,
		struct iomap_readpage_ctx *ctx)
{ … }

int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
{ … }
EXPORT_SYMBOL_GPL(…);

static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
		struct iomap_readpage_ctx *ctx)
{ … }

/**
 * iomap_readahead - Attempt to read pages from a file.
 * @rac: Describes the pages to be read.
 * @ops: The operations vector for the filesystem.
 *
 * This function is for filesystems to call to implement their readahead
 * address_space operation.
 *
 * Context: The @ops callbacks may submit I/O (eg to read the addresses of
 * blocks from disc), and may wait for it.  The caller may be trying to
 * access a different page, and so sleeping excessively should be avoided.
 * It may allocate memory, but should avoid costly allocations.  This
 * function is called with memalloc_nofs set, so allocations will not cause
 * the filesystem to be reentered.
 */
void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
{ … }
EXPORT_SYMBOL_GPL(…);

/*
 * iomap_is_partially_uptodate checks whether blocks within a folio are
 * uptodate or not.
 *
 * Returns true if all blocks which correspond to the specified part
 * of the folio are uptodate.
 */
bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
{ … }
EXPORT_SYMBOL_GPL(…);

/**
 * iomap_get_folio - get a folio reference for writing
 * @iter: iteration structure
 * @pos: start offset of write
 * @len: Suggested size of folio to create.
 *
 * Returns a locked reference to the folio at @pos, or an error pointer if the
 * folio could not be obtained.
 */
struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len)
{ … }
EXPORT_SYMBOL_GPL(…);

bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags)
{ … }
EXPORT_SYMBOL_GPL(…);

void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
{ … }
EXPORT_SYMBOL_GPL(…);

bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio)
{ … }
EXPORT_SYMBOL_GPL(…);

static void
iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
{ … }

static int iomap_read_folio_sync(loff_t block_start, struct folio *folio,
		size_t poff, size_t plen, const struct iomap *iomap)
{ … }

static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
		size_t len, struct folio *folio)
{ … }

static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos,
		size_t len)
{ … }

static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret,
		struct folio *folio)
{ … }

static int iomap_write_begin_inline(const struct iomap_iter *iter,
		struct folio *folio)
{ … }

static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
		size_t len, struct folio **foliop)
{ … }

static bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
		size_t copied, struct folio *folio)
{ … }

static void iomap_write_end_inline(const struct iomap_iter *iter,
		struct folio *folio, loff_t pos, size_t copied)
{ … }

/*
 * Returns true if all copied bytes have been written to the pagecache,
 * otherwise return false.
 */
static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
		size_t copied, struct folio *folio)
{ … }

static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
{ … }

ssize_t
iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
		const struct iomap_ops *ops)
{ … }
EXPORT_SYMBOL_GPL(…);

static int iomap_write_delalloc_ifs_punch(struct inode *inode,
		struct folio *folio, loff_t start_byte, loff_t end_byte,
		iomap_punch_t punch)
{ … }


static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
		loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
		iomap_punch_t punch)
{ … }

/*
 * Scan the data range passed to us for dirty page cache folios. If we find a
 * dirty folio, punch out the preceding range and update the offset from which
 * the next punch will start from.
 *
 * We can punch out storage reservations under clean pages because they either
 * contain data that has been written back - in which case the delalloc punch
 * over that range is a no-op - or they have been read faults in which case they
 * contain zeroes and we can remove the delalloc backing range and any new
 * writes to those pages will do the normal hole filling operation...
 *
 * This makes the logic simple: we only need to keep the delalloc extents only
 * over the dirty ranges of the page cache.
 *
 * This function uses [start_byte, end_byte) intervals (i.e. open ended) to
 * simplify range iterations.
 */
static int iomap_write_delalloc_scan(struct inode *inode,
		loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
		iomap_punch_t punch)
{ … }

/*
 * Punch out all the delalloc blocks in the range given except for those that
 * have dirty data still pending in the page cache - those are going to be
 * written and so must still retain the delalloc backing for writeback.
 *
 * As we are scanning the page cache for data, we don't need to reimplement the
 * wheel - mapping_seek_hole_data() does exactly what we need to identify the
 * start and end of data ranges correctly even for sub-folio block sizes. This
 * byte range based iteration is especially convenient because it means we
 * don't have to care about variable size folios, nor where the start or end of
 * the data range lies within a folio, if they lie within the same folio or even
 * if there are multiple discontiguous data ranges within the folio.
 *
 * It should be noted that mapping_seek_hole_data() is not aware of EOF, and so
 * can return data ranges that exist in the cache beyond EOF. e.g. a page fault
 * spanning EOF will initialise the post-EOF data to zeroes and mark it up to
 * date. A write page fault can then mark it dirty. If we then fail a write()
 * beyond EOF into that up to date cached range, we allocate a delalloc block
 * beyond EOF and then have to punch it out. Because the range is up to date,
 * mapping_seek_hole_data() will return it, and we will skip the punch because
 * the folio is dirty. THis is incorrect - we always need to punch out delalloc
 * beyond EOF in this case as writeback will never write back and covert that
 * delalloc block beyond EOF. Hence we limit the cached data scan range to EOF,
 * resulting in always punching out the range from the EOF to the end of the
 * range the iomap spans.
 *
 * Intervals are of the form [start_byte, end_byte) (i.e. open ended) because it
 * matches the intervals returned by mapping_seek_hole_data(). i.e. SEEK_DATA
 * returns the start of a data range (start_byte), and SEEK_HOLE(start_byte)
 * returns the end of the data range (data_end). Using closed intervals would
 * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose
 * the code to subtle off-by-one bugs....
 */
static int iomap_write_delalloc_release(struct inode *inode,
		loff_t start_byte, loff_t end_byte, iomap_punch_t punch)
{ … }

/*
 * When a short write occurs, the filesystem may need to remove reserved space
 * that was allocated in ->iomap_begin from it's ->iomap_end method. For
 * filesystems that use delayed allocation, we need to punch out delalloc
 * extents from the range that are not dirty in the page cache. As the write can
 * race with page faults, there can be dirty pages over the delalloc extent
 * outside the range of a short write but still within the delalloc extent
 * allocated for this iomap.
 *
 * This function uses [start_byte, end_byte) intervals (i.e. open ended) to
 * simplify range iterations.
 *
 * The punch() callback *must* only punch delalloc extents in the range passed
 * to it. It must skip over all other types of extents in the range and leave
 * them completely unchanged. It must do this punch atomically with respect to
 * other extent modifications.
 *
 * The punch() callback may be called with a folio locked to prevent writeback
 * extent allocation racing at the edge of the range we are currently punching.
 * The locked folio may or may not cover the range being punched, so it is not
 * safe for the punch() callback to lock folios itself.
 *
 * Lock order is:
 *
 * inode->i_rwsem (shared or exclusive)
 *   inode->i_mapping->invalidate_lock (exclusive)
 *     folio_lock()
 *       ->punch
 *         internal filesystem allocation lock
 */
int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
		struct iomap *iomap, loff_t pos, loff_t length,
		ssize_t written, iomap_punch_t punch)
{ … }
EXPORT_SYMBOL_GPL(…);

static loff_t iomap_unshare_iter(struct iomap_iter *iter)
{ … }

int
iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
		const struct iomap_ops *ops)
{ … }
EXPORT_SYMBOL_GPL(…);

static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
{ … }

int
iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
		const struct iomap_ops *ops)
{ … }
EXPORT_SYMBOL_GPL(…);

int
iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
		const struct iomap_ops *ops)
{ … }
EXPORT_SYMBOL_GPL(…);

static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter,
		struct folio *folio)
{ … }

vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
{ … }
EXPORT_SYMBOL_GPL(…);

static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
		size_t len)
{ … }

/*
 * We're now finished for good with this ioend structure.  Update the page
 * state, release holds on bios, and finally free up memory.  Do not use the
 * ioend after this.
 */
static u32
iomap_finish_ioend(struct iomap_ioend *ioend, int error)
{ … }

/*
 * Ioend completion routine for merged bios. This can only be called from task
 * contexts as merged ioends can be of unbound length. Hence we have to break up
 * the writeback completions into manageable chunks to avoid long scheduler
 * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get
 * good batch processing throughput without creating adverse scheduler latency
 * conditions.
 */
void
iomap_finish_ioends(struct iomap_ioend *ioend, int error)
{ … }
EXPORT_SYMBOL_GPL(…);

/*
 * We can merge two adjacent ioends if they have the same set of work to do.
 */
static bool
iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
{ … }

void
iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends)
{ … }
EXPORT_SYMBOL_GPL(…);

static int
iomap_ioend_compare(void *priv, const struct list_head *a,
		const struct list_head *b)
{ … }

void
iomap_sort_ioends(struct list_head *ioend_list)
{ … }
EXPORT_SYMBOL_GPL(…);

static void iomap_writepage_end_bio(struct bio *bio)
{ … }

/*
 * Submit the final bio for an ioend.
 *
 * If @error is non-zero, it means that we have a situation where some part of
 * the submission process has failed after we've marked pages for writeback.
 * We cannot cancel ioend directly in that case, so call the bio end I/O handler
 * with the error status here to run the normal I/O completion handler to clear
 * the writeback bit and let the file system proess the errors.
 */
static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
{ … }

static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
		struct writeback_control *wbc, struct inode *inode, loff_t pos)
{ … }

static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
{ … }

/*
 * Test to see if we have an existing ioend structure that we could append to
 * first; otherwise finish off the current ioend and start another.
 *
 * If a new ioend is created and cached, the old ioend is submitted to the block
 * layer instantly.  Batching optimisations are provided by higher level block
 * plugging.
 *
 * At the end of a writeback pass, there will be a cached ioend remaining on the
 * writepage context that the caller will need to submit.
 */
static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
		struct writeback_control *wbc, struct folio *folio,
		struct inode *inode, loff_t pos, unsigned len)
{ … }

static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
		struct writeback_control *wbc, struct folio *folio,
		struct inode *inode, u64 pos, unsigned dirty_len,
		unsigned *count)
{ … }

/*
 * Check interaction of the folio with the file end.
 *
 * If the folio is entirely beyond i_size, return false.  If it straddles
 * i_size, adjust end_pos and zero all data beyond i_size.
 */
static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode,
		u64 *end_pos)
{ … }

static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
		struct writeback_control *wbc, struct folio *folio)
{ … }

int
iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
		struct iomap_writepage_ctx *wpc,
		const struct iomap_writeback_ops *ops)
{ … }
EXPORT_SYMBOL_GPL(…);

static int __init iomap_init(void)
{ … }
fs_initcall(iomap_init);
linux/fs/iomap/buffered-io.c