page-writeback.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/page-writeback.c
 *
 * Copyright (C) 2002, Linus Torvalds.
 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
 *
 * Contains functions related to writing back dirty pages at the
 * address_space level.
 *
 * 10Apr2002	Andrew Morton
 *		Initial version
 */

#include <linux/kernel.h>
#include <linux/math64.h>
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/init.h>
#include <linux/backing-dev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/blkdev.h>
#include <linux/mpage.h>
#include <linux/rmap.h>
#include <linux/percpu.h>
#include <linux/smp.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/syscalls.h>
#include <linux/pagevec.h>
#include <linux/timer.h>
#include <linux/sched/rt.h>
#include <linux/sched/signal.h>
#include <linux/mm_inline.h>
#include <trace/events/writeback.h>

#include "internal.h"

/*
 * Sleep at most 200ms at a time in balance_dirty_pages().
 */
#define MAX_PAUSE …

/*
 * Try to keep balance_dirty_pages() call intervals higher than this many pages
 * by raising pause time to max_pause when falls below it.
 */
#define DIRTY_POLL_THRESH …

/*
 * Estimate write bandwidth at 200ms intervals.
 */
#define BANDWIDTH_INTERVAL …

#define RATELIMIT_CALC_SHIFT …

/*
 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
 * will look to see if it needs to force writeback or throttling.
 */
static long ratelimit_pages = …;

/* The following parameters are exported via /proc/sys/vm */

/*
 * Start background writeback (via writeback threads) at this percentage
 */
static int dirty_background_ratio = …;

/*
 * dirty_background_bytes starts at 0 (disabled) so that it is a function of
 * dirty_background_ratio * the amount of dirtyable memory
 */
static unsigned long dirty_background_bytes;

/*
 * free highmem will not be subtracted from the total free memory
 * for calculating free ratios if vm_highmem_is_dirtyable is true
 */
static int vm_highmem_is_dirtyable;

/*
 * The generator of dirty data starts writeback at this percentage
 */
static int vm_dirty_ratio = …;

/*
 * vm_dirty_bytes starts at 0 (disabled) so that it is a function of
 * vm_dirty_ratio * the amount of dirtyable memory
 */
static unsigned long vm_dirty_bytes;

/*
 * The interval between `kupdate'-style writebacks
 */
unsigned int dirty_writeback_interval = …; /* centiseconds */

EXPORT_SYMBOL_GPL(…);

/*
 * The longest time for which data is allowed to remain dirty
 */
unsigned int dirty_expire_interval = …; /* centiseconds */

/*
 * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
 * a full sync is triggered after this time elapses without any disk activity.
 */
int laptop_mode;

EXPORT_SYMBOL(…);

/* End of sysctl-exported parameters */

struct wb_domain global_wb_domain;

/* consolidated parameters for balance_dirty_pages() and its subroutines */
struct dirty_throttle_control { … };

/*
 * Length of period for aging writeout fractions of bdis. This is an
 * arbitrarily chosen number. The longer the period, the slower fractions will
 * reflect changes in current writeout rate.
 */
#define VM_COMPLETIONS_PERIOD_LEN …

#ifdef CONFIG_CGROUP_WRITEBACK

#define GDTC_INIT(__wb) …

#define GDTC_INIT_NO_WB …

#define MDTC_INIT(__wb, __gdtc) …

static bool mdtc_valid(struct dirty_throttle_control *dtc)
{ … }

static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
{ … }

static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
{ … }

static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
{ … }

static void wb_min_max_ratio(struct bdi_writeback *wb,
			     unsigned long *minp, unsigned long *maxp)
{ … }

#else	/* CONFIG_CGROUP_WRITEBACK */

#define GDTC_INIT …
#define GDTC_INIT_NO_WB
#define MDTC_INIT …

static bool mdtc_valid(struct dirty_throttle_control *dtc)
{
	return false;
}

static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
{
	return &global_wb_domain;
}

static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
{
	return NULL;
}

static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
{
	return NULL;
}

static void wb_min_max_ratio(struct bdi_writeback *wb,
			     unsigned long *minp, unsigned long *maxp)
{
	*minp = wb->bdi->min_ratio;
	*maxp = wb->bdi->max_ratio;
}

#endif	/* CONFIG_CGROUP_WRITEBACK */

/*
 * In a memory zone, there is a certain amount of pages we consider
 * available for the page cache, which is essentially the number of
 * free and reclaimable pages, minus some zone reserves to protect
 * lowmem and the ability to uphold the zone's watermarks without
 * requiring writeback.
 *
 * This number of dirtyable pages is the base value of which the
 * user-configurable dirty ratio is the effective number of pages that
 * are allowed to be actually dirtied.  Per individual zone, or
 * globally by using the sum of dirtyable pages over all zones.
 *
 * Because the user is allowed to specify the dirty limit globally as
 * absolute number of bytes, calculating the per-zone dirty limit can
 * require translating the configured limit into a percentage of
 * global dirtyable memory first.
 */

/**
 * node_dirtyable_memory - number of dirtyable pages in a node
 * @pgdat: the node
 *
 * Return: the node's number of pages potentially available for dirty
 * page cache.  This is the base value for the per-node dirty limits.
 */
static unsigned long node_dirtyable_memory(struct pglist_data *pgdat)
{ … }

static unsigned long highmem_dirtyable_memory(unsigned long total)
{ … }

/**
 * global_dirtyable_memory - number of globally dirtyable pages
 *
 * Return: the global number of pages potentially available for dirty
 * page cache.  This is the base value for the global dirty limits.
 */
static unsigned long global_dirtyable_memory(void)
{ … }

/**
 * domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain
 * @dtc: dirty_throttle_control of interest
 *
 * Calculate @dtc->thresh and ->bg_thresh considering
 * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}.  The caller
 * must ensure that @dtc->avail is set before calling this function.  The
 * dirty limits will be lifted by 1/4 for real-time tasks.
 */
static void domain_dirty_limits(struct dirty_throttle_control *dtc)
{ … }

/**
 * global_dirty_limits - background-writeback and dirty-throttling thresholds
 * @pbackground: out parameter for bg_thresh
 * @pdirty: out parameter for thresh
 *
 * Calculate bg_thresh and thresh for global_wb_domain.  See
 * domain_dirty_limits() for details.
 */
void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
{ … }

/**
 * node_dirty_limit - maximum number of dirty pages allowed in a node
 * @pgdat: the node
 *
 * Return: the maximum number of dirty pages allowed in a node, based
 * on the node's dirtyable memory.
 */
static unsigned long node_dirty_limit(struct pglist_data *pgdat)
{ … }

/**
 * node_dirty_ok - tells whether a node is within its dirty limits
 * @pgdat: the node to check
 *
 * Return: %true when the dirty pages in @pgdat are within the node's
 * dirty limit, %false if the limit is exceeded.
 */
bool node_dirty_ok(struct pglist_data *pgdat)
{ … }

#ifdef CONFIG_SYSCTL
static int dirty_background_ratio_handler(const struct ctl_table *table, int write,
		void *buffer, size_t *lenp, loff_t *ppos)
{ … }

static int dirty_background_bytes_handler(const struct ctl_table *table, int write,
		void *buffer, size_t *lenp, loff_t *ppos)
{ … }

static int dirty_ratio_handler(const struct ctl_table *table, int write, void *buffer,
		size_t *lenp, loff_t *ppos)
{ … }

static int dirty_bytes_handler(const struct ctl_table *table, int write,
		void *buffer, size_t *lenp, loff_t *ppos)
{ … }
#endif

static unsigned long wp_next_time(unsigned long cur_time)
{ … }

static void wb_domain_writeout_add(struct wb_domain *dom,
				   struct fprop_local_percpu *completions,
				   unsigned int max_prop_frac, long nr)
{ … }

/*
 * Increment @wb's writeout completion count and the global writeout
 * completion count. Called from __folio_end_writeback().
 */
static inline void __wb_writeout_add(struct bdi_writeback *wb, long nr)
{ … }

void wb_writeout_inc(struct bdi_writeback *wb)
{ … }
EXPORT_SYMBOL_GPL(…);

/*
 * On idle system, we can be called long after we scheduled because we use
 * deferred timers so count with missed periods.
 */
static void writeout_period(struct timer_list *t)
{ … }

int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
{ … }

#ifdef CONFIG_CGROUP_WRITEBACK
void wb_domain_exit(struct wb_domain *dom)
{ … }
#endif

/*
 * bdi_min_ratio keeps the sum of the minimum dirty shares of all
 * registered backing devices, which, for obvious reasons, can not
 * exceed 100%.
 */
static unsigned int bdi_min_ratio;

static int bdi_check_pages_limit(unsigned long pages)
{ … }

static unsigned long bdi_ratio_from_pages(unsigned long pages)
{ … }

static u64 bdi_get_bytes(unsigned int ratio)
{ … }

static int __bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
{ … }

static int __bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio)
{ … }

int bdi_set_min_ratio_no_scale(struct backing_dev_info *bdi, unsigned int min_ratio)
{ … }

int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio)
{ … }

int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
{ … }

int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio)
{ … }
EXPORT_SYMBOL(…);

u64 bdi_get_min_bytes(struct backing_dev_info *bdi)
{ … }

int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes)
{ … }

u64 bdi_get_max_bytes(struct backing_dev_info *bdi)
{ … }

int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes)
{ … }

int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit)
{ … }

static unsigned long dirty_freerun_ceiling(unsigned long thresh,
					   unsigned long bg_thresh)
{ … }

static unsigned long hard_dirty_limit(struct wb_domain *dom,
				      unsigned long thresh)
{ … }

/*
 * Memory which can be further allocated to a memcg domain is capped by
 * system-wide clean memory excluding the amount being used in the domain.
 */
static void mdtc_calc_avail(struct dirty_throttle_control *mdtc,
			    unsigned long filepages, unsigned long headroom)
{ … }

static inline bool dtc_is_global(struct dirty_throttle_control *dtc)
{ … }

/*
 * Dirty background will ignore pages being written as we're trying to
 * decide whether to put more under writeback.
 */
static void domain_dirty_avail(struct dirty_throttle_control *dtc,
			       bool include_writeback)
{ … }

/**
 * __wb_calc_thresh - @wb's share of dirty threshold
 * @dtc: dirty_throttle_context of interest
 * @thresh: dirty throttling or dirty background threshold of wb_domain in @dtc
 *
 * Note that balance_dirty_pages() will only seriously take dirty throttling
 * threshold as a hard limit when sleeping max_pause per page is not enough
 * to keep the dirty pages under control. For example, when the device is
 * completely stalled due to some error conditions, or when there are 1000
 * dd tasks writing to a slow 10MB/s USB key.
 * In the other normal situations, it acts more gently by throttling the tasks
 * more (rather than completely block them) when the wb dirty pages go high.
 *
 * It allocates high/low dirty limits to fast/slow devices, in order to prevent
 * - starving fast devices
 * - piling up dirty pages (that will take long time to sync) on slow devices
 *
 * The wb's share of dirty limit will be adapting to its throughput and
 * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
 *
 * Return: @wb's dirty limit in pages. For dirty throttling limit, the term
 * "dirty" in the context of dirty balancing includes all PG_dirty and
 * PG_writeback pages.
 */
static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc,
				      unsigned long thresh)
{ … }

unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
{ … }

unsigned long cgwb_calc_thresh(struct bdi_writeback *wb)
{ … }

/*
 *                           setpoint - dirty 3
 *        f(dirty) := 1.0 + (----------------)
 *                           limit - setpoint
 *
 * it's a 3rd order polynomial that subjects to
 *
 * (1) f(freerun)  = 2.0 => rampup dirty_ratelimit reasonably fast
 * (2) f(setpoint) = 1.0 => the balance point
 * (3) f(limit)    = 0   => the hard limit
 * (4) df/dx      <= 0	 => negative feedback control
 * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
 *     => fast response on large errors; small oscillation near setpoint
 */
static long long pos_ratio_polynom(unsigned long setpoint,
					  unsigned long dirty,
					  unsigned long limit)
{ … }

/*
 * Dirty position control.
 *
 * (o) global/bdi setpoints
 *
 * We want the dirty pages be balanced around the global/wb setpoints.
 * When the number of dirty pages is higher/lower than the setpoint, the
 * dirty position control ratio (and hence task dirty ratelimit) will be
 * decreased/increased to bring the dirty pages back to the setpoint.
 *
 *     pos_ratio = 1 << RATELIMIT_CALC_SHIFT
 *
 *     if (dirty < setpoint) scale up   pos_ratio
 *     if (dirty > setpoint) scale down pos_ratio
 *
 *     if (wb_dirty < wb_setpoint) scale up   pos_ratio
 *     if (wb_dirty > wb_setpoint) scale down pos_ratio
 *
 *     task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT
 *
 * (o) global control line
 *
 *     ^ pos_ratio
 *     |
 *     |            |<===== global dirty control scope ======>|
 * 2.0  * * * * * * *
 *     |            .*
 *     |            . *
 *     |            .   *
 *     |            .     *
 *     |            .        *
 *     |            .            *
 * 1.0 ................................*
 *     |            .                  .     *
 *     |            .                  .          *
 *     |            .                  .              *
 *     |            .                  .                 *
 *     |            .                  .                    *
 *   0 +------------.------------------.----------------------*------------->
 *           freerun^          setpoint^                 limit^   dirty pages
 *
 * (o) wb control line
 *
 *     ^ pos_ratio
 *     |
 *     |            *
 *     |              *
 *     |                *
 *     |                  *
 *     |                    * |<=========== span ============>|
 * 1.0 .......................*
 *     |                      . *
 *     |                      .   *
 *     |                      .     *
 *     |                      .       *
 *     |                      .         *
 *     |                      .           *
 *     |                      .             *
 *     |                      .               *
 *     |                      .                 *
 *     |                      .                   *
 *     |                      .                     *
 * 1/4 ...............................................* * * * * * * * * * * *
 *     |                      .                         .
 *     |                      .                           .
 *     |                      .                             .
 *   0 +----------------------.-------------------------------.------------->
 *                wb_setpoint^                    x_intercept^
 *
 * The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can
 * be smoothly throttled down to normal if it starts high in situations like
 * - start writing to a slow SD card and a fast disk at the same time. The SD
 *   card's wb_dirty may rush to many times higher than wb_setpoint.
 * - the wb dirty thresh drops quickly due to change of JBOD workload
 */
static void wb_position_ratio(struct dirty_throttle_control *dtc)
{ … }

static void wb_update_write_bandwidth(struct bdi_writeback *wb,
				      unsigned long elapsed,
				      unsigned long written)
{ … }

static void update_dirty_limit(struct dirty_throttle_control *dtc)
{ … }

static void domain_update_dirty_limit(struct dirty_throttle_control *dtc,
				      unsigned long now)
{ … }

/*
 * Maintain wb->dirty_ratelimit, the base dirty throttle rate.
 *
 * Normal wb tasks will be curbed at or below it in long term.
 * Obviously it should be around (write_bw / N) when there are N dd tasks.
 */
static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
				      unsigned long dirtied,
				      unsigned long elapsed)
{ … }

static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
				  struct dirty_throttle_control *mdtc,
				  bool update_ratelimit)
{ … }

void wb_update_bandwidth(struct bdi_writeback *wb)
{ … }

/* Interval after which we consider wb idle and don't estimate bandwidth */
#define WB_BANDWIDTH_IDLE_JIF …

static void wb_bandwidth_estimate_start(struct bdi_writeback *wb)
{ … }

/*
 * After a task dirtied this many pages, balance_dirty_pages_ratelimited()
 * will look to see if it needs to start dirty throttling.
 *
 * If dirty_poll_interval is too low, big NUMA machines will call the expensive
 * global_zone_page_state() too often. So scale it near-sqrt to the safety margin
 * (the number of pages we may dirty without exceeding the dirty limits).
 */
static unsigned long dirty_poll_interval(unsigned long dirty,
					 unsigned long thresh)
{ … }

static unsigned long wb_max_pause(struct bdi_writeback *wb,
				  unsigned long wb_dirty)
{ … }

static long wb_min_pause(struct bdi_writeback *wb,
			 long max_pause,
			 unsigned long task_ratelimit,
			 unsigned long dirty_ratelimit,
			 int *nr_dirtied_pause)
{ … }

static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
{ … }

static unsigned long domain_poll_intv(struct dirty_throttle_control *dtc,
				      bool strictlimit)
{ … }

/*
 * Throttle it only when the background writeback cannot catch-up. This avoids
 * (excessively) small writeouts when the wb limits are ramping up in case of
 * !strictlimit.
 *
 * In strictlimit case make decision based on the wb counters and limits. Small
 * writeouts when the wb limits are ramping up are the price we consciously pay
 * for strictlimit-ing.
 */
static void domain_dirty_freerun(struct dirty_throttle_control *dtc,
				 bool strictlimit)
{ … }

static void balance_domain_limits(struct dirty_throttle_control *dtc,
				  bool strictlimit)
{ … }

static void wb_dirty_freerun(struct dirty_throttle_control *dtc,
			     bool strictlimit)
{ … }

static inline void wb_dirty_exceeded(struct dirty_throttle_control *dtc,
				     bool strictlimit)
{ … }

/*
 * The limits fields dirty_exceeded and pos_ratio won't be updated if wb is
 * in freerun state. Please don't use these invalid fields in freerun case.
 */
static void balance_wb_limits(struct dirty_throttle_control *dtc,
			      bool strictlimit)
{ … }

/*
 * balance_dirty_pages() must be called by processes which are generating dirty
 * data.  It looks at the number of dirty pages in the machine and will force
 * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
 * If we're over `background_thresh' then the writeback threads are woken to
 * perform some writeout.
 */
static int balance_dirty_pages(struct bdi_writeback *wb,
			       unsigned long pages_dirtied, unsigned int flags)
{ … }

static DEFINE_PER_CPU(int, bdp_ratelimits);

/*
 * Normal tasks are throttled by
 *	loop {
 *		dirty tsk->nr_dirtied_pause pages;
 *		take a snap in balance_dirty_pages();
 *	}
 * However there is a worst case. If every task exit immediately when dirtied
 * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be
 * called to throttle the page dirties. The solution is to save the not yet
 * throttled page dirties in dirty_throttle_leaks on task exit and charge them
 * randomly into the running tasks. This works well for the above worst case,
 * as the new task will pick up and accumulate the old task's leaked dirty
 * count and eventually get throttled.
 */
DEFINE_PER_CPU(int, dirty_throttle_leaks) = …;

/**
 * balance_dirty_pages_ratelimited_flags - Balance dirty memory state.
 * @mapping: address_space which was dirtied.
 * @flags: BDP flags.
 *
 * Processes which are dirtying memory should call in here once for each page
 * which was newly dirtied.  The function will periodically check the system's
 * dirty state and will initiate writeback if needed.
 *
 * See balance_dirty_pages_ratelimited() for details.
 *
 * Return: If @flags contains BDP_ASYNC, it may return -EAGAIN to
 * indicate that memory is out of balance and the caller must wait
 * for I/O to complete.  Otherwise, it will return 0 to indicate
 * that either memory was already in balance, or it was able to sleep
 * until the amount of dirty memory returned to balance.
 */
int balance_dirty_pages_ratelimited_flags(struct address_space *mapping,
					unsigned int flags)
{ … }
EXPORT_SYMBOL_GPL(…);

/**
 * balance_dirty_pages_ratelimited - balance dirty memory state.
 * @mapping: address_space which was dirtied.
 *
 * Processes which are dirtying memory should call in here once for each page
 * which was newly dirtied.  The function will periodically check the system's
 * dirty state and will initiate writeback if needed.
 *
 * Once we're over the dirty memory limit we decrease the ratelimiting
 * by a lot, to prevent individual processes from overshooting the limit
 * by (ratelimit_pages) each.
 */
void balance_dirty_pages_ratelimited(struct address_space *mapping)
{ … }
EXPORT_SYMBOL(…);

/*
 * Similar to wb_dirty_limits, wb_bg_dirty_limits also calculates dirty
 * and thresh, but it's for background writeback.
 */
static void wb_bg_dirty_limits(struct dirty_throttle_control *dtc)
{ … }

static bool domain_over_bg_thresh(struct dirty_throttle_control *dtc)
{ … }

/**
 * wb_over_bg_thresh - does @wb need to be written back?
 * @wb: bdi_writeback of interest
 *
 * Determines whether background writeback should keep writing @wb or it's
 * clean enough.
 *
 * Return: %true if writeback should continue.
 */
bool wb_over_bg_thresh(struct bdi_writeback *wb)
{ … }

#ifdef CONFIG_SYSCTL
/*
 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
 */
static int dirty_writeback_centisecs_handler(const struct ctl_table *table, int write,
		void *buffer, size_t *length, loff_t *ppos)
{ … }
#endif

void laptop_mode_timer_fn(struct timer_list *t)
{ … }

/*
 * We've spun up the disk and we're in laptop mode: schedule writeback
 * of all dirty data a few seconds from now.  If the flush is already scheduled
 * then push it back - the user is still using the disk.
 */
void laptop_io_completion(struct backing_dev_info *info)
{ … }

/*
 * We're in laptop mode and we've just synced. The sync's writes will have
 * caused another writeback to be scheduled by laptop_io_completion.
 * Nothing needs to be written back anymore, so we unschedule the writeback.
 */
void laptop_sync_completion(void)
{ … }

/*
 * If ratelimit_pages is too high then we can get into dirty-data overload
 * if a large number of processes all perform writes at the same time.
 *
 * Here we set ratelimit_pages to a level which ensures that when all CPUs are
 * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
 * thresholds.
 */

void writeback_set_ratelimit(void)
{ … }

static int page_writeback_cpu_online(unsigned int cpu)
{ … }

#ifdef CONFIG_SYSCTL

/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
static const unsigned long dirty_bytes_min = …;

static struct ctl_table vm_page_writeback_sysctls[] = …;
#endif

/*
 * Called early on to tune the page writeback dirty limits.
 *
 * We used to scale dirty pages according to how total memory
 * related to pages that could be allocated for buffers.
 *
 * However, that was when we used "dirty_ratio" to scale with
 * all memory, and we don't do that any more. "dirty_ratio"
 * is now applied to total non-HIGHPAGE memory, and as such we can't
 * get into the old insane situation any more where we had
 * large amounts of dirty pages compared to a small amount of
 * non-HIGHMEM memory.
 *
 * But we might still want to scale the dirty_ratio by how
 * much memory the box has..
 */
void __init page_writeback_init(void)
{ … }

/**
 * tag_pages_for_writeback - tag pages to be written by writeback
 * @mapping: address space structure to write
 * @start: starting page index
 * @end: ending page index (inclusive)
 *
 * This function scans the page range from @start to @end (inclusive) and tags
 * all pages that have DIRTY tag set with a special TOWRITE tag.  The caller
 * can then use the TOWRITE tag to identify pages eligible for writeback.
 * This mechanism is used to avoid livelocking of writeback by a process
 * steadily creating new dirty pages in the file (thus it is important for this
 * function to be quick so that it can tag pages faster than a dirtying process
 * can create them).
 */
void tag_pages_for_writeback(struct address_space *mapping,
			     pgoff_t start, pgoff_t end)
{ … }
EXPORT_SYMBOL(…);

static bool folio_prepare_writeback(struct address_space *mapping,
		struct writeback_control *wbc, struct folio *folio)
{ … }

static xa_mark_t wbc_to_tag(struct writeback_control *wbc)
{ … }

static pgoff_t wbc_end(struct writeback_control *wbc)
{ … }

static struct folio *writeback_get_folio(struct address_space *mapping,
		struct writeback_control *wbc)
{ … }

/**
 * writeback_iter - iterate folio of a mapping for writeback
 * @mapping: address space structure to write
 * @wbc: writeback context
 * @folio: previously iterated folio (%NULL to start)
 * @error: in-out pointer for writeback errors (see below)
 *
 * This function returns the next folio for the writeback operation described by
 * @wbc on @mapping and  should be called in a while loop in the ->writepages
 * implementation.
 *
 * To start the writeback operation, %NULL is passed in the @folio argument, and
 * for every subsequent iteration the folio returned previously should be passed
 * back in.
 *
 * If there was an error in the per-folio writeback inside the writeback_iter()
 * loop, @error should be set to the error value.
 *
 * Once the writeback described in @wbc has finished, this function will return
 * %NULL and if there was an error in any iteration restore it to @error.
 *
 * Note: callers should not manually break out of the loop using break or goto
 * but must keep calling writeback_iter() until it returns %NULL.
 *
 * Return: the folio to write or %NULL if the loop is done.
 */
struct folio *writeback_iter(struct address_space *mapping,
		struct writeback_control *wbc, struct folio *folio, int *error)
{ … }
EXPORT_SYMBOL_GPL(…);

/**
 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
 * @mapping: address space structure to write
 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
 * @writepage: function called for each page
 * @data: data passed to writepage function
 *
 * Return: %0 on success, negative error code otherwise
 *
 * Note: please use writeback_iter() instead.
 */
int write_cache_pages(struct address_space *mapping,
		      struct writeback_control *wbc, writepage_t writepage,
		      void *data)
{ … }
EXPORT_SYMBOL(…);

static int writeback_use_writepage(struct address_space *mapping,
		struct writeback_control *wbc)
{ … }

int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{ … }

/*
 * For address_spaces which do not use buffers nor write back.
 */
bool noop_dirty_folio(struct address_space *mapping, struct folio *folio)
{ … }
EXPORT_SYMBOL(…);

/*
 * Helper function for set_page_dirty family.
 *
 * Caller must hold folio_memcg_lock().
 *
 * NOTE: This relies on being atomic wrt interrupts.
 */
static void folio_account_dirtied(struct folio *folio,
		struct address_space *mapping)
{ … }

/*
 * Helper function for deaccounting dirty page without writeback.
 *
 * Caller must hold folio_memcg_lock().
 */
void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb)
{ … }

/*
 * Mark the folio dirty, and set it dirty in the page cache.
 *
 * If warn is true, then emit a warning if the folio is not uptodate and has
 * not been truncated.
 *
 * The caller must hold folio_memcg_lock().  It is the caller's
 * responsibility to prevent the folio from being truncated while
 * this function is in progress, although it may have been truncated
 * before this function is called.  Most callers have the folio locked.
 * A few have the folio blocked from truncation through other means (e.g.
 * zap_vma_pages() has it mapped and is holding the page table lock).
 * When called from mark_buffer_dirty(), the filesystem should hold a
 * reference to the buffer_head that is being marked dirty, which causes
 * try_to_free_buffers() to fail.
 */
void __folio_mark_dirty(struct folio *folio, struct address_space *mapping,
			     int warn)
{ … }

/**
 * filemap_dirty_folio - Mark a folio dirty for filesystems which do not use buffer_heads.
 * @mapping: Address space this folio belongs to.
 * @folio: Folio to be marked as dirty.
 *
 * Filesystems which do not use buffer heads should call this function
 * from their dirty_folio address space operation.  It ignores the
 * contents of folio_get_private(), so if the filesystem marks individual
 * blocks as dirty, the filesystem should handle that itself.
 *
 * This is also sometimes used by filesystems which use buffer_heads when
 * a single buffer is being dirtied: we want to set the folio dirty in
 * that case, but not all the buffers.  This is a "bottom-up" dirtying,
 * whereas block_dirty_folio() is a "top-down" dirtying.
 *
 * The caller must ensure this doesn't race with truncation.  Most will
 * simply hold the folio lock, but e.g. zap_pte_range() calls with the
 * folio mapped and the pte lock held, which also locks out truncation.
 */
bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio)
{ … }
EXPORT_SYMBOL(…);

/**
 * folio_redirty_for_writepage - Decline to write a dirty folio.
 * @wbc: The writeback control.
 * @folio: The folio.
 *
 * When a writepage implementation decides that it doesn't want to write
 * @folio for some reason, it should call this function, unlock @folio and
 * return 0.
 *
 * Return: True if we redirtied the folio.  False if someone else dirtied
 * it first.
 */
bool folio_redirty_for_writepage(struct writeback_control *wbc,
		struct folio *folio)
{ … }
EXPORT_SYMBOL(…);

/**
 * folio_mark_dirty - Mark a folio as being modified.
 * @folio: The folio.
 *
 * The folio may not be truncated while this function is running.
 * Holding the folio lock is sufficient to prevent truncation, but some
 * callers cannot acquire a sleeping lock.  These callers instead hold
 * the page table lock for a page table which contains at least one page
 * in this folio.  Truncation will block on the page table lock as it
 * unmaps pages before removing the folio from its mapping.
 *
 * Return: True if the folio was newly dirtied, false if it was already dirty.
 */
bool folio_mark_dirty(struct folio *folio)
{ … }
EXPORT_SYMBOL(…);

/*
 * set_page_dirty() is racy if the caller has no reference against
 * page->mapping->host, and if the page is unlocked.  This is because another
 * CPU could truncate the page off the mapping and then free the mapping.
 *
 * Usually, the page _is_ locked, or the caller is a user-space process which
 * holds a reference on the inode by having an open file.
 *
 * In other cases, the page should be locked before running set_page_dirty().
 */
int set_page_dirty_lock(struct page *page)
{ … }
EXPORT_SYMBOL(…);

/*
 * This cancels just the dirty bit on the kernel page itself, it does NOT
 * actually remove dirty bits on any mmap's that may be around. It also
 * leaves the page tagged dirty, so any sync activity will still find it on
 * the dirty lists, and in particular, clear_page_dirty_for_io() will still
 * look at the dirty bits in the VM.
 *
 * Doing this should *normally* only ever be done when a page is truncated,
 * and is not actually mapped anywhere at all. However, fs/buffer.c does
 * this when it notices that somebody has cleaned out all the buffers on a
 * page without actually doing it through the VM. Can you say "ext3 is
 * horribly ugly"? Thought you could.
 */
void __folio_cancel_dirty(struct folio *folio)
{ … }
EXPORT_SYMBOL(…);

/*
 * Clear a folio's dirty flag, while caring for dirty memory accounting.
 * Returns true if the folio was previously dirty.
 *
 * This is for preparing to put the folio under writeout.  We leave
 * the folio tagged as dirty in the xarray so that a concurrent
 * write-for-sync can discover it via a PAGECACHE_TAG_DIRTY walk.
 * The ->writepage implementation will run either folio_start_writeback()
 * or folio_mark_dirty(), at which stage we bring the folio's dirty flag
 * and xarray dirty tag back into sync.
 *
 * This incoherency between the folio's dirty flag and xarray tag is
 * unfortunate, but it only exists while the folio is locked.
 */
bool folio_clear_dirty_for_io(struct folio *folio)
{ … }
EXPORT_SYMBOL(…);

static void wb_inode_writeback_start(struct bdi_writeback *wb)
{ … }

static void wb_inode_writeback_end(struct bdi_writeback *wb)
{ … }

bool __folio_end_writeback(struct folio *folio)
{ … }

void __folio_start_writeback(struct folio *folio, bool keep_write)
{ … }
EXPORT_SYMBOL(…);

/**
 * folio_wait_writeback - Wait for a folio to finish writeback.
 * @folio: The folio to wait for.
 *
 * If the folio is currently being written back to storage, wait for the
 * I/O to complete.
 *
 * Context: Sleeps.  Must be called in process context and with
 * no spinlocks held.  Caller should hold a reference on the folio.
 * If the folio is not locked, writeback may start again after writeback
 * has finished.
 */
void folio_wait_writeback(struct folio *folio)
{ … }
EXPORT_SYMBOL_GPL(…);

/**
 * folio_wait_writeback_killable - Wait for a folio to finish writeback.
 * @folio: The folio to wait for.
 *
 * If the folio is currently being written back to storage, wait for the
 * I/O to complete or a fatal signal to arrive.
 *
 * Context: Sleeps.  Must be called in process context and with
 * no spinlocks held.  Caller should hold a reference on the folio.
 * If the folio is not locked, writeback may start again after writeback
 * has finished.
 * Return: 0 on success, -EINTR if we get a fatal signal while waiting.
 */
int folio_wait_writeback_killable(struct folio *folio)
{ … }
EXPORT_SYMBOL_GPL(…);

/**
 * folio_wait_stable() - wait for writeback to finish, if necessary.
 * @folio: The folio to wait on.
 *
 * This function determines if the given folio is related to a backing
 * device that requires folio contents to be held stable during writeback.
 * If so, then it will wait for any pending writeback to complete.
 *
 * Context: Sleeps.  Must be called in process context and with
 * no spinlocks held.  Caller should hold a reference on the folio.
 * If the folio is not locked, writeback may start again after writeback
 * has finished.
 */
void folio_wait_stable(struct folio *folio)
{ … }
EXPORT_SYMBOL_GPL(…);
linux/mm/page-writeback.c