page_alloc.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/mm/page_alloc.c
 *
 *  Manages the free list, the system allocates free pages here.
 *  Note that kmalloc() lives in slab.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *  Swap reorganised 29.12.95, Stephen Tweedie
 *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
 *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
 *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
 *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
 *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
 *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
 */

#include <linux/stddef.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/interrupt.h>
#include <linux/jiffies.h>
#include <linux/compiler.h>
#include <linux/kernel.h>
#include <linux/kasan.h>
#include <linux/kmsan.h>
#include <linux/module.h>
#include <linux/suspend.h>
#include <linux/ratelimit.h>
#include <linux/oom.h>
#include <linux/topology.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/pagevec.h>
#include <linux/memory_hotplug.h>
#include <linux/nodemask.h>
#include <linux/vmstat.h>
#include <linux/fault-inject.h>
#include <linux/compaction.h>
#include <trace/events/kmem.h>
#include <trace/events/oom.h>
#include <linux/prefetch.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
#include <linux/sched/mm.h>
#include <linux/page_owner.h>
#include <linux/page_table_check.h>
#include <linux/memcontrol.h>
#include <linux/ftrace.h>
#include <linux/lockdep.h>
#include <linux/psi.h>
#include <linux/khugepaged.h>
#include <linux/delayacct.h>
#include <linux/cacheinfo.h>
#include <linux/pgalloc_tag.h>
#include <asm/div64.h>
#include "internal.h"
#include "shuffle.h"
#include "page_reporting.h"

/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
fpi_t;

/* No special request */
#define FPI_NONE …

/*
 * Skip free page reporting notification for the (possibly merged) page.
 * This does not hinder free page reporting from grabbing the page,
 * reporting it and marking it "reported" -  it only skips notifying
 * the free page reporting infrastructure about a newly freed page. For
 * example, used when temporarily pulling a page from a freelist and
 * putting it back unmodified.
 */
#define FPI_SKIP_REPORT_NOTIFY …

/*
 * Place the (possibly merged) page to the tail of the freelist. Will ignore
 * page shuffling (relevant code - e.g., memory onlining - is expected to
 * shuffle the whole zone).
 *
 * Note: No code should rely on this flag for correctness - it's purely
 *       to allow for optimizations when handing back either fresh pages
 *       (memory onlining) or untouched pages (page isolation, free page
 *       reporting).
 */
#define FPI_TO_TAIL …

/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
#define MIN_PERCPU_PAGELIST_HIGH_FRACTION …

#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
/*
 * On SMP, spin_trylock is sufficient protection.
 * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
 */
#define pcp_trylock_prepare(flags) …
#define pcp_trylock_finish(flag) …
#else

/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
#define pcp_trylock_prepare …
#define pcp_trylock_finish …
#endif

/*
 * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid
 * a migration causing the wrong PCP to be locked and remote memory being
 * potentially allocated, pin the task to the CPU for the lookup+lock.
 * preempt_disable is used on !RT because it is faster than migrate_disable.
 * migrate_disable is used on RT because otherwise RT spinlock usage is
 * interfered with and a high priority task cannot preempt the allocator.
 */
#ifndef CONFIG_PREEMPT_RT
#define pcpu_task_pin() …
#define pcpu_task_unpin() …
#else
#define pcpu_task_pin …
#define pcpu_task_unpin …
#endif

/*
 * Generic helper to lookup and a per-cpu variable with an embedded spinlock.
 * Return value should be used with equivalent unlock helper.
 */
#define pcpu_spin_lock(type, member, ptr) …

#define pcpu_spin_trylock(type, member, ptr) …

#define pcpu_spin_unlock(member, ptr) …

/* struct per_cpu_pages specific helpers. */
#define pcp_spin_lock(ptr) …

#define pcp_spin_trylock(ptr) …

#define pcp_spin_unlock(ptr) …

#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DEFINE_PER_CPU(int, numa_node);
EXPORT_PER_CPU_SYMBOL(…);
#endif

DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);

#ifdef CONFIG_HAVE_MEMORYLESS_NODES
/*
 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
 * defined in <linux/topology.h>.
 */
DEFINE_PER_CPU(int, _numa_mem_);		/* Kernel "local memory" node */
EXPORT_PER_CPU_SYMBOL(_numa_mem_);
#endif

static DEFINE_MUTEX(pcpu_drain_mutex);

#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
volatile unsigned long latent_entropy __latent_entropy;
EXPORT_SYMBOL(latent_entropy);
#endif

/*
 * Array of node states.
 */
nodemask_t node_states[NR_NODE_STATES] __read_mostly = …;
EXPORT_SYMBOL(…);

gfp_t gfp_allowed_mask __read_mostly = …;

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
unsigned int pageblock_order __read_mostly;
#endif

static void __free_pages_ok(struct page *page, unsigned int order,
			    fpi_t fpi_flags);

/*
 * results with 256, 32 in the lowmem_reserve sysctl:
 *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
 *	1G machine -> (16M dma, 784M normal, 224M high)
 *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
 *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
 *	HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
 *
 * TBD: should special case ZONE_DMA32 machines here - in those we normally
 * don't need any ZONE_NORMAL reservation
 */
static int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = …;

char * const zone_names[MAX_NR_ZONES] = …;

const char * const migratetype_names[MIGRATE_TYPES] = …;

int min_free_kbytes = …;
int user_min_free_kbytes = …;
static int watermark_boost_factor __read_mostly = …;
static int watermark_scale_factor = …;

/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
int movable_zone;
EXPORT_SYMBOL(…);

#if MAX_NUMNODES > 1
unsigned int nr_node_ids __read_mostly = …;
unsigned int nr_online_nodes __read_mostly = …;
EXPORT_SYMBOL(…);
EXPORT_SYMBOL(…);
#endif

static bool page_contains_unaccepted(struct page *page, unsigned int order);
static void accept_page(struct page *page, unsigned int order);
static bool try_to_accept_memory(struct zone *zone, unsigned int order);
static inline bool has_unaccepted_memory(void);
static bool __free_unaccepted(struct page *page);

int page_group_by_mobility_disabled __read_mostly;

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
 * During boot we initialize deferred pages on-demand, as needed, but once
 * page_alloc_init_late() has finished, the deferred pages are all initialized,
 * and we can permanently disable that path.
 */
DEFINE_STATIC_KEY_TRUE(deferred_pages);

static inline bool deferred_pages_enabled(void)
{ … }

/*
 * deferred_grow_zone() is __init, but it is called from
 * get_page_from_freelist() during early boot until deferred_pages permanently
 * disables this call. This is why we have refdata wrapper to avoid warning,
 * and to ensure that the function body gets unloaded.
 */
static bool __ref
_deferred_grow_zone(struct zone *zone, unsigned int order)
{ … }
#else
static inline bool deferred_pages_enabled(void)
{
	return false;
}
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */

/* Return a pointer to the bitmap storing bits affecting a block of pages */
static inline unsigned long *get_pageblock_bitmap(const struct page *page,
							unsigned long pfn)
{ … }

static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
{ … }

/**
 * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
 * @page: The page within the block of interest
 * @pfn: The target page frame number
 * @mask: mask of bits that the caller is interested in
 *
 * Return: pageblock_bits flags
 */
unsigned long get_pfnblock_flags_mask(const struct page *page,
					unsigned long pfn, unsigned long mask)
{ … }

static __always_inline int get_pfnblock_migratetype(const struct page *page,
					unsigned long pfn)
{ … }

/**
 * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
 * @page: The page within the block of interest
 * @flags: The flags to set
 * @pfn: The target page frame number
 * @mask: mask of bits that the caller is interested in
 */
void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
					unsigned long pfn,
					unsigned long mask)
{ … }

void set_pageblock_migratetype(struct page *page, int migratetype)
{ … }

#ifdef CONFIG_DEBUG_VM
static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
{ … }

/*
 * Temporary debugging check for pages not lying within a given zone.
 */
static bool __maybe_unused bad_range(struct zone *zone, struct page *page)
{ … }
#else
static inline bool __maybe_unused bad_range(struct zone *zone, struct page *page)
{
	return false;
}
#endif

static void bad_page(struct page *page, const char *reason)
{ … }

static inline unsigned int order_to_pindex(int migratetype, int order)
{ … }

static inline int pindex_to_order(unsigned int pindex)
{ … }

static inline bool pcp_allowed_order(unsigned int order)
{ … }

/*
 * Higher-order pages are called "compound pages".  They are structured thusly:
 *
 * The first PAGE_SIZE page is called the "head page" and have PG_head set.
 *
 * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
 * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
 *
 * The first tail page's ->compound_order holds the order of allocation.
 * This usage means that zero-order pages may not be compound.
 */

void prep_compound_page(struct page *page, unsigned int order)
{ … }

static inline void set_buddy_order(struct page *page, unsigned int order)
{ … }

#ifdef CONFIG_COMPACTION
static inline struct capture_control *task_capc(struct zone *zone)
{ … }

static inline bool
compaction_capture(struct capture_control *capc, struct page *page,
		   int order, int migratetype)
{ … }

#else
static inline struct capture_control *task_capc(struct zone *zone)
{
	return NULL;
}

static inline bool
compaction_capture(struct capture_control *capc, struct page *page,
		   int order, int migratetype)
{
	return false;
}
#endif /* CONFIG_COMPACTION */

static inline void account_freepages(struct zone *zone, int nr_pages,
				     int migratetype)
{ … }

/* Used for pages not on another list */
static inline void __add_to_free_list(struct page *page, struct zone *zone,
				      unsigned int order, int migratetype,
				      bool tail)
{ … }

/*
 * Used for pages which are on another list. Move the pages to the tail
 * of the list - so the moved pages won't immediately be considered for
 * allocation again (e.g., optimization for memory onlining).
 */
static inline void move_to_free_list(struct page *page, struct zone *zone,
				     unsigned int order, int old_mt, int new_mt)
{ … }

static inline void __del_page_from_free_list(struct page *page, struct zone *zone,
					     unsigned int order, int migratetype)
{ … }

static inline void del_page_from_free_list(struct page *page, struct zone *zone,
					   unsigned int order, int migratetype)
{ … }

static inline struct page *get_page_from_free_area(struct free_area *area,
					    int migratetype)
{ … }

/*
 * If this is less than the 2nd largest possible page, check if the buddy
 * of the next-higher order is free. If it is, it's possible
 * that pages are being freed that will coalesce soon. In case,
 * that is happening, add the free page to the tail of the list
 * so it's less likely to be used soon and more likely to be merged
 * as a 2-level higher order page
 */
static inline bool
buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
		   struct page *page, unsigned int order)
{ … }

/*
 * Freeing function for a buddy system allocator.
 *
 * The concept of a buddy system is to maintain direct-mapped table
 * (containing bit values) for memory blocks of various "orders".
 * The bottom level table contains the map for the smallest allocatable
 * units of memory (here, pages), and each level above it describes
 * pairs of units from the levels below, hence, "buddies".
 * At a high level, all that happens here is marking the table entry
 * at the bottom level available, and propagating the changes upward
 * as necessary, plus some accounting needed to play nicely with other
 * parts of the VM system.
 * At each level, we keep a list of pages, which are heads of continuous
 * free pages of length of (1 << order) and marked with PageBuddy.
 * Page's order is recorded in page_private(page) field.
 * So when we are allocating or freeing one, we can derive the state of the
 * other.  That is, if we allocate a small block, and both were
 * free, the remainder of the region must be split into blocks.
 * If a block is freed, and its buddy is also free, then this
 * triggers coalescing into a block of larger size.
 *
 * -- nyc
 */

static inline void __free_one_page(struct page *page,
		unsigned long pfn,
		struct zone *zone, unsigned int order,
		int migratetype, fpi_t fpi_flags)
{ … }

/*
 * A bad page could be due to a number of fields. Instead of multiple branches,
 * try and check multiple fields with one check. The caller must do a detailed
 * check if necessary.
 */
static inline bool page_expected_state(struct page *page,
					unsigned long check_flags)
{ … }

static const char *page_bad_reason(struct page *page, unsigned long flags)
{ … }

static void free_page_is_bad_report(struct page *page)
{ … }

static inline bool free_page_is_bad(struct page *page)
{ … }

static inline bool is_check_pages_enabled(void)
{ … }

static int free_tail_page_prepare(struct page *head_page, struct page *page)
{ … }

/*
 * Skip KASAN memory poisoning when either:
 *
 * 1. For generic KASAN: deferred memory initialization has not yet completed.
 *    Tag-based KASAN modes skip pages freed via deferred memory initialization
 *    using page tags instead (see below).
 * 2. For tag-based KASAN modes: the page has a match-all KASAN tag, indicating
 *    that error detection is disabled for accesses via the page address.
 *
 * Pages will have match-all tags in the following circumstances:
 *
 * 1. Pages are being initialized for the first time, including during deferred
 *    memory init; see the call to page_kasan_tag_reset in __init_single_page.
 * 2. The allocation was not unpoisoned due to __GFP_SKIP_KASAN, with the
 *    exception of pages unpoisoned by kasan_unpoison_vmalloc.
 * 3. The allocation was excluded from being checked due to sampling,
 *    see the call to kasan_unpoison_pages.
 *
 * Poisoning pages during deferred memory init will greatly lengthen the
 * process and cause problem in large memory systems as the deferred pages
 * initialization is done with interrupt disabled.
 *
 * Assuming that there will be no reference to those newly initialized
 * pages before they are ever allocated, this should have no effect on
 * KASAN memory tracking as the poison will be properly inserted at page
 * allocation time. The only corner case is when pages are allocated by
 * on-demand allocation and then freed again before the deferred pages
 * initialization is done, but this is not likely to happen.
 */
static inline bool should_skip_kasan_poison(struct page *page)
{ … }

static void kernel_init_pages(struct page *page, int numpages)
{ … }

__always_inline bool free_pages_prepare(struct page *page,
			unsigned int order)
{ … }

/*
 * Frees a number of pages from the PCP lists
 * Assumes all pages on list are in same zone.
 * count is the number of pages to free.
 */
static void free_pcppages_bulk(struct zone *zone, int count,
					struct per_cpu_pages *pcp,
					int pindex)
{ … }

static void free_one_page(struct zone *zone, struct page *page,
			  unsigned long pfn, unsigned int order,
			  fpi_t fpi_flags)
{ … }

static void __free_pages_ok(struct page *page, unsigned int order,
			    fpi_t fpi_flags)
{ … }

void __meminit __free_pages_core(struct page *page, unsigned int order,
		enum meminit_context context)
{ … }

/*
 * Check that the whole (or subset of) a pageblock given by the interval of
 * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
 * with the migration of free compaction scanner.
 *
 * Return struct page pointer of start_pfn, or NULL if checks were not passed.
 *
 * It's possible on some configurations to have a setup like node0 node1 node0
 * i.e. it's possible that all pages within a zones range of pages do not
 * belong to a single zone. We assume that a border between node0 and node1
 * can occur within a single pageblock, but not a node0 node1 node0
 * interleaving within a single pageblock. It is therefore sufficient to check
 * the first and last page of a pageblock and avoid checking each individual
 * page in a pageblock.
 *
 * Note: the function may return non-NULL struct page even for a page block
 * which contains a memory hole (i.e. there is no physical memory for a subset
 * of the pfn range). For example, if the pageblock order is MAX_PAGE_ORDER, which
 * will fall into 2 sub-sections, and the end pfn of the pageblock may be hole
 * even though the start pfn is online and valid. This should be safe most of
 * the time because struct pages are still initialized via init_unavailable_range()
 * and pfn walkers shouldn't touch any physical memory range for which they do
 * not recognize any specific metadata in struct pages.
 */
struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
				     unsigned long end_pfn, struct zone *zone)
{ … }

/*
 * The order of subdivision here is critical for the IO subsystem.
 * Please do not alter this order without good reasons and regression
 * testing. Specifically, as large blocks of memory are subdivided,
 * the order in which smaller blocks are delivered depends on the order
 * they're subdivided in this function. This is the primary factor
 * influencing the order in which pages are delivered to the IO
 * subsystem according to empirical testing, and this is also justified
 * by considering the behavior of a buddy system containing a single
 * large block of memory acted on by a series of small allocations.
 * This behavior is a critical factor in sglist merging's success.
 *
 * -- nyc
 */
static inline void expand(struct zone *zone, struct page *page,
	int low, int high, int migratetype)
{ … }

static void check_new_page_bad(struct page *page)
{ … }

/*
 * This page is about to be returned from the page allocator
 */
static bool check_new_page(struct page *page)
{ … }

static inline bool check_new_pages(struct page *page, unsigned int order)
{ … }

static inline bool should_skip_kasan_unpoison(gfp_t flags)
{ … }

static inline bool should_skip_init(gfp_t flags)
{ … }

inline void post_alloc_hook(struct page *page, unsigned int order,
				gfp_t gfp_flags)
{ … }

static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
							unsigned int alloc_flags)
{ … }

/*
 * Go through the free lists for the given migratetype and remove
 * the smallest available page from the freelists
 */
static __always_inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
						int migratetype)
{ … }


/*
 * This array describes the order lists are fallen back to when
 * the free lists for the desirable migrate type are depleted
 *
 * The other migratetypes do not have fallbacks.
 */
static int fallbacks[MIGRATE_PCPTYPES][MIGRATE_PCPTYPES - 1] = …;

#ifdef CONFIG_CMA
static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone,
					unsigned int order)
{ … }
#else
static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
					unsigned int order) { return NULL; }
#endif

/*
 * Change the type of a block and move all its free pages to that
 * type's freelist.
 */
static int __move_freepages_block(struct zone *zone, unsigned long start_pfn,
				  int old_mt, int new_mt)
{ … }

static bool prep_move_freepages_block(struct zone *zone, struct page *page,
				      unsigned long *start_pfn,
				      int *num_free, int *num_movable)
{ … }

static int move_freepages_block(struct zone *zone, struct page *page,
				int old_mt, int new_mt)
{ … }

#ifdef CONFIG_MEMORY_ISOLATION
/* Look for a buddy that straddles start_pfn */
static unsigned long find_large_buddy(unsigned long start_pfn)
{ … }

/* Split a multi-block free page into its individual pageblocks */
static void split_large_buddy(struct zone *zone, struct page *page,
			      unsigned long pfn, int order)
{ … }

/**
 * move_freepages_block_isolate - move free pages in block for page isolation
 * @zone: the zone
 * @page: the pageblock page
 * @migratetype: migratetype to set on the pageblock
 *
 * This is similar to move_freepages_block(), but handles the special
 * case encountered in page isolation, where the block of interest
 * might be part of a larger buddy spanning multiple pageblocks.
 *
 * Unlike the regular page allocator path, which moves pages while
 * stealing buddies off the freelist, page isolation is interested in
 * arbitrary pfn ranges that may have overlapping buddies on both ends.
 *
 * This function handles that. Straddling buddies are split into
 * individual pageblocks. Only the block of interest is moved.
 *
 * Returns %true if pages could be moved, %false otherwise.
 */
bool move_freepages_block_isolate(struct zone *zone, struct page *page,
				  int migratetype)
{ … }
#endif /* CONFIG_MEMORY_ISOLATION */

static void change_pageblock_range(struct page *pageblock_page,
					int start_order, int migratetype)
{ … }

/*
 * When we are falling back to another migratetype during allocation, try to
 * steal extra free pages from the same pageblocks to satisfy further
 * allocations, instead of polluting multiple pageblocks.
 *
 * If we are stealing a relatively large buddy page, it is likely there will
 * be more free pages in the pageblock, so try to steal them all. For
 * reclaimable and unmovable allocations, we steal regardless of page size,
 * as fragmentation caused by those allocations polluting movable pageblocks
 * is worse than movable allocations stealing from unmovable and reclaimable
 * pageblocks.
 */
static bool can_steal_fallback(unsigned int order, int start_mt)
{ … }

static inline bool boost_watermark(struct zone *zone)
{ … }

/*
 * This function implements actual steal behaviour. If order is large enough, we
 * can claim the whole pageblock for the requested migratetype. If not, we check
 * the pageblock for constituent pages; if at least half of the pages are free
 * or compatible, we can still claim the whole block, so pages freed in the
 * future will be put on the correct free list. Otherwise, we isolate exactly
 * the order we need from the fallback block and leave its migratetype alone.
 */
static struct page *
steal_suitable_fallback(struct zone *zone, struct page *page,
			int current_order, int order, int start_type,
			unsigned int alloc_flags, bool whole_block)
{ … }

/*
 * Check whether there is a suitable fallback freepage with requested order.
 * If only_stealable is true, this function returns fallback_mt only if
 * we can steal other freepages all together. This would help to reduce
 * fragmentation due to mixed migratetype pages in one pageblock.
 */
int find_suitable_fallback(struct free_area *area, unsigned int order,
			int migratetype, bool only_stealable, bool *can_steal)
{ … }

/*
 * Reserve the pageblock(s) surrounding an allocation request for
 * exclusive use of high-order atomic allocations if there are no
 * empty page blocks that contain a page with a suitable order
 */
static void reserve_highatomic_pageblock(struct page *page, int order,
					 struct zone *zone)
{ … }

/*
 * Used when an allocation is about to fail under memory pressure. This
 * potentially hurts the reliability of high-order allocations when under
 * intense memory pressure but failed atomic allocations should be easier
 * to recover from than an OOM.
 *
 * If @force is true, try to unreserve pageblocks even though highatomic
 * pageblock is exhausted.
 */
static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
						bool force)
{ … }

/*
 * Try finding a free buddy page on the fallback list and put it on the free
 * list of requested migratetype, possibly along with other pages from the same
 * block, depending on fragmentation avoidance heuristics. Returns true if
 * fallback was found so that __rmqueue_smallest() can grab it.
 *
 * The use of signed ints for order and current_order is a deliberate
 * deviation from the rest of this file, to make the for loop
 * condition simpler.
 */
static __always_inline struct page *
__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
						unsigned int alloc_flags)
{ … }

/*
 * Do the hard work of removing an element from the buddy allocator.
 * Call me with the zone->lock already held.
 */
static __always_inline struct page *
__rmqueue(struct zone *zone, unsigned int order, int migratetype,
						unsigned int alloc_flags)
{ … }

/*
 * Obtain a specified number of elements from the buddy allocator, all under
 * a single hold of the lock, for efficiency.  Add them to the supplied list.
 * Returns the number of new pages which were placed at *list.
 */
static int rmqueue_bulk(struct zone *zone, unsigned int order,
			unsigned long count, struct list_head *list,
			int migratetype, unsigned int alloc_flags)
{ … }

/*
 * Called from the vmstat counter updater to decay the PCP high.
 * Return whether there are addition works to do.
 */
int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
{ … }

#ifdef CONFIG_NUMA
/*
 * Called from the vmstat counter updater to drain pagesets of this
 * currently executing processor on remote nodes after they have
 * expired.
 */
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
{ … }
#endif

/*
 * Drain pcplists of the indicated processor and zone.
 */
static void drain_pages_zone(unsigned int cpu, struct zone *zone)
{ … }

/*
 * Drain pcplists of all zones on the indicated processor.
 */
static void drain_pages(unsigned int cpu)
{ … }

/*
 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
 */
void drain_local_pages(struct zone *zone)
{ … }

/*
 * The implementation of drain_all_pages(), exposing an extra parameter to
 * drain on all cpus.
 *
 * drain_all_pages() is optimized to only execute on cpus where pcplists are
 * not empty. The check for non-emptiness can however race with a free to
 * pcplist that has not yet increased the pcp->count from 0 to 1. Callers
 * that need the guarantee that every CPU has drained can disable the
 * optimizing racy check.
 */
static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
{ … }

/*
 * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
 *
 * When zone parameter is non-NULL, spill just the single zone's pages.
 */
void drain_all_pages(struct zone *zone)
{ … }

static int nr_pcp_free(struct per_cpu_pages *pcp, int batch, int high, bool free_high)
{ … }

static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
		       int batch, bool free_high)
{ … }

static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
				   struct page *page, int migratetype,
				   unsigned int order)
{ … }

/*
 * Free a pcp page
 */
void free_unref_page(struct page *page, unsigned int order)
{ … }

/*
 * Free a batch of folios
 */
void free_unref_folios(struct folio_batch *folios)
{ … }

/*
 * split_page takes a non-compound higher-order page, and splits it into
 * n (1<<order) sub-pages: page[0..n]
 * Each sub-page must be freed individually.
 *
 * Note: this is probably too low level an operation for use in drivers.
 * Please consult with lkml before using this in your driver.
 */
void split_page(struct page *page, unsigned int order)
{ … }
EXPORT_SYMBOL_GPL(…);

int __isolate_free_page(struct page *page, unsigned int order)
{ … }

/**
 * __putback_isolated_page - Return a now-isolated page back where we got it
 * @page: Page that was isolated
 * @order: Order of the isolated page
 * @mt: The page's pageblock's migratetype
 *
 * This function is meant to return a page pulled from the free lists via
 * __isolate_free_page back to the free lists they were pulled from.
 */
void __putback_isolated_page(struct page *page, unsigned int order, int mt)
{ … }

/*
 * Update NUMA hit/miss statistics
 */
static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
				   long nr_account)
{ … }

static __always_inline
struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
			   unsigned int order, unsigned int alloc_flags,
			   int migratetype)
{ … }

static int nr_pcp_alloc(struct per_cpu_pages *pcp, struct zone *zone, int order)
{ … }

/* Remove page from the per-cpu list, caller must protect the list */
static inline
struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
			int migratetype,
			unsigned int alloc_flags,
			struct per_cpu_pages *pcp,
			struct list_head *list)
{ … }

/* Lock and remove page from the per-cpu list */
static struct page *rmqueue_pcplist(struct zone *preferred_zone,
			struct zone *zone, unsigned int order,
			int migratetype, unsigned int alloc_flags)
{ … }

/*
 * Allocate a page from the given zone.
 * Use pcplists for THP or "cheap" high-order allocations.
 */

/*
 * Do not instrument rmqueue() with KMSAN. This function may call
 * __msan_poison_alloca() through a call to set_pfnblock_flags_mask().
 * If __msan_poison_alloca() attempts to allocate pages for the stack depot, it
 * may call rmqueue() again, which will result in a deadlock.
 */
__no_sanitize_memory
static inline
struct page *rmqueue(struct zone *preferred_zone,
			struct zone *zone, unsigned int order,
			gfp_t gfp_flags, unsigned int alloc_flags,
			int migratetype)
{ … }

static inline long __zone_watermark_unusable_free(struct zone *z,
				unsigned int order, unsigned int alloc_flags)
{ … }

/*
 * Return true if free base pages are above 'mark'. For high-order checks it
 * will return true of the order-0 watermark is reached and there is at least
 * one free page of a suitable size. Checking now avoids taking the zone lock
 * to check in the allocation paths if no pages are free.
 */
bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
			 int highest_zoneidx, unsigned int alloc_flags,
			 long free_pages)
{ … }

bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
		      int highest_zoneidx, unsigned int alloc_flags)
{ … }

static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
				unsigned long mark, int highest_zoneidx,
				unsigned int alloc_flags, gfp_t gfp_mask)
{ … }

bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
			unsigned long mark, int highest_zoneidx)
{ … }

#ifdef CONFIG_NUMA
int __read_mostly node_reclaim_distance = …;

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{ … }
#else	/* CONFIG_NUMA */
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
	return true;
}
#endif	/* CONFIG_NUMA */

/*
 * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
 * fragmentation is subtle. If the preferred zone was HIGHMEM then
 * premature use of a lower zone may cause lowmem pressure problems that
 * are worse than fragmentation. If the next zone is ZONE_DMA then it is
 * probably too small. It only makes sense to spread allocations to avoid
 * fragmentation between the Normal and DMA32 zones.
 */
static inline unsigned int
alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
{ … }

/* Must be called after current_gfp_context() which can change gfp_mask */
static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask,
						  unsigned int alloc_flags)
{ … }

/*
 * get_page_from_freelist goes through the zonelist trying to allocate
 * a page.
 */
static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
						const struct alloc_context *ac)
{ … }

static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
{ … }

void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
{ … }

static inline struct page *
__alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
			      unsigned int alloc_flags,
			      const struct alloc_context *ac)
{ … }

static inline struct page *
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
	const struct alloc_context *ac, unsigned long *did_some_progress)
{ … }

/*
 * Maximum number of compaction retries with a progress before OOM
 * killer is consider as the only way to move forward.
 */
#define MAX_COMPACT_RETRIES …

#ifdef CONFIG_COMPACTION
/* Try memory compaction for high-order allocations before reclaim */
static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
		unsigned int alloc_flags, const struct alloc_context *ac,
		enum compact_priority prio, enum compact_result *compact_result)
{ … }

static inline bool
should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
		     enum compact_result compact_result,
		     enum compact_priority *compact_priority,
		     int *compaction_retries)
{ … }
#else
static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
		unsigned int alloc_flags, const struct alloc_context *ac,
		enum compact_priority prio, enum compact_result *compact_result)
{
	*compact_result = COMPACT_SKIPPED;
	return NULL;
}

static inline bool
should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
		     enum compact_result compact_result,
		     enum compact_priority *compact_priority,
		     int *compaction_retries)
{
	struct zone *zone;
	struct zoneref *z;

	if (!order || order > PAGE_ALLOC_COSTLY_ORDER)
		return false;

	/*
	 * There are setups with compaction disabled which would prefer to loop
	 * inside the allocator rather than hit the oom killer prematurely.
	 * Let's give them a good hope and keep retrying while the order-0
	 * watermarks are OK.
	 */
	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
				ac->highest_zoneidx, ac->nodemask) {
		if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
					ac->highest_zoneidx, alloc_flags))
			return true;
	}
	return false;
}
#endif /* CONFIG_COMPACTION */

#ifdef CONFIG_LOCKDEP
static struct lockdep_map __fs_reclaim_map = …;

static bool __need_reclaim(gfp_t gfp_mask)
{ … }

void __fs_reclaim_acquire(unsigned long ip)
{ … }

void __fs_reclaim_release(unsigned long ip)
{ … }

void fs_reclaim_acquire(gfp_t gfp_mask)
{ … }
EXPORT_SYMBOL_GPL(…);

void fs_reclaim_release(gfp_t gfp_mask)
{ … }
EXPORT_SYMBOL_GPL(…);
#endif

/*
 * Zonelists may change due to hotplug during allocation. Detect when zonelists
 * have been rebuilt so allocation retries. Reader side does not lock and
 * retries the allocation if zonelist changes. Writer side is protected by the
 * embedded spin_lock.
 */
static DEFINE_SEQLOCK(zonelist_update_seq);

static unsigned int zonelist_iter_begin(void)
{ … }

static unsigned int check_retry_zonelist(unsigned int seq)
{ … }

/* Perform direct synchronous page reclaim */
static unsigned long
__perform_reclaim(gfp_t gfp_mask, unsigned int order,
					const struct alloc_context *ac)
{ … }

/* The really slow allocator path where we enter direct reclaim */
static inline struct page *
__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
		unsigned int alloc_flags, const struct alloc_context *ac,
		unsigned long *did_some_progress)
{ … }

static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
			     const struct alloc_context *ac)
{ … }

static inline unsigned int
gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
{ … }

static bool oom_reserves_allowed(struct task_struct *tsk)
{ … }

/*
 * Distinguish requests which really need access to full memory
 * reserves from oom victims which can live with a portion of it
 */
static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
{ … }

bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
{ … }

/*
 * Checks whether it makes sense to retry the reclaim to make a forward progress
 * for the given allocation request.
 *
 * We give up when we either have tried MAX_RECLAIM_RETRIES in a row
 * without success, or when we couldn't even meet the watermark if we
 * reclaimed all remaining pages on the LRU lists.
 *
 * Returns true if a retry is viable or false to enter the oom path.
 */
static inline bool
should_reclaim_retry(gfp_t gfp_mask, unsigned order,
		     struct alloc_context *ac, int alloc_flags,
		     bool did_some_progress, int *no_progress_loops)
{ … }

static inline bool
check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
{ … }

static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
						struct alloc_context *ac)
{ … }

static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
		int preferred_nid, nodemask_t *nodemask,
		struct alloc_context *ac, gfp_t *alloc_gfp,
		unsigned int *alloc_flags)
{ … }

/*
 * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array
 * @gfp: GFP flags for the allocation
 * @preferred_nid: The preferred NUMA node ID to allocate from
 * @nodemask: Set of nodes to allocate from, may be NULL
 * @nr_pages: The number of pages desired on the list or array
 * @page_list: Optional list to store the allocated pages
 * @page_array: Optional array to store the pages
 *
 * This is a batched version of the page allocator that attempts to
 * allocate nr_pages quickly. Pages are added to page_list if page_list
 * is not NULL, otherwise it is assumed that the page_array is valid.
 *
 * For lists, nr_pages is the number of pages that should be allocated.
 *
 * For arrays, only NULL elements are populated with pages and nr_pages
 * is the maximum number of pages that will be stored in the array.
 *
 * Returns the number of pages on the list or array.
 */
unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
			nodemask_t *nodemask, int nr_pages,
			struct list_head *page_list,
			struct page **page_array)
{ … }
EXPORT_SYMBOL_GPL(…);

/*
 * This is the 'heart' of the zoned buddy allocator.
 */
struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order,
				      int preferred_nid, nodemask_t *nodemask)
{ … }
EXPORT_SYMBOL(…);

struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid,
		nodemask_t *nodemask)
{ … }
EXPORT_SYMBOL(…);

/*
 * Common helper functions. Never use with __GFP_HIGHMEM because the returned
 * address cannot represent highmem pages. Use alloc_pages and then kmap if
 * you need to access high mem.
 */
unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order)
{ … }
EXPORT_SYMBOL(…);

unsigned long get_zeroed_page_noprof(gfp_t gfp_mask)
{ … }
EXPORT_SYMBOL(…);

/**
 * __free_pages - Free pages allocated with alloc_pages().
 * @page: The page pointer returned from alloc_pages().
 * @order: The order of the allocation.
 *
 * This function can free multi-page allocations that are not compound
 * pages.  It does not check that the @order passed in matches that of
 * the allocation, so it is easy to leak memory.  Freeing more memory
 * than was allocated will probably emit a warning.
 *
 * If the last reference to this page is speculative, it will be released
 * by put_page() which only frees the first page of a non-compound
 * allocation.  To prevent the remaining pages from being leaked, we free
 * the subsequent pages here.  If you want to use the page's reference
 * count to decide when to free the allocation, you should allocate a
 * compound page, and use put_page() instead of __free_pages().
 *
 * Context: May be called in interrupt context or while holding a normal
 * spinlock, but not in NMI context or while holding a raw spinlock.
 */
void __free_pages(struct page *page, unsigned int order)
{ … }
EXPORT_SYMBOL(…);

void free_pages(unsigned long addr, unsigned int order)
{ … }

EXPORT_SYMBOL(…);

/*
 * Page Fragment:
 *  An arbitrary-length arbitrary-offset area of memory which resides
 *  within a 0 or higher order page.  Multiple fragments within that page
 *  are individually refcounted, in the page's reference counter.
 *
 * The page_frag functions below provide a simple allocation framework for
 * page fragments.  This is used by the network stack and network device
 * drivers to provide a backing region of memory for use as either an
 * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
 */
static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
					     gfp_t gfp_mask)
{ … }

void page_frag_cache_drain(struct page_frag_cache *nc)
{ … }
EXPORT_SYMBOL(…);

void __page_frag_cache_drain(struct page *page, unsigned int count)
{ … }
EXPORT_SYMBOL(…);

void *__page_frag_alloc_align(struct page_frag_cache *nc,
			      unsigned int fragsz, gfp_t gfp_mask,
			      unsigned int align_mask)
{ … }
EXPORT_SYMBOL(…);

/*
 * Frees a page fragment allocated out of either a compound or order 0 page.
 */
void page_frag_free(void *addr)
{ … }
EXPORT_SYMBOL(…);

static void *make_alloc_exact(unsigned long addr, unsigned int order,
		size_t size)
{ … }

/**
 * alloc_pages_exact - allocate an exact number physically-contiguous pages.
 * @size: the number of bytes to allocate
 * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
 *
 * This function is similar to alloc_pages(), except that it allocates the
 * minimum number of pages to satisfy the request.  alloc_pages() can only
 * allocate memory in power-of-two pages.
 *
 * This function is also limited by MAX_PAGE_ORDER.
 *
 * Memory allocated by this function must be released by free_pages_exact().
 *
 * Return: pointer to the allocated area or %NULL in case of error.
 */
void *alloc_pages_exact_noprof(size_t size, gfp_t gfp_mask)
{ … }
EXPORT_SYMBOL(…);

/**
 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
 *			   pages on a node.
 * @nid: the preferred node ID where memory should be allocated
 * @size: the number of bytes to allocate
 * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
 *
 * Like alloc_pages_exact(), but try to allocate on node nid first before falling
 * back.
 *
 * Return: pointer to the allocated area or %NULL in case of error.
 */
void * __meminit alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mask)
{ … }

/**
 * free_pages_exact - release memory allocated via alloc_pages_exact()
 * @virt: the value returned by alloc_pages_exact.
 * @size: size of allocation, same value as passed to alloc_pages_exact().
 *
 * Release the memory allocated by a previous call to alloc_pages_exact.
 */
void free_pages_exact(void *virt, size_t size)
{ … }
EXPORT_SYMBOL(…);

/**
 * nr_free_zone_pages - count number of pages beyond high watermark
 * @offset: The zone index of the highest zone
 *
 * nr_free_zone_pages() counts the number of pages which are beyond the
 * high watermark within all zones at or below a given zone index.  For each
 * zone, the number of pages is calculated as:
 *
 *     nr_free_zone_pages = managed_pages - high_pages
 *
 * Return: number of pages beyond high watermark.
 */
static unsigned long nr_free_zone_pages(int offset)
{ … }

/**
 * nr_free_buffer_pages - count number of pages beyond high watermark
 *
 * nr_free_buffer_pages() counts the number of pages which are beyond the high
 * watermark within ZONE_DMA and ZONE_NORMAL.
 *
 * Return: number of pages beyond high watermark within ZONE_DMA and
 * ZONE_NORMAL.
 */
unsigned long nr_free_buffer_pages(void)
{ … }
EXPORT_SYMBOL_GPL(…);

static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
{ … }

/*
 * Builds allocation fallback zone lists.
 *
 * Add all populated zones of a node to the zonelist.
 */
static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
{ … }

#ifdef CONFIG_NUMA

static int __parse_numa_zonelist_order(char *s)
{ … }

static char numa_zonelist_order[] = …;
#define NUMA_ZONELIST_ORDER_LEN …
/*
 * sysctl handler for numa_zonelist_order
 */
static int numa_zonelist_order_handler(const struct ctl_table *table, int write,
		void *buffer, size_t *length, loff_t *ppos)
{ … }

static int node_load[MAX_NUMNODES];

/**
 * find_next_best_node - find the next node that should appear in a given node's fallback list
 * @node: node whose fallback list we're appending
 * @used_node_mask: nodemask_t of already used nodes
 *
 * We use a number of factors to determine which is the next node that should
 * appear on a given node's fallback list.  The node should not have appeared
 * already in @node's fallback list, and it should be the next closest node
 * according to the distance array (which contains arbitrary distance values
 * from each node to each node in the system), and should also prefer nodes
 * with no CPUs, since presumably they'll have very little allocation pressure
 * on them otherwise.
 *
 * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
 */
int find_next_best_node(int node, nodemask_t *used_node_mask)
{ … }


/*
 * Build zonelists ordered by node and zones within node.
 * This results in maximum locality--normal zone overflows into local
 * DMA zone, if any--but risks exhausting DMA zone.
 */
static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
		unsigned nr_nodes)
{ … }

/*
 * Build __GFP_THISNODE zonelists
 */
static void build_thisnode_zonelists(pg_data_t *pgdat)
{ … }

/*
 * Build zonelists ordered by zone and nodes within zones.
 * This results in conserving DMA zone[s] until all Normal memory is
 * exhausted, but results in overflowing to remote node while memory
 * may still exist in local DMA zone.
 */

static void build_zonelists(pg_data_t *pgdat)
{ … }

#ifdef CONFIG_HAVE_MEMORYLESS_NODES
/*
 * Return node id of node used for "local" allocations.
 * I.e., first node id of first zone in arg node's generic zonelist.
 * Used for initializing percpu 'numa_mem', which is used primarily
 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
 */
int local_memory_node(int node)
{
	struct zoneref *z;

	z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
				   gfp_zone(GFP_KERNEL),
				   NULL);
	return zone_to_nid(z->zone);
}
#endif

static void setup_min_unmapped_ratio(void);
static void setup_min_slab_ratio(void);
#else	/* CONFIG_NUMA */

static void build_zonelists(pg_data_t *pgdat)
{
	struct zoneref *zonerefs;
	int nr_zones;

	zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
	nr_zones = build_zonerefs_node(pgdat, zonerefs);
	zonerefs += nr_zones;

	zonerefs->zone = NULL;
	zonerefs->zone_idx = 0;
}

#endif	/* CONFIG_NUMA */

/*
 * Boot pageset table. One per cpu which is going to be used for all
 * zones and all nodes. The parameters will be set in such a way
 * that an item put on a list will immediately be handed over to
 * the buddy list. This is safe since pageset manipulation is done
 * with interrupts disabled.
 *
 * The boot_pagesets must be kept even after bootup is complete for
 * unused processors and/or zones. They do play a role for bootstrapping
 * hotplugged processors.
 *
 * zoneinfo_show() and maybe other functions do
 * not check if the processor is online before following the pageset pointer.
 * Other parts of the kernel may not check if the zone is available.
 */
static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats);
/* These effectively disable the pcplists in the boot pageset completely */
#define BOOT_PAGESET_HIGH …
#define BOOT_PAGESET_BATCH …
static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset);
static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats);

static void __build_all_zonelists(void *data)
{ … }

static noinline void __init
build_all_zonelists_init(void)
{ … }

/*
 * unless system_state == SYSTEM_BOOTING.
 *
 * __ref due to call of __init annotated helper build_all_zonelists_init
 * [protected by SYSTEM_BOOTING].
 */
void __ref build_all_zonelists(pg_data_t *pgdat)
{ … }

static int zone_batchsize(struct zone *zone)
{ … }

static int percpu_pagelist_high_fraction;
static int zone_highsize(struct zone *zone, int batch, int cpu_online,
			 int high_fraction)
{ … }

/*
 * pcp->high and pcp->batch values are related and generally batch is lower
 * than high. They are also related to pcp->count such that count is lower
 * than high, and as soon as it reaches high, the pcplist is flushed.
 *
 * However, guaranteeing these relations at all times would require e.g. write
 * barriers here but also careful usage of read barriers at the read side, and
 * thus be prone to error and bad for performance. Thus the update only prevents
 * store tearing. Any new users of pcp->batch, pcp->high_min and pcp->high_max
 * should ensure they can cope with those fields changing asynchronously, and
 * fully trust only the pcp->count field on the local CPU with interrupts
 * disabled.
 *
 * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
 * outside of boot time (or some other assurance that no concurrent updaters
 * exist).
 */
static void pageset_update(struct per_cpu_pages *pcp, unsigned long high_min,
			   unsigned long high_max, unsigned long batch)
{ … }

static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats)
{ … }

static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high_min,
					      unsigned long high_max, unsigned long batch)
{ … }

/*
 * Calculate and set new high and batch values for all per-cpu pagesets of a
 * zone based on the zone's size.
 */
static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online)
{ … }

void __meminit setup_zone_pageset(struct zone *zone)
{ … }

/*
 * The zone indicated has a new number of managed_pages; batch sizes and percpu
 * page high values need to be recalculated.
 */
static void zone_pcp_update(struct zone *zone, int cpu_online)
{ … }

static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu)
{ … }

void setup_pcp_cacheinfo(unsigned int cpu)
{ … }

/*
 * Allocate per cpu pagesets and initialize them.
 * Before this call only boot pagesets were available.
 */
void __init setup_per_cpu_pageset(void)
{ … }

__meminit void zone_pcp_init(struct zone *zone)
{ … }

void adjust_managed_page_count(struct page *page, long count)
{ … }
EXPORT_SYMBOL(…);

unsigned long free_reserved_area(void *start, void *end, int poison, const char *s)
{ … }

void free_reserved_page(struct page *page)
{ … }
EXPORT_SYMBOL(…);

static int page_alloc_cpu_dead(unsigned int cpu)
{ … }

static int page_alloc_cpu_online(unsigned int cpu)
{ … }

void __init page_alloc_init_cpuhp(void)
{ … }

/*
 * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
 *	or min_free_kbytes changes.
 */
static void calculate_totalreserve_pages(void)
{ … }

/*
 * setup_per_zone_lowmem_reserve - called whenever
 *	sysctl_lowmem_reserve_ratio changes.  Ensures that each zone
 *	has a correct pages reserved value, so an adequate number of
 *	pages are left in the zone after a successful __alloc_pages().
 */
static void setup_per_zone_lowmem_reserve(void)
{ … }

static void __setup_per_zone_wmarks(void)
{ … }

/**
 * setup_per_zone_wmarks - called when min_free_kbytes changes
 * or when memory is hot-{added|removed}
 *
 * Ensures that the watermark[min,low,high] values for each zone are set
 * correctly with respect to min_free_kbytes.
 */
void setup_per_zone_wmarks(void)
{ … }

/*
 * Initialise min_free_kbytes.
 *
 * For small machines we want it small (128k min).  For large machines
 * we want it large (256MB max).  But it is not linear, because network
 * bandwidth does not increase linearly with machine size.  We use
 *
 *	min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
 *	min_free_kbytes = sqrt(lowmem_kbytes * 16)
 *
 * which yields
 *
 * 16MB:	512k
 * 32MB:	724k
 * 64MB:	1024k
 * 128MB:	1448k
 * 256MB:	2048k
 * 512MB:	2896k
 * 1024MB:	4096k
 * 2048MB:	5792k
 * 4096MB:	8192k
 * 8192MB:	11584k
 * 16384MB:	16384k
 */
void calculate_min_free_kbytes(void)
{ … }

int __meminit init_per_zone_wmark_min(void)
{ … }
postcore_initcall(…) …

/*
 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
 *	that we can call two helper functions whenever min_free_kbytes
 *	changes.
 */
static int min_free_kbytes_sysctl_handler(const struct ctl_table *table, int write,
		void *buffer, size_t *length, loff_t *ppos)
{ … }

static int watermark_scale_factor_sysctl_handler(const struct ctl_table *table, int write,
		void *buffer, size_t *length, loff_t *ppos)
{ … }

#ifdef CONFIG_NUMA
static void setup_min_unmapped_ratio(void)
{ … }


static int sysctl_min_unmapped_ratio_sysctl_handler(const struct ctl_table *table, int write,
		void *buffer, size_t *length, loff_t *ppos)
{ … }

static void setup_min_slab_ratio(void)
{ … }

static int sysctl_min_slab_ratio_sysctl_handler(const struct ctl_table *table, int write,
		void *buffer, size_t *length, loff_t *ppos)
{ … }
#endif

/*
 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
 *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
 *	whenever sysctl_lowmem_reserve_ratio changes.
 *
 * The reserve ratio obviously has absolutely no relation with the
 * minimum watermarks. The lowmem reserve ratio can only make sense
 * if in function of the boot time zone sizes.
 */
static int lowmem_reserve_ratio_sysctl_handler(const struct ctl_table *table,
		int write, void *buffer, size_t *length, loff_t *ppos)
{ … }

/*
 * percpu_pagelist_high_fraction - changes the pcp->high for each zone on each
 * cpu. It is the fraction of total pages in each zone that a hot per cpu
 * pagelist can have before it gets flushed back to buddy allocator.
 */
static int percpu_pagelist_high_fraction_sysctl_handler(const struct ctl_table *table,
		int write, void *buffer, size_t *length, loff_t *ppos)
{ … }

static struct ctl_table page_alloc_sysctl_table[] = …;

void __init page_alloc_sysctl_init(void)
{ … }

#ifdef CONFIG_CONTIG_ALLOC
/* Usage: See admin-guide/dynamic-debug-howto.rst */
static void alloc_contig_dump_pages(struct list_head *page_list)
{ … }

/*
 * [start, end) must belong to a single zone.
 * @migratetype: using migratetype to filter the type of migration in
 *		trace_mm_alloc_contig_migrate_range_info.
 */
int __alloc_contig_migrate_range(struct compact_control *cc,
					unsigned long start, unsigned long end,
					int migratetype)
{ … }

/**
 * alloc_contig_range() -- tries to allocate given range of pages
 * @start:	start PFN to allocate
 * @end:	one-past-the-last PFN to allocate
 * @migratetype:	migratetype of the underlying pageblocks (either
 *			#MIGRATE_MOVABLE or #MIGRATE_CMA).  All pageblocks
 *			in range must have the same migratetype and it must
 *			be either of the two.
 * @gfp_mask:	GFP mask to use during compaction
 *
 * The PFN range does not have to be pageblock aligned. The PFN range must
 * belong to a single zone.
 *
 * The first thing this routine does is attempt to MIGRATE_ISOLATE all
 * pageblocks in the range.  Once isolated, the pageblocks should not
 * be modified by others.
 *
 * Return: zero on success or negative error code.  On success all
 * pages which PFN is in [start, end) are allocated for the caller and
 * need to be freed with free_contig_range().
 */
int alloc_contig_range_noprof(unsigned long start, unsigned long end,
		       unsigned migratetype, gfp_t gfp_mask)
{ … }
EXPORT_SYMBOL(…);

static int __alloc_contig_pages(unsigned long start_pfn,
				unsigned long nr_pages, gfp_t gfp_mask)
{ … }

static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
				   unsigned long nr_pages)
{ … }

static bool zone_spans_last_pfn(const struct zone *zone,
				unsigned long start_pfn, unsigned long nr_pages)
{ … }

/**
 * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
 * @nr_pages:	Number of contiguous pages to allocate
 * @gfp_mask:	GFP mask to limit search and used during compaction
 * @nid:	Target node
 * @nodemask:	Mask for other possible nodes
 *
 * This routine is a wrapper around alloc_contig_range(). It scans over zones
 * on an applicable zonelist to find a contiguous pfn range which can then be
 * tried for allocation with alloc_contig_range(). This routine is intended
 * for allocation requests which can not be fulfilled with the buddy allocator.
 *
 * The allocated memory is always aligned to a page boundary. If nr_pages is a
 * power of two, then allocated range is also guaranteed to be aligned to same
 * nr_pages (e.g. 1GB request would be aligned to 1GB).
 *
 * Allocated pages can be freed with free_contig_range() or by manually calling
 * __free_page() on each allocated page.
 *
 * Return: pointer to contiguous pages on success, or NULL if not successful.
 */
struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask,
				 int nid, nodemask_t *nodemask)
{ … }
#endif /* CONFIG_CONTIG_ALLOC */

void free_contig_range(unsigned long pfn, unsigned long nr_pages)
{ … }
EXPORT_SYMBOL(…);

/*
 * Effectively disable pcplists for the zone by setting the high limit to 0
 * and draining all cpus. A concurrent page freeing on another CPU that's about
 * to put the page on pcplist will either finish before the drain and the page
 * will be drained, or observe the new high limit and skip the pcplist.
 *
 * Must be paired with a call to zone_pcp_enable().
 */
void zone_pcp_disable(struct zone *zone)
{ … }

void zone_pcp_enable(struct zone *zone)
{ … }

void zone_pcp_reset(struct zone *zone)
{ … }

#ifdef CONFIG_MEMORY_HOTREMOVE
/*
 * All pages in the range must be in a single zone, must not contain holes,
 * must span full sections, and must be isolated before calling this function.
 *
 * Returns the number of managed (non-PageOffline()) pages in the range: the
 * number of pages for which memory offlining code must adjust managed page
 * counters using adjust_managed_page_count().
 */
unsigned long __offline_isolated_pages(unsigned long start_pfn,
		unsigned long end_pfn)
{ … }
#endif

/*
 * This function returns a stable result only if called under zone lock.
 */
bool is_free_buddy_page(const struct page *page)
{ … }
EXPORT_SYMBOL(…);

#ifdef CONFIG_MEMORY_FAILURE
static inline void add_to_free_list(struct page *page, struct zone *zone,
				    unsigned int order, int migratetype,
				    bool tail)
{ … }

/*
 * Break down a higher-order page in sub-pages, and keep our target out of
 * buddy allocator.
 */
static void break_down_buddy_pages(struct zone *zone, struct page *page,
				   struct page *target, int low, int high,
				   int migratetype)
{ … }

/*
 * Take a page that will be marked as poisoned off the buddy allocator.
 */
bool take_page_off_buddy(struct page *page)
{ … }

/*
 * Cancel takeoff done by take_page_off_buddy().
 */
bool put_page_back_buddy(struct page *page)
{ … }
#endif

#ifdef CONFIG_ZONE_DMA
bool has_managed_dma(void)
{ … }
#endif /* CONFIG_ZONE_DMA */

#ifdef CONFIG_UNACCEPTED_MEMORY

/* Counts number of zones with unaccepted pages. */
static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages);

static bool lazy_accept = …;

static int __init accept_memory_parse(char *p)
{ … }
early_param(…);

static bool page_contains_unaccepted(struct page *page, unsigned int order)
{ … }

static void accept_page(struct page *page, unsigned int order)
{ … }

static bool try_to_accept_memory_one(struct zone *zone)
{ … }

static bool try_to_accept_memory(struct zone *zone, unsigned int order)
{ … }

static inline bool has_unaccepted_memory(void)
{ … }

static bool __free_unaccepted(struct page *page)
{ … }

#else

static bool page_contains_unaccepted(struct page *page, unsigned int order)
{
	return false;
}

static void accept_page(struct page *page, unsigned int order)
{
}

static bool try_to_accept_memory(struct zone *zone, unsigned int order)
{
	return false;
}

static inline bool has_unaccepted_memory(void)
{
	return false;
}

static bool __free_unaccepted(struct page *page)
{
	BUILD_BUG();
	return false;
}

#endif /* CONFIG_UNACCEPTED_MEMORY */