// SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/page_alloc.c * * Manages the free list, the system allocates free pages here. * Note that kmalloc() lives in slab.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * Swap reorganised 29.12.95, Stephen Tweedie * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) */ #include <linux/stddef.h> #include <linux/mm.h> #include <linux/highmem.h> #include <linux/interrupt.h> #include <linux/jiffies.h> #include <linux/compiler.h> #include <linux/kernel.h> #include <linux/kasan.h> #include <linux/kmsan.h> #include <linux/module.h> #include <linux/suspend.h> #include <linux/ratelimit.h> #include <linux/oom.h> #include <linux/topology.h> #include <linux/sysctl.h> #include <linux/cpu.h> #include <linux/cpuset.h> #include <linux/pagevec.h> #include <linux/memory_hotplug.h> #include <linux/nodemask.h> #include <linux/vmstat.h> #include <linux/fault-inject.h> #include <linux/compaction.h> #include <trace/events/kmem.h> #include <trace/events/oom.h> #include <linux/prefetch.h> #include <linux/mm_inline.h> #include <linux/mmu_notifier.h> #include <linux/migrate.h> #include <linux/sched/mm.h> #include <linux/page_owner.h> #include <linux/page_table_check.h> #include <linux/memcontrol.h> #include <linux/ftrace.h> #include <linux/lockdep.h> #include <linux/psi.h> #include <linux/khugepaged.h> #include <linux/delayacct.h> #include <linux/cacheinfo.h> #include <linux/pgalloc_tag.h> #include <asm/div64.h> #include "internal.h" #include "shuffle.h" #include "page_reporting.h" /* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */ fpi_t; /* No special request */ #define FPI_NONE … /* * Skip free page reporting notification for the (possibly merged) page. * This does not hinder free page reporting from grabbing the page, * reporting it and marking it "reported" - it only skips notifying * the free page reporting infrastructure about a newly freed page. For * example, used when temporarily pulling a page from a freelist and * putting it back unmodified. */ #define FPI_SKIP_REPORT_NOTIFY … /* * Place the (possibly merged) page to the tail of the freelist. Will ignore * page shuffling (relevant code - e.g., memory onlining - is expected to * shuffle the whole zone). * * Note: No code should rely on this flag for correctness - it's purely * to allow for optimizations when handing back either fresh pages * (memory onlining) or untouched pages (page isolation, free page * reporting). */ #define FPI_TO_TAIL … /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_HIGH_FRACTION … #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) /* * On SMP, spin_trylock is sufficient protection. * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP. */ #define pcp_trylock_prepare(flags) … #define pcp_trylock_finish(flag) … #else /* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */ #define pcp_trylock_prepare … #define pcp_trylock_finish … #endif /* * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid * a migration causing the wrong PCP to be locked and remote memory being * potentially allocated, pin the task to the CPU for the lookup+lock. * preempt_disable is used on !RT because it is faster than migrate_disable. * migrate_disable is used on RT because otherwise RT spinlock usage is * interfered with and a high priority task cannot preempt the allocator. */ #ifndef CONFIG_PREEMPT_RT #define pcpu_task_pin() … #define pcpu_task_unpin() … #else #define pcpu_task_pin … #define pcpu_task_unpin … #endif /* * Generic helper to lookup and a per-cpu variable with an embedded spinlock. * Return value should be used with equivalent unlock helper. */ #define pcpu_spin_lock(type, member, ptr) … #define pcpu_spin_trylock(type, member, ptr) … #define pcpu_spin_unlock(member, ptr) … /* struct per_cpu_pages specific helpers. */ #define pcp_spin_lock(ptr) … #define pcp_spin_trylock(ptr) … #define pcp_spin_unlock(ptr) … #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(…); #endif DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key); #ifdef CONFIG_HAVE_MEMORYLESS_NODES /* * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() * defined in <linux/topology.h>. */ DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ EXPORT_PER_CPU_SYMBOL(_numa_mem_); #endif static DEFINE_MUTEX(pcpu_drain_mutex); #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY volatile unsigned long latent_entropy __latent_entropy; EXPORT_SYMBOL(latent_entropy); #endif /* * Array of node states. */ nodemask_t node_states[NR_NODE_STATES] __read_mostly = …; EXPORT_SYMBOL(…); gfp_t gfp_allowed_mask __read_mostly = …; #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE unsigned int pageblock_order __read_mostly; #endif static void __free_pages_ok(struct page *page, unsigned int order, fpi_t fpi_flags); /* * results with 256, 32 in the lowmem_reserve sysctl: * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) * 1G machine -> (16M dma, 784M normal, 224M high) * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA * * TBD: should special case ZONE_DMA32 machines here - in those we normally * don't need any ZONE_NORMAL reservation */ static int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = …; char * const zone_names[MAX_NR_ZONES] = …; const char * const migratetype_names[MIGRATE_TYPES] = …; int min_free_kbytes = …; int user_min_free_kbytes = …; static int watermark_boost_factor __read_mostly = …; static int watermark_scale_factor = …; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; EXPORT_SYMBOL(…); #if MAX_NUMNODES > 1 unsigned int nr_node_ids __read_mostly = …; unsigned int nr_online_nodes __read_mostly = …; EXPORT_SYMBOL(…); EXPORT_SYMBOL(…); #endif static bool page_contains_unaccepted(struct page *page, unsigned int order); static void accept_page(struct page *page, unsigned int order); static bool try_to_accept_memory(struct zone *zone, unsigned int order); static inline bool has_unaccepted_memory(void); static bool __free_unaccepted(struct page *page); int page_group_by_mobility_disabled __read_mostly; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* * During boot we initialize deferred pages on-demand, as needed, but once * page_alloc_init_late() has finished, the deferred pages are all initialized, * and we can permanently disable that path. */ DEFINE_STATIC_KEY_TRUE(deferred_pages); static inline bool deferred_pages_enabled(void) { … } /* * deferred_grow_zone() is __init, but it is called from * get_page_from_freelist() during early boot until deferred_pages permanently * disables this call. This is why we have refdata wrapper to avoid warning, * and to ensure that the function body gets unloaded. */ static bool __ref _deferred_grow_zone(struct zone *zone, unsigned int order) { … } #else static inline bool deferred_pages_enabled(void) { return false; } #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ /* Return a pointer to the bitmap storing bits affecting a block of pages */ static inline unsigned long *get_pageblock_bitmap(const struct page *page, unsigned long pfn) { … } static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn) { … } /** * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages * @page: The page within the block of interest * @pfn: The target page frame number * @mask: mask of bits that the caller is interested in * * Return: pageblock_bits flags */ unsigned long get_pfnblock_flags_mask(const struct page *page, unsigned long pfn, unsigned long mask) { … } static __always_inline int get_pfnblock_migratetype(const struct page *page, unsigned long pfn) { … } /** * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages * @page: The page within the block of interest * @flags: The flags to set * @pfn: The target page frame number * @mask: mask of bits that the caller is interested in */ void set_pfnblock_flags_mask(struct page *page, unsigned long flags, unsigned long pfn, unsigned long mask) { … } void set_pageblock_migratetype(struct page *page, int migratetype) { … } #ifdef CONFIG_DEBUG_VM static int page_outside_zone_boundaries(struct zone *zone, struct page *page) { … } /* * Temporary debugging check for pages not lying within a given zone. */ static bool __maybe_unused bad_range(struct zone *zone, struct page *page) { … } #else static inline bool __maybe_unused bad_range(struct zone *zone, struct page *page) { return false; } #endif static void bad_page(struct page *page, const char *reason) { … } static inline unsigned int order_to_pindex(int migratetype, int order) { … } static inline int pindex_to_order(unsigned int pindex) { … } static inline bool pcp_allowed_order(unsigned int order) { … } /* * Higher-order pages are called "compound pages". They are structured thusly: * * The first PAGE_SIZE page is called the "head page" and have PG_head set. * * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded * in bit 0 of page->compound_head. The rest of bits is pointer to head page. * * The first tail page's ->compound_order holds the order of allocation. * This usage means that zero-order pages may not be compound. */ void prep_compound_page(struct page *page, unsigned int order) { … } static inline void set_buddy_order(struct page *page, unsigned int order) { … } #ifdef CONFIG_COMPACTION static inline struct capture_control *task_capc(struct zone *zone) { … } static inline bool compaction_capture(struct capture_control *capc, struct page *page, int order, int migratetype) { … } #else static inline struct capture_control *task_capc(struct zone *zone) { return NULL; } static inline bool compaction_capture(struct capture_control *capc, struct page *page, int order, int migratetype) { return false; } #endif /* CONFIG_COMPACTION */ static inline void account_freepages(struct zone *zone, int nr_pages, int migratetype) { … } /* Used for pages not on another list */ static inline void __add_to_free_list(struct page *page, struct zone *zone, unsigned int order, int migratetype, bool tail) { … } /* * Used for pages which are on another list. Move the pages to the tail * of the list - so the moved pages won't immediately be considered for * allocation again (e.g., optimization for memory onlining). */ static inline void move_to_free_list(struct page *page, struct zone *zone, unsigned int order, int old_mt, int new_mt) { … } static inline void __del_page_from_free_list(struct page *page, struct zone *zone, unsigned int order, int migratetype) { … } static inline void del_page_from_free_list(struct page *page, struct zone *zone, unsigned int order, int migratetype) { … } static inline struct page *get_page_from_free_area(struct free_area *area, int migratetype) { … } /* * If this is less than the 2nd largest possible page, check if the buddy * of the next-higher order is free. If it is, it's possible * that pages are being freed that will coalesce soon. In case, * that is happening, add the free page to the tail of the list * so it's less likely to be used soon and more likely to be merged * as a 2-level higher order page */ static inline bool buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, struct page *page, unsigned int order) { … } /* * Freeing function for a buddy system allocator. * * The concept of a buddy system is to maintain direct-mapped table * (containing bit values) for memory blocks of various "orders". * The bottom level table contains the map for the smallest allocatable * units of memory (here, pages), and each level above it describes * pairs of units from the levels below, hence, "buddies". * At a high level, all that happens here is marking the table entry * at the bottom level available, and propagating the changes upward * as necessary, plus some accounting needed to play nicely with other * parts of the VM system. * At each level, we keep a list of pages, which are heads of continuous * free pages of length of (1 << order) and marked with PageBuddy. * Page's order is recorded in page_private(page) field. * So when we are allocating or freeing one, we can derive the state of the * other. That is, if we allocate a small block, and both were * free, the remainder of the region must be split into blocks. * If a block is freed, and its buddy is also free, then this * triggers coalescing into a block of larger size. * * -- nyc */ static inline void __free_one_page(struct page *page, unsigned long pfn, struct zone *zone, unsigned int order, int migratetype, fpi_t fpi_flags) { … } /* * A bad page could be due to a number of fields. Instead of multiple branches, * try and check multiple fields with one check. The caller must do a detailed * check if necessary. */ static inline bool page_expected_state(struct page *page, unsigned long check_flags) { … } static const char *page_bad_reason(struct page *page, unsigned long flags) { … } static void free_page_is_bad_report(struct page *page) { … } static inline bool free_page_is_bad(struct page *page) { … } static inline bool is_check_pages_enabled(void) { … } static int free_tail_page_prepare(struct page *head_page, struct page *page) { … } /* * Skip KASAN memory poisoning when either: * * 1. For generic KASAN: deferred memory initialization has not yet completed. * Tag-based KASAN modes skip pages freed via deferred memory initialization * using page tags instead (see below). * 2. For tag-based KASAN modes: the page has a match-all KASAN tag, indicating * that error detection is disabled for accesses via the page address. * * Pages will have match-all tags in the following circumstances: * * 1. Pages are being initialized for the first time, including during deferred * memory init; see the call to page_kasan_tag_reset in __init_single_page. * 2. The allocation was not unpoisoned due to __GFP_SKIP_KASAN, with the * exception of pages unpoisoned by kasan_unpoison_vmalloc. * 3. The allocation was excluded from being checked due to sampling, * see the call to kasan_unpoison_pages. * * Poisoning pages during deferred memory init will greatly lengthen the * process and cause problem in large memory systems as the deferred pages * initialization is done with interrupt disabled. * * Assuming that there will be no reference to those newly initialized * pages before they are ever allocated, this should have no effect on * KASAN memory tracking as the poison will be properly inserted at page * allocation time. The only corner case is when pages are allocated by * on-demand allocation and then freed again before the deferred pages * initialization is done, but this is not likely to happen. */ static inline bool should_skip_kasan_poison(struct page *page) { … } static void kernel_init_pages(struct page *page, int numpages) { … } __always_inline bool free_pages_prepare(struct page *page, unsigned int order) { … } /* * Frees a number of pages from the PCP lists * Assumes all pages on list are in same zone. * count is the number of pages to free. */ static void free_pcppages_bulk(struct zone *zone, int count, struct per_cpu_pages *pcp, int pindex) { … } static void free_one_page(struct zone *zone, struct page *page, unsigned long pfn, unsigned int order, fpi_t fpi_flags) { … } static void __free_pages_ok(struct page *page, unsigned int order, fpi_t fpi_flags) { … } void __meminit __free_pages_core(struct page *page, unsigned int order, enum meminit_context context) { … } /* * Check that the whole (or subset of) a pageblock given by the interval of * [start_pfn, end_pfn) is valid and within the same zone, before scanning it * with the migration of free compaction scanner. * * Return struct page pointer of start_pfn, or NULL if checks were not passed. * * It's possible on some configurations to have a setup like node0 node1 node0 * i.e. it's possible that all pages within a zones range of pages do not * belong to a single zone. We assume that a border between node0 and node1 * can occur within a single pageblock, but not a node0 node1 node0 * interleaving within a single pageblock. It is therefore sufficient to check * the first and last page of a pageblock and avoid checking each individual * page in a pageblock. * * Note: the function may return non-NULL struct page even for a page block * which contains a memory hole (i.e. there is no physical memory for a subset * of the pfn range). For example, if the pageblock order is MAX_PAGE_ORDER, which * will fall into 2 sub-sections, and the end pfn of the pageblock may be hole * even though the start pfn is online and valid. This should be safe most of * the time because struct pages are still initialized via init_unavailable_range() * and pfn walkers shouldn't touch any physical memory range for which they do * not recognize any specific metadata in struct pages. */ struct page *__pageblock_pfn_to_page(unsigned long start_pfn, unsigned long end_pfn, struct zone *zone) { … } /* * The order of subdivision here is critical for the IO subsystem. * Please do not alter this order without good reasons and regression * testing. Specifically, as large blocks of memory are subdivided, * the order in which smaller blocks are delivered depends on the order * they're subdivided in this function. This is the primary factor * influencing the order in which pages are delivered to the IO * subsystem according to empirical testing, and this is also justified * by considering the behavior of a buddy system containing a single * large block of memory acted on by a series of small allocations. * This behavior is a critical factor in sglist merging's success. * * -- nyc */ static inline void expand(struct zone *zone, struct page *page, int low, int high, int migratetype) { … } static void check_new_page_bad(struct page *page) { … } /* * This page is about to be returned from the page allocator */ static bool check_new_page(struct page *page) { … } static inline bool check_new_pages(struct page *page, unsigned int order) { … } static inline bool should_skip_kasan_unpoison(gfp_t flags) { … } static inline bool should_skip_init(gfp_t flags) { … } inline void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags) { … } static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, unsigned int alloc_flags) { … } /* * Go through the free lists for the given migratetype and remove * the smallest available page from the freelists */ static __always_inline struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, int migratetype) { … } /* * This array describes the order lists are fallen back to when * the free lists for the desirable migrate type are depleted * * The other migratetypes do not have fallbacks. */ static int fallbacks[MIGRATE_PCPTYPES][MIGRATE_PCPTYPES - 1] = …; #ifdef CONFIG_CMA static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone, unsigned int order) { … } #else static inline struct page *__rmqueue_cma_fallback(struct zone *zone, unsigned int order) { return NULL; } #endif /* * Change the type of a block and move all its free pages to that * type's freelist. */ static int __move_freepages_block(struct zone *zone, unsigned long start_pfn, int old_mt, int new_mt) { … } static bool prep_move_freepages_block(struct zone *zone, struct page *page, unsigned long *start_pfn, int *num_free, int *num_movable) { … } static int move_freepages_block(struct zone *zone, struct page *page, int old_mt, int new_mt) { … } #ifdef CONFIG_MEMORY_ISOLATION /* Look for a buddy that straddles start_pfn */ static unsigned long find_large_buddy(unsigned long start_pfn) { … } /* Split a multi-block free page into its individual pageblocks */ static void split_large_buddy(struct zone *zone, struct page *page, unsigned long pfn, int order) { … } /** * move_freepages_block_isolate - move free pages in block for page isolation * @zone: the zone * @page: the pageblock page * @migratetype: migratetype to set on the pageblock * * This is similar to move_freepages_block(), but handles the special * case encountered in page isolation, where the block of interest * might be part of a larger buddy spanning multiple pageblocks. * * Unlike the regular page allocator path, which moves pages while * stealing buddies off the freelist, page isolation is interested in * arbitrary pfn ranges that may have overlapping buddies on both ends. * * This function handles that. Straddling buddies are split into * individual pageblocks. Only the block of interest is moved. * * Returns %true if pages could be moved, %false otherwise. */ bool move_freepages_block_isolate(struct zone *zone, struct page *page, int migratetype) { … } #endif /* CONFIG_MEMORY_ISOLATION */ static void change_pageblock_range(struct page *pageblock_page, int start_order, int migratetype) { … } /* * When we are falling back to another migratetype during allocation, try to * steal extra free pages from the same pageblocks to satisfy further * allocations, instead of polluting multiple pageblocks. * * If we are stealing a relatively large buddy page, it is likely there will * be more free pages in the pageblock, so try to steal them all. For * reclaimable and unmovable allocations, we steal regardless of page size, * as fragmentation caused by those allocations polluting movable pageblocks * is worse than movable allocations stealing from unmovable and reclaimable * pageblocks. */ static bool can_steal_fallback(unsigned int order, int start_mt) { … } static inline bool boost_watermark(struct zone *zone) { … } /* * This function implements actual steal behaviour. If order is large enough, we * can claim the whole pageblock for the requested migratetype. If not, we check * the pageblock for constituent pages; if at least half of the pages are free * or compatible, we can still claim the whole block, so pages freed in the * future will be put on the correct free list. Otherwise, we isolate exactly * the order we need from the fallback block and leave its migratetype alone. */ static struct page * steal_suitable_fallback(struct zone *zone, struct page *page, int current_order, int order, int start_type, unsigned int alloc_flags, bool whole_block) { … } /* * Check whether there is a suitable fallback freepage with requested order. * If only_stealable is true, this function returns fallback_mt only if * we can steal other freepages all together. This would help to reduce * fragmentation due to mixed migratetype pages in one pageblock. */ int find_suitable_fallback(struct free_area *area, unsigned int order, int migratetype, bool only_stealable, bool *can_steal) { … } /* * Reserve the pageblock(s) surrounding an allocation request for * exclusive use of high-order atomic allocations if there are no * empty page blocks that contain a page with a suitable order */ static void reserve_highatomic_pageblock(struct page *page, int order, struct zone *zone) { … } /* * Used when an allocation is about to fail under memory pressure. This * potentially hurts the reliability of high-order allocations when under * intense memory pressure but failed atomic allocations should be easier * to recover from than an OOM. * * If @force is true, try to unreserve pageblocks even though highatomic * pageblock is exhausted. */ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, bool force) { … } /* * Try finding a free buddy page on the fallback list and put it on the free * list of requested migratetype, possibly along with other pages from the same * block, depending on fragmentation avoidance heuristics. Returns true if * fallback was found so that __rmqueue_smallest() can grab it. * * The use of signed ints for order and current_order is a deliberate * deviation from the rest of this file, to make the for loop * condition simpler. */ static __always_inline struct page * __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, unsigned int alloc_flags) { … } /* * Do the hard work of removing an element from the buddy allocator. * Call me with the zone->lock already held. */ static __always_inline struct page * __rmqueue(struct zone *zone, unsigned int order, int migratetype, unsigned int alloc_flags) { … } /* * Obtain a specified number of elements from the buddy allocator, all under * a single hold of the lock, for efficiency. Add them to the supplied list. * Returns the number of new pages which were placed at *list. */ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, unsigned int alloc_flags) { … } /* * Called from the vmstat counter updater to decay the PCP high. * Return whether there are addition works to do. */ int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) { … } #ifdef CONFIG_NUMA /* * Called from the vmstat counter updater to drain pagesets of this * currently executing processor on remote nodes after they have * expired. */ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { … } #endif /* * Drain pcplists of the indicated processor and zone. */ static void drain_pages_zone(unsigned int cpu, struct zone *zone) { … } /* * Drain pcplists of all zones on the indicated processor. */ static void drain_pages(unsigned int cpu) { … } /* * Spill all of this CPU's per-cpu pages back into the buddy allocator. */ void drain_local_pages(struct zone *zone) { … } /* * The implementation of drain_all_pages(), exposing an extra parameter to * drain on all cpus. * * drain_all_pages() is optimized to only execute on cpus where pcplists are * not empty. The check for non-emptiness can however race with a free to * pcplist that has not yet increased the pcp->count from 0 to 1. Callers * that need the guarantee that every CPU has drained can disable the * optimizing racy check. */ static void __drain_all_pages(struct zone *zone, bool force_all_cpus) { … } /* * Spill all the per-cpu pages from all CPUs back into the buddy allocator. * * When zone parameter is non-NULL, spill just the single zone's pages. */ void drain_all_pages(struct zone *zone) { … } static int nr_pcp_free(struct per_cpu_pages *pcp, int batch, int high, bool free_high) { … } static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, int batch, bool free_high) { … } static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, struct page *page, int migratetype, unsigned int order) { … } /* * Free a pcp page */ void free_unref_page(struct page *page, unsigned int order) { … } /* * Free a batch of folios */ void free_unref_folios(struct folio_batch *folios) { … } /* * split_page takes a non-compound higher-order page, and splits it into * n (1<<order) sub-pages: page[0..n] * Each sub-page must be freed individually. * * Note: this is probably too low level an operation for use in drivers. * Please consult with lkml before using this in your driver. */ void split_page(struct page *page, unsigned int order) { … } EXPORT_SYMBOL_GPL(…); int __isolate_free_page(struct page *page, unsigned int order) { … } /** * __putback_isolated_page - Return a now-isolated page back where we got it * @page: Page that was isolated * @order: Order of the isolated page * @mt: The page's pageblock's migratetype * * This function is meant to return a page pulled from the free lists via * __isolate_free_page back to the free lists they were pulled from. */ void __putback_isolated_page(struct page *page, unsigned int order, int mt) { … } /* * Update NUMA hit/miss statistics */ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, long nr_account) { … } static __always_inline struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, unsigned int order, unsigned int alloc_flags, int migratetype) { … } static int nr_pcp_alloc(struct per_cpu_pages *pcp, struct zone *zone, int order) { … } /* Remove page from the per-cpu list, caller must protect the list */ static inline struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, int migratetype, unsigned int alloc_flags, struct per_cpu_pages *pcp, struct list_head *list) { … } /* Lock and remove page from the per-cpu list */ static struct page *rmqueue_pcplist(struct zone *preferred_zone, struct zone *zone, unsigned int order, int migratetype, unsigned int alloc_flags) { … } /* * Allocate a page from the given zone. * Use pcplists for THP or "cheap" high-order allocations. */ /* * Do not instrument rmqueue() with KMSAN. This function may call * __msan_poison_alloca() through a call to set_pfnblock_flags_mask(). * If __msan_poison_alloca() attempts to allocate pages for the stack depot, it * may call rmqueue() again, which will result in a deadlock. */ __no_sanitize_memory static inline struct page *rmqueue(struct zone *preferred_zone, struct zone *zone, unsigned int order, gfp_t gfp_flags, unsigned int alloc_flags, int migratetype) { … } static inline long __zone_watermark_unusable_free(struct zone *z, unsigned int order, unsigned int alloc_flags) { … } /* * Return true if free base pages are above 'mark'. For high-order checks it * will return true of the order-0 watermark is reached and there is at least * one free page of a suitable size. Checking now avoids taking the zone lock * to check in the allocation paths if no pages are free. */ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags, long free_pages) { … } bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags) { … } static inline bool zone_watermark_fast(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags, gfp_t gfp_mask) { … } bool zone_watermark_ok_safe(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx) { … } #ifdef CONFIG_NUMA int __read_mostly node_reclaim_distance = …; static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { … } #else /* CONFIG_NUMA */ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { return true; } #endif /* CONFIG_NUMA */ /* * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid * fragmentation is subtle. If the preferred zone was HIGHMEM then * premature use of a lower zone may cause lowmem pressure problems that * are worse than fragmentation. If the next zone is ZONE_DMA then it is * probably too small. It only makes sense to spread allocations to avoid * fragmentation between the Normal and DMA32 zones. */ static inline unsigned int alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask) { … } /* Must be called after current_gfp_context() which can change gfp_mask */ static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask, unsigned int alloc_flags) { … } /* * get_page_from_freelist goes through the zonelist trying to allocate * a page. */ static struct page * get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, const struct alloc_context *ac) { … } static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) { … } void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) { … } static inline struct page * __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac) { … } static inline struct page * __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, const struct alloc_context *ac, unsigned long *did_some_progress) { … } /* * Maximum number of compaction retries with a progress before OOM * killer is consider as the only way to move forward. */ #define MAX_COMPACT_RETRIES … #ifdef CONFIG_COMPACTION /* Try memory compaction for high-order allocations before reclaim */ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, enum compact_priority prio, enum compact_result *compact_result) { … } static inline bool should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, enum compact_result compact_result, enum compact_priority *compact_priority, int *compaction_retries) { … } #else static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, enum compact_priority prio, enum compact_result *compact_result) { *compact_result = COMPACT_SKIPPED; return NULL; } static inline bool should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, enum compact_result compact_result, enum compact_priority *compact_priority, int *compaction_retries) { struct zone *zone; struct zoneref *z; if (!order || order > PAGE_ALLOC_COSTLY_ORDER) return false; /* * There are setups with compaction disabled which would prefer to loop * inside the allocator rather than hit the oom killer prematurely. * Let's give them a good hope and keep retrying while the order-0 * watermarks are OK. */ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->highest_zoneidx, ac->nodemask) { if (zone_watermark_ok(zone, 0, min_wmark_pages(zone), ac->highest_zoneidx, alloc_flags)) return true; } return false; } #endif /* CONFIG_COMPACTION */ #ifdef CONFIG_LOCKDEP static struct lockdep_map __fs_reclaim_map = …; static bool __need_reclaim(gfp_t gfp_mask) { … } void __fs_reclaim_acquire(unsigned long ip) { … } void __fs_reclaim_release(unsigned long ip) { … } void fs_reclaim_acquire(gfp_t gfp_mask) { … } EXPORT_SYMBOL_GPL(…); void fs_reclaim_release(gfp_t gfp_mask) { … } EXPORT_SYMBOL_GPL(…); #endif /* * Zonelists may change due to hotplug during allocation. Detect when zonelists * have been rebuilt so allocation retries. Reader side does not lock and * retries the allocation if zonelist changes. Writer side is protected by the * embedded spin_lock. */ static DEFINE_SEQLOCK(zonelist_update_seq); static unsigned int zonelist_iter_begin(void) { … } static unsigned int check_retry_zonelist(unsigned int seq) { … } /* Perform direct synchronous page reclaim */ static unsigned long __perform_reclaim(gfp_t gfp_mask, unsigned int order, const struct alloc_context *ac) { … } /* The really slow allocator path where we enter direct reclaim */ static inline struct page * __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, unsigned long *did_some_progress) { … } static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask, const struct alloc_context *ac) { … } static inline unsigned int gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order) { … } static bool oom_reserves_allowed(struct task_struct *tsk) { … } /* * Distinguish requests which really need access to full memory * reserves from oom victims which can live with a portion of it */ static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask) { … } bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) { … } /* * Checks whether it makes sense to retry the reclaim to make a forward progress * for the given allocation request. * * We give up when we either have tried MAX_RECLAIM_RETRIES in a row * without success, or when we couldn't even meet the watermark if we * reclaimed all remaining pages on the LRU lists. * * Returns true if a retry is viable or false to enter the oom path. */ static inline bool should_reclaim_retry(gfp_t gfp_mask, unsigned order, struct alloc_context *ac, int alloc_flags, bool did_some_progress, int *no_progress_loops) { … } static inline bool check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac) { … } static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) { … } static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid, nodemask_t *nodemask, struct alloc_context *ac, gfp_t *alloc_gfp, unsigned int *alloc_flags) { … } /* * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array * @gfp: GFP flags for the allocation * @preferred_nid: The preferred NUMA node ID to allocate from * @nodemask: Set of nodes to allocate from, may be NULL * @nr_pages: The number of pages desired on the list or array * @page_list: Optional list to store the allocated pages * @page_array: Optional array to store the pages * * This is a batched version of the page allocator that attempts to * allocate nr_pages quickly. Pages are added to page_list if page_list * is not NULL, otherwise it is assumed that the page_array is valid. * * For lists, nr_pages is the number of pages that should be allocated. * * For arrays, only NULL elements are populated with pages and nr_pages * is the maximum number of pages that will be stored in the array. * * Returns the number of pages on the list or array. */ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, nodemask_t *nodemask, int nr_pages, struct list_head *page_list, struct page **page_array) { … } EXPORT_SYMBOL_GPL(…); /* * This is the 'heart' of the zoned buddy allocator. */ struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask) { … } EXPORT_SYMBOL(…); struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask) { … } EXPORT_SYMBOL(…); /* * Common helper functions. Never use with __GFP_HIGHMEM because the returned * address cannot represent highmem pages. Use alloc_pages and then kmap if * you need to access high mem. */ unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order) { … } EXPORT_SYMBOL(…); unsigned long get_zeroed_page_noprof(gfp_t gfp_mask) { … } EXPORT_SYMBOL(…); /** * __free_pages - Free pages allocated with alloc_pages(). * @page: The page pointer returned from alloc_pages(). * @order: The order of the allocation. * * This function can free multi-page allocations that are not compound * pages. It does not check that the @order passed in matches that of * the allocation, so it is easy to leak memory. Freeing more memory * than was allocated will probably emit a warning. * * If the last reference to this page is speculative, it will be released * by put_page() which only frees the first page of a non-compound * allocation. To prevent the remaining pages from being leaked, we free * the subsequent pages here. If you want to use the page's reference * count to decide when to free the allocation, you should allocate a * compound page, and use put_page() instead of __free_pages(). * * Context: May be called in interrupt context or while holding a normal * spinlock, but not in NMI context or while holding a raw spinlock. */ void __free_pages(struct page *page, unsigned int order) { … } EXPORT_SYMBOL(…); void free_pages(unsigned long addr, unsigned int order) { … } EXPORT_SYMBOL(…); /* * Page Fragment: * An arbitrary-length arbitrary-offset area of memory which resides * within a 0 or higher order page. Multiple fragments within that page * are individually refcounted, in the page's reference counter. * * The page_frag functions below provide a simple allocation framework for * page fragments. This is used by the network stack and network device * drivers to provide a backing region of memory for use as either an * sk_buff->head, or to be used in the "frags" portion of skb_shared_info. */ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc, gfp_t gfp_mask) { … } void page_frag_cache_drain(struct page_frag_cache *nc) { … } EXPORT_SYMBOL(…); void __page_frag_cache_drain(struct page *page, unsigned int count) { … } EXPORT_SYMBOL(…); void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask, unsigned int align_mask) { … } EXPORT_SYMBOL(…); /* * Frees a page fragment allocated out of either a compound or order 0 page. */ void page_frag_free(void *addr) { … } EXPORT_SYMBOL(…); static void *make_alloc_exact(unsigned long addr, unsigned int order, size_t size) { … } /** * alloc_pages_exact - allocate an exact number physically-contiguous pages. * @size: the number of bytes to allocate * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP * * This function is similar to alloc_pages(), except that it allocates the * minimum number of pages to satisfy the request. alloc_pages() can only * allocate memory in power-of-two pages. * * This function is also limited by MAX_PAGE_ORDER. * * Memory allocated by this function must be released by free_pages_exact(). * * Return: pointer to the allocated area or %NULL in case of error. */ void *alloc_pages_exact_noprof(size_t size, gfp_t gfp_mask) { … } EXPORT_SYMBOL(…); /** * alloc_pages_exact_nid - allocate an exact number of physically-contiguous * pages on a node. * @nid: the preferred node ID where memory should be allocated * @size: the number of bytes to allocate * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP * * Like alloc_pages_exact(), but try to allocate on node nid first before falling * back. * * Return: pointer to the allocated area or %NULL in case of error. */ void * __meminit alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mask) { … } /** * free_pages_exact - release memory allocated via alloc_pages_exact() * @virt: the value returned by alloc_pages_exact. * @size: size of allocation, same value as passed to alloc_pages_exact(). * * Release the memory allocated by a previous call to alloc_pages_exact. */ void free_pages_exact(void *virt, size_t size) { … } EXPORT_SYMBOL(…); /** * nr_free_zone_pages - count number of pages beyond high watermark * @offset: The zone index of the highest zone * * nr_free_zone_pages() counts the number of pages which are beyond the * high watermark within all zones at or below a given zone index. For each * zone, the number of pages is calculated as: * * nr_free_zone_pages = managed_pages - high_pages * * Return: number of pages beyond high watermark. */ static unsigned long nr_free_zone_pages(int offset) { … } /** * nr_free_buffer_pages - count number of pages beyond high watermark * * nr_free_buffer_pages() counts the number of pages which are beyond the high * watermark within ZONE_DMA and ZONE_NORMAL. * * Return: number of pages beyond high watermark within ZONE_DMA and * ZONE_NORMAL. */ unsigned long nr_free_buffer_pages(void) { … } EXPORT_SYMBOL_GPL(…); static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) { … } /* * Builds allocation fallback zone lists. * * Add all populated zones of a node to the zonelist. */ static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs) { … } #ifdef CONFIG_NUMA static int __parse_numa_zonelist_order(char *s) { … } static char numa_zonelist_order[] = …; #define NUMA_ZONELIST_ORDER_LEN … /* * sysctl handler for numa_zonelist_order */ static int numa_zonelist_order_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { … } static int node_load[MAX_NUMNODES]; /** * find_next_best_node - find the next node that should appear in a given node's fallback list * @node: node whose fallback list we're appending * @used_node_mask: nodemask_t of already used nodes * * We use a number of factors to determine which is the next node that should * appear on a given node's fallback list. The node should not have appeared * already in @node's fallback list, and it should be the next closest node * according to the distance array (which contains arbitrary distance values * from each node to each node in the system), and should also prefer nodes * with no CPUs, since presumably they'll have very little allocation pressure * on them otherwise. * * Return: node id of the found node or %NUMA_NO_NODE if no node is found. */ int find_next_best_node(int node, nodemask_t *used_node_mask) { … } /* * Build zonelists ordered by node and zones within node. * This results in maximum locality--normal zone overflows into local * DMA zone, if any--but risks exhausting DMA zone. */ static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order, unsigned nr_nodes) { … } /* * Build __GFP_THISNODE zonelists */ static void build_thisnode_zonelists(pg_data_t *pgdat) { … } /* * Build zonelists ordered by zone and nodes within zones. * This results in conserving DMA zone[s] until all Normal memory is * exhausted, but results in overflowing to remote node while memory * may still exist in local DMA zone. */ static void build_zonelists(pg_data_t *pgdat) { … } #ifdef CONFIG_HAVE_MEMORYLESS_NODES /* * Return node id of node used for "local" allocations. * I.e., first node id of first zone in arg node's generic zonelist. * Used for initializing percpu 'numa_mem', which is used primarily * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. */ int local_memory_node(int node) { struct zoneref *z; z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL), gfp_zone(GFP_KERNEL), NULL); return zone_to_nid(z->zone); } #endif static void setup_min_unmapped_ratio(void); static void setup_min_slab_ratio(void); #else /* CONFIG_NUMA */ static void build_zonelists(pg_data_t *pgdat) { struct zoneref *zonerefs; int nr_zones; zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs; nr_zones = build_zonerefs_node(pgdat, zonerefs); zonerefs += nr_zones; zonerefs->zone = NULL; zonerefs->zone_idx = 0; } #endif /* CONFIG_NUMA */ /* * Boot pageset table. One per cpu which is going to be used for all * zones and all nodes. The parameters will be set in such a way * that an item put on a list will immediately be handed over to * the buddy list. This is safe since pageset manipulation is done * with interrupts disabled. * * The boot_pagesets must be kept even after bootup is complete for * unused processors and/or zones. They do play a role for bootstrapping * hotplugged processors. * * zoneinfo_show() and maybe other functions do * not check if the processor is online before following the pageset pointer. * Other parts of the kernel may not check if the zone is available. */ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats); /* These effectively disable the pcplists in the boot pageset completely */ #define BOOT_PAGESET_HIGH … #define BOOT_PAGESET_BATCH … static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset); static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats); static void __build_all_zonelists(void *data) { … } static noinline void __init build_all_zonelists_init(void) { … } /* * unless system_state == SYSTEM_BOOTING. * * __ref due to call of __init annotated helper build_all_zonelists_init * [protected by SYSTEM_BOOTING]. */ void __ref build_all_zonelists(pg_data_t *pgdat) { … } static int zone_batchsize(struct zone *zone) { … } static int percpu_pagelist_high_fraction; static int zone_highsize(struct zone *zone, int batch, int cpu_online, int high_fraction) { … } /* * pcp->high and pcp->batch values are related and generally batch is lower * than high. They are also related to pcp->count such that count is lower * than high, and as soon as it reaches high, the pcplist is flushed. * * However, guaranteeing these relations at all times would require e.g. write * barriers here but also careful usage of read barriers at the read side, and * thus be prone to error and bad for performance. Thus the update only prevents * store tearing. Any new users of pcp->batch, pcp->high_min and pcp->high_max * should ensure they can cope with those fields changing asynchronously, and * fully trust only the pcp->count field on the local CPU with interrupts * disabled. * * mutex_is_locked(&pcp_batch_high_lock) required when calling this function * outside of boot time (or some other assurance that no concurrent updaters * exist). */ static void pageset_update(struct per_cpu_pages *pcp, unsigned long high_min, unsigned long high_max, unsigned long batch) { … } static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats) { … } static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high_min, unsigned long high_max, unsigned long batch) { … } /* * Calculate and set new high and batch values for all per-cpu pagesets of a * zone based on the zone's size. */ static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online) { … } void __meminit setup_zone_pageset(struct zone *zone) { … } /* * The zone indicated has a new number of managed_pages; batch sizes and percpu * page high values need to be recalculated. */ static void zone_pcp_update(struct zone *zone, int cpu_online) { … } static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu) { … } void setup_pcp_cacheinfo(unsigned int cpu) { … } /* * Allocate per cpu pagesets and initialize them. * Before this call only boot pagesets were available. */ void __init setup_per_cpu_pageset(void) { … } __meminit void zone_pcp_init(struct zone *zone) { … } void adjust_managed_page_count(struct page *page, long count) { … } EXPORT_SYMBOL(…); unsigned long free_reserved_area(void *start, void *end, int poison, const char *s) { … } void free_reserved_page(struct page *page) { … } EXPORT_SYMBOL(…); static int page_alloc_cpu_dead(unsigned int cpu) { … } static int page_alloc_cpu_online(unsigned int cpu) { … } void __init page_alloc_init_cpuhp(void) { … } /* * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio * or min_free_kbytes changes. */ static void calculate_totalreserve_pages(void) { … } /* * setup_per_zone_lowmem_reserve - called whenever * sysctl_lowmem_reserve_ratio changes. Ensures that each zone * has a correct pages reserved value, so an adequate number of * pages are left in the zone after a successful __alloc_pages(). */ static void setup_per_zone_lowmem_reserve(void) { … } static void __setup_per_zone_wmarks(void) { … } /** * setup_per_zone_wmarks - called when min_free_kbytes changes * or when memory is hot-{added|removed} * * Ensures that the watermark[min,low,high] values for each zone are set * correctly with respect to min_free_kbytes. */ void setup_per_zone_wmarks(void) { … } /* * Initialise min_free_kbytes. * * For small machines we want it small (128k min). For large machines * we want it large (256MB max). But it is not linear, because network * bandwidth does not increase linearly with machine size. We use * * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: * min_free_kbytes = sqrt(lowmem_kbytes * 16) * * which yields * * 16MB: 512k * 32MB: 724k * 64MB: 1024k * 128MB: 1448k * 256MB: 2048k * 512MB: 2896k * 1024MB: 4096k * 2048MB: 5792k * 4096MB: 8192k * 8192MB: 11584k * 16384MB: 16384k */ void calculate_min_free_kbytes(void) { … } int __meminit init_per_zone_wmark_min(void) { … } postcore_initcall(…) … /* * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so * that we can call two helper functions whenever min_free_kbytes * changes. */ static int min_free_kbytes_sysctl_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { … } static int watermark_scale_factor_sysctl_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { … } #ifdef CONFIG_NUMA static void setup_min_unmapped_ratio(void) { … } static int sysctl_min_unmapped_ratio_sysctl_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { … } static void setup_min_slab_ratio(void) { … } static int sysctl_min_slab_ratio_sysctl_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { … } #endif /* * lowmem_reserve_ratio_sysctl_handler - just a wrapper around * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() * whenever sysctl_lowmem_reserve_ratio changes. * * The reserve ratio obviously has absolutely no relation with the * minimum watermarks. The lowmem reserve ratio can only make sense * if in function of the boot time zone sizes. */ static int lowmem_reserve_ratio_sysctl_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { … } /* * percpu_pagelist_high_fraction - changes the pcp->high for each zone on each * cpu. It is the fraction of total pages in each zone that a hot per cpu * pagelist can have before it gets flushed back to buddy allocator. */ static int percpu_pagelist_high_fraction_sysctl_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { … } static struct ctl_table page_alloc_sysctl_table[] = …; void __init page_alloc_sysctl_init(void) { … } #ifdef CONFIG_CONTIG_ALLOC /* Usage: See admin-guide/dynamic-debug-howto.rst */ static void alloc_contig_dump_pages(struct list_head *page_list) { … } /* * [start, end) must belong to a single zone. * @migratetype: using migratetype to filter the type of migration in * trace_mm_alloc_contig_migrate_range_info. */ int __alloc_contig_migrate_range(struct compact_control *cc, unsigned long start, unsigned long end, int migratetype) { … } /** * alloc_contig_range() -- tries to allocate given range of pages * @start: start PFN to allocate * @end: one-past-the-last PFN to allocate * @migratetype: migratetype of the underlying pageblocks (either * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks * in range must have the same migratetype and it must * be either of the two. * @gfp_mask: GFP mask to use during compaction * * The PFN range does not have to be pageblock aligned. The PFN range must * belong to a single zone. * * The first thing this routine does is attempt to MIGRATE_ISOLATE all * pageblocks in the range. Once isolated, the pageblocks should not * be modified by others. * * Return: zero on success or negative error code. On success all * pages which PFN is in [start, end) are allocated for the caller and * need to be freed with free_contig_range(). */ int alloc_contig_range_noprof(unsigned long start, unsigned long end, unsigned migratetype, gfp_t gfp_mask) { … } EXPORT_SYMBOL(…); static int __alloc_contig_pages(unsigned long start_pfn, unsigned long nr_pages, gfp_t gfp_mask) { … } static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, unsigned long nr_pages) { … } static bool zone_spans_last_pfn(const struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { … } /** * alloc_contig_pages() -- tries to find and allocate contiguous range of pages * @nr_pages: Number of contiguous pages to allocate * @gfp_mask: GFP mask to limit search and used during compaction * @nid: Target node * @nodemask: Mask for other possible nodes * * This routine is a wrapper around alloc_contig_range(). It scans over zones * on an applicable zonelist to find a contiguous pfn range which can then be * tried for allocation with alloc_contig_range(). This routine is intended * for allocation requests which can not be fulfilled with the buddy allocator. * * The allocated memory is always aligned to a page boundary. If nr_pages is a * power of two, then allocated range is also guaranteed to be aligned to same * nr_pages (e.g. 1GB request would be aligned to 1GB). * * Allocated pages can be freed with free_contig_range() or by manually calling * __free_page() on each allocated page. * * Return: pointer to contiguous pages on success, or NULL if not successful. */ struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { … } #endif /* CONFIG_CONTIG_ALLOC */ void free_contig_range(unsigned long pfn, unsigned long nr_pages) { … } EXPORT_SYMBOL(…); /* * Effectively disable pcplists for the zone by setting the high limit to 0 * and draining all cpus. A concurrent page freeing on another CPU that's about * to put the page on pcplist will either finish before the drain and the page * will be drained, or observe the new high limit and skip the pcplist. * * Must be paired with a call to zone_pcp_enable(). */ void zone_pcp_disable(struct zone *zone) { … } void zone_pcp_enable(struct zone *zone) { … } void zone_pcp_reset(struct zone *zone) { … } #ifdef CONFIG_MEMORY_HOTREMOVE /* * All pages in the range must be in a single zone, must not contain holes, * must span full sections, and must be isolated before calling this function. * * Returns the number of managed (non-PageOffline()) pages in the range: the * number of pages for which memory offlining code must adjust managed page * counters using adjust_managed_page_count(). */ unsigned long __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) { … } #endif /* * This function returns a stable result only if called under zone lock. */ bool is_free_buddy_page(const struct page *page) { … } EXPORT_SYMBOL(…); #ifdef CONFIG_MEMORY_FAILURE static inline void add_to_free_list(struct page *page, struct zone *zone, unsigned int order, int migratetype, bool tail) { … } /* * Break down a higher-order page in sub-pages, and keep our target out of * buddy allocator. */ static void break_down_buddy_pages(struct zone *zone, struct page *page, struct page *target, int low, int high, int migratetype) { … } /* * Take a page that will be marked as poisoned off the buddy allocator. */ bool take_page_off_buddy(struct page *page) { … } /* * Cancel takeoff done by take_page_off_buddy(). */ bool put_page_back_buddy(struct page *page) { … } #endif #ifdef CONFIG_ZONE_DMA bool has_managed_dma(void) { … } #endif /* CONFIG_ZONE_DMA */ #ifdef CONFIG_UNACCEPTED_MEMORY /* Counts number of zones with unaccepted pages. */ static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages); static bool lazy_accept = …; static int __init accept_memory_parse(char *p) { … } early_param(…); static bool page_contains_unaccepted(struct page *page, unsigned int order) { … } static void accept_page(struct page *page, unsigned int order) { … } static bool try_to_accept_memory_one(struct zone *zone) { … } static bool try_to_accept_memory(struct zone *zone, unsigned int order) { … } static inline bool has_unaccepted_memory(void) { … } static bool __free_unaccepted(struct page *page) { … } #else static bool page_contains_unaccepted(struct page *page, unsigned int order) { return false; } static void accept_page(struct page *page, unsigned int order) { } static bool try_to_accept_memory(struct zone *zone, unsigned int order) { return false; } static inline bool has_unaccepted_memory(void) { return false; } static bool __free_unaccepted(struct page *page) { BUILD_BUG(); return false; } #endif /* CONFIG_UNACCEPTED_MEMORY */