// SPDX-License-Identifier: GPL-2.0 /* * linux/mm/compaction.c * * Memory compaction for the reduction of external fragmentation. Note that * this heavily depends upon page migration to do all the real heavy * lifting * * Copyright IBM Corp. 2007-2010 Mel Gorman <[email protected]> */ #include <linux/cpu.h> #include <linux/swap.h> #include <linux/migrate.h> #include <linux/compaction.h> #include <linux/mm_inline.h> #include <linux/sched/signal.h> #include <linux/backing-dev.h> #include <linux/sysctl.h> #include <linux/sysfs.h> #include <linux/page-isolation.h> #include <linux/kasan.h> #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/page_owner.h> #include <linux/psi.h> #include <linux/cpuset.h> #include "internal.h" #ifdef CONFIG_COMPACTION /* * Fragmentation score check interval for proactive compaction purposes. */ #define HPAGE_FRAG_CHECK_INTERVAL_MSEC … static inline void count_compact_event(enum vm_event_item item) { … } static inline void count_compact_events(enum vm_event_item item, long delta) { … } /* * order == -1 is expected when compacting proactively via * 1. /proc/sys/vm/compact_memory * 2. /sys/devices/system/node/nodex/compact * 3. /proc/sys/vm/compaction_proactiveness */ static inline bool is_via_compact_memory(int order) { … } #else #define count_compact_event … #define count_compact_events … static inline bool is_via_compact_memory(int order) { return false; } #endif #if defined CONFIG_COMPACTION || defined CONFIG_CMA #define CREATE_TRACE_POINTS #include <trace/events/compaction.h> #define block_start_pfn(pfn, order) … #define block_end_pfn(pfn, order) … /* * Page order with-respect-to which proactive compaction * calculates external fragmentation, which is used as * the "fragmentation score" of a node/zone. */ #if defined CONFIG_TRANSPARENT_HUGEPAGE #define COMPACTION_HPAGE_ORDER … #elif defined CONFIG_HUGETLBFS #define COMPACTION_HPAGE_ORDER … #else #define COMPACTION_HPAGE_ORDER … #endif static struct page *mark_allocated_noprof(struct page *page, unsigned int order, gfp_t gfp_flags) { … } #define mark_allocated(...) … static unsigned long release_free_list(struct list_head *freepages) { … } #ifdef CONFIG_COMPACTION bool PageMovable(struct page *page) { … } void __SetPageMovable(struct page *page, const struct movable_operations *mops) { … } EXPORT_SYMBOL(…); void __ClearPageMovable(struct page *page) { … } EXPORT_SYMBOL(…); /* Do not skip compaction more than 64 times */ #define COMPACT_MAX_DEFER_SHIFT … /* * Compaction is deferred when compaction fails to result in a page * allocation success. 1 << compact_defer_shift, compactions are skipped up * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT */ static void defer_compaction(struct zone *zone, int order) { … } /* Returns true if compaction should be skipped this time */ static bool compaction_deferred(struct zone *zone, int order) { … } /* * Update defer tracking counters after successful compaction of given order, * which means an allocation either succeeded (alloc_success == true) or is * expected to succeed. */ void compaction_defer_reset(struct zone *zone, int order, bool alloc_success) { … } /* Returns true if restarting compaction after many failures */ static bool compaction_restarting(struct zone *zone, int order) { … } /* Returns true if the pageblock should be scanned for pages to isolate. */ static inline bool isolation_suitable(struct compact_control *cc, struct page *page) { … } static void reset_cached_positions(struct zone *zone) { … } #ifdef CONFIG_SPARSEMEM /* * If the PFN falls into an offline section, return the start PFN of the * next online section. If the PFN falls into an online section or if * there is no next online section, return 0. */ static unsigned long skip_offline_sections(unsigned long start_pfn) { … } /* * If the PFN falls into an offline section, return the end PFN of the * next online section in reverse. If the PFN falls into an online section * or if there is no next online section in reverse, return 0. */ static unsigned long skip_offline_sections_reverse(unsigned long start_pfn) { … } #else static unsigned long skip_offline_sections(unsigned long start_pfn) { return 0; } static unsigned long skip_offline_sections_reverse(unsigned long start_pfn) { return 0; } #endif /* * Compound pages of >= pageblock_order should consistently be skipped until * released. It is always pointless to compact pages of such order (if they are * migratable), and the pageblocks they occupy cannot contain any free pages. */ static bool pageblock_skip_persistent(struct page *page) { … } static bool __reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source, bool check_target) { … } /* * This function is called to clear all cached information on pageblocks that * should be skipped for page isolation when the migrate and free page scanner * meet. */ static void __reset_isolation_suitable(struct zone *zone) { … } void reset_isolation_suitable(pg_data_t *pgdat) { … } /* * Sets the pageblock skip bit if it was clear. Note that this is a hint as * locks are not required for read/writers. Returns true if it was already set. */ static bool test_and_set_skip(struct compact_control *cc, struct page *page) { … } static void update_cached_migrate(struct compact_control *cc, unsigned long pfn) { … } /* * If no pages were isolated then mark this pageblock to be skipped in the * future. The information is later cleared by __reset_isolation_suitable(). */ static void update_pageblock_skip(struct compact_control *cc, struct page *page, unsigned long pfn) { … } #else static inline bool isolation_suitable(struct compact_control *cc, struct page *page) { return true; } static inline bool pageblock_skip_persistent(struct page *page) { return false; } static inline void update_pageblock_skip(struct compact_control *cc, struct page *page, unsigned long pfn) { } static void update_cached_migrate(struct compact_control *cc, unsigned long pfn) { } static bool test_and_set_skip(struct compact_control *cc, struct page *page) { return false; } #endif /* CONFIG_COMPACTION */ /* * Compaction requires the taking of some coarse locks that are potentially * very heavily contended. For async compaction, trylock and record if the * lock is contended. The lock will still be acquired but compaction will * abort when the current block is finished regardless of success rate. * Sync compaction acquires the lock. * * Always returns true which makes it easier to track lock state in callers. */ static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags, struct compact_control *cc) __acquires(lock) { … } /* * Compaction requires the taking of some coarse locks that are potentially * very heavily contended. The lock should be periodically unlocked to avoid * having disabled IRQs for a long time, even when there is nobody waiting on * the lock. It might also be that allowing the IRQs will result in * need_resched() becoming true. If scheduling is needed, compaction schedules. * Either compaction type will also abort if a fatal signal is pending. * In either case if the lock was locked, it is dropped and not regained. * * Returns true if compaction should abort due to fatal signal pending. * Returns false when compaction can continue. */ static bool compact_unlock_should_abort(spinlock_t *lock, unsigned long flags, bool *locked, struct compact_control *cc) { … } /* * Isolate free pages onto a private freelist. If @strict is true, will abort * returning 0 on any invalid PFNs or non-free pages inside of the pageblock * (even though it may still end up isolating some pages). */ static unsigned long isolate_freepages_block(struct compact_control *cc, unsigned long *start_pfn, unsigned long end_pfn, struct list_head *freelist, unsigned int stride, bool strict) { … } /** * isolate_freepages_range() - isolate free pages. * @cc: Compaction control structure. * @start_pfn: The first PFN to start isolating. * @end_pfn: The one-past-last PFN. * * Non-free pages, invalid PFNs, or zone boundaries within the * [start_pfn, end_pfn) range are considered errors, cause function to * undo its actions and return zero. cc->freepages[] are empty. * * Otherwise, function returns one-past-the-last PFN of isolated page * (which may be greater then end_pfn if end fell in a middle of * a free page). cc->freepages[] contain free pages isolated. */ unsigned long isolate_freepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn) { … } /* Similar to reclaim, but different enough that they don't share logic */ static bool too_many_isolated(struct compact_control *cc) { … } /** * skip_isolation_on_order() - determine when to skip folio isolation based on * folio order and compaction target order * @order: to-be-isolated folio order * @target_order: compaction target order * * This avoids unnecessary folio isolations during compaction. */ static bool skip_isolation_on_order(int order, int target_order) { … } /** * isolate_migratepages_block() - isolate all migrate-able pages within * a single pageblock * @cc: Compaction control structure. * @low_pfn: The first PFN to isolate * @end_pfn: The one-past-the-last PFN to isolate, within same pageblock * @mode: Isolation mode to be used. * * Isolate all pages that can be migrated from the range specified by * [low_pfn, end_pfn). The range is expected to be within same pageblock. * Returns errno, like -EAGAIN or -EINTR in case e.g signal pending or congestion, * -ENOMEM in case we could not allocate a page, or 0. * cc->migrate_pfn will contain the next pfn to scan. * * The pages are isolated on cc->migratepages list (not required to be empty), * and cc->nr_migratepages is updated accordingly. */ static int isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn, isolate_mode_t mode) { … } /** * isolate_migratepages_range() - isolate migrate-able pages in a PFN range * @cc: Compaction control structure. * @start_pfn: The first PFN to start isolating. * @end_pfn: The one-past-last PFN. * * Returns -EAGAIN when contented, -EINTR in case of a signal pending, -ENOMEM * in case we could not allocate a page, or 0. */ int isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn) { … } #endif /* CONFIG_COMPACTION || CONFIG_CMA */ #ifdef CONFIG_COMPACTION static bool suitable_migration_source(struct compact_control *cc, struct page *page) { … } /* Returns true if the page is within a block suitable for migration to */ static bool suitable_migration_target(struct compact_control *cc, struct page *page) { … } static inline unsigned int freelist_scan_limit(struct compact_control *cc) { … } /* * Test whether the free scanner has reached the same or lower pageblock than * the migration scanner, and compaction should thus terminate. */ static inline bool compact_scanners_met(struct compact_control *cc) { … } /* * Used when scanning for a suitable migration target which scans freelists * in reverse. Reorders the list such as the unscanned pages are scanned * first on the next iteration of the free scanner */ static void move_freelist_head(struct list_head *freelist, struct page *freepage) { … } /* * Similar to move_freelist_head except used by the migration scanner * when scanning forward. It's possible for these list operations to * move against each other if they search the free list exactly in * lockstep. */ static void move_freelist_tail(struct list_head *freelist, struct page *freepage) { … } static void fast_isolate_around(struct compact_control *cc, unsigned long pfn) { … } /* Search orders in round-robin fashion */ static int next_search_order(struct compact_control *cc, int order) { … } static void fast_isolate_freepages(struct compact_control *cc) { … } /* * Based on information in the current compact_control, find blocks * suitable for isolating free pages from and then isolate them. */ static void isolate_freepages(struct compact_control *cc) { … } /* * This is a migrate-callback that "allocates" freepages by taking pages * from the isolated freelists in the block we are migrating to. */ static struct folio *compaction_alloc_noprof(struct folio *src, unsigned long data) { … } static struct folio *compaction_alloc(struct folio *src, unsigned long data) { … } /* * This is a migrate-callback that "frees" freepages back to the isolated * freelist. All pages on the freelist are from the same zone, so there is no * special handling needed for NUMA. */ static void compaction_free(struct folio *dst, unsigned long data) { … } /* possible outcome of isolate_migratepages */ isolate_migrate_t; /* * Allow userspace to control policy on scanning the unevictable LRU for * compactable pages. */ static int sysctl_compact_unevictable_allowed __read_mostly = …; /* * Tunable for proactive compaction. It determines how * aggressively the kernel should compact memory in the * background. It takes values in the range [0, 100]. */ static unsigned int __read_mostly sysctl_compaction_proactiveness = …; static int sysctl_extfrag_threshold = …; static int __read_mostly sysctl_compact_memory; static inline void update_fast_start_pfn(struct compact_control *cc, unsigned long pfn) { … } static inline unsigned long reinit_migrate_pfn(struct compact_control *cc) { … } /* * Briefly search the free lists for a migration source that already has * some free pages to reduce the number of pages that need migration * before a pageblock is free. */ static unsigned long fast_find_migrateblock(struct compact_control *cc) { … } /* * Isolate all pages that can be migrated from the first suitable block, * starting at the block pointed to by the migrate scanner pfn within * compact_control. */ static isolate_migrate_t isolate_migratepages(struct compact_control *cc) { … } /* * Determine whether kswapd is (or recently was!) running on this node. * * pgdat_kswapd_lock() pins pgdat->kswapd, so a concurrent kswapd_stop() can't * zero it. */ static bool kswapd_is_running(pg_data_t *pgdat) { … } /* * A zone's fragmentation score is the external fragmentation wrt to the * COMPACTION_HPAGE_ORDER. It returns a value in the range [0, 100]. */ static unsigned int fragmentation_score_zone(struct zone *zone) { … } /* * A weighted zone's fragmentation score is the external fragmentation * wrt to the COMPACTION_HPAGE_ORDER scaled by the zone's size. It * returns a value in the range [0, 100]. * * The scaling factor ensures that proactive compaction focuses on larger * zones like ZONE_NORMAL, rather than smaller, specialized zones like * ZONE_DMA32. For smaller zones, the score value remains close to zero, * and thus never exceeds the high threshold for proactive compaction. */ static unsigned int fragmentation_score_zone_weighted(struct zone *zone) { … } /* * The per-node proactive (background) compaction process is started by its * corresponding kcompactd thread when the node's fragmentation score * exceeds the high threshold. The compaction process remains active till * the node's score falls below the low threshold, or one of the back-off * conditions is met. */ static unsigned int fragmentation_score_node(pg_data_t *pgdat) { … } static unsigned int fragmentation_score_wmark(bool low) { … } static bool should_proactive_compact_node(pg_data_t *pgdat) { … } static enum compact_result __compact_finished(struct compact_control *cc) { … } static enum compact_result compact_finished(struct compact_control *cc) { … } static bool __compaction_suitable(struct zone *zone, int order, int highest_zoneidx, unsigned long wmark_target) { … } /* * compaction_suitable: Is this suitable to run compaction on this zone now? */ bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx) { … } bool compaction_zonelist_suitable(struct alloc_context *ac, int order, int alloc_flags) { … } /* * Should we do compaction for target allocation order. * Return COMPACT_SUCCESS if allocation for target order can be already * satisfied * Return COMPACT_SKIPPED if compaction for target order is likely to fail * Return COMPACT_CONTINUE if compaction for target order should be ran */ static enum compact_result compaction_suit_allocation_order(struct zone *zone, unsigned int order, int highest_zoneidx, unsigned int alloc_flags) { … } static enum compact_result compact_zone(struct compact_control *cc, struct capture_control *capc) { … } static enum compact_result compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, enum compact_priority prio, unsigned int alloc_flags, int highest_zoneidx, struct page **capture) { … } /** * try_to_compact_pages - Direct compact to satisfy a high-order allocation * @gfp_mask: The GFP mask of the current allocation * @order: The order of the current allocation * @alloc_flags: The allocation flags of the current allocation * @ac: The context of current allocation * @prio: Determines how hard direct compaction should try to succeed * @capture: Pointer to free page created by compaction will be stored here * * This is the main entry point for direct page compaction. */ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, enum compact_priority prio, struct page **capture) { … } /* * compact_node() - compact all zones within a node * @pgdat: The node page data * @proactive: Whether the compaction is proactive * * For proactive compaction, compact till each zone's fragmentation score * reaches within proactive compaction thresholds (as determined by the * proactiveness tunable), it is possible that the function returns before * reaching score targets due to various back-off conditions, such as, * contention on per-node or per-zone locks. */ static int compact_node(pg_data_t *pgdat, bool proactive) { … } /* Compact all zones of all nodes in the system */ static int compact_nodes(void) { … } static int compaction_proactiveness_sysctl_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { … } /* * This is the entry point for compacting all nodes via * /proc/sys/vm/compact_memory */ static int sysctl_compaction_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { … } #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) static ssize_t compact_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { … } static DEVICE_ATTR_WO(compact); int compaction_register_node(struct node *node) { … } void compaction_unregister_node(struct node *node) { … } #endif /* CONFIG_SYSFS && CONFIG_NUMA */ static inline bool kcompactd_work_requested(pg_data_t *pgdat) { … } static bool kcompactd_node_suitable(pg_data_t *pgdat) { … } static void kcompactd_do_work(pg_data_t *pgdat) { … } void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx) { … } /* * The background compaction daemon, started as a kernel thread * from the init process. */ static int kcompactd(void *p) { … } /* * This kcompactd start function will be called by init and node-hot-add. * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added. */ void __meminit kcompactd_run(int nid) { … } /* * Called by memory hotplug when all memory in a node is offlined. Caller must * be holding mem_hotplug_begin/done(). */ void __meminit kcompactd_stop(int nid) { … } /* * It's optimal to keep kcompactd on the same CPUs as their memory, but * not required for correctness. So if the last cpu in a node goes * away, we get changed to run anywhere: as the first one comes back, * restore their cpu bindings. */ static int kcompactd_cpu_online(unsigned int cpu) { … } static int proc_dointvec_minmax_warn_RT_change(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { … } static struct ctl_table vm_compaction[] = …; static int __init kcompactd_init(void) { … } subsys_initcall(…) … #endif /* CONFIG_COMPACTION */