compaction.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0
/*
 * linux/mm/compaction.c
 *
 * Memory compaction for the reduction of external fragmentation. Note that
 * this heavily depends upon page migration to do all the real heavy
 * lifting
 *
 * Copyright IBM Corp. 2007-2010 Mel Gorman <[email protected]>
 */
#include <linux/cpu.h>
#include <linux/swap.h>
#include <linux/migrate.h>
#include <linux/compaction.h>
#include <linux/mm_inline.h>
#include <linux/sched/signal.h>
#include <linux/backing-dev.h>
#include <linux/sysctl.h>
#include <linux/sysfs.h>
#include <linux/page-isolation.h>
#include <linux/kasan.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/page_owner.h>
#include <linux/psi.h>
#include <linux/cpuset.h>
#include "internal.h"

#ifdef CONFIG_COMPACTION
/*
 * Fragmentation score check interval for proactive compaction purposes.
 */
#define HPAGE_FRAG_CHECK_INTERVAL_MSEC …

static inline void count_compact_event(enum vm_event_item item)
{ … }

static inline void count_compact_events(enum vm_event_item item, long delta)
{ … }

/*
 * order == -1 is expected when compacting proactively via
 * 1. /proc/sys/vm/compact_memory
 * 2. /sys/devices/system/node/nodex/compact
 * 3. /proc/sys/vm/compaction_proactiveness
 */
static inline bool is_via_compact_memory(int order)
{ … }

#else
#define count_compact_event …
#define count_compact_events …
static inline bool is_via_compact_memory(int order) { return false; }
#endif

#if defined CONFIG_COMPACTION || defined CONFIG_CMA

#define CREATE_TRACE_POINTS
#include <trace/events/compaction.h>

#define block_start_pfn(pfn, order) …
#define block_end_pfn(pfn, order) …

/*
 * Page order with-respect-to which proactive compaction
 * calculates external fragmentation, which is used as
 * the "fragmentation score" of a node/zone.
 */
#if defined CONFIG_TRANSPARENT_HUGEPAGE
#define COMPACTION_HPAGE_ORDER …
#elif defined CONFIG_HUGETLBFS
#define COMPACTION_HPAGE_ORDER …
#else
#define COMPACTION_HPAGE_ORDER …
#endif

static struct page *mark_allocated_noprof(struct page *page, unsigned int order, gfp_t gfp_flags)
{ … }
#define mark_allocated(...) …

static unsigned long release_free_list(struct list_head *freepages)
{ … }

#ifdef CONFIG_COMPACTION
bool PageMovable(struct page *page)
{ … }

void __SetPageMovable(struct page *page, const struct movable_operations *mops)
{ … }
EXPORT_SYMBOL(…);

void __ClearPageMovable(struct page *page)
{ … }
EXPORT_SYMBOL(…);

/* Do not skip compaction more than 64 times */
#define COMPACT_MAX_DEFER_SHIFT …

/*
 * Compaction is deferred when compaction fails to result in a page
 * allocation success. 1 << compact_defer_shift, compactions are skipped up
 * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
 */
static void defer_compaction(struct zone *zone, int order)
{ … }

/* Returns true if compaction should be skipped this time */
static bool compaction_deferred(struct zone *zone, int order)
{ … }

/*
 * Update defer tracking counters after successful compaction of given order,
 * which means an allocation either succeeded (alloc_success == true) or is
 * expected to succeed.
 */
void compaction_defer_reset(struct zone *zone, int order,
		bool alloc_success)
{ … }

/* Returns true if restarting compaction after many failures */
static bool compaction_restarting(struct zone *zone, int order)
{ … }

/* Returns true if the pageblock should be scanned for pages to isolate. */
static inline bool isolation_suitable(struct compact_control *cc,
					struct page *page)
{ … }

static void reset_cached_positions(struct zone *zone)
{ … }

#ifdef CONFIG_SPARSEMEM
/*
 * If the PFN falls into an offline section, return the start PFN of the
 * next online section. If the PFN falls into an online section or if
 * there is no next online section, return 0.
 */
static unsigned long skip_offline_sections(unsigned long start_pfn)
{ … }

/*
 * If the PFN falls into an offline section, return the end PFN of the
 * next online section in reverse. If the PFN falls into an online section
 * or if there is no next online section in reverse, return 0.
 */
static unsigned long skip_offline_sections_reverse(unsigned long start_pfn)
{ … }
#else
static unsigned long skip_offline_sections(unsigned long start_pfn)
{
	return 0;
}

static unsigned long skip_offline_sections_reverse(unsigned long start_pfn)
{
	return 0;
}
#endif

/*
 * Compound pages of >= pageblock_order should consistently be skipped until
 * released. It is always pointless to compact pages of such order (if they are
 * migratable), and the pageblocks they occupy cannot contain any free pages.
 */
static bool pageblock_skip_persistent(struct page *page)
{ … }

static bool
__reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source,
							bool check_target)
{ … }

/*
 * This function is called to clear all cached information on pageblocks that
 * should be skipped for page isolation when the migrate and free page scanner
 * meet.
 */
static void __reset_isolation_suitable(struct zone *zone)
{ … }

void reset_isolation_suitable(pg_data_t *pgdat)
{ … }

/*
 * Sets the pageblock skip bit if it was clear. Note that this is a hint as
 * locks are not required for read/writers. Returns true if it was already set.
 */
static bool test_and_set_skip(struct compact_control *cc, struct page *page)
{ … }

static void update_cached_migrate(struct compact_control *cc, unsigned long pfn)
{ … }

/*
 * If no pages were isolated then mark this pageblock to be skipped in the
 * future. The information is later cleared by __reset_isolation_suitable().
 */
static void update_pageblock_skip(struct compact_control *cc,
			struct page *page, unsigned long pfn)
{ … }
#else
static inline bool isolation_suitable(struct compact_control *cc,
					struct page *page)
{
	return true;
}

static inline bool pageblock_skip_persistent(struct page *page)
{
	return false;
}

static inline void update_pageblock_skip(struct compact_control *cc,
			struct page *page, unsigned long pfn)
{
}

static void update_cached_migrate(struct compact_control *cc, unsigned long pfn)
{
}

static bool test_and_set_skip(struct compact_control *cc, struct page *page)
{
	return false;
}
#endif /* CONFIG_COMPACTION */

/*
 * Compaction requires the taking of some coarse locks that are potentially
 * very heavily contended. For async compaction, trylock and record if the
 * lock is contended. The lock will still be acquired but compaction will
 * abort when the current block is finished regardless of success rate.
 * Sync compaction acquires the lock.
 *
 * Always returns true which makes it easier to track lock state in callers.
 */
static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags,
						struct compact_control *cc)
	__acquires(lock)
{ … }

/*
 * Compaction requires the taking of some coarse locks that are potentially
 * very heavily contended. The lock should be periodically unlocked to avoid
 * having disabled IRQs for a long time, even when there is nobody waiting on
 * the lock. It might also be that allowing the IRQs will result in
 * need_resched() becoming true. If scheduling is needed, compaction schedules.
 * Either compaction type will also abort if a fatal signal is pending.
 * In either case if the lock was locked, it is dropped and not regained.
 *
 * Returns true if compaction should abort due to fatal signal pending.
 * Returns false when compaction can continue.
 */
static bool compact_unlock_should_abort(spinlock_t *lock,
		unsigned long flags, bool *locked, struct compact_control *cc)
{ … }

/*
 * Isolate free pages onto a private freelist. If @strict is true, will abort
 * returning 0 on any invalid PFNs or non-free pages inside of the pageblock
 * (even though it may still end up isolating some pages).
 */
static unsigned long isolate_freepages_block(struct compact_control *cc,
				unsigned long *start_pfn,
				unsigned long end_pfn,
				struct list_head *freelist,
				unsigned int stride,
				bool strict)
{ … }

/**
 * isolate_freepages_range() - isolate free pages.
 * @cc:        Compaction control structure.
 * @start_pfn: The first PFN to start isolating.
 * @end_pfn:   The one-past-last PFN.
 *
 * Non-free pages, invalid PFNs, or zone boundaries within the
 * [start_pfn, end_pfn) range are considered errors, cause function to
 * undo its actions and return zero. cc->freepages[] are empty.
 *
 * Otherwise, function returns one-past-the-last PFN of isolated page
 * (which may be greater then end_pfn if end fell in a middle of
 * a free page). cc->freepages[] contain free pages isolated.
 */
unsigned long
isolate_freepages_range(struct compact_control *cc,
			unsigned long start_pfn, unsigned long end_pfn)
{ … }

/* Similar to reclaim, but different enough that they don't share logic */
static bool too_many_isolated(struct compact_control *cc)
{ … }

/**
 * skip_isolation_on_order() - determine when to skip folio isolation based on
 *			       folio order and compaction target order
 * @order:		to-be-isolated folio order
 * @target_order:	compaction target order
 *
 * This avoids unnecessary folio isolations during compaction.
 */
static bool skip_isolation_on_order(int order, int target_order)
{ … }

/**
 * isolate_migratepages_block() - isolate all migrate-able pages within
 *				  a single pageblock
 * @cc:		Compaction control structure.
 * @low_pfn:	The first PFN to isolate
 * @end_pfn:	The one-past-the-last PFN to isolate, within same pageblock
 * @mode:	Isolation mode to be used.
 *
 * Isolate all pages that can be migrated from the range specified by
 * [low_pfn, end_pfn). The range is expected to be within same pageblock.
 * Returns errno, like -EAGAIN or -EINTR in case e.g signal pending or congestion,
 * -ENOMEM in case we could not allocate a page, or 0.
 * cc->migrate_pfn will contain the next pfn to scan.
 *
 * The pages are isolated on cc->migratepages list (not required to be empty),
 * and cc->nr_migratepages is updated accordingly.
 */
static int
isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
			unsigned long end_pfn, isolate_mode_t mode)
{ … }

/**
 * isolate_migratepages_range() - isolate migrate-able pages in a PFN range
 * @cc:        Compaction control structure.
 * @start_pfn: The first PFN to start isolating.
 * @end_pfn:   The one-past-last PFN.
 *
 * Returns -EAGAIN when contented, -EINTR in case of a signal pending, -ENOMEM
 * in case we could not allocate a page, or 0.
 */
int
isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
							unsigned long end_pfn)
{ … }

#endif /* CONFIG_COMPACTION || CONFIG_CMA */
#ifdef CONFIG_COMPACTION

static bool suitable_migration_source(struct compact_control *cc,
							struct page *page)
{ … }

/* Returns true if the page is within a block suitable for migration to */
static bool suitable_migration_target(struct compact_control *cc,
							struct page *page)
{ … }

static inline unsigned int
freelist_scan_limit(struct compact_control *cc)
{ … }

/*
 * Test whether the free scanner has reached the same or lower pageblock than
 * the migration scanner, and compaction should thus terminate.
 */
static inline bool compact_scanners_met(struct compact_control *cc)
{ … }

/*
 * Used when scanning for a suitable migration target which scans freelists
 * in reverse. Reorders the list such as the unscanned pages are scanned
 * first on the next iteration of the free scanner
 */
static void
move_freelist_head(struct list_head *freelist, struct page *freepage)
{ … }

/*
 * Similar to move_freelist_head except used by the migration scanner
 * when scanning forward. It's possible for these list operations to
 * move against each other if they search the free list exactly in
 * lockstep.
 */
static void
move_freelist_tail(struct list_head *freelist, struct page *freepage)
{ … }

static void
fast_isolate_around(struct compact_control *cc, unsigned long pfn)
{ … }

/* Search orders in round-robin fashion */
static int next_search_order(struct compact_control *cc, int order)
{ … }

static void fast_isolate_freepages(struct compact_control *cc)
{ … }

/*
 * Based on information in the current compact_control, find blocks
 * suitable for isolating free pages from and then isolate them.
 */
static void isolate_freepages(struct compact_control *cc)
{ … }

/*
 * This is a migrate-callback that "allocates" freepages by taking pages
 * from the isolated freelists in the block we are migrating to.
 */
static struct folio *compaction_alloc_noprof(struct folio *src, unsigned long data)
{ … }

static struct folio *compaction_alloc(struct folio *src, unsigned long data)
{ … }

/*
 * This is a migrate-callback that "frees" freepages back to the isolated
 * freelist.  All pages on the freelist are from the same zone, so there is no
 * special handling needed for NUMA.
 */
static void compaction_free(struct folio *dst, unsigned long data)
{ … }

/* possible outcome of isolate_migratepages */
isolate_migrate_t;

/*
 * Allow userspace to control policy on scanning the unevictable LRU for
 * compactable pages.
 */
static int sysctl_compact_unevictable_allowed __read_mostly = …;
/*
 * Tunable for proactive compaction. It determines how
 * aggressively the kernel should compact memory in the
 * background. It takes values in the range [0, 100].
 */
static unsigned int __read_mostly sysctl_compaction_proactiveness = …;
static int sysctl_extfrag_threshold = …;
static int __read_mostly sysctl_compact_memory;

static inline void
update_fast_start_pfn(struct compact_control *cc, unsigned long pfn)
{ … }

static inline unsigned long
reinit_migrate_pfn(struct compact_control *cc)
{ … }

/*
 * Briefly search the free lists for a migration source that already has
 * some free pages to reduce the number of pages that need migration
 * before a pageblock is free.
 */
static unsigned long fast_find_migrateblock(struct compact_control *cc)
{ … }

/*
 * Isolate all pages that can be migrated from the first suitable block,
 * starting at the block pointed to by the migrate scanner pfn within
 * compact_control.
 */
static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
{ … }

/*
 * Determine whether kswapd is (or recently was!) running on this node.
 *
 * pgdat_kswapd_lock() pins pgdat->kswapd, so a concurrent kswapd_stop() can't
 * zero it.
 */
static bool kswapd_is_running(pg_data_t *pgdat)
{ … }

/*
 * A zone's fragmentation score is the external fragmentation wrt to the
 * COMPACTION_HPAGE_ORDER. It returns a value in the range [0, 100].
 */
static unsigned int fragmentation_score_zone(struct zone *zone)
{ … }

/*
 * A weighted zone's fragmentation score is the external fragmentation
 * wrt to the COMPACTION_HPAGE_ORDER scaled by the zone's size. It
 * returns a value in the range [0, 100].
 *
 * The scaling factor ensures that proactive compaction focuses on larger
 * zones like ZONE_NORMAL, rather than smaller, specialized zones like
 * ZONE_DMA32. For smaller zones, the score value remains close to zero,
 * and thus never exceeds the high threshold for proactive compaction.
 */
static unsigned int fragmentation_score_zone_weighted(struct zone *zone)
{ … }

/*
 * The per-node proactive (background) compaction process is started by its
 * corresponding kcompactd thread when the node's fragmentation score
 * exceeds the high threshold. The compaction process remains active till
 * the node's score falls below the low threshold, or one of the back-off
 * conditions is met.
 */
static unsigned int fragmentation_score_node(pg_data_t *pgdat)
{ … }

static unsigned int fragmentation_score_wmark(bool low)
{ … }

static bool should_proactive_compact_node(pg_data_t *pgdat)
{ … }

static enum compact_result __compact_finished(struct compact_control *cc)
{ … }

static enum compact_result compact_finished(struct compact_control *cc)
{ … }

static bool __compaction_suitable(struct zone *zone, int order,
				  int highest_zoneidx,
				  unsigned long wmark_target)
{ … }

/*
 * compaction_suitable: Is this suitable to run compaction on this zone now?
 */
bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx)
{ … }

bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
		int alloc_flags)
{ … }

/*
 * Should we do compaction for target allocation order.
 * Return COMPACT_SUCCESS if allocation for target order can be already
 * satisfied
 * Return COMPACT_SKIPPED if compaction for target order is likely to fail
 * Return COMPACT_CONTINUE if compaction for target order should be ran
 */
static enum compact_result
compaction_suit_allocation_order(struct zone *zone, unsigned int order,
				 int highest_zoneidx, unsigned int alloc_flags)
{ … }

static enum compact_result
compact_zone(struct compact_control *cc, struct capture_control *capc)
{ … }

static enum compact_result compact_zone_order(struct zone *zone, int order,
		gfp_t gfp_mask, enum compact_priority prio,
		unsigned int alloc_flags, int highest_zoneidx,
		struct page **capture)
{ … }

/**
 * try_to_compact_pages - Direct compact to satisfy a high-order allocation
 * @gfp_mask: The GFP mask of the current allocation
 * @order: The order of the current allocation
 * @alloc_flags: The allocation flags of the current allocation
 * @ac: The context of current allocation
 * @prio: Determines how hard direct compaction should try to succeed
 * @capture: Pointer to free page created by compaction will be stored here
 *
 * This is the main entry point for direct page compaction.
 */
enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
		unsigned int alloc_flags, const struct alloc_context *ac,
		enum compact_priority prio, struct page **capture)
{ … }

/*
 * compact_node() - compact all zones within a node
 * @pgdat: The node page data
 * @proactive: Whether the compaction is proactive
 *
 * For proactive compaction, compact till each zone's fragmentation score
 * reaches within proactive compaction thresholds (as determined by the
 * proactiveness tunable), it is possible that the function returns before
 * reaching score targets due to various back-off conditions, such as,
 * contention on per-node or per-zone locks.
 */
static int compact_node(pg_data_t *pgdat, bool proactive)
{ … }

/* Compact all zones of all nodes in the system */
static int compact_nodes(void)
{ … }

static int compaction_proactiveness_sysctl_handler(const struct ctl_table *table, int write,
		void *buffer, size_t *length, loff_t *ppos)
{ … }

/*
 * This is the entry point for compacting all nodes via
 * /proc/sys/vm/compact_memory
 */
static int sysctl_compaction_handler(const struct ctl_table *table, int write,
			void *buffer, size_t *length, loff_t *ppos)
{ … }

#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
static ssize_t compact_store(struct device *dev,
			     struct device_attribute *attr,
			     const char *buf, size_t count)
{ … }
static DEVICE_ATTR_WO(compact);

int compaction_register_node(struct node *node)
{ … }

void compaction_unregister_node(struct node *node)
{ … }
#endif /* CONFIG_SYSFS && CONFIG_NUMA */

static inline bool kcompactd_work_requested(pg_data_t *pgdat)
{ … }

static bool kcompactd_node_suitable(pg_data_t *pgdat)
{ … }

static void kcompactd_do_work(pg_data_t *pgdat)
{ … }

void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx)
{ … }

/*
 * The background compaction daemon, started as a kernel thread
 * from the init process.
 */
static int kcompactd(void *p)
{ … }

/*
 * This kcompactd start function will be called by init and node-hot-add.
 * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added.
 */
void __meminit kcompactd_run(int nid)
{ … }

/*
 * Called by memory hotplug when all memory in a node is offlined. Caller must
 * be holding mem_hotplug_begin/done().
 */
void __meminit kcompactd_stop(int nid)
{ … }

/*
 * It's optimal to keep kcompactd on the same CPUs as their memory, but
 * not required for correctness. So if the last cpu in a node goes
 * away, we get changed to run anywhere: as the first one comes back,
 * restore their cpu bindings.
 */
static int kcompactd_cpu_online(unsigned int cpu)
{ … }

static int proc_dointvec_minmax_warn_RT_change(const struct ctl_table *table,
		int write, void *buffer, size_t *lenp, loff_t *ppos)
{ … }

static struct ctl_table vm_compaction[] = …;

static int __init kcompactd_init(void)
{ … }
subsys_initcall(…) …

#endif /* CONFIG_COMPACTION */