// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * * Swap reorganised 29.12.95, Stephen Tweedie. * kswapd added: 7.1.96 sct * Removed kswapd_ctl limits, and swap out as many pages as needed * to bring the system back to freepages.high: 2.4.97, Rik van Riel. * Zone aware kswapd started 02/00, Kanoj Sarcar ([email protected]). * Multiqueue VM started 5.8.00, Rik van Riel. */ #define pr_fmt(fmt) … #include <linux/mm.h> #include <linux/sched/mm.h> #include <linux/module.h> #include <linux/gfp.h> #include <linux/kernel_stat.h> #include <linux/swap.h> #include <linux/pagemap.h> #include <linux/init.h> #include <linux/highmem.h> #include <linux/vmpressure.h> #include <linux/vmstat.h> #include <linux/file.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/buffer_head.h> /* for buffer_heads_over_limit */ #include <linux/mm_inline.h> #include <linux/backing-dev.h> #include <linux/rmap.h> #include <linux/topology.h> #include <linux/cpu.h> #include <linux/cpuset.h> #include <linux/compaction.h> #include <linux/notifier.h> #include <linux/delay.h> #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/memcontrol.h> #include <linux/migrate.h> #include <linux/delayacct.h> #include <linux/sysctl.h> #include <linux/memory-tiers.h> #include <linux/oom.h> #include <linux/pagevec.h> #include <linux/prefetch.h> #include <linux/printk.h> #include <linux/dax.h> #include <linux/psi.h> #include <linux/pagewalk.h> #include <linux/shmem_fs.h> #include <linux/ctype.h> #include <linux/debugfs.h> #include <linux/khugepaged.h> #include <linux/rculist_nulls.h> #include <linux/random.h> #include <linux/mmu_notifier.h> #include <asm/tlbflush.h> #include <asm/div64.h> #include <linux/swapops.h> #include <linux/balloon_compaction.h> #include <linux/sched/sysctl.h> #include "internal.h" #include "swap.h" #define CREATE_TRACE_POINTS #include <trace/events/vmscan.h> struct scan_control { … }; #ifdef ARCH_HAS_PREFETCHW #define prefetchw_prev_lru_folio(_folio, _base, _field) … #else #define prefetchw_prev_lru_folio … #endif /* * From 0 .. MAX_SWAPPINESS. Higher means more swappy. */ int vm_swappiness = …; #ifdef CONFIG_MEMCG /* Returns true for reclaim through cgroup limits or cgroup interfaces. */ static bool cgroup_reclaim(struct scan_control *sc) { … } /* * Returns true for reclaim on the root cgroup. This is true for direct * allocator reclaim and reclaim through cgroup interfaces on the root cgroup. */ static bool root_reclaim(struct scan_control *sc) { … } /** * writeback_throttling_sane - is the usual dirty throttling mechanism available? * @sc: scan_control in question * * The normal page dirty throttling mechanism in balance_dirty_pages() is * completely broken with the legacy memcg and direct stalling in * shrink_folio_list() is used for throttling instead, which lacks all the * niceties such as fairness, adaptive pausing, bandwidth proportional * allocation and configurability. * * This function tests whether the vmscan currently in progress can assume * that the normal dirty throttling mechanism is operational. */ static bool writeback_throttling_sane(struct scan_control *sc) { … } static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg) { … } #else static bool cgroup_reclaim(struct scan_control *sc) { return false; } static bool root_reclaim(struct scan_control *sc) { return true; } static bool writeback_throttling_sane(struct scan_control *sc) { return true; } static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg) { return READ_ONCE(vm_swappiness); } #endif static void set_task_reclaim_state(struct task_struct *task, struct reclaim_state *rs) { … } /* * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to * scan_control->nr_reclaimed. */ static void flush_reclaim_state(struct scan_control *sc) { … } static bool can_demote(int nid, struct scan_control *sc) { … } static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, int nid, struct scan_control *sc) { … } /* * This misses isolated folios which are not accounted for to save counters. * As the data only determines if reclaim or compaction continues, it is * not expected that isolated folios will be a dominating factor. */ unsigned long zone_reclaimable_pages(struct zone *zone) { … } /** * lruvec_lru_size - Returns the number of pages on the given LRU list. * @lruvec: lru vector * @lru: lru to use * @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list) */ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) { … } static unsigned long drop_slab_node(int nid) { … } void drop_slab(void) { … } static int reclaimer_offset(void) { … } static inline int is_page_cache_freeable(struct folio *folio) { … } /* * We detected a synchronous write error writing a folio out. Probably * -ENOSPC. We need to propagate that into the address_space for a subsequent * fsync(), msync() or close(). * * The tricky part is that after writepage we cannot touch the mapping: nothing * prevents it from being freed up. But we have a ref on the folio and once * that folio is locked, the mapping is pinned. * * We're allowed to run sleeping folio_lock() here because we know the caller has * __GFP_FS. */ static void handle_write_error(struct address_space *mapping, struct folio *folio, int error) { … } static bool skip_throttle_noprogress(pg_data_t *pgdat) { … } void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason) { … } /* * Account for folios written if tasks are throttled waiting on dirty * folios to clean. If enough folios have been cleaned since throttling * started then wakeup the throttled tasks. */ void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, int nr_throttled) { … } /* possible outcome of pageout() */ pageout_t; /* * pageout is called by shrink_folio_list() for each dirty folio. * Calls ->writepage(). */ static pageout_t pageout(struct folio *folio, struct address_space *mapping, struct swap_iocb **plug, struct list_head *folio_list) { … } /* * Same as remove_mapping, but if the folio is removed from the mapping, it * gets returned with a refcount of 0. */ static int __remove_mapping(struct address_space *mapping, struct folio *folio, bool reclaimed, struct mem_cgroup *target_memcg) { … } /** * remove_mapping() - Attempt to remove a folio from its mapping. * @mapping: The address space. * @folio: The folio to remove. * * If the folio is dirty, under writeback or if someone else has a ref * on it, removal will fail. * Return: The number of pages removed from the mapping. 0 if the folio * could not be removed. * Context: The caller should have a single refcount on the folio and * hold its lock. */ long remove_mapping(struct address_space *mapping, struct folio *folio) { … } /** * folio_putback_lru - Put previously isolated folio onto appropriate LRU list. * @folio: Folio to be returned to an LRU list. * * Add previously isolated @folio to appropriate LRU list. * The folio may still be unevictable for other reasons. * * Context: lru_lock must not be held, interrupts must be enabled. */ void folio_putback_lru(struct folio *folio) { … } enum folio_references { … }; static enum folio_references folio_check_references(struct folio *folio, struct scan_control *sc) { … } /* Check if a folio is dirty or under writeback */ static void folio_check_dirty_writeback(struct folio *folio, bool *dirty, bool *writeback) { … } struct folio *alloc_migrate_folio(struct folio *src, unsigned long private) { … } /* * Take folios on @demote_folios and attempt to demote them to another node. * Folios which are not demoted are left on @demote_folios. */ static unsigned int demote_folio_list(struct list_head *demote_folios, struct pglist_data *pgdat) { … } static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask) { … } /* * shrink_folio_list() returns the number of reclaimed pages */ static unsigned int shrink_folio_list(struct list_head *folio_list, struct pglist_data *pgdat, struct scan_control *sc, struct reclaim_stat *stat, bool ignore_references) { … } unsigned int reclaim_clean_pages_from_list(struct zone *zone, struct list_head *folio_list) { … } /* * Update LRU sizes after isolating pages. The LRU size updates must * be complete before mem_cgroup_update_lru_size due to a sanity check. */ static __always_inline void update_lru_sizes(struct lruvec *lruvec, enum lru_list lru, unsigned long *nr_zone_taken) { … } /* * Isolating page from the lruvec to fill in @dst list by nr_to_scan times. * * lruvec->lru_lock is heavily contended. Some of the functions that * shrink the lists perform better by taking out a batch of pages * and working on them outside the LRU lock. * * For pagecache intensive workloads, this function is the hottest * spot in the kernel (apart from copy_*_user functions). * * Lru_lock must be held before calling this function. * * @nr_to_scan: The number of eligible pages to look through on the list. * @lruvec: The LRU vector to pull pages from. * @dst: The temp list to put pages on to. * @nr_scanned: The number of pages that were scanned. * @sc: The scan_control struct for this reclaim session * @lru: LRU list id for isolating * * returns how many pages were moved onto *@dst. */ static unsigned long isolate_lru_folios(unsigned long nr_to_scan, struct lruvec *lruvec, struct list_head *dst, unsigned long *nr_scanned, struct scan_control *sc, enum lru_list lru) { … } /** * folio_isolate_lru() - Try to isolate a folio from its LRU list. * @folio: Folio to isolate from its LRU list. * * Isolate a @folio from an LRU list and adjust the vmstat statistic * corresponding to whatever LRU list the folio was on. * * The folio will have its LRU flag cleared. If it was found on the * active list, it will have the Active flag set. If it was found on the * unevictable list, it will have the Unevictable flag set. These flags * may need to be cleared by the caller before letting the page go. * * Context: * * (1) Must be called with an elevated refcount on the folio. This is a * fundamental difference from isolate_lru_folios() (which is called * without a stable reference). * (2) The lru_lock must not be held. * (3) Interrupts must be enabled. * * Return: true if the folio was removed from an LRU list. * false if the folio was not on an LRU list. */ bool folio_isolate_lru(struct folio *folio) { … } /* * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and * then get rescheduled. When there are massive number of tasks doing page * allocation, such sleeping direct reclaimers may keep piling up on each CPU, * the LRU list will go small and be scanned faster than necessary, leading to * unnecessary swapping, thrashing and OOM. */ static bool too_many_isolated(struct pglist_data *pgdat, int file, struct scan_control *sc) { … } /* * move_folios_to_lru() moves folios from private @list to appropriate LRU list. * * Returns the number of pages moved to the given lruvec. */ static unsigned int move_folios_to_lru(struct lruvec *lruvec, struct list_head *list) { … } /* * If a kernel thread (such as nfsd for loop-back mounts) services a backing * device by writing to the page cache it sets PF_LOCAL_THROTTLE. In this case * we should not throttle. Otherwise it is safe to do so. */ static int current_may_throttle(void) { … } /* * shrink_inactive_list() is a helper for shrink_node(). It returns the number * of reclaimed pages */ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, enum lru_list lru) { … } /* * shrink_active_list() moves folios from the active LRU to the inactive LRU. * * We move them the other way if the folio is referenced by one or more * processes. * * If the folios are mostly unmapped, the processing is fast and it is * appropriate to hold lru_lock across the whole operation. But if * the folios are mapped, the processing is slow (folio_referenced()), so * we should drop lru_lock around each folio. It's impossible to balance * this, so instead we remove the folios from the LRU while processing them. * It is safe to rely on the active flag against the non-LRU folios in here * because nobody will play with that bit on a non-LRU folio. * * The downside is that we have to touch folio->_refcount against each folio. * But we had to alter folio->flags anyway. */ static void shrink_active_list(unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, enum lru_list lru) { … } static unsigned int reclaim_folio_list(struct list_head *folio_list, struct pglist_data *pgdat) { … } unsigned long reclaim_pages(struct list_head *folio_list) { … } static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc) { … } /* * The inactive anon list should be small enough that the VM never has * to do too much work. * * The inactive file list should be small enough to leave most memory * to the established workingset on the scan-resistant active list, * but large enough to avoid thrashing the aggregate readahead window. * * Both inactive lists should also be large enough that each inactive * folio has a chance to be referenced again before it is reclaimed. * * If that fails and refaulting is observed, the inactive list grows. * * The inactive_ratio is the target ratio of ACTIVE to INACTIVE folios * on this LRU, maintained by the pageout code. An inactive_ratio * of 3 means 3:1 or 25% of the folios are kept on the inactive list. * * total target max * memory ratio inactive * ------------------------------------- * 10MB 1 5MB * 100MB 1 50MB * 1GB 3 250MB * 10GB 10 0.9GB * 100GB 31 3GB * 1TB 101 10GB * 10TB 320 32GB */ static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru) { … } enum scan_balance { … }; static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc) { … } /* * Determine how aggressively the anon and file LRU lists should be * scanned. * * nr[0] = anon inactive folios to scan; nr[1] = anon active folios to scan * nr[2] = file inactive folios to scan; nr[3] = file active folios to scan */ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, unsigned long *nr) { … } /* * Anonymous LRU management is a waste if there is * ultimately no way to reclaim the memory. */ static bool can_age_anon_pages(struct pglist_data *pgdat, struct scan_control *sc) { … } #ifdef CONFIG_LRU_GEN #ifdef CONFIG_LRU_GEN_ENABLED DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS); #define get_cap(cap) … #else DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS); #define get_cap … #endif static bool should_walk_mmu(void) { … } static bool should_clear_pmd_young(void) { … } /****************************************************************************** * shorthand helpers ******************************************************************************/ #define LRU_REFS_FLAGS … #define DEFINE_MAX_SEQ(lruvec) … #define DEFINE_MIN_SEQ(lruvec) … #define for_each_gen_type_zone(gen, type, zone) … #define get_memcg_gen(seq) … #define get_memcg_bin(bin) … static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) { … } static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) { … } static int get_nr_gens(struct lruvec *lruvec, int type) { … } static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) { … } /****************************************************************************** * Bloom filters ******************************************************************************/ /* * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of * bits in a bitmap, k is the number of hash functions and n is the number of * inserted items. * * Page table walkers use one of the two filters to reduce their search space. * To get rid of non-leaf entries that no longer have enough leaf entries, the * aging uses the double-buffering technique to flip to the other filter each * time it produces a new generation. For non-leaf entries that have enough * leaf entries, the aging carries them over to the next generation in * walk_pmd_range(); the eviction also report them when walking the rmap * in lru_gen_look_around(). * * For future optimizations: * 1. It's not necessary to keep both filters all the time. The spare one can be * freed after the RCU grace period and reallocated if needed again. * 2. And when reallocating, it's worth scaling its size according to the number * of inserted entries in the other filter, to reduce the memory overhead on * small systems and false positives on large systems. * 3. Jenkins' hash function is an alternative to Knuth's. */ #define BLOOM_FILTER_SHIFT … static inline int filter_gen_from_seq(unsigned long seq) { … } static void get_item_key(void *item, int *key) { … } static bool test_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq, void *item) { … } static void update_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq, void *item) { … } static void reset_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq) { … } /****************************************************************************** * mm_struct list ******************************************************************************/ #ifdef CONFIG_LRU_GEN_WALKS_MMU static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) { … } static struct lru_gen_mm_state *get_mm_state(struct lruvec *lruvec) { … } static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk) { … } void lru_gen_add_mm(struct mm_struct *mm) { … } void lru_gen_del_mm(struct mm_struct *mm) { … } #ifdef CONFIG_MEMCG void lru_gen_migrate_mm(struct mm_struct *mm) { … } #endif #else /* !CONFIG_LRU_GEN_WALKS_MMU */ static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) { return NULL; } static struct lru_gen_mm_state *get_mm_state(struct lruvec *lruvec) { return NULL; } static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk) { return NULL; } #endif static void reset_mm_stats(struct lru_gen_mm_walk *walk, bool last) { … } static bool iterate_mm_list(struct lru_gen_mm_walk *walk, struct mm_struct **iter) { … } static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long seq) { … } /****************************************************************************** * PID controller ******************************************************************************/ /* * A feedback loop based on Proportional-Integral-Derivative (PID) controller. * * The P term is refaulted/(evicted+protected) from a tier in the generation * currently being evicted; the I term is the exponential moving average of the * P term over the generations previously evicted, using the smoothing factor * 1/2; the D term isn't supported. * * The setpoint (SP) is always the first tier of one type; the process variable * (PV) is either any tier of the other type or any other tier of the same * type. * * The error is the difference between the SP and the PV; the correction is to * turn off protection when SP>PV or turn on protection when SP<PV. * * For future optimizations: * 1. The D term may discount the other two terms over time so that long-lived * generations can resist stale information. */ struct ctrl_pos { … }; static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, struct ctrl_pos *pos) { … } static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) { … } static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) { … } /****************************************************************************** * the aging ******************************************************************************/ /* promote pages accessed through page tables */ static int folio_update_gen(struct folio *folio, int gen) { … } /* protect pages accessed multiple times through file descriptors */ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { … } static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio, int old_gen, int new_gen) { … } static void reset_batch_size(struct lru_gen_mm_walk *walk) { … } static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args) { … } /* * Some userspace memory allocators map many single-page VMAs. Instead of * returning back to the PGD table for each of such VMAs, finish an entire PMD * table to reduce zigzags and improve cache performance. */ static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args, unsigned long *vm_start, unsigned long *vm_end) { … } static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr, struct pglist_data *pgdat) { … } static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr, struct pglist_data *pgdat) { … } static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, struct pglist_data *pgdat, bool can_swap) { … } static bool suitable_to_scan(int total, int young) { … } static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, struct mm_walk *args) { … } static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma, struct mm_walk *args, unsigned long *bitmap, unsigned long *first) { … } static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, struct mm_walk *args) { … } static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end, struct mm_walk *args) { … } static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) { … } static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat, bool force_alloc) { … } static void clear_mm_walk(void) { … } static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) { … } static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) { … } static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, bool can_swap, bool force_scan) { … } static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq, bool can_swap, bool force_scan) { … } /****************************************************************************** * working set protection ******************************************************************************/ static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc) { … } static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) { … } static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl) { … } /* to protect the working set of the last N jiffies */ static unsigned long lru_gen_min_ttl __read_mostly; static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) { … } /****************************************************************************** * rmap/PT walk feedback ******************************************************************************/ /* * This function exploits spatial locality when shrink_folio_list() walks the * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If * the scan was done cacheline efficiently, it adds the PMD entry pointing to * the PTE table to the Bloom filter. This forms a feedback loop between the * eviction and the aging. */ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) { … } /****************************************************************************** * memcg LRU ******************************************************************************/ /* see the comment on MEMCG_NR_GENS */ enum { … }; static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) { … } #ifdef CONFIG_MEMCG void lru_gen_online_memcg(struct mem_cgroup *memcg) { … } void lru_gen_offline_memcg(struct mem_cgroup *memcg) { … } void lru_gen_release_memcg(struct mem_cgroup *memcg) { … } void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid) { … } #endif /* CONFIG_MEMCG */ /****************************************************************************** * the eviction ******************************************************************************/ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc, int tier_idx) { … } static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc) { … } static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, int type, int tier, struct list_head *list) { … } static int get_tier_idx(struct lruvec *lruvec, int type) { … } static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx) { … } static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness, int *type_scanned, struct list_head *list) { … } static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness) { … } static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, bool can_swap, unsigned long *nr_to_scan) { … } /* * For future optimizations: * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg * reclaim. */ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap) { … } static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc) { … } static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { … } static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) { … } static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) { … } static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { … } static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) { … } /****************************************************************************** * state change ******************************************************************************/ static bool __maybe_unused state_is_valid(struct lruvec *lruvec) { … } static bool fill_evictable(struct lruvec *lruvec) { … } static bool drain_evictable(struct lruvec *lruvec) { … } static void lru_gen_change_state(bool enabled) { … } /****************************************************************************** * sysfs interface ******************************************************************************/ static ssize_t min_ttl_ms_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { … } /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ static ssize_t min_ttl_ms_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { … } static struct kobj_attribute lru_gen_min_ttl_attr = …; static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { … } /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { … } static struct kobj_attribute lru_gen_enabled_attr = …; static struct attribute *lru_gen_attrs[] = …; static const struct attribute_group lru_gen_attr_group = …; /****************************************************************************** * debugfs interface ******************************************************************************/ static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos) { … } static void lru_gen_seq_stop(struct seq_file *m, void *v) { … } static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos) { … } static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq, unsigned long seq) { … } /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ static int lru_gen_seq_show(struct seq_file *m, void *v) { … } static const struct seq_operations lru_gen_seq_ops = …; static int run_aging(struct lruvec *lruvec, unsigned long seq, bool can_swap, bool force_scan) { … } static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, int swappiness, unsigned long nr_to_reclaim) { … } static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, struct scan_control *sc, int swappiness, unsigned long opt) { … } /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, size_t len, loff_t *pos) { … } static int lru_gen_seq_open(struct inode *inode, struct file *file) { … } static const struct file_operations lru_gen_rw_fops = …; static const struct file_operations lru_gen_ro_fops = …; /****************************************************************************** * initialization ******************************************************************************/ void lru_gen_init_pgdat(struct pglist_data *pgdat) { … } void lru_gen_init_lruvec(struct lruvec *lruvec) { … } #ifdef CONFIG_MEMCG void lru_gen_init_memcg(struct mem_cgroup *memcg) { … } void lru_gen_exit_memcg(struct mem_cgroup *memcg) { … } #endif /* CONFIG_MEMCG */ static int __init init_lru_gen(void) { BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) pr_err("lru_gen: failed to create sysfs group\n"); debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops); debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops); return 0; }; late_initcall(init_lru_gen); #else /* !CONFIG_LRU_GEN */ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) { BUILD_BUG(); } static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { BUILD_BUG(); } static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) { BUILD_BUG(); } #endif /* CONFIG_LRU_GEN */ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { … } /* Use reclaim/compaction for costly allocs or under memory pressure */ static bool in_reclaim_compaction(struct scan_control *sc) { … } /* * Reclaim/compaction is used for high-order allocation requests. It reclaims * order-0 pages before compacting the zone. should_continue_reclaim() returns * true if more pages should be reclaimed such that when the page allocator * calls try_to_compact_pages() that it will have enough free pages to succeed. * It will give up earlier than that if there is difficulty reclaiming pages. */ static inline bool should_continue_reclaim(struct pglist_data *pgdat, unsigned long nr_reclaimed, struct scan_control *sc) { … } static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) { … } static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) { … } /* * Returns true if compaction should go ahead for a costly-order request, or * the allocation would already succeed without compaction. Return false if we * should reclaim first. */ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) { … } static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc) { … } /* * This is the direct reclaim path, for page-allocating processes. We only * try to reclaim pages from zones which will satisfy the caller's allocation * request. * * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. */ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) { … } static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) { … } /* * This is the main entry point to direct page reclaim. * * If a full scan of the inactive list fails to free enough memory then we * are "out of memory" and something needs to be killed. * * If the caller is !__GFP_FS then the probability of a failure is reasonably * high - the zone may be full of dirty or under-writeback pages, which this * caller can't do much about. We kick the writeback threads and take explicit * naps in the hope that some of these pages can be written. But if the * allocating task holds filesystem locks which prevent writeout this might not * work, and the allocation attempt will fail. * * returns: 0, if no pages reclaimed * else, the number of pages reclaimed */ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct scan_control *sc) { … } static bool allow_direct_reclaim(pg_data_t *pgdat) { … } /* * Throttle direct reclaimers if backing storage is backed by the network * and the PFMEMALLOC reserve for the preferred node is getting dangerously * depleted. kswapd will continue to make progress and wake the processes * when the low watermark is reached. * * Returns true if a fatal signal was delivered during throttling. If this * happens, the page allocator should not consider triggering the OOM killer. */ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, nodemask_t *nodemask) { … } unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask) { … } #ifdef CONFIG_MEMCG /* Only used by soft limit reclaim. Do not reuse for anything else. */ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, unsigned long *nr_scanned) { … } unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, unsigned int reclaim_options, int *swappiness) { … } #endif static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc) { … } static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx) { … } /* * Returns true if there is an eligible zone balanced for the request order * and highest_zoneidx */ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) { … } /* Clear pgdat state for congested, dirty or under writeback. */ static void clear_pgdat_congested(pg_data_t *pgdat) { … } /* * Prepare kswapd for sleeping. This verifies that there are no processes * waiting in throttle_direct_reclaim() and that watermarks have been met. * * Returns true if kswapd is ready to sleep */ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int highest_zoneidx) { … } /* * kswapd shrinks a node of pages that are at or below the highest usable * zone that is currently unbalanced. * * Returns true if kswapd scanned at least the requested number of pages to * reclaim or if the lack of progress was due to pages under writeback. * This is used to determine if the scanning priority needs to be raised. */ static bool kswapd_shrink_node(pg_data_t *pgdat, struct scan_control *sc) { … } /* Page allocator PCP high watermark is lowered if reclaim is active. */ static inline void update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active) { … } static inline void set_reclaim_active(pg_data_t *pgdat, int highest_zoneidx) { … } static inline void clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx) { … } /* * For kswapd, balance_pgdat() will reclaim pages across a node from zones * that are eligible for use by the caller until at least one zone is * balanced. * * Returns the order kswapd finished reclaiming at. * * kswapd scans the zones in the highmem->normal->dma direction. It skips * zones which have free_pages > high_wmark_pages(zone), but once a zone is * found to have free_pages <= high_wmark_pages(zone), any page in that zone * or lower is eligible for reclaim until at least one usable zone is * balanced. */ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) { … } /* * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to * be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is * not a valid index then either kswapd runs for first time or kswapd couldn't * sleep after previous reclaim attempt (node is still unbalanced). In that * case return the zone index of the previous kswapd reclaim cycle. */ static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat, enum zone_type prev_highest_zoneidx) { … } static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, unsigned int highest_zoneidx) { … } /* * The background pageout daemon, started as a kernel thread * from the init process. * * This basically trickles out pages so that we have _some_ * free memory available even if there is no other activity * that frees anything up. This is needed for things like routing * etc, where we otherwise might have all activity going on in * asynchronous contexts that cannot page things out. * * If there are applications that are active memory-allocators * (most normal use), this basically shouldn't matter. */ static int kswapd(void *p) { … } /* * A zone is low on free memory or too fragmented for high-order memory. If * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's * pgdat. It will wake up kcompactd after reclaiming memory. If kswapd reclaim * has failed or is not needed, still wake up kcompactd if only compaction is * needed. */ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, enum zone_type highest_zoneidx) { … } #ifdef CONFIG_HIBERNATION /* * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of * freed pages. * * Rather than trying to age LRUs the aim is to preserve the overall * LRU order by reclaiming preferentially * inactive > active > active referenced > active mapped */ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) { … } #endif /* CONFIG_HIBERNATION */ /* * This kswapd start function will be called by init and node-hot-add. */ void __meminit kswapd_run(int nid) { … } /* * Called by memory hotplug when all memory in a node is offlined. Caller must * be holding mem_hotplug_begin/done(). */ void __meminit kswapd_stop(int nid) { … } static int __init kswapd_init(void) { … } module_init(…) … #ifdef CONFIG_NUMA /* * Node reclaim mode * * If non-zero call node_reclaim when the number of free pages falls below * the watermarks. */ int node_reclaim_mode __read_mostly; /* * Priority for NODE_RECLAIM. This determines the fraction of pages * of a node considered for each zone_reclaim. 4 scans 1/16th of * a zone. */ #define NODE_RECLAIM_PRIORITY … /* * Percentage of pages in a zone that must be unmapped for node_reclaim to * occur. */ int sysctl_min_unmapped_ratio = …; /* * If the number of slab pages in a zone grows beyond this percentage then * slab reclaim needs to occur. */ int sysctl_min_slab_ratio = …; static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat) { … } /* Work out how many page cache pages we can reclaim in this reclaim_mode */ static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat) { … } /* * Try to free up some pages from this node through reclaim. */ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) { … } int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) { … } #endif /** * check_move_unevictable_folios - Move evictable folios to appropriate zone * lru list * @fbatch: Batch of lru folios to check. * * Checks folios for evictability, if an evictable folio is in the unevictable * lru list, moves it to the appropriate evictable lru list. This function * should be only used for lru folios. */ void check_move_unevictable_folios(struct folio_batch *fbatch) { … } EXPORT_SYMBOL_GPL(…);