// SPDX-License-Identifier: GPL-2.0-or-later /* memcontrol.c - Memory Controller * * Copyright IBM Corporation, 2007 * Author Balbir Singh <[email protected]> * * Copyright 2007 OpenVZ SWsoft Inc * Author: Pavel Emelianov <[email protected]> * * Memory thresholds * Copyright (C) 2009 Nokia Corporation * Author: Kirill A. Shutemov * * Kernel Memory Controller * Copyright (C) 2012 Parallels Inc. and Google Inc. * Authors: Glauber Costa and Suleiman Souhlal * * Native page reclaim * Charge lifetime sanitation * Lockless page tracking & accounting * Unified hierarchy configuration model * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner * * Per memcg lru locking * Copyright (C) 2020 Alibaba, Inc, Alex Shi */ #include <linux/page_counter.h> #include <linux/memcontrol.h> #include <linux/cgroup.h> #include <linux/sched/mm.h> #include <linux/shmem_fs.h> #include <linux/hugetlb.h> #include <linux/pagemap.h> #include <linux/pagevec.h> #include <linux/vm_event_item.h> #include <linux/smp.h> #include <linux/page-flags.h> #include <linux/backing-dev.h> #include <linux/bit_spinlock.h> #include <linux/rcupdate.h> #include <linux/limits.h> #include <linux/export.h> #include <linux/mutex.h> #include <linux/rbtree.h> #include <linux/slab.h> #include <linux/swapops.h> #include <linux/spinlock.h> #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/parser.h> #include <linux/vmpressure.h> #include <linux/memremap.h> #include <linux/mm_inline.h> #include <linux/swap_cgroup.h> #include <linux/cpu.h> #include <linux/oom.h> #include <linux/lockdep.h> #include <linux/resume_user_mode.h> #include <linux/psi.h> #include <linux/seq_buf.h> #include <linux/sched/isolation.h> #include <linux/kmemleak.h> #include "internal.h" #include <net/sock.h> #include <net/ip.h> #include "slab.h" #include "memcontrol-v1.h" #include <linux/uaccess.h> #include <trace/events/vmscan.h> struct cgroup_subsys memory_cgrp_subsys __read_mostly; EXPORT_SYMBOL(…); struct mem_cgroup *root_mem_cgroup __read_mostly; /* Active memory cgroup to use from an interrupt context */ DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg); EXPORT_PER_CPU_SYMBOL_GPL(…); /* Socket memory accounting disabled? */ static bool cgroup_memory_nosocket __ro_after_init; /* Kernel memory accounting disabled? */ static bool cgroup_memory_nokmem __ro_after_init; /* BPF memory accounting disabled? */ static bool cgroup_memory_nobpf __ro_after_init; #ifdef CONFIG_CGROUP_WRITEBACK static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); #endif #define THRESHOLDS_EVENTS_TARGET … #define SOFTLIMIT_EVENTS_TARGET … static inline bool task_is_dying(void) { … } /* Some nice accessors for the vmpressure. */ struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) { … } struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr) { … } #define CURRENT_OBJCG_UPDATE_BIT … #define CURRENT_OBJCG_UPDATE_FLAG … static DEFINE_SPINLOCK(objcg_lock); bool mem_cgroup_kmem_disabled(void) { … } static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, unsigned int nr_pages); static void obj_cgroup_release(struct percpu_ref *ref) { … } static struct obj_cgroup *obj_cgroup_alloc(void) { … } static void memcg_reparent_objcgs(struct mem_cgroup *memcg, struct mem_cgroup *parent) { … } /* * A lot of the calls to the cache allocation functions are expected to be * inlined by the compiler. Since the calls to memcg_slab_post_alloc_hook() are * conditional to this static branch, we'll have to allow modules that does * kmem_cache_alloc and the such to see this symbol as well */ DEFINE_STATIC_KEY_FALSE(memcg_kmem_online_key); EXPORT_SYMBOL(…); DEFINE_STATIC_KEY_FALSE(memcg_bpf_enabled_key); EXPORT_SYMBOL(…); /** * mem_cgroup_css_from_folio - css of the memcg associated with a folio * @folio: folio of interest * * If memcg is bound to the default hierarchy, css of the memcg associated * with @folio is returned. The returned css remains associated with @folio * until it is released. * * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup * is returned. */ struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio) { … } /** * page_cgroup_ino - return inode number of the memcg a page is charged to * @page: the page * * Look up the closest online ancestor of the memory cgroup @page is charged to * and return its inode number or 0 if @page is not charged to any cgroup. It * is safe to call this function without holding a reference to @page. * * Note, this function is inherently racy, because there is nothing to prevent * the cgroup inode from getting torn down and potentially reallocated a moment * after page_cgroup_ino() returns, so it only should be used by callers that * do not care (such as procfs interfaces). */ ino_t page_cgroup_ino(struct page *page) { … } /* Subset of node_stat_item for memcg stats */ static const unsigned int memcg_node_stat_items[] = …; static const unsigned int memcg_stat_items[] = …; #define NR_MEMCG_NODE_STAT_ITEMS … #define MEMCG_VMSTAT_SIZE … static int8_t mem_cgroup_stats_index[MEMCG_NR_STAT] __read_mostly; static void init_memcg_stats(void) { … } static inline int memcg_stats_index(int idx) { … } struct lruvec_stats_percpu { … }; struct lruvec_stats { … }; unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { … } unsigned long lruvec_page_state_local(struct lruvec *lruvec, enum node_stat_item idx) { … } /* Subset of vm_event_item to report for memcg event stats */ static const unsigned int memcg_vm_event_stat[] = …; #define NR_MEMCG_EVENTS … static int8_t mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly; static void init_memcg_events(void) { … } static inline int memcg_events_index(enum vm_event_item idx) { … } struct memcg_vmstats_percpu { … } ____cacheline_aligned; struct memcg_vmstats { … }; /* * memcg and lruvec stats flushing * * Many codepaths leading to stats update or read are performance sensitive and * adding stats flushing in such codepaths is not desirable. So, to optimize the * flushing the kernel does: * * 1) Periodically and asynchronously flush the stats every 2 seconds to not let * rstat update tree grow unbounded. * * 2) Flush the stats synchronously on reader side only when there are more than * (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization * will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but * only for 2 seconds due to (1). */ static void flush_memcg_stats_dwork(struct work_struct *w); static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); static u64 flush_last_time; #define FLUSH_TIME … /* * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can * not rely on this as part of an acquired spinlock_t lock. These functions are * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion * is sufficient. */ static void memcg_stats_lock(void) { … } static void __memcg_stats_lock(void) { … } static void memcg_stats_unlock(void) { … } static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats) { … } static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) { … } static void do_flush_stats(struct mem_cgroup *memcg) { … } /* * mem_cgroup_flush_stats - flush the stats of a memory cgroup subtree * @memcg: root of the subtree to flush * * Flushing is serialized by the underlying global rstat lock. There is also a * minimum amount of work to be done even if there are no stat updates to flush. * Hence, we only flush the stats if the updates delta exceeds a threshold. This * avoids unnecessary work and contention on the underlying lock. */ void mem_cgroup_flush_stats(struct mem_cgroup *memcg) { … } void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg) { … } static void flush_memcg_stats_dwork(struct work_struct *w) { … } unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { … } static int memcg_page_state_unit(int item); /* * Normalize the value passed into memcg_rstat_updated() to be in pages. Round * up non-zero sub-page updates to 1 page as zero page updates are ignored. */ static int memcg_state_val_in_pages(int idx, int val) { … } /** * __mod_memcg_state - update cgroup memory statistics * @memcg: the memory cgroup * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item * @val: delta to add to the counter, can be negative */ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, int val) { … } /* idx can be of type enum memcg_stat_item or node_stat_item. */ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) { … } static void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { … } /** * __mod_lruvec_state - update lruvec memory statistics * @lruvec: the lruvec * @idx: the stat item * @val: delta to add to the counter, can be negative * * The lruvec is the intersection of the NUMA node and a cgroup. This * function updates the all three counters that are affected by a * change of state at this level: per-node, per-cgroup, per-lruvec. */ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { … } void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val) { … } EXPORT_SYMBOL(…); void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { … } /** * __count_memcg_events - account VM events in a cgroup * @memcg: the memory cgroup * @idx: the event item * @count: the number of events that occurred */ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { … } unsigned long memcg_events(struct mem_cgroup *memcg, int event) { … } unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) { … } void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, int nr_pages) { … } bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, enum mem_cgroup_events_target target) { … } struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) { … } EXPORT_SYMBOL(…); static __always_inline struct mem_cgroup *active_memcg(void) { … } /** * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg. * @mm: mm from which memcg should be extracted. It can be NULL. * * Obtain a reference on mm->memcg and returns it if successful. If mm * is NULL, then the memcg is chosen as follows: * 1) The active memcg, if set. * 2) current->mm->memcg, if available * 3) root memcg * If mem_cgroup is disabled, NULL is returned. */ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) { … } EXPORT_SYMBOL(…); /** * get_mem_cgroup_from_current - Obtain a reference on current task's memcg. */ struct mem_cgroup *get_mem_cgroup_from_current(void) { … } /** * mem_cgroup_iter - iterate over memory cgroup hierarchy * @root: hierarchy root * @prev: previously returned memcg, NULL on first invocation * @reclaim: cookie for shared reclaim walks, NULL for full walks * * Returns references to children of the hierarchy below @root, or * @root itself, or %NULL after a full round-trip. * * Caller must pass the return value in @prev on subsequent * invocations for reference counting, or use mem_cgroup_iter_break() * to cancel a hierarchy walk before the round-trip is complete. * * Reclaimers can specify a node in @reclaim to divide up the memcgs * in the hierarchy among all concurrent reclaimers operating on the * same node. */ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, struct mem_cgroup_reclaim_cookie *reclaim) { … } /** * mem_cgroup_iter_break - abort a hierarchy walk prematurely * @root: hierarchy root * @prev: last visited hierarchy member as returned by mem_cgroup_iter() */ void mem_cgroup_iter_break(struct mem_cgroup *root, struct mem_cgroup *prev) { … } static void __invalidate_reclaim_iterators(struct mem_cgroup *from, struct mem_cgroup *dead_memcg) { … } static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) { … } /** * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy * @memcg: hierarchy root * @fn: function to call for each task * @arg: argument passed to @fn * * This function iterates over tasks attached to @memcg or to any of its * descendants and calls @fn for each task. If @fn returns a non-zero * value, the function breaks the iteration loop. Otherwise, it will iterate * over all tasks and return 0. * * This function must not be called for the root memory cgroup. */ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, int (*fn)(struct task_struct *, void *), void *arg) { … } #ifdef CONFIG_DEBUG_VM void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) { … } #endif /** * folio_lruvec_lock - Lock the lruvec for a folio. * @folio: Pointer to the folio. * * These functions are safe to use under any of the following conditions: * - folio locked * - folio_test_lru false * - folio_memcg_lock() * - folio frozen (refcount of 0) * * Return: The lruvec this folio is on with its lock held. */ struct lruvec *folio_lruvec_lock(struct folio *folio) { … } /** * folio_lruvec_lock_irq - Lock the lruvec for a folio. * @folio: Pointer to the folio. * * These functions are safe to use under any of the following conditions: * - folio locked * - folio_test_lru false * - folio_memcg_lock() * - folio frozen (refcount of 0) * * Return: The lruvec this folio is on with its lock held and interrupts * disabled. */ struct lruvec *folio_lruvec_lock_irq(struct folio *folio) { … } /** * folio_lruvec_lock_irqsave - Lock the lruvec for a folio. * @folio: Pointer to the folio. * @flags: Pointer to irqsave flags. * * These functions are safe to use under any of the following conditions: * - folio locked * - folio_test_lru false * - folio_memcg_lock() * - folio frozen (refcount of 0) * * Return: The lruvec this folio is on with its lock held and interrupts * disabled. */ struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flags) { … } /** * mem_cgroup_update_lru_size - account for adding or removing an lru page * @lruvec: mem_cgroup per zone lru vector * @lru: index of lru list the page is sitting on * @zid: zone id of the accounted pages * @nr_pages: positive when adding or negative when removing * * This function must be called under lru_lock, just before a page is added * to or just after a page is removed from an lru list. */ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, int zid, int nr_pages) { … } /** * mem_cgroup_margin - calculate chargeable space of a memory cgroup * @memcg: the memory cgroup * * Returns the maximum amount of memory @mem can be charged with, in * pages. */ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) { … } struct memory_stat { … }; static const struct memory_stat memory_stats[] = …; /* The actual unit of the state item, not the same as the output unit */ static int memcg_page_state_unit(int item) { … } /* Translate stat items to the correct unit for memory.stat output */ static int memcg_page_state_output_unit(int item) { … } unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item) { … } unsigned long memcg_page_state_local_output(struct mem_cgroup *memcg, int item) { … } static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) { … } static void memory_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) { … } /** * mem_cgroup_print_oom_context: Print OOM information relevant to * memory controller. * @memcg: The memory cgroup that went over limit * @p: Task that is going to be killed * * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is * enabled */ void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) { … } /** * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to * memory controller. * @memcg: The memory cgroup that went over limit */ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) { … } /* * Return the memory (and swap, if configured) limit for a memcg. */ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { … } unsigned long mem_cgroup_size(struct mem_cgroup *memcg) { … } static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, int order) { … } /* * Returns true if successfully killed one or more processes. Though in some * corner cases it can return true even without killing any process. */ static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { … } /** * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM * @victim: task to be killed by the OOM killer * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM * * Returns a pointer to a memory cgroup, which has to be cleaned up * by killing all belonging OOM-killable tasks. * * Caller has to call mem_cgroup_put() on the returned non-NULL memcg. */ struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, struct mem_cgroup *oom_domain) { … } void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) { … } struct memcg_stock_pcp { … }; static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = …; static DEFINE_MUTEX(percpu_charge_mutex); static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock); static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, struct mem_cgroup *root_memcg); /** * consume_stock: Try to consume stocked charge on this cpu. * @memcg: memcg to consume from. * @nr_pages: how many pages to charge. * * The charges will only happen if @memcg matches the current cpu's memcg * stock, and at least @nr_pages are available in that stock. Failure to * service an allocation will refill the stock. * * returns true if successful, false otherwise. */ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { … } /* * Returns stocks cached in percpu and reset cached information. */ static void drain_stock(struct memcg_stock_pcp *stock) { … } static void drain_local_stock(struct work_struct *dummy) { … } /* * Cache charges(val) to local per_cpu area. * This will be consumed by consume_stock() function, later. */ static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { … } static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { … } /* * Drains all per-CPU charge caches for given root_memcg resp. subtree * of the hierarchy under it. */ void drain_all_stock(struct mem_cgroup *root_memcg) { … } static int memcg_hotplug_cpu_dead(unsigned int cpu) { … } static unsigned long reclaim_high(struct mem_cgroup *memcg, unsigned int nr_pages, gfp_t gfp_mask) { … } static void high_work_func(struct work_struct *work) { … } /* * Clamp the maximum sleep time per allocation batch to 2 seconds. This is * enough to still cause a significant slowdown in most cases, while still * allowing diagnostics and tracing to proceed without becoming stuck. */ #define MEMCG_MAX_HIGH_DELAY_JIFFIES … /* * When calculating the delay, we use these either side of the exponentiation to * maintain precision and scale to a reasonable number of jiffies (see the table * below. * * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the * overage ratio to a delay. * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the * proposed penalty in order to reduce to a reasonable number of jiffies, and * to produce a reasonable delay curve. * * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a * reasonable delay curve compared to precision-adjusted overage, not * penalising heavily at first, but still making sure that growth beyond the * limit penalises misbehaviour cgroups by slowing them down exponentially. For * example, with a high of 100 megabytes: * * +-------+------------------------+ * | usage | time to allocate in ms | * +-------+------------------------+ * | 100M | 0 | * | 101M | 6 | * | 102M | 25 | * | 103M | 57 | * | 104M | 102 | * | 105M | 159 | * | 106M | 230 | * | 107M | 313 | * | 108M | 409 | * | 109M | 518 | * | 110M | 639 | * | 111M | 774 | * | 112M | 921 | * | 113M | 1081 | * | 114M | 1254 | * | 115M | 1439 | * | 116M | 1638 | * | 117M | 1849 | * | 118M | 2000 | * | 119M | 2000 | * | 120M | 2000 | * +-------+------------------------+ */ #define MEMCG_DELAY_PRECISION_SHIFT … #define MEMCG_DELAY_SCALING_SHIFT … static u64 calculate_overage(unsigned long usage, unsigned long high) { … } static u64 mem_find_max_overage(struct mem_cgroup *memcg) { … } static u64 swap_find_max_overage(struct mem_cgroup *memcg) { … } /* * Get the number of jiffies that we should penalise a mischievous cgroup which * is exceeding its memory.high by checking both it and its ancestors. */ static unsigned long calculate_high_delay(struct mem_cgroup *memcg, unsigned int nr_pages, u64 max_overage) { … } /* * Reclaims memory over the high limit. Called directly from * try_charge() (context permitting), as well as from the userland * return path where reclaim is always able to block. */ void mem_cgroup_handle_over_high(gfp_t gfp_mask) { … } int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned int nr_pages) { … } /** * mem_cgroup_cancel_charge() - cancel an uncommitted try_charge() call. * @memcg: memcg previously charged. * @nr_pages: number of pages previously charged. */ void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { … } static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) { … } /** * mem_cgroup_commit_charge - commit a previously successful try_charge(). * @folio: folio to commit the charge to. * @memcg: memcg previously charged. */ void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg) { … } static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg, struct pglist_data *pgdat, enum node_stat_item idx, int nr) { … } static __always_inline struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) { … } /* * Returns a pointer to the memory cgroup to which the kernel object is charged. * * A passed kernel object can be a slab object, vmalloc object or a generic * kernel page, so different mechanisms for getting the memory cgroup pointer * should be used. * * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller * can not know for sure how the kernel object is implemented. * mem_cgroup_from_obj() can be safely used in such cases. * * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), * cgroup_mutex, etc. */ struct mem_cgroup *mem_cgroup_from_obj(void *p) { … } /* * Returns a pointer to the memory cgroup to which the kernel object is charged. * Similar to mem_cgroup_from_obj(), but faster and not suitable for objects, * allocated using vmalloc(). * * A passed kernel object must be a slab object or a generic kernel page. * * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), * cgroup_mutex, etc. */ struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) { … } static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) { … } static struct obj_cgroup *current_objcg_update(void) { … } __always_inline struct obj_cgroup *current_obj_cgroup(void) { … } struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio) { … } /* * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg * @objcg: object cgroup to uncharge * @nr_pages: number of pages to uncharge */ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, unsigned int nr_pages) { … } /* * obj_cgroup_charge_pages: charge a number of kernel pages to a objcg * @objcg: object cgroup to charge * @gfp: reclaim mode * @nr_pages: number of pages to charge * * Returns 0 on success, an error code on failure. */ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, unsigned int nr_pages) { … } /** * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup * @page: page to charge * @gfp: reclaim mode * @order: allocation order * * Returns 0 on success, an error code on failure. */ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) { … } /** * __memcg_kmem_uncharge_page: uncharge a kmem page * @page: page to uncharge * @order: allocation order */ void __memcg_kmem_uncharge_page(struct page *page, int order) { … } static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, enum node_stat_item idx, int nr) { … } static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) { … } static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) { … } static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, struct mem_cgroup *root_memcg) { … } static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, bool allow_uncharge) { … } int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) { … } void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) { … } static inline size_t obj_full_size(struct kmem_cache *s) { … } bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, gfp_t flags, size_t size, void **p) { … } void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects, struct slabobj_ext *obj_exts) { … } /* * Because folio_memcg(head) is not set on tails, set it now. */ void split_page_memcg(struct page *head, int old_order, int new_order) { … } unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) { … } static int memcg_online_kmem(struct mem_cgroup *memcg) { … } static void memcg_offline_kmem(struct mem_cgroup *memcg) { … } #ifdef CONFIG_CGROUP_WRITEBACK #include <trace/events/writeback.h> static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) { … } static void memcg_wb_domain_exit(struct mem_cgroup *memcg) { … } static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) { … } struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) { … } /** * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg * @wb: bdi_writeback in question * @pfilepages: out parameter for number of file pages * @pheadroom: out parameter for number of allocatable pages according to memcg * @pdirty: out parameter for number of dirty pages * @pwriteback: out parameter for number of pages under writeback * * Determine the numbers of file, headroom, dirty, and writeback pages in * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom * is a bit more involved. * * A memcg's headroom is "min(max, high) - used". In the hierarchy, the * headroom is calculated as the lowest headroom of itself and the * ancestors. Note that this doesn't consider the actual amount of * available memory in the system. The caller should further cap * *@pheadroom accordingly. */ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, unsigned long *pheadroom, unsigned long *pdirty, unsigned long *pwriteback) { … } /* * Foreign dirty flushing * * There's an inherent mismatch between memcg and writeback. The former * tracks ownership per-page while the latter per-inode. This was a * deliberate design decision because honoring per-page ownership in the * writeback path is complicated, may lead to higher CPU and IO overheads * and deemed unnecessary given that write-sharing an inode across * different cgroups isn't a common use-case. * * Combined with inode majority-writer ownership switching, this works well * enough in most cases but there are some pathological cases. For * example, let's say there are two cgroups A and B which keep writing to * different but confined parts of the same inode. B owns the inode and * A's memory is limited far below B's. A's dirty ratio can rise enough to * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid * triggering background writeback. A will be slowed down without a way to * make writeback of the dirty pages happen. * * Conditions like the above can lead to a cgroup getting repeatedly and * severely throttled after making some progress after each * dirty_expire_interval while the underlying IO device is almost * completely idle. * * Solving this problem completely requires matching the ownership tracking * granularities between memcg and writeback in either direction. However, * the more egregious behaviors can be avoided by simply remembering the * most recent foreign dirtying events and initiating remote flushes on * them when local writeback isn't enough to keep the memory clean enough. * * The following two functions implement such mechanism. When a foreign * page - a page whose memcg and writeback ownerships don't match - is * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning * bdi_writeback on the page owning memcg. When balance_dirty_pages() * decides that the memcg needs to sleep due to high dirty ratio, it calls * mem_cgroup_flush_foreign() which queues writeback on the recorded * foreign bdi_writebacks which haven't expired. Both the numbers of * recorded bdi_writebacks and concurrent in-flight foreign writebacks are * limited to MEMCG_CGWB_FRN_CNT. * * The mechanism only remembers IDs and doesn't hold any object references. * As being wrong occasionally doesn't matter, updates and accesses to the * records are lockless and racy. */ void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio, struct bdi_writeback *wb) { … } /* issue foreign writeback flushes for recorded foreign dirtying events */ void mem_cgroup_flush_foreign(struct bdi_writeback *wb) { … } #else /* CONFIG_CGROUP_WRITEBACK */ static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) { return 0; } static void memcg_wb_domain_exit(struct mem_cgroup *memcg) { } static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) { } #endif /* CONFIG_CGROUP_WRITEBACK */ /* * Private memory cgroup IDR * * Swap-out records and page cache shadow entries need to store memcg * references in constrained space, so we maintain an ID space that is * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of * memory-controlled cgroups to 64k. * * However, there usually are many references to the offline CSS after * the cgroup has been destroyed, such as page cache or reclaimable * slab objects, that don't need to hang on to the ID. We want to keep * those dead CSS from occupying IDs, or we might quickly exhaust the * relatively small ID space and prevent the creation of new cgroups * even when there are much fewer than 64k cgroups - possibly none. * * Maintain a private 16-bit ID space for memcg, and allow the ID to * be freed and recycled when it's no longer needed, which is usually * when the CSS is offlined. * * The only exception to that are records of swapped out tmpfs/shmem * pages that need to be attributed to live ancestors on swapin. But * those references are manageable from userspace. */ #define MEM_CGROUP_ID_MAX … static DEFINE_IDR(mem_cgroup_idr); static DEFINE_SPINLOCK(memcg_idr_lock); static int mem_cgroup_alloc_id(void) { … } static void mem_cgroup_id_remove(struct mem_cgroup *memcg) { … } void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) { … } void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) { … } static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) { … } /** * mem_cgroup_from_id - look up a memcg from a memcg id * @id: the memcg id to look up * * Caller must hold rcu_read_lock(). */ struct mem_cgroup *mem_cgroup_from_id(unsigned short id) { … } #ifdef CONFIG_SHRINKER_DEBUG struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) { … } #endif static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { … } static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { … } static void __mem_cgroup_free(struct mem_cgroup *memcg) { … } static void mem_cgroup_free(struct mem_cgroup *memcg) { … } static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) { … } static struct cgroup_subsys_state * __ref mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { … } static int mem_cgroup_css_online(struct cgroup_subsys_state *css) { … } static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) { … } static void mem_cgroup_css_released(struct cgroup_subsys_state *css) { … } static void mem_cgroup_css_free(struct cgroup_subsys_state *css) { … } /** * mem_cgroup_css_reset - reset the states of a mem_cgroup * @css: the target css * * Reset the states of the mem_cgroup associated with @css. This is * invoked when the userland requests disabling on the default hierarchy * but the memcg is pinned through dependency. The memcg should stop * applying policies and should revert to the vanilla state as it may be * made visible again. * * The current implementation only resets the essential configurations. * This needs to be expanded to cover all the visible parts. */ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) { … } static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) { … } static void mem_cgroup_fork(struct task_struct *task) { … } static void mem_cgroup_exit(struct task_struct *task) { … } #ifdef CONFIG_LRU_GEN static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset) { … } #else static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset) {} #endif /* CONFIG_LRU_GEN */ static void mem_cgroup_kmem_attach(struct cgroup_taskset *tset) { … } static void mem_cgroup_attach(struct cgroup_taskset *tset) { … } static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) { … } static u64 memory_current_read(struct cgroup_subsys_state *css, struct cftype *cft) { … } static u64 memory_peak_read(struct cgroup_subsys_state *css, struct cftype *cft) { … } static int memory_min_show(struct seq_file *m, void *v) { … } static ssize_t memory_min_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { … } static int memory_low_show(struct seq_file *m, void *v) { … } static ssize_t memory_low_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { … } static int memory_high_show(struct seq_file *m, void *v) { … } static ssize_t memory_high_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { … } static int memory_max_show(struct seq_file *m, void *v) { … } static ssize_t memory_max_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { … } /* * Note: don't forget to update the 'samples/cgroup/memcg_event_listener' * if any new events become available. */ static void __memory_events_show(struct seq_file *m, atomic_long_t *events) { … } static int memory_events_show(struct seq_file *m, void *v) { … } static int memory_events_local_show(struct seq_file *m, void *v) { … } int memory_stat_show(struct seq_file *m, void *v) { … } #ifdef CONFIG_NUMA static inline unsigned long lruvec_page_state_output(struct lruvec *lruvec, int item) { … } static int memory_numa_stat_show(struct seq_file *m, void *v) { … } #endif static int memory_oom_group_show(struct seq_file *m, void *v) { … } static ssize_t memory_oom_group_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { … } enum { … }; static const match_table_t tokens = …; static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { … } static struct cftype memory_files[] = …; struct cgroup_subsys memory_cgrp_subsys = …; /** * mem_cgroup_calculate_protection - check if memory consumption is in the normal range * @root: the top ancestor of the sub-tree being checked * @memcg: the memory cgroup to check * * WARNING: This function is not stateless! It can only be used as part * of a top-down tree iteration, not for isolated queries. */ void mem_cgroup_calculate_protection(struct mem_cgroup *root, struct mem_cgroup *memcg) { … } static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg, gfp_t gfp) { … } int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) { … } /** * mem_cgroup_hugetlb_try_charge - try to charge the memcg for a hugetlb folio * @memcg: memcg to charge. * @gfp: reclaim mode. * @nr_pages: number of pages to charge. * * This function is called when allocating a huge page folio to determine if * the memcg has the capacity for it. It does not commit the charge yet, * as the hugetlb folio itself has not been obtained from the hugetlb pool. * * Once we have obtained the hugetlb folio, we can call * mem_cgroup_commit_charge() to commit the charge. If we fail to obtain the * folio, we should instead call mem_cgroup_cancel_charge() to undo the effect * of try_charge(). * * Returns 0 on success. Otherwise, an error code is returned. */ int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, long nr_pages) { … } /** * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin. * @folio: folio to charge. * @mm: mm context of the victim * @gfp: reclaim mode * @entry: swap entry for which the folio is allocated * * This function charges a folio allocated for swapin. Please call this before * adding the folio to the swapcache. * * Returns 0 on success. Otherwise, an error code is returned. */ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) { … } /* * mem_cgroup_swapin_uncharge_swap - uncharge swap slot * @entry: swap entry for which the page is charged * * Call this function after successfully adding the charged page to swapcache. * * Note: This function assumes the page for which swap slot is being uncharged * is order 0 page. */ void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) { … } struct uncharge_gather { … }; static inline void uncharge_gather_clear(struct uncharge_gather *ug) { … } static void uncharge_batch(const struct uncharge_gather *ug) { … } static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) { … } void __mem_cgroup_uncharge(struct folio *folio) { … } void __mem_cgroup_uncharge_folios(struct folio_batch *folios) { … } /** * mem_cgroup_replace_folio - Charge a folio's replacement. * @old: Currently circulating folio. * @new: Replacement folio. * * Charge @new as a replacement folio for @old. @old will * be uncharged upon free. * * Both folios must be locked, @new->mapping must be set up. */ void mem_cgroup_replace_folio(struct folio *old, struct folio *new) { … } /** * mem_cgroup_migrate - Transfer the memcg data from the old to the new folio. * @old: Currently circulating folio. * @new: Replacement folio. * * Transfer the memcg data from the old folio to the new folio for migration. * The old folio's data info will be cleared. Note that the memory counters * will remain unchanged throughout the process. * * Both folios must be locked, @new->mapping must be set up. */ void mem_cgroup_migrate(struct folio *old, struct folio *new) { … } DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); EXPORT_SYMBOL(…); void mem_cgroup_sk_alloc(struct sock *sk) { … } void mem_cgroup_sk_free(struct sock *sk) { … } /** * mem_cgroup_charge_skmem - charge socket memory * @memcg: memcg to charge * @nr_pages: number of pages to charge * @gfp_mask: reclaim mode * * Charges @nr_pages to @memcg. Returns %true if the charge fit within * @memcg's configured limit, %false if it doesn't. */ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, gfp_t gfp_mask) { … } /** * mem_cgroup_uncharge_skmem - uncharge socket memory * @memcg: memcg to uncharge * @nr_pages: number of pages to uncharge */ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) { … } static int __init cgroup_memory(char *s) { … } __setup(…); /* * subsys_initcall() for memory controller. * * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this * context because of lock dependencies (cgroup_lock -> cpu hotplug) but * basically everything that doesn't depend on a specific mem_cgroup structure * should be initialized from here. */ static int __init mem_cgroup_init(void) { … } subsys_initcall(mem_cgroup_init); #ifdef CONFIG_SWAP static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) { … } /** * mem_cgroup_swapout - transfer a memsw charge to swap * @folio: folio whose memsw charge to transfer * @entry: swap entry to move the charge to * * Transfer the memsw charge of @folio to @entry. */ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) { … } /** * __mem_cgroup_try_charge_swap - try charging swap space for a folio * @folio: folio being added to swap * @entry: swap entry to charge * * Try to charge @folio's memcg for the swap space at @entry. * * Returns 0 on success, -ENOMEM on failure. */ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) { … } /** * __mem_cgroup_uncharge_swap - uncharge swap space * @entry: swap entry to uncharge * @nr_pages: the amount of swap space to uncharge */ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) { … } long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) { … } bool mem_cgroup_swap_full(struct folio *folio) { … } static int __init setup_swap_account(char *s) { … } __setup(…); static u64 swap_current_read(struct cgroup_subsys_state *css, struct cftype *cft) { … } static u64 swap_peak_read(struct cgroup_subsys_state *css, struct cftype *cft) { … } static int swap_high_show(struct seq_file *m, void *v) { … } static ssize_t swap_high_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { … } static int swap_max_show(struct seq_file *m, void *v) { … } static ssize_t swap_max_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { … } static int swap_events_show(struct seq_file *m, void *v) { … } static struct cftype swap_files[] = …; #ifdef CONFIG_ZSWAP /** * obj_cgroup_may_zswap - check if this cgroup can zswap * @objcg: the object cgroup * * Check if the hierarchical zswap limit has been reached. * * This doesn't check for specific headroom, and it is not atomic * either. But with zswap, the size of the allocation is only known * once compression has occurred, and this optimistic pre-check avoids * spending cycles on compression when there is already no room left * or zswap is disabled altogether somewhere in the hierarchy. */ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) { … } /** * obj_cgroup_charge_zswap - charge compression backend memory * @objcg: the object cgroup * @size: size of compressed object * * This forces the charge after obj_cgroup_may_zswap() allowed * compression and storage in zwap for this cgroup to go ahead. */ void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size) { … } /** * obj_cgroup_uncharge_zswap - uncharge compression backend memory * @objcg: the object cgroup * @size: size of compressed object * * Uncharges zswap memory on page in. */ void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) { … } bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg) { … } static u64 zswap_current_read(struct cgroup_subsys_state *css, struct cftype *cft) { … } static int zswap_max_show(struct seq_file *m, void *v) { … } static ssize_t zswap_max_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { … } static int zswap_writeback_show(struct seq_file *m, void *v) { … } static ssize_t zswap_writeback_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { … } static struct cftype zswap_files[] = …; #endif /* CONFIG_ZSWAP */ static int __init mem_cgroup_swap_init(void) { … } subsys_initcall(mem_cgroup_swap_init); #endif /* CONFIG_SWAP */