// SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/oom_kill.c * * Copyright (C) 1998,2000 Rik van Riel * Thanks go out to Claus Fischer for some serious inspiration and * for goading me into coding this file... * Copyright (C) 2010 Google, Inc. * Rewritten by David Rientjes * * The routines in this file are used to kill a process when * we're seriously out of memory. This gets called from __alloc_pages() * in mm/page_alloc.c when we really run out of memory. * * Since we won't call these routines often (on a well-configured * machine) this file will double as a 'coding guide' and a signpost * for newbie kernel hackers. It features several pointers to major * kernel subsystems and hints as to where to find out what things do. */ #include <linux/oom.h> #include <linux/mm.h> #include <linux/err.h> #include <linux/gfp.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/coredump.h> #include <linux/sched/task.h> #include <linux/sched/debug.h> #include <linux/swap.h> #include <linux/syscalls.h> #include <linux/timex.h> #include <linux/jiffies.h> #include <linux/cpuset.h> #include <linux/export.h> #include <linux/notifier.h> #include <linux/memcontrol.h> #include <linux/mempolicy.h> #include <linux/security.h> #include <linux/ptrace.h> #include <linux/freezer.h> #include <linux/ftrace.h> #include <linux/ratelimit.h> #include <linux/kthread.h> #include <linux/init.h> #include <linux/mmu_notifier.h> #include <linux/cred.h> #include <asm/tlb.h> #include "internal.h" #include "slab.h" #define CREATE_TRACE_POINTS #include <trace/events/oom.h> static int sysctl_panic_on_oom; static int sysctl_oom_kill_allocating_task; static int sysctl_oom_dump_tasks = …; /* * Serializes oom killer invocations (out_of_memory()) from all contexts to * prevent from over eager oom killing (e.g. when the oom killer is invoked * from different domains). * * oom_killer_disable() relies on this lock to stabilize oom_killer_disabled * and mark_oom_victim */ DEFINE_MUTEX(…) …; /* Serializes oom_score_adj and oom_score_adj_min updates */ DEFINE_MUTEX(…) …; static inline bool is_memcg_oom(struct oom_control *oc) { … } #ifdef CONFIG_NUMA /** * oom_cpuset_eligible() - check task eligibility for kill * @start: task struct of which task to consider * @oc: pointer to struct oom_control * * Task eligibility is determined by whether or not a candidate task, @tsk, * shares the same mempolicy nodes as current if it is bound by such a policy * and whether or not it has the same set of allowed cpuset nodes. * * This function is assuming oom-killer context and 'current' has triggered * the oom-killer. */ static bool oom_cpuset_eligible(struct task_struct *start, struct oom_control *oc) { … } #else static bool oom_cpuset_eligible(struct task_struct *tsk, struct oom_control *oc) { return true; } #endif /* CONFIG_NUMA */ /* * The process p may have detached its own ->mm while exiting or through * kthread_use_mm(), but one or more of its subthreads may still have a valid * pointer. Return p, or any of its subthreads with a valid ->mm, with * task_lock() held. */ struct task_struct *find_lock_task_mm(struct task_struct *p) { … } /* * order == -1 means the oom kill is required by sysrq, otherwise only * for display purposes. */ static inline bool is_sysrq_oom(struct oom_control *oc) { … } /* return true if the task is not adequate as candidate victim task. */ static bool oom_unkillable_task(struct task_struct *p) { … } /* * Check whether unreclaimable slab amount is greater than * all user memory(LRU pages). * dump_unreclaimable_slab() could help in the case that * oom due to too much unreclaimable slab used by kernel. */ static bool should_dump_unreclaim_slab(void) { … } /** * oom_badness - heuristic function to determine which candidate task to kill * @p: task struct of which task we should calculate * @totalpages: total present RAM allowed for page allocation * * The heuristic for determining which task to kill is made to be as simple and * predictable as possible. The goal is to return the highest value for the * task consuming the most memory to avoid subsequent oom failures. */ long oom_badness(struct task_struct *p, unsigned long totalpages) { … } static const char * const oom_constraint_text[] = …; /* * Determine the type of allocation constraint. */ static enum oom_constraint constrained_alloc(struct oom_control *oc) { … } static int oom_evaluate_task(struct task_struct *task, void *arg) { … } /* * Simple selection loop. We choose the process with the highest number of * 'points'. In case scan was aborted, oc->chosen is set to -1. */ static void select_bad_process(struct oom_control *oc) { … } static int dump_task(struct task_struct *p, void *arg) { … } /** * dump_tasks - dump current memory state of all system tasks * @oc: pointer to struct oom_control * * Dumps the current memory state of all eligible tasks. Tasks not in the same * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes * are not shown. * State information includes task's pid, uid, tgid, vm size, rss, * pgtables_bytes, swapents, oom_score_adj value, and name. */ static void dump_tasks(struct oom_control *oc) { … } static void dump_oom_victim(struct oom_control *oc, struct task_struct *victim) { … } static void dump_header(struct oom_control *oc) { … } /* * Number of OOM victims in flight */ static atomic_t oom_victims = …; static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); static bool oom_killer_disabled __read_mostly; /* * task->mm can be NULL if the task is the exited group leader. So to * determine whether the task is using a particular mm, we examine all the * task's threads: if one of those is using this mm then this task was also * using it. */ bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) { … } #ifdef CONFIG_MMU /* * OOM Reaper kernel thread which tries to reap the memory used by the OOM * victim (if that is possible) to help the OOM killer to move on. */ static struct task_struct *oom_reaper_th; static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); static struct task_struct *oom_reaper_list; static DEFINE_SPINLOCK(oom_reaper_lock); static bool __oom_reap_task_mm(struct mm_struct *mm) { … } /* * Reaps the address space of the give task. * * Returns true on success and false if none or part of the address space * has been reclaimed and the caller should retry later. */ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) { … } #define MAX_OOM_REAP_RETRIES … static void oom_reap_task(struct task_struct *tsk) { … } static int oom_reaper(void *unused) { … } static void wake_oom_reaper(struct timer_list *timer) { … } /* * Give the OOM victim time to exit naturally before invoking the oom_reaping. * The timers timeout is arbitrary... the longer it is, the longer the worst * case scenario for the OOM can take. If it is too small, the oom_reaper can * get in the way and release resources needed by the process exit path. * e.g. The futex robust list can sit in Anon|Private memory that gets reaped * before the exit path is able to wake the futex waiters. */ #define OOM_REAPER_DELAY … static void queue_oom_reaper(struct task_struct *tsk) { … } #ifdef CONFIG_SYSCTL static struct ctl_table vm_oom_kill_table[] = …; #endif static int __init oom_init(void) { … } subsys_initcall(…) … #else static inline void queue_oom_reaper(struct task_struct *tsk) { } #endif /* CONFIG_MMU */ /** * mark_oom_victim - mark the given task as OOM victim * @tsk: task to mark * * Has to be called with oom_lock held and never after * oom has been disabled already. * * tsk->mm has to be non NULL and caller has to guarantee it is stable (either * under task_lock or operate on the current). */ static void mark_oom_victim(struct task_struct *tsk) { … } /** * exit_oom_victim - note the exit of an OOM victim */ void exit_oom_victim(void) { … } /** * oom_killer_enable - enable OOM killer */ void oom_killer_enable(void) { … } /** * oom_killer_disable - disable OOM killer * @timeout: maximum timeout to wait for oom victims in jiffies * * Forces all page allocations to fail rather than trigger OOM killer. * Will block and wait until all OOM victims are killed or the given * timeout expires. * * The function cannot be called when there are runnable user tasks because * the userspace would see unexpected allocation failures as a result. Any * new usage of this function should be consulted with MM people. * * Returns true if successful and false if the OOM killer cannot be * disabled. */ bool oom_killer_disable(signed long timeout) { … } static inline bool __task_will_free_mem(struct task_struct *task) { … } /* * Checks whether the given task is dying or exiting and likely to * release its address space. This means that all threads and processes * sharing the same mm have to be killed or exiting. * Caller has to make sure that task->mm is stable (hold task_lock or * it operates on the current). */ static bool task_will_free_mem(struct task_struct *task) { … } static void __oom_kill_process(struct task_struct *victim, const char *message) { … } /* * Kill provided task unless it's secured by setting * oom_score_adj to OOM_SCORE_ADJ_MIN. */ static int oom_kill_memcg_member(struct task_struct *task, void *message) { … } static void oom_kill_process(struct oom_control *oc, const char *message) { … } /* * Determines whether the kernel must panic because of the panic_on_oom sysctl. */ static void check_panic_on_oom(struct oom_control *oc) { … } static BLOCKING_NOTIFIER_HEAD(oom_notify_list); int register_oom_notifier(struct notifier_block *nb) { … } EXPORT_SYMBOL_GPL(…); int unregister_oom_notifier(struct notifier_block *nb) { … } EXPORT_SYMBOL_GPL(…); /** * out_of_memory - kill the "best" process when we run out of memory * @oc: pointer to struct oom_control * * If we run out of memory, we have the choice between either * killing a random task (bad), letting the system crash (worse) * OR try to be smart about which process to kill. Note that we * don't have to be perfect here, we just have to be good. */ bool out_of_memory(struct oom_control *oc) { … } /* * The pagefault handler calls here because some allocation has failed. We have * to take care of the memcg OOM here because this is the only safe context without * any locks held but let the oom killer triggered from the allocation context care * about the global OOM. */ void pagefault_out_of_memory(void) { … } SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) { … }