// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2008, 2009 Intel Corporation * Authors: Andi Kleen, Fengguang Wu * * High level machine check handler. Handles pages reported by the * hardware as being corrupted usually due to a multi-bit ECC memory or cache * failure. * * In addition there is a "soft offline" entry point that allows stop using * not-yet-corrupted-by-suspicious pages without killing anything. * * Handles page cache pages in various states. The tricky part * here is that we can access any page asynchronously in respect to * other VM users, because memory failures could happen anytime and * anywhere. This could violate some of their assumptions. This is why * this code has to be extremely careful. Generally it tries to use * normal locking rules, as in get the standard locks, even if that means * the error handling takes potentially a long time. * * It can be very tempting to add handling for obscure cases here. * In general any code for handling new cases should only be added iff: * - You know how to test it. * - You have a test that can be added to mce-test * https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/ * - The case actually shows up as a frequent (top 10) page state in * tools/mm/page-types when running a real workload. * * There are several operations here with exponential complexity because * of unsuitable VM data structures. For example the operation to map back * from RMAP chains to processes has to walk the complete process list and * has non linear complexity with the number. But since memory corruptions * are rare we hope to get away with this. This avoids impacting the core * VM. */ #define pr_fmt(fmt) … #include <linux/kernel.h> #include <linux/mm.h> #include <linux/page-flags.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/dax.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/export.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/backing-dev.h> #include <linux/migrate.h> #include <linux/slab.h> #include <linux/swapops.h> #include <linux/hugetlb.h> #include <linux/memory_hotplug.h> #include <linux/mm_inline.h> #include <linux/memremap.h> #include <linux/kfifo.h> #include <linux/ratelimit.h> #include <linux/pagewalk.h> #include <linux/shmem_fs.h> #include <linux/sysctl.h> #include "swap.h" #include "internal.h" #include "ras/ras_event.h" static int sysctl_memory_failure_early_kill __read_mostly; static int sysctl_memory_failure_recovery __read_mostly = …; static int sysctl_enable_soft_offline __read_mostly = …; atomic_long_t num_poisoned_pages __read_mostly = …; static bool hw_memory_failure __read_mostly = …; static DEFINE_MUTEX(mf_mutex); void num_poisoned_pages_inc(unsigned long pfn) { … } void num_poisoned_pages_sub(unsigned long pfn, long i) { … } /** * MF_ATTR_RO - Create sysfs entry for each memory failure statistics. * @_name: name of the file in the per NUMA sysfs directory. */ #define MF_ATTR_RO(_name) … MF_ATTR_RO(…); MF_ATTR_RO(…); MF_ATTR_RO(…); MF_ATTR_RO(…); MF_ATTR_RO(…); static struct attribute *memory_failure_attr[] = …; const struct attribute_group memory_failure_attr_group = …; static struct ctl_table memory_failure_table[] = …; /* * Return values: * 1: the page is dissolved (if needed) and taken off from buddy, * 0: the page is dissolved (if needed) and not taken off from buddy, * < 0: failed to dissolve. */ static int __page_handle_poison(struct page *page) { … } static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release) { … } #if IS_ENABLED(CONFIG_HWPOISON_INJECT) u32 hwpoison_filter_enable = …; u32 hwpoison_filter_dev_major = …; u32 hwpoison_filter_dev_minor = …; u64 hwpoison_filter_flags_mask; u64 hwpoison_filter_flags_value; EXPORT_SYMBOL_GPL(…); EXPORT_SYMBOL_GPL(…); EXPORT_SYMBOL_GPL(…); EXPORT_SYMBOL_GPL(…); EXPORT_SYMBOL_GPL(…); static int hwpoison_filter_dev(struct page *p) { … } static int hwpoison_filter_flags(struct page *p) { … } /* * This allows stress tests to limit test scope to a collection of tasks * by putting them under some memcg. This prevents killing unrelated/important * processes such as /sbin/init. Note that the target task may share clean * pages with init (eg. libc text), which is harmless. If the target task * share _dirty_ pages with another task B, the test scheme must make sure B * is also included in the memcg. At last, due to race conditions this filter * can only guarantee that the page either belongs to the memcg tasks, or is * a freed page. */ #ifdef CONFIG_MEMCG u64 hwpoison_filter_memcg; EXPORT_SYMBOL_GPL(…); static int hwpoison_filter_task(struct page *p) { … } #else static int hwpoison_filter_task(struct page *p) { return 0; } #endif int hwpoison_filter(struct page *p) { … } EXPORT_SYMBOL_GPL(…); #else int hwpoison_filter(struct page *p) { return 0; } #endif /* * Kill all processes that have a poisoned page mapped and then isolate * the page. * * General strategy: * Find all processes having the page mapped and kill them. * But we keep a page reference around so that the page is not * actually freed yet. * Then stash the page away * * There's no convenient way to get back to mapped processes * from the VMAs. So do a brute-force search over all * running processes. * * Remember that machine checks are not common (or rather * if they are common you have other problems), so this shouldn't * be a performance issue. * * Also there are some races possible while we get from the * error detection to actually handle it. */ struct to_kill { … }; /* * Send all the processes who have the page mapped a signal. * ``action optional'' if they are not immediately affected by the error * ``action required'' if error happened in current execution context */ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) { … } /* * Unknown page type encountered. Try to check whether it can turn PageLRU by * lru_add_drain_all. */ void shake_folio(struct folio *folio) { … } EXPORT_SYMBOL_GPL(…); static void shake_page(struct page *page) { … } static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, unsigned long address) { … } /* * Failure handling: if we can't find or can't kill a process there's * not much we can do. We just print a message and ignore otherwise. */ /* * Schedule a process for later kill. * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. */ static void __add_to_kill(struct task_struct *tsk, struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, unsigned long addr) { … } static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, unsigned long addr) { … } #ifdef CONFIG_KSM static bool task_in_to_kill_list(struct list_head *to_kill, struct task_struct *tsk) { … } void add_to_kill_ksm(struct task_struct *tsk, struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, unsigned long addr) { … } #endif /* * Kill the processes that have been collected earlier. * * Only do anything when FORCEKILL is set, otherwise just free the * list (this is used for clean pages which do not need killing) */ static void kill_procs(struct list_head *to_kill, int forcekill, unsigned long pfn, int flags) { … } /* * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO) * on behalf of the thread group. Return task_struct of the (first found) * dedicated thread if found, and return NULL otherwise. * * We already hold rcu lock in the caller, so we don't have to call * rcu_read_lock/unlock() in this function. */ static struct task_struct *find_early_kill_thread(struct task_struct *tsk) { … } /* * Determine whether a given process is "early kill" process which expects * to be signaled when some page under the process is hwpoisoned. * Return task_struct of the dedicated thread (main thread unless explicitly * specified) if the process is "early kill" and otherwise returns NULL. * * Note that the above is true for Action Optional case. For Action Required * case, it's only meaningful to the current thread which need to be signaled * with SIGBUS, this error is Action Optional for other non current * processes sharing the same error page,if the process is "early kill", the * task_struct of the dedicated thread will also be returned. */ struct task_struct *task_early_kill(struct task_struct *tsk, int force_early) { … } /* * Collect processes when the error hit an anonymous page. */ static void collect_procs_anon(struct folio *folio, struct page *page, struct list_head *to_kill, int force_early) { … } /* * Collect processes when the error hit a file mapped page. */ static void collect_procs_file(struct folio *folio, struct page *page, struct list_head *to_kill, int force_early) { … } #ifdef CONFIG_FS_DAX static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, pgoff_t pgoff) { … } /* * Collect processes when the error hit a fsdax page. */ static void collect_procs_fsdax(struct page *page, struct address_space *mapping, pgoff_t pgoff, struct list_head *to_kill, bool pre_remove) { … } #endif /* CONFIG_FS_DAX */ /* * Collect the processes who have the corrupted page mapped to kill. */ static void collect_procs(struct folio *folio, struct page *page, struct list_head *tokill, int force_early) { … } struct hwpoison_walk { … }; static void set_to_kill(struct to_kill *tk, unsigned long addr, short shift) { … } static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift, unsigned long poisoned_pfn, struct to_kill *tk) { … } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr, struct hwpoison_walk *hwp) { … } #else static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr, struct hwpoison_walk *hwp) { return 0; } #endif static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr, unsigned long end, struct mm_walk *walk) { … } #ifdef CONFIG_HUGETLB_PAGE static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { … } #else #define hwpoison_hugetlb_range … #endif static const struct mm_walk_ops hwpoison_walk_ops = …; /* * Sends SIGBUS to the current process with error info. * * This function is intended to handle "Action Required" MCEs on already * hardware poisoned pages. They could happen, for example, when * memory_failure() failed to unmap the error page at the first call, or * when multiple local machine checks happened on different CPUs. * * MCE handler currently has no easy access to the error virtual address, * so this function walks page table to find it. The returned virtual address * is proper in most cases, but it could be wrong when the application * process has multiple entries mapping the error page. */ static int kill_accessing_process(struct task_struct *p, unsigned long pfn, int flags) { … } /* * MF_IGNORED - The m-f() handler marks the page as PG_hwpoisoned'ed. * But it could not do more to isolate the page from being accessed again, * nor does it kill the process. This is extremely rare and one of the * potential causes is that the page state has been changed due to * underlying race condition. This is the most severe outcomes. * * MF_FAILED - The m-f() handler marks the page as PG_hwpoisoned'ed. * It should have killed the process, but it can't isolate the page, * due to conditions such as extra pin, unmap failure, etc. Accessing * the page again may trigger another MCE and the process will be killed * by the m-f() handler immediately. * * MF_DELAYED - The m-f() handler marks the page as PG_hwpoisoned'ed. * The page is unmapped, and is removed from the LRU or file mapping. * An attempt to access the page again will trigger page fault and the * PF handler will kill the process. * * MF_RECOVERED - The m-f() handler marks the page as PG_hwpoisoned'ed. * The page has been completely isolated, that is, unmapped, taken out of * the buddy system, or hole-punnched out of the file mapping. */ static const char *action_name[] = …; static const char * const action_page_types[] = …; /* * XXX: It is possible that a page is isolated from LRU cache, * and then kept in swap cache or failed to remove from page cache. * The page count will stop it from being freed by unpoison. * Stress tests should be aware of this memory leak problem. */ static int delete_from_lru_cache(struct folio *folio) { … } static int truncate_error_folio(struct folio *folio, unsigned long pfn, struct address_space *mapping) { … } struct page_state { … }; /* * Return true if page is still referenced by others, otherwise return * false. * * The extra_pins is true when one extra refcount is expected. */ static bool has_extra_refcount(struct page_state *ps, struct page *p, bool extra_pins) { … } /* * Error hit kernel page. * Do nothing, try to be lucky and not touch this instead. For a few cases we * could be more sophisticated. */ static int me_kernel(struct page_state *ps, struct page *p) { … } /* * Page in unknown state. Do nothing. * This is a catch-all in case we fail to make sense of the page state. */ static int me_unknown(struct page_state *ps, struct page *p) { … } /* * Clean (or cleaned) page cache page. */ static int me_pagecache_clean(struct page_state *ps, struct page *p) { … } /* * Dirty pagecache page * Issues: when the error hit a hole page the error is not properly * propagated. */ static int me_pagecache_dirty(struct page_state *ps, struct page *p) { … } /* * Clean and dirty swap cache. * * Dirty swap cache page is tricky to handle. The page could live both in page * table and swap cache(ie. page is freshly swapped in). So it could be * referenced concurrently by 2 types of PTEs: * normal PTEs and swap PTEs. We try to handle them consistently by calling * try_to_unmap(!TTU_HWPOISON) to convert the normal PTEs to swap PTEs, * and then * - clear dirty bit to prevent IO * - remove from LRU * - but keep in the swap cache, so that when we return to it on * a later page fault, we know the application is accessing * corrupted data and shall be killed (we installed simple * interception code in do_swap_page to catch it). * * Clean swap cache pages can be directly isolated. A later page fault will * bring in the known good data from disk. */ static int me_swapcache_dirty(struct page_state *ps, struct page *p) { … } static int me_swapcache_clean(struct page_state *ps, struct page *p) { … } /* * Huge pages. Needs work. * Issues: * - Error on hugepage is contained in hugepage unit (not in raw page unit.) * To narrow down kill region to one page, we need to break up pmd. */ static int me_huge_page(struct page_state *ps, struct page *p) { … } /* * Various page states we can handle. * * A page state is defined by its current page->flags bits. * The table matches them in order and calls the right handler. * * This is quite tricky because we can access page at any time * in its live cycle, so all accesses have to be extremely careful. * * This is not complete. More states could be added. * For any missing state don't attempt recovery. */ #define dirty … #define sc … #define unevict … #define mlock … #define lru … #define head … #define reserved … static struct page_state error_states[] = …; #undef dirty #undef sc #undef unevict #undef mlock #undef lru #undef head #undef reserved static void update_per_node_mf_stats(unsigned long pfn, enum mf_result result) { … } /* * "Dirty/Clean" indication is not 100% accurate due to the possibility of * setting PG_dirty outside page lock. See also comment above set_page_dirty(). */ static int action_result(unsigned long pfn, enum mf_action_page_type type, enum mf_result result) { … } static int page_action(struct page_state *ps, struct page *p, unsigned long pfn) { … } static inline bool PageHWPoisonTakenOff(struct page *page) { … } void SetPageHWPoisonTakenOff(struct page *page) { … } void ClearPageHWPoisonTakenOff(struct page *page) { … } /* * Return true if a page type of a given page is supported by hwpoison * mechanism (while handling could fail), otherwise false. This function * does not return true for hugetlb or device memory pages, so it's assumed * to be called only in the context where we never have such pages. */ static inline bool HWPoisonHandlable(struct page *page, unsigned long flags) { … } static int __get_hwpoison_page(struct page *page, unsigned long flags) { … } #define GET_PAGE_MAX_RETRY_NUM … static int get_any_page(struct page *p, unsigned long flags) { … } static int __get_unpoison_page(struct page *page) { … } /** * get_hwpoison_page() - Get refcount for memory error handling * @p: Raw error page (hit by memory error) * @flags: Flags controlling behavior of error handling * * get_hwpoison_page() takes a page refcount of an error page to handle memory * error on it, after checking that the error page is in a well-defined state * (defined as a page-type we can successfully handle the memory error on it, * such as LRU page and hugetlb page). * * Memory error handling could be triggered at any time on any type of page, * so it's prone to race with typical memory management lifecycle (like * allocation and free). So to avoid such races, get_hwpoison_page() takes * extra care for the error page's state (as done in __get_hwpoison_page()), * and has some retry logic in get_any_page(). * * When called from unpoison_memory(), the caller should already ensure that * the given page has PG_hwpoison. So it's never reused for other page * allocations, and __get_unpoison_page() never races with them. * * Return: 0 on failure or free buddy (hugetlb) page, * 1 on success for in-use pages in a well-defined state, * -EIO for pages on which we can not handle memory errors, * -EBUSY when get_hwpoison_page() has raced with page lifecycle * operations like allocation and free, * -EHWPOISON when the page is hwpoisoned and taken off from buddy. */ static int get_hwpoison_page(struct page *p, unsigned long flags) { … } void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) { … } /* * Do all that is necessary to remove user space mappings. Unmap * the pages and send SIGBUS to the processes if the data was dirty. */ static bool hwpoison_user_mappings(struct folio *folio, struct page *p, unsigned long pfn, int flags) { … } static int identify_page_state(unsigned long pfn, struct page *p, unsigned long page_flags) { … } /* * When 'release' is 'false', it means that if thp split has failed, * there is still more to do, hence the page refcount we took earlier * is still needed. */ static int try_to_split_thp_page(struct page *page, bool release) { … } static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn, struct address_space *mapping, pgoff_t index, int flags) { … } /* * Only dev_pagemap pages get here, such as fsdax when the filesystem * either do not claim or fails to claim a hwpoison event, or devdax. * The fsdax pages are initialized per base page, and the devdax pages * could be initialized either as base pages, or as compound pages with * vmemmap optimization enabled. Devdax is simplistic in its dealing with * hwpoison, such that, if a subpage of a compound page is poisoned, * simply mark the compound head page is by far sufficient. */ static int mf_generic_kill_procs(unsigned long long pfn, int flags, struct dev_pagemap *pgmap) { … } #ifdef CONFIG_FS_DAX /** * mf_dax_kill_procs - Collect and kill processes who are using this file range * @mapping: address_space of the file in use * @index: start pgoff of the range within the file * @count: length of the range, in unit of PAGE_SIZE * @mf_flags: memory failure flags */ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, unsigned long count, int mf_flags) { … } EXPORT_SYMBOL_GPL(…); #endif /* CONFIG_FS_DAX */ #ifdef CONFIG_HUGETLB_PAGE /* * Struct raw_hwp_page represents information about "raw error page", * constructing singly linked list from ->_hugetlb_hwpoison field of folio. */ struct raw_hwp_page { … }; static inline struct llist_head *raw_hwp_list_head(struct folio *folio) { … } bool is_raw_hwpoison_page_in_hugepage(struct page *page) { … } static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag) { … } static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page) { … } static unsigned long folio_free_raw_hwp(struct folio *folio, bool move_flag) { … } void folio_clear_hugetlb_hwpoison(struct folio *folio) { … } /* * Called from hugetlb code with hugetlb_lock held. * * Return values: * 0 - free hugepage * 1 - in-use hugepage * 2 - not a hugepage * -EBUSY - the hugepage is busy (try to retry) * -EHWPOISON - the hugepage is already hwpoisoned */ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared) { … } /* * Taking refcount of hugetlb pages needs extra care about race conditions * with basic operations like hugepage allocation/free/demotion. * So some of prechecks for hwpoison (pinning, and testing/setting * PageHWPoison) should be done in single hugetlb_lock range. */ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb) { … } #else static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb) { return 0; } static inline unsigned long folio_free_raw_hwp(struct folio *folio, bool flag) { return 0; } #endif /* CONFIG_HUGETLB_PAGE */ /* Drop the extra refcount in case we come from madvise() */ static void put_ref_page(unsigned long pfn, int flags) { … } static int memory_failure_dev_pagemap(unsigned long pfn, int flags, struct dev_pagemap *pgmap) { … } /* * The calling condition is as such: thp split failed, page might have * been RDMA pinned, not much can be done for recovery. * But a SIGBUS should be delivered with vaddr provided so that the user * application has a chance to recover. Also, application processes' * election for MCE early killed will be honored. */ static void kill_procs_now(struct page *p, unsigned long pfn, int flags, struct folio *folio) { … } /** * memory_failure - Handle memory failure of a page. * @pfn: Page Number of the corrupted page * @flags: fine tune action taken * * This function is called by the low level machine check code * of an architecture when it detects hardware memory corruption * of a page. It tries its best to recover, which includes * dropping pages, killing processes etc. * * The function is primarily of use for corruptions that * happen outside the current execution context (e.g. when * detected by a background scrubber) * * Must run in process context (e.g. a work queue) with interrupts * enabled and no spinlocks held. * * Return: 0 for successfully handled the memory error, * -EOPNOTSUPP for hwpoison_filter() filtered the error event, * < 0(except -EOPNOTSUPP) on failure. */ int memory_failure(unsigned long pfn, int flags) { … } EXPORT_SYMBOL_GPL(…); #define MEMORY_FAILURE_FIFO_ORDER … #define MEMORY_FAILURE_FIFO_SIZE … struct memory_failure_entry { … }; struct memory_failure_cpu { … }; static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu); /** * memory_failure_queue - Schedule handling memory failure of a page. * @pfn: Page Number of the corrupted page * @flags: Flags for memory failure handling * * This function is called by the low level hardware error handler * when it detects hardware memory corruption of a page. It schedules * the recovering of error page, including dropping pages, killing * processes etc. * * The function is primarily of use for corruptions that * happen outside the current execution context (e.g. when * detected by a background scrubber) * * Can run in IRQ context. */ void memory_failure_queue(unsigned long pfn, int flags) { … } EXPORT_SYMBOL_GPL(…); static void memory_failure_work_func(struct work_struct *work) { … } /* * Process memory_failure work queued on the specified CPU. * Used to avoid return-to-userspace racing with the memory_failure workqueue. */ void memory_failure_queue_kick(int cpu) { … } static int __init memory_failure_init(void) { … } core_initcall(memory_failure_init); #undef pr_fmt #define pr_fmt(fmt) … #define unpoison_pr_info(fmt, pfn, rs) … /** * unpoison_memory - Unpoison a previously poisoned page * @pfn: Page number of the to be unpoisoned page * * Software-unpoison a page that has been poisoned by * memory_failure() earlier. * * This is only done on the software-level, so it only works * for linux injected failures, not real hardware failures * * Returns 0 for success, otherwise -errno. */ int unpoison_memory(unsigned long pfn) { … } EXPORT_SYMBOL(…); #undef pr_fmt #define pr_fmt(fmt) … /* * soft_offline_in_use_page handles hugetlb-pages and non-hugetlb pages. * If the page is a non-dirty unmapped page-cache page, it simply invalidates. * If the page is mapped, it migrates the contents over. */ static int soft_offline_in_use_page(struct page *page) { … } /** * soft_offline_page - Soft offline a page. * @pfn: pfn to soft-offline * @flags: flags. Same as memory_failure(). * * Returns 0 on success, * -EOPNOTSUPP for hwpoison_filter() filtered the error event, or * disabled by /proc/sys/vm/enable_soft_offline, * < 0 otherwise negated errno. * * Soft offline a page, by migration or invalidation, * without killing anything. This is for the case when * a page is not corrupted yet (so it's still valid to access), * but has had a number of corrected errors and is better taken * out. * * The actual policy on when to do that is maintained by * user space. * * This should never impact any application or cause data loss, * however it might take some time. * * This is not a 100% solution for all memory, but tries to be * ``good enough'' for the majority of memory. */ int soft_offline_page(unsigned long pfn, int flags) { … }