memory-failure.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2008, 2009 Intel Corporation
 * Authors: Andi Kleen, Fengguang Wu
 *
 * High level machine check handler. Handles pages reported by the
 * hardware as being corrupted usually due to a multi-bit ECC memory or cache
 * failure.
 *
 * In addition there is a "soft offline" entry point that allows stop using
 * not-yet-corrupted-by-suspicious pages without killing anything.
 *
 * Handles page cache pages in various states.	The tricky part
 * here is that we can access any page asynchronously in respect to
 * other VM users, because memory failures could happen anytime and
 * anywhere. This could violate some of their assumptions. This is why
 * this code has to be extremely careful. Generally it tries to use
 * normal locking rules, as in get the standard locks, even if that means
 * the error handling takes potentially a long time.
 *
 * It can be very tempting to add handling for obscure cases here.
 * In general any code for handling new cases should only be added iff:
 * - You know how to test it.
 * - You have a test that can be added to mce-test
 *   https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/
 * - The case actually shows up as a frequent (top 10) page state in
 *   tools/mm/page-types when running a real workload.
 *
 * There are several operations here with exponential complexity because
 * of unsuitable VM data structures. For example the operation to map back
 * from RMAP chains to processes has to walk the complete process list and
 * has non linear complexity with the number. But since memory corruptions
 * are rare we hope to get away with this. This avoids impacting the core
 * VM.
 */

#define pr_fmt(fmt) …

#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/page-flags.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/dax.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/export.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/backing-dev.h>
#include <linux/migrate.h>
#include <linux/slab.h>
#include <linux/swapops.h>
#include <linux/hugetlb.h>
#include <linux/memory_hotplug.h>
#include <linux/mm_inline.h>
#include <linux/memremap.h>
#include <linux/kfifo.h>
#include <linux/ratelimit.h>
#include <linux/pagewalk.h>
#include <linux/shmem_fs.h>
#include <linux/sysctl.h>
#include "swap.h"
#include "internal.h"
#include "ras/ras_event.h"

static int sysctl_memory_failure_early_kill __read_mostly;

static int sysctl_memory_failure_recovery __read_mostly = …;

static int sysctl_enable_soft_offline __read_mostly = …;

atomic_long_t num_poisoned_pages __read_mostly = …;

static bool hw_memory_failure __read_mostly = …;

static DEFINE_MUTEX(mf_mutex);

void num_poisoned_pages_inc(unsigned long pfn)
{ … }

void num_poisoned_pages_sub(unsigned long pfn, long i)
{ … }

/**
 * MF_ATTR_RO - Create sysfs entry for each memory failure statistics.
 * @_name: name of the file in the per NUMA sysfs directory.
 */
#define MF_ATTR_RO(_name) …

MF_ATTR_RO(…);
MF_ATTR_RO(…);
MF_ATTR_RO(…);
MF_ATTR_RO(…);
MF_ATTR_RO(…);

static struct attribute *memory_failure_attr[] = …;

const struct attribute_group memory_failure_attr_group = …;

static struct ctl_table memory_failure_table[] = …;

/*
 * Return values:
 *   1:   the page is dissolved (if needed) and taken off from buddy,
 *   0:   the page is dissolved (if needed) and not taken off from buddy,
 *   < 0: failed to dissolve.
 */
static int __page_handle_poison(struct page *page)
{ … }

static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
{ … }

#if IS_ENABLED(CONFIG_HWPOISON_INJECT)

u32 hwpoison_filter_enable = …;
u32 hwpoison_filter_dev_major = …;
u32 hwpoison_filter_dev_minor = …;
u64 hwpoison_filter_flags_mask;
u64 hwpoison_filter_flags_value;
EXPORT_SYMBOL_GPL(…);
EXPORT_SYMBOL_GPL(…);
EXPORT_SYMBOL_GPL(…);
EXPORT_SYMBOL_GPL(…);
EXPORT_SYMBOL_GPL(…);

static int hwpoison_filter_dev(struct page *p)
{ … }

static int hwpoison_filter_flags(struct page *p)
{ … }

/*
 * This allows stress tests to limit test scope to a collection of tasks
 * by putting them under some memcg. This prevents killing unrelated/important
 * processes such as /sbin/init. Note that the target task may share clean
 * pages with init (eg. libc text), which is harmless. If the target task
 * share _dirty_ pages with another task B, the test scheme must make sure B
 * is also included in the memcg. At last, due to race conditions this filter
 * can only guarantee that the page either belongs to the memcg tasks, or is
 * a freed page.
 */
#ifdef CONFIG_MEMCG
u64 hwpoison_filter_memcg;
EXPORT_SYMBOL_GPL(…);
static int hwpoison_filter_task(struct page *p)
{ … }
#else
static int hwpoison_filter_task(struct page *p) { return 0; }
#endif

int hwpoison_filter(struct page *p)
{ … }
EXPORT_SYMBOL_GPL(…);
#else
int hwpoison_filter(struct page *p)
{
	return 0;
}
#endif

/*
 * Kill all processes that have a poisoned page mapped and then isolate
 * the page.
 *
 * General strategy:
 * Find all processes having the page mapped and kill them.
 * But we keep a page reference around so that the page is not
 * actually freed yet.
 * Then stash the page away
 *
 * There's no convenient way to get back to mapped processes
 * from the VMAs. So do a brute-force search over all
 * running processes.
 *
 * Remember that machine checks are not common (or rather
 * if they are common you have other problems), so this shouldn't
 * be a performance issue.
 *
 * Also there are some races possible while we get from the
 * error detection to actually handle it.
 */

struct to_kill { … };

/*
 * Send all the processes who have the page mapped a signal.
 * ``action optional'' if they are not immediately affected by the error
 * ``action required'' if error happened in current execution context
 */
static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
{ … }

/*
 * Unknown page type encountered. Try to check whether it can turn PageLRU by
 * lru_add_drain_all.
 */
void shake_folio(struct folio *folio)
{ … }
EXPORT_SYMBOL_GPL(…);

static void shake_page(struct page *page)
{ … }

static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
		unsigned long address)
{ … }

/*
 * Failure handling: if we can't find or can't kill a process there's
 * not much we can do.	We just print a message and ignore otherwise.
 */

/*
 * Schedule a process for later kill.
 * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
 */
static void __add_to_kill(struct task_struct *tsk, struct page *p,
			  struct vm_area_struct *vma, struct list_head *to_kill,
			  unsigned long addr)
{ … }

static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p,
		struct vm_area_struct *vma, struct list_head *to_kill,
		unsigned long addr)
{ … }

#ifdef CONFIG_KSM
static bool task_in_to_kill_list(struct list_head *to_kill,
				 struct task_struct *tsk)
{ … }

void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
		     struct vm_area_struct *vma, struct list_head *to_kill,
		     unsigned long addr)
{ … }
#endif
/*
 * Kill the processes that have been collected earlier.
 *
 * Only do anything when FORCEKILL is set, otherwise just free the
 * list (this is used for clean pages which do not need killing)
 */
static void kill_procs(struct list_head *to_kill, int forcekill,
		unsigned long pfn, int flags)
{ … }

/*
 * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO)
 * on behalf of the thread group. Return task_struct of the (first found)
 * dedicated thread if found, and return NULL otherwise.
 *
 * We already hold rcu lock in the caller, so we don't have to call
 * rcu_read_lock/unlock() in this function.
 */
static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
{ … }

/*
 * Determine whether a given process is "early kill" process which expects
 * to be signaled when some page under the process is hwpoisoned.
 * Return task_struct of the dedicated thread (main thread unless explicitly
 * specified) if the process is "early kill" and otherwise returns NULL.
 *
 * Note that the above is true for Action Optional case. For Action Required
 * case, it's only meaningful to the current thread which need to be signaled
 * with SIGBUS, this error is Action Optional for other non current
 * processes sharing the same error page,if the process is "early kill", the
 * task_struct of the dedicated thread will also be returned.
 */
struct task_struct *task_early_kill(struct task_struct *tsk, int force_early)
{ … }

/*
 * Collect processes when the error hit an anonymous page.
 */
static void collect_procs_anon(struct folio *folio, struct page *page,
		struct list_head *to_kill, int force_early)
{ … }

/*
 * Collect processes when the error hit a file mapped page.
 */
static void collect_procs_file(struct folio *folio, struct page *page,
		struct list_head *to_kill, int force_early)
{ … }

#ifdef CONFIG_FS_DAX
static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p,
			      struct vm_area_struct *vma,
			      struct list_head *to_kill, pgoff_t pgoff)
{ … }

/*
 * Collect processes when the error hit a fsdax page.
 */
static void collect_procs_fsdax(struct page *page,
		struct address_space *mapping, pgoff_t pgoff,
		struct list_head *to_kill, bool pre_remove)
{ … }
#endif /* CONFIG_FS_DAX */

/*
 * Collect the processes who have the corrupted page mapped to kill.
 */
static void collect_procs(struct folio *folio, struct page *page,
		struct list_head *tokill, int force_early)
{ … }

struct hwpoison_walk { … };

static void set_to_kill(struct to_kill *tk, unsigned long addr, short shift)
{ … }

static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift,
				unsigned long poisoned_pfn, struct to_kill *tk)
{ … }

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
				      struct hwpoison_walk *hwp)
{ … }
#else
static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
				      struct hwpoison_walk *hwp)
{
	return 0;
}
#endif

static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr,
			      unsigned long end, struct mm_walk *walk)
{ … }

#ifdef CONFIG_HUGETLB_PAGE
static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
			    unsigned long addr, unsigned long end,
			    struct mm_walk *walk)
{ … }
#else
#define hwpoison_hugetlb_range …
#endif

static const struct mm_walk_ops hwpoison_walk_ops = …;

/*
 * Sends SIGBUS to the current process with error info.
 *
 * This function is intended to handle "Action Required" MCEs on already
 * hardware poisoned pages. They could happen, for example, when
 * memory_failure() failed to unmap the error page at the first call, or
 * when multiple local machine checks happened on different CPUs.
 *
 * MCE handler currently has no easy access to the error virtual address,
 * so this function walks page table to find it. The returned virtual address
 * is proper in most cases, but it could be wrong when the application
 * process has multiple entries mapping the error page.
 */
static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
				  int flags)
{ … }

/*
 * MF_IGNORED - The m-f() handler marks the page as PG_hwpoisoned'ed.
 * But it could not do more to isolate the page from being accessed again,
 * nor does it kill the process. This is extremely rare and one of the
 * potential causes is that the page state has been changed due to
 * underlying race condition. This is the most severe outcomes.
 *
 * MF_FAILED - The m-f() handler marks the page as PG_hwpoisoned'ed.
 * It should have killed the process, but it can't isolate the page,
 * due to conditions such as extra pin, unmap failure, etc. Accessing
 * the page again may trigger another MCE and the process will be killed
 * by the m-f() handler immediately.
 *
 * MF_DELAYED - The m-f() handler marks the page as PG_hwpoisoned'ed.
 * The page is unmapped, and is removed from the LRU or file mapping.
 * An attempt to access the page again will trigger page fault and the
 * PF handler will kill the process.
 *
 * MF_RECOVERED - The m-f() handler marks the page as PG_hwpoisoned'ed.
 * The page has been completely isolated, that is, unmapped, taken out of
 * the buddy system, or hole-punnched out of the file mapping.
 */
static const char *action_name[] = …;

static const char * const action_page_types[] = …;

/*
 * XXX: It is possible that a page is isolated from LRU cache,
 * and then kept in swap cache or failed to remove from page cache.
 * The page count will stop it from being freed by unpoison.
 * Stress tests should be aware of this memory leak problem.
 */
static int delete_from_lru_cache(struct folio *folio)
{ … }

static int truncate_error_folio(struct folio *folio, unsigned long pfn,
				struct address_space *mapping)
{ … }

struct page_state { … };

/*
 * Return true if page is still referenced by others, otherwise return
 * false.
 *
 * The extra_pins is true when one extra refcount is expected.
 */
static bool has_extra_refcount(struct page_state *ps, struct page *p,
			       bool extra_pins)
{ … }

/*
 * Error hit kernel page.
 * Do nothing, try to be lucky and not touch this instead. For a few cases we
 * could be more sophisticated.
 */
static int me_kernel(struct page_state *ps, struct page *p)
{ … }

/*
 * Page in unknown state. Do nothing.
 * This is a catch-all in case we fail to make sense of the page state.
 */
static int me_unknown(struct page_state *ps, struct page *p)
{ … }

/*
 * Clean (or cleaned) page cache page.
 */
static int me_pagecache_clean(struct page_state *ps, struct page *p)
{ … }

/*
 * Dirty pagecache page
 * Issues: when the error hit a hole page the error is not properly
 * propagated.
 */
static int me_pagecache_dirty(struct page_state *ps, struct page *p)
{ … }

/*
 * Clean and dirty swap cache.
 *
 * Dirty swap cache page is tricky to handle. The page could live both in page
 * table and swap cache(ie. page is freshly swapped in). So it could be
 * referenced concurrently by 2 types of PTEs:
 * normal PTEs and swap PTEs. We try to handle them consistently by calling
 * try_to_unmap(!TTU_HWPOISON) to convert the normal PTEs to swap PTEs,
 * and then
 *      - clear dirty bit to prevent IO
 *      - remove from LRU
 *      - but keep in the swap cache, so that when we return to it on
 *        a later page fault, we know the application is accessing
 *        corrupted data and shall be killed (we installed simple
 *        interception code in do_swap_page to catch it).
 *
 * Clean swap cache pages can be directly isolated. A later page fault will
 * bring in the known good data from disk.
 */
static int me_swapcache_dirty(struct page_state *ps, struct page *p)
{ … }

static int me_swapcache_clean(struct page_state *ps, struct page *p)
{ … }

/*
 * Huge pages. Needs work.
 * Issues:
 * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
 *   To narrow down kill region to one page, we need to break up pmd.
 */
static int me_huge_page(struct page_state *ps, struct page *p)
{ … }

/*
 * Various page states we can handle.
 *
 * A page state is defined by its current page->flags bits.
 * The table matches them in order and calls the right handler.
 *
 * This is quite tricky because we can access page at any time
 * in its live cycle, so all accesses have to be extremely careful.
 *
 * This is not complete. More states could be added.
 * For any missing state don't attempt recovery.
 */

#define dirty …
#define sc …
#define unevict …
#define mlock …
#define lru …
#define head …
#define reserved …

static struct page_state error_states[] = …;

#undef dirty
#undef sc
#undef unevict
#undef mlock
#undef lru
#undef head
#undef reserved

static void update_per_node_mf_stats(unsigned long pfn,
				     enum mf_result result)
{ … }

/*
 * "Dirty/Clean" indication is not 100% accurate due to the possibility of
 * setting PG_dirty outside page lock. See also comment above set_page_dirty().
 */
static int action_result(unsigned long pfn, enum mf_action_page_type type,
			 enum mf_result result)
{ … }

static int page_action(struct page_state *ps, struct page *p,
			unsigned long pfn)
{ … }

static inline bool PageHWPoisonTakenOff(struct page *page)
{ … }

void SetPageHWPoisonTakenOff(struct page *page)
{ … }

void ClearPageHWPoisonTakenOff(struct page *page)
{ … }

/*
 * Return true if a page type of a given page is supported by hwpoison
 * mechanism (while handling could fail), otherwise false.  This function
 * does not return true for hugetlb or device memory pages, so it's assumed
 * to be called only in the context where we never have such pages.
 */
static inline bool HWPoisonHandlable(struct page *page, unsigned long flags)
{ … }

static int __get_hwpoison_page(struct page *page, unsigned long flags)
{ … }

#define GET_PAGE_MAX_RETRY_NUM …

static int get_any_page(struct page *p, unsigned long flags)
{ … }

static int __get_unpoison_page(struct page *page)
{ … }

/**
 * get_hwpoison_page() - Get refcount for memory error handling
 * @p:		Raw error page (hit by memory error)
 * @flags:	Flags controlling behavior of error handling
 *
 * get_hwpoison_page() takes a page refcount of an error page to handle memory
 * error on it, after checking that the error page is in a well-defined state
 * (defined as a page-type we can successfully handle the memory error on it,
 * such as LRU page and hugetlb page).
 *
 * Memory error handling could be triggered at any time on any type of page,
 * so it's prone to race with typical memory management lifecycle (like
 * allocation and free).  So to avoid such races, get_hwpoison_page() takes
 * extra care for the error page's state (as done in __get_hwpoison_page()),
 * and has some retry logic in get_any_page().
 *
 * When called from unpoison_memory(), the caller should already ensure that
 * the given page has PG_hwpoison. So it's never reused for other page
 * allocations, and __get_unpoison_page() never races with them.
 *
 * Return: 0 on failure or free buddy (hugetlb) page,
 *         1 on success for in-use pages in a well-defined state,
 *         -EIO for pages on which we can not handle memory errors,
 *         -EBUSY when get_hwpoison_page() has raced with page lifecycle
 *         operations like allocation and free,
 *         -EHWPOISON when the page is hwpoisoned and taken off from buddy.
 */
static int get_hwpoison_page(struct page *p, unsigned long flags)
{ … }

/*
 * Do all that is necessary to remove user space mappings. Unmap
 * the pages and send SIGBUS to the processes if the data was dirty.
 */
static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
		unsigned long pfn, int flags)
{ … }

static int identify_page_state(unsigned long pfn, struct page *p,
				unsigned long page_flags)
{ … }

/*
 * When 'release' is 'false', it means that if thp split has failed,
 * there is still more to do, hence the page refcount we took earlier
 * is still needed.
 */
static int try_to_split_thp_page(struct page *page, bool release)
{ … }

static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn,
		struct address_space *mapping, pgoff_t index, int flags)
{ … }

/*
 * Only dev_pagemap pages get here, such as fsdax when the filesystem
 * either do not claim or fails to claim a hwpoison event, or devdax.
 * The fsdax pages are initialized per base page, and the devdax pages
 * could be initialized either as base pages, or as compound pages with
 * vmemmap optimization enabled. Devdax is simplistic in its dealing with
 * hwpoison, such that, if a subpage of a compound page is poisoned,
 * simply mark the compound head page is by far sufficient.
 */
static int mf_generic_kill_procs(unsigned long long pfn, int flags,
		struct dev_pagemap *pgmap)
{ … }

#ifdef CONFIG_FS_DAX
/**
 * mf_dax_kill_procs - Collect and kill processes who are using this file range
 * @mapping:	address_space of the file in use
 * @index:	start pgoff of the range within the file
 * @count:	length of the range, in unit of PAGE_SIZE
 * @mf_flags:	memory failure flags
 */
int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
		unsigned long count, int mf_flags)
{ … }
EXPORT_SYMBOL_GPL(…);
#endif /* CONFIG_FS_DAX */

#ifdef CONFIG_HUGETLB_PAGE

/*
 * Struct raw_hwp_page represents information about "raw error page",
 * constructing singly linked list from ->_hugetlb_hwpoison field of folio.
 */
struct raw_hwp_page { … };

static inline struct llist_head *raw_hwp_list_head(struct folio *folio)
{ … }

bool is_raw_hwpoison_page_in_hugepage(struct page *page)
{ … }

static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag)
{ … }

static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
{ … }

static unsigned long folio_free_raw_hwp(struct folio *folio, bool move_flag)
{ … }

void folio_clear_hugetlb_hwpoison(struct folio *folio)
{ … }

/*
 * Called from hugetlb code with hugetlb_lock held.
 *
 * Return values:
 *   0             - free hugepage
 *   1             - in-use hugepage
 *   2             - not a hugepage
 *   -EBUSY        - the hugepage is busy (try to retry)
 *   -EHWPOISON    - the hugepage is already hwpoisoned
 */
int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
				 bool *migratable_cleared)
{ … }

/*
 * Taking refcount of hugetlb pages needs extra care about race conditions
 * with basic operations like hugepage allocation/free/demotion.
 * So some of prechecks for hwpoison (pinning, and testing/setting
 * PageHWPoison) should be done in single hugetlb_lock range.
 */
static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
{ … }

#else
static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
{
	return 0;
}

static inline unsigned long folio_free_raw_hwp(struct folio *folio, bool flag)
{
	return 0;
}
#endif	/* CONFIG_HUGETLB_PAGE */

/* Drop the extra refcount in case we come from madvise() */
static void put_ref_page(unsigned long pfn, int flags)
{ … }

static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
		struct dev_pagemap *pgmap)
{ … }

/*
 * The calling condition is as such: thp split failed, page might have
 * been RDMA pinned, not much can be done for recovery.
 * But a SIGBUS should be delivered with vaddr provided so that the user
 * application has a chance to recover. Also, application processes'
 * election for MCE early killed will be honored.
 */
static void kill_procs_now(struct page *p, unsigned long pfn, int flags,
				struct folio *folio)
{ … }

/**
 * memory_failure - Handle memory failure of a page.
 * @pfn: Page Number of the corrupted page
 * @flags: fine tune action taken
 *
 * This function is called by the low level machine check code
 * of an architecture when it detects hardware memory corruption
 * of a page. It tries its best to recover, which includes
 * dropping pages, killing processes etc.
 *
 * The function is primarily of use for corruptions that
 * happen outside the current execution context (e.g. when
 * detected by a background scrubber)
 *
 * Must run in process context (e.g. a work queue) with interrupts
 * enabled and no spinlocks held.
 *
 * Return: 0 for successfully handled the memory error,
 *         -EOPNOTSUPP for hwpoison_filter() filtered the error event,
 *         < 0(except -EOPNOTSUPP) on failure.
 */
int memory_failure(unsigned long pfn, int flags)
{ … }
EXPORT_SYMBOL_GPL(…);

#define MEMORY_FAILURE_FIFO_ORDER …
#define MEMORY_FAILURE_FIFO_SIZE …

struct memory_failure_entry { … };

struct memory_failure_cpu { … };

static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);

/**
 * memory_failure_queue - Schedule handling memory failure of a page.
 * @pfn: Page Number of the corrupted page
 * @flags: Flags for memory failure handling
 *
 * This function is called by the low level hardware error handler
 * when it detects hardware memory corruption of a page. It schedules
 * the recovering of error page, including dropping pages, killing
 * processes etc.
 *
 * The function is primarily of use for corruptions that
 * happen outside the current execution context (e.g. when
 * detected by a background scrubber)
 *
 * Can run in IRQ context.
 */
void memory_failure_queue(unsigned long pfn, int flags)
{ … }
EXPORT_SYMBOL_GPL(…);

static void memory_failure_work_func(struct work_struct *work)
{ … }

/*
 * Process memory_failure work queued on the specified CPU.
 * Used to avoid return-to-userspace racing with the memory_failure workqueue.
 */
void memory_failure_queue_kick(int cpu)
{ … }

static int __init memory_failure_init(void)
{ … }
core_initcall(memory_failure_init);

#undef pr_fmt
#define pr_fmt(fmt) …
#define unpoison_pr_info(fmt, pfn, rs) …

/**
 * unpoison_memory - Unpoison a previously poisoned page
 * @pfn: Page number of the to be unpoisoned page
 *
 * Software-unpoison a page that has been poisoned by
 * memory_failure() earlier.
 *
 * This is only done on the software-level, so it only works
 * for linux injected failures, not real hardware failures
 *
 * Returns 0 for success, otherwise -errno.
 */
int unpoison_memory(unsigned long pfn)
{ … }
EXPORT_SYMBOL(…);

#undef pr_fmt
#define pr_fmt(fmt) …

static bool mf_isolate_folio(struct folio *folio, struct list_head *pagelist)
{ … }

/*
 * soft_offline_in_use_page handles hugetlb-pages and non-hugetlb pages.
 * If the page is a non-dirty unmapped page-cache page, it simply invalidates.
 * If the page is mapped, it migrates the contents over.
 */
static int soft_offline_in_use_page(struct page *page)
{ … }

/**
 * soft_offline_page - Soft offline a page.
 * @pfn: pfn to soft-offline
 * @flags: flags. Same as memory_failure().
 *
 * Returns 0 on success,
 *         -EOPNOTSUPP for hwpoison_filter() filtered the error event, or
 *         disabled by /proc/sys/vm/enable_soft_offline,
 *         < 0 otherwise negated errno.
 *
 * Soft offline a page, by migration or invalidation,
 * without killing anything. This is for the case when
 * a page is not corrupted yet (so it's still valid to access),
 * but has had a number of corrected errors and is better taken
 * out.
 *
 * The actual policy on when to do that is maintained by
 * user space.
 *
 * This should never impact any application or cause data loss,
 * however it might take some time.
 *
 * This is not a 100% solution for all memory, but tries to be
 * ``good enough'' for the majority of memory.
 */
int soft_offline_page(unsigned long pfn, int flags)
{ … }
linux/mm/memory-failure.c