core.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0-only
/*
 * Machine check handler.
 *
 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
 * Rest from unknown author(s).
 * 2004 Andi Kleen. Rewrote most of it.
 * Copyright 2008 Intel Corporation
 * Author: Andi Kleen
 */

#include <linux/thread_info.h>
#include <linux/capability.h>
#include <linux/miscdevice.h>
#include <linux/ratelimit.h>
#include <linux/rcupdate.h>
#include <linux/kobject.h>
#include <linux/uaccess.h>
#include <linux/kdebug.h>
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/string.h>
#include <linux/device.h>
#include <linux/syscore_ops.h>
#include <linux/delay.h>
#include <linux/ctype.h>
#include <linux/sched.h>
#include <linux/sysfs.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/kmod.h>
#include <linux/poll.h>
#include <linux/nmi.h>
#include <linux/cpu.h>
#include <linux/ras.h>
#include <linux/smp.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/debugfs.h>
#include <linux/irq_work.h>
#include <linux/export.h>
#include <linux/set_memory.h>
#include <linux/sync_core.h>
#include <linux/task_work.h>
#include <linux/hardirq.h>
#include <linux/kexec.h>

#include <asm/fred.h>
#include <asm/cpu_device_id.h>
#include <asm/processor.h>
#include <asm/traps.h>
#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>
#include <asm/reboot.h>
#include <asm/tdx.h>

#include "internal.h"

/* sysfs synchronization */
static DEFINE_MUTEX(mce_sysfs_mutex);

#define CREATE_TRACE_POINTS
#include <trace/events/mce.h>

#define SPINUNIT …

DEFINE_PER_CPU(unsigned, mce_exception_count);

DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);

DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);

#define ATTR_LEN …
/* One object for each MCE bank, shared by all CPUs */
struct mce_bank_dev { … };
static struct mce_bank_dev mce_bank_devs[MAX_NR_BANKS];

struct mce_vendor_flags mce_flags __read_mostly;

struct mca_config mca_cfg __read_mostly = …;

static DEFINE_PER_CPU(struct mce, mces_seen);
static unsigned long mce_need_notify;

/*
 * MCA banks polled by the period polling timer for corrected events.
 * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
 */
DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = …;

/*
 * MCA banks controlled through firmware first for corrected errors.
 * This is a global list of banks for which we won't enable CMCI and we
 * won't poll. Firmware controls these banks and is responsible for
 * reporting corrected errors through GHES. Uncorrected/recoverable
 * errors are still notified through a machine check.
 */
mce_banks_t mce_banks_ce_disabled;

static struct work_struct mce_work;
static struct irq_work mce_irq_work;

/*
 * CPU/chipset specific EDAC code can register a notifier call here to print
 * MCE errors in a human-readable form.
 */
BLOCKING_NOTIFIER_HEAD(…) …;

void mce_prep_record_common(struct mce *m)
{ … }

void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m)
{ … }

/* Do initial initialization of a struct mce */
void mce_prep_record(struct mce *m)
{ … }

DEFINE_PER_CPU(struct mce, injectm);
EXPORT_PER_CPU_SYMBOL_GPL(…);

void mce_log(struct mce *m)
{ … }
EXPORT_SYMBOL_GPL(…);

void mce_register_decode_chain(struct notifier_block *nb)
{ … }
EXPORT_SYMBOL_GPL(…);

void mce_unregister_decode_chain(struct notifier_block *nb)
{ … }
EXPORT_SYMBOL_GPL(…);

static void __print_mce(struct mce *m)
{ … }

static void print_mce(struct mce *m)
{ … }

#define PANIC_TIMEOUT …

static atomic_t mce_panicked;

static int fake_panic;
static atomic_t mce_fake_panicked;

/* Panic in progress. Enable interrupts and wait for final IPI */
static void wait_for_panic(void)
{ … }

static const char *mce_dump_aux_info(struct mce *m)
{ … }

static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
{ … }

/* Support code for software error injection */

static int msr_to_offset(u32 msr)
{ … }

void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr)
{ … }

/* MSR access wrappers used for error injection */
noinstr u64 mce_rdmsrl(u32 msr)
{ … }

static noinstr void mce_wrmsrl(u32 msr, u64 v)
{ … }

/*
 * Collect all global (w.r.t. this processor) status about this machine
 * check into our "mce" struct so that we can use it later to assess
 * the severity of the problem as we read per-bank specific details.
 */
static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs)
{ … }

int mce_available(struct cpuinfo_x86 *c)
{ … }

static void mce_schedule_work(void)
{ … }

static void mce_irq_work_cb(struct irq_work *entry)
{ … }

bool mce_usable_address(struct mce *m)
{ … }
EXPORT_SYMBOL_GPL(…);

bool mce_is_memory_error(struct mce *m)
{ … }
EXPORT_SYMBOL_GPL(…);

static bool whole_page(struct mce *m)
{ … }

bool mce_is_correctable(struct mce *m)
{ … }
EXPORT_SYMBOL_GPL(…);

static int mce_early_notifier(struct notifier_block *nb, unsigned long val,
			      void *data)
{ … }

static struct notifier_block early_nb = …;

static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
			      void *data)
{ … }

static struct notifier_block mce_uc_nb = …;

static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
				void *data)
{ … }

static struct notifier_block mce_default_nb = …;

/*
 * Read ADDR and MISC registers.
 */
static noinstr void mce_read_aux(struct mce *m, int i)
{ … }

DEFINE_PER_CPU(unsigned, mce_poll_count);

/*
 * Poll for corrected events or events that happened before reset.
 * Those are just logged through /dev/mcelog.
 *
 * This is executed in standard interrupt context.
 *
 * Note: spec recommends to panic for fatal unsignalled
 * errors here. However this would be quite problematic --
 * we would need to reimplement the Monarch handling and
 * it would mess up the exclusion between exception handler
 * and poll handler -- * so we skip this for now.
 * These cases should not happen anyways, or only when the CPU
 * is already totally * confused. In this case it's likely it will
 * not fully execute the machine check handler either.
 */
void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
{ … }
EXPORT_SYMBOL_GPL(…);

/*
 * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
 * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
 * Vol 3B Table 15-20). But this confuses both the code that determines
 * whether the machine check occurred in kernel or user mode, and also
 * the severity assessment code. Pretend that EIPV was set, and take the
 * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
 */
static __always_inline void
quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
{ … }

/*
 * Disable fast string copy and return from the MCE handler upon the first SRAR
 * MCE on bank 1 due to a CPU erratum on Intel Skylake/Cascade Lake/Cooper Lake
 * CPUs.
 * The fast string copy instructions ("REP; MOVS*") could consume an
 * uncorrectable memory error in the cache line _right after_ the desired region
 * to copy and raise an MCE with RIP pointing to the instruction _after_ the
 * "REP; MOVS*".
 * This mitigation addresses the issue completely with the caveat of performance
 * degradation on the CPU affected. This is still better than the OS crashing on
 * MCEs raised on an irrelevant process due to "REP; MOVS*" accesses from a
 * kernel context (e.g., copy_page).
 *
 * Returns true when fast string copy on CPU has been disabled.
 */
static noinstr bool quirk_skylake_repmov(void)
{ … }

/*
 * Some Zen-based Instruction Fetch Units set EIPV=RIPV=0 on poison consumption
 * errors. This means mce_gather_info() will not save the "ip" and "cs" registers.
 *
 * However, the context is still valid, so save the "cs" register for later use.
 *
 * The "ip" register is truly unknown, so don't save it or fixup EIPV/RIPV.
 *
 * The Instruction Fetch Unit is at MCA bank 1 for all affected systems.
 */
static __always_inline void quirk_zen_ifu(int bank, struct mce *m, struct pt_regs *regs)
{ … }

/*
 * Do a quick check if any of the events requires a panic.
 * This decides if we keep the events around or clear them.
 */
static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
					  struct pt_regs *regs)
{ … }

/*
 * Variable to establish order between CPUs while scanning.
 * Each CPU spins initially until executing is equal its number.
 */
static atomic_t mce_executing;

/*
 * Defines order of CPUs on entry. First CPU becomes Monarch.
 */
static atomic_t mce_callin;

/*
 * Track which CPUs entered the MCA broadcast synchronization and which not in
 * order to print holdouts.
 */
static cpumask_t mce_missing_cpus = …;

/*
 * Check if a timeout waiting for other CPUs happened.
 */
static noinstr int mce_timed_out(u64 *t, const char *msg)
{ … }

/*
 * The Monarch's reign.  The Monarch is the CPU who entered
 * the machine check handler first. It waits for the others to
 * raise the exception too and then grades them. When any
 * error is fatal panic. Only then let the others continue.
 *
 * The other CPUs entering the MCE handler will be controlled by the
 * Monarch. They are called Subjects.
 *
 * This way we prevent any potential data corruption in a unrecoverable case
 * and also makes sure always all CPU's errors are examined.
 *
 * Also this detects the case of a machine check event coming from outer
 * space (not detected by any CPUs) In this case some external agent wants
 * us to shut down, so panic too.
 *
 * The other CPUs might still decide to panic if the handler happens
 * in a unrecoverable place, but in this case the system is in a semi-stable
 * state and won't corrupt anything by itself. It's ok to let the others
 * continue for a bit first.
 *
 * All the spin loops have timeouts; when a timeout happens a CPU
 * typically elects itself to be Monarch.
 */
static void mce_reign(void)
{ … }

static atomic_t global_nwo;

/*
 * Start of Monarch synchronization. This waits until all CPUs have
 * entered the exception handler and then determines if any of them
 * saw a fatal event that requires panic. Then it executes them
 * in the entry order.
 * TBD double check parallel CPU hotunplug
 */
static noinstr int mce_start(int *no_way_out)
{ … }

/*
 * Synchronize between CPUs after main scanning loop.
 * This invokes the bulk of the Monarch processing.
 */
static noinstr int mce_end(int order)
{ … }

static __always_inline void mce_clear_state(unsigned long *toclear)
{ … }

/*
 * Cases where we avoid rendezvous handler timeout:
 * 1) If this CPU is offline.
 *
 * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
 *  skip those CPUs which remain looping in the 1st kernel - see
 *  crash_nmi_callback().
 *
 * Note: there still is a small window between kexec-ing and the new,
 * kdump kernel establishing a new #MC handler where a broadcasted MCE
 * might not get handled properly.
 */
static noinstr bool mce_check_crashing_cpu(void)
{ … }

static __always_inline int
__mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final,
		unsigned long *toclear, unsigned long *valid_banks, int no_way_out,
		int *worst)
{ … }

static void kill_me_now(struct callback_head *ch)
{ … }

static void kill_me_maybe(struct callback_head *cb)
{ … }

static void kill_me_never(struct callback_head *cb)
{ … }

static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *))
{ … }

/* Handle unconfigured int18 (should never happen) */
static noinstr void unexpected_machine_check(struct pt_regs *regs)
{ … }

/*
 * The actual machine check handler. This only handles real exceptions when
 * something got corrupted coming in through int 18.
 *
 * This is executed in #MC context not subject to normal locking rules.
 * This implies that most kernel services cannot be safely used. Don't even
 * think about putting a printk in there!
 *
 * On Intel systems this is entered on all CPUs in parallel through
 * MCE broadcast. However some CPUs might be broken beyond repair,
 * so be always careful when synchronizing with others.
 *
 * Tracing and kprobes are disabled: if we interrupted a kernel context
 * with IF=1, we need to minimize stack usage.  There are also recursion
 * issues: if the machine check was due to a failure of the memory
 * backing the user stack, tracing that reads the user stack will cause
 * potentially infinite recursion.
 *
 * Currently, the #MC handler calls out to a number of external facilities
 * and, therefore, allows instrumentation around them. The optimal thing to
 * have would be to do the absolutely minimal work required in #MC context
 * and have instrumentation disabled only around that. Further processing can
 * then happen in process context where instrumentation is allowed. Achieving
 * that requires careful auditing and modifications. Until then, the code
 * allows instrumentation temporarily, where required. *
 */
noinstr void do_machine_check(struct pt_regs *regs)
{ … }
EXPORT_SYMBOL_GPL(…);

#ifndef CONFIG_MEMORY_FAILURE
int memory_failure(unsigned long pfn, int flags)
{
	/* mce_severity() should not hand us an ACTION_REQUIRED error */
	BUG_ON(flags & MF_ACTION_REQUIRED);
	pr_err("Uncorrected memory error in page 0x%lx ignored\n"
	       "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
	       pfn);

	return 0;
}
#endif

/*
 * Periodic polling timer for "silent" machine check errors.  If the
 * poller finds an MCE, poll 2x faster.  When the poller finds no more
 * errors, poll 2x slower (up to check_interval seconds).
 */
static unsigned long check_interval = …;

static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
static DEFINE_PER_CPU(struct timer_list, mce_timer);

static void __start_timer(struct timer_list *t, unsigned long interval)
{ … }

static void mc_poll_banks_default(void)
{ … }

void (*mc_poll_banks)(void) = …;

static void mce_timer_fn(struct timer_list *t)
{ … }

/*
 * When a storm starts on any bank on this CPU, switch to polling
 * once per second. When the storm ends, revert to the default
 * polling interval.
 */
void mce_timer_kick(bool storm)
{ … }

/* Must not be called in IRQ context where del_timer_sync() can deadlock */
static void mce_timer_delete_all(void)
{ … }

/*
 * Notify the user(s) about new machine check events.
 * Can be called from interrupt context, but not from machine check/NMI
 * context.
 */
int mce_notify_irq(void)
{ … }
EXPORT_SYMBOL_GPL(…);

static void __mcheck_cpu_mce_banks_init(void)
{ … }

/*
 * Initialize Machine Checks for a CPU.
 */
static void __mcheck_cpu_cap_init(void)
{ … }

static void __mcheck_cpu_init_generic(void)
{ … }

static void __mcheck_cpu_init_clear_banks(void)
{ … }

/*
 * Do a final check to see if there are any unused/RAZ banks.
 *
 * This must be done after the banks have been initialized and any quirks have
 * been applied.
 *
 * Do not call this from any user-initiated flows, e.g. CPU hotplug or sysfs.
 * Otherwise, a user who disables a bank will not be able to re-enable it
 * without a system reboot.
 */
static void __mcheck_cpu_check_banks(void)
{ … }

/* Add per CPU specific workarounds here */
static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
{ … }

static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
{ … }

/*
 * Init basic CPU features needed for early decoding of MCEs.
 */
static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
{ … }

static void mce_centaur_feature_init(struct cpuinfo_x86 *c)
{ … }

static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c)
{ … }

static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
{ … }

static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
{ … }

static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
{ … }

static void mce_start_timer(struct timer_list *t)
{ … }

static void __mcheck_cpu_setup_timer(void)
{ … }

static void __mcheck_cpu_init_timer(void)
{ … }

bool filter_mce(struct mce *m)
{ … }

static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
{ … }

static __always_inline void exc_machine_check_user(struct pt_regs *regs)
{ … }

#ifdef CONFIG_X86_64
/* MCE hit kernel mode */
DEFINE_IDTENTRY_MCE(exc_machine_check)
{ … }

/* The user mode variant. */
DEFINE_IDTENTRY_MCE_USER(exc_machine_check)
{ … }

#ifdef CONFIG_X86_FRED
/*
 * When occurred on different ring level, i.e., from user or kernel
 * context, #MCE needs to be handled on different stack: User #MCE
 * on current task stack, while kernel #MCE on a dedicated stack.
 *
 * This is exactly how FRED event delivery invokes an exception
 * handler: ring 3 event on level 0 stack, i.e., current task stack;
 * ring 0 event on the #MCE dedicated stack specified in the
 * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED machine check entry
 * stub doesn't do stack switch.
 */
DEFINE_FREDENTRY_MCE(exc_machine_check)
{ … }
#endif
#else
/* 32bit unified entry point */
DEFINE_IDTENTRY_RAW(exc_machine_check)
{
	unsigned long dr7;

	dr7 = local_db_save();
	if (user_mode(regs))
		exc_machine_check_user(regs);
	else
		exc_machine_check_kernel(regs);
	local_db_restore(dr7);
}
#endif

/*
 * Called for each booted CPU to set up machine checks.
 * Must be called with preempt off:
 */
void mcheck_cpu_init(struct cpuinfo_x86 *c)
{ … }

/*
 * Called for each booted CPU to clear some machine checks opt-ins
 */
void mcheck_cpu_clear(struct cpuinfo_x86 *c)
{ … }

static void __mce_disable_bank(void *arg)
{ … }

void mce_disable_bank(int bank)
{ … }

/*
 * mce=off Disables machine check
 * mce=no_cmci Disables CMCI
 * mce=no_lmce Disables LMCE
 * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
 * mce=print_all Print all machine check logs to console
 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
 *	monarchtimeout is how long to wait for other CPUs on machine
 *	check, or 0 to not wait
 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h
	and older.
 * mce=nobootlog Don't log MCEs from before booting.
 * mce=bios_cmci_threshold Don't program the CMCI threshold
 * mce=recovery force enable copy_mc_fragile()
 */
static int __init mcheck_enable(char *str)
{ … }
__setup(…);

int __init mcheck_init(void)
{ … }

/*
 * mce_syscore: PM support
 */

/*
 * Disable machine checks on suspend and shutdown. We can't really handle
 * them later.
 */
static void mce_disable_error_reporting(void)
{ … }

static void vendor_disable_error_reporting(void)
{ … }

static int mce_syscore_suspend(void)
{ … }

static void mce_syscore_shutdown(void)
{ … }

/*
 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
 * Only one CPU is active at this time, the others get re-added later using
 * CPU hotplug:
 */
static void mce_syscore_resume(void)
{ … }

static struct syscore_ops mce_syscore_ops = …;

/*
 * mce_device: Sysfs support
 */

static void mce_cpu_restart(void *data)
{ … }

/* Reinit MCEs after user configuration changes */
static void mce_restart(void)
{ … }

/* Toggle features for corrected errors */
static void mce_disable_cmci(void *data)
{ … }

static void mce_enable_ce(void *all)
{ … }

static const struct bus_type mce_subsys = …;

DEFINE_PER_CPU(struct device *, mce_device);

static inline struct mce_bank_dev *attr_to_bank(struct device_attribute *attr)
{ … }

static ssize_t show_bank(struct device *s, struct device_attribute *attr,
			 char *buf)
{ … }

static ssize_t set_bank(struct device *s, struct device_attribute *attr,
			const char *buf, size_t size)
{ … }

static ssize_t set_ignore_ce(struct device *s,
			     struct device_attribute *attr,
			     const char *buf, size_t size)
{ … }

static ssize_t set_cmci_disabled(struct device *s,
				 struct device_attribute *attr,
				 const char *buf, size_t size)
{ … }

static ssize_t store_int_with_restart(struct device *s,
				      struct device_attribute *attr,
				      const char *buf, size_t size)
{ … }

static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
static DEVICE_BOOL_ATTR(print_all, 0644, mca_cfg.print_all);

static struct dev_ext_attribute dev_attr_check_interval = …;

static struct dev_ext_attribute dev_attr_ignore_ce = …;

static struct dev_ext_attribute dev_attr_cmci_disabled = …;

static struct device_attribute *mce_device_attrs[] = …;

static cpumask_var_t mce_device_initialized;

static void mce_device_release(struct device *dev)
{ … }

/* Per CPU device init. All of the CPUs still share the same bank device: */
static int mce_device_create(unsigned int cpu)
{ … }

static void mce_device_remove(unsigned int cpu)
{ … }

/* Make sure there are no machine checks on offlined CPUs. */
static void mce_disable_cpu(void)
{ … }

static void mce_reenable_cpu(void)
{ … }

static int mce_cpu_dead(unsigned int cpu)
{ … }

static int mce_cpu_online(unsigned int cpu)
{ … }

static int mce_cpu_pre_down(unsigned int cpu)
{ … }

static __init void mce_init_banks(void)
{ … }

/*
 * When running on XEN, this initcall is ordered against the XEN mcelog
 * initcall:
 *
 *   device_initcall(xen_late_init_mcelog);
 *   device_initcall_sync(mcheck_init_device);
 */
static __init int mcheck_init_device(void)
{ … }
device_initcall_sync(mcheck_init_device);

/*
 * Old style boot options parsing. Only for compatibility.
 */
static int __init mcheck_disable(char *str)
{ … }
__setup(…);

#ifdef CONFIG_DEBUG_FS
struct dentry *mce_get_debugfs_dir(void)
{ … }

static void mce_reset(void)
{ … }

static int fake_panic_get(void *data, u64 *val)
{ … }

static int fake_panic_set(void *data, u64 val)
{ … }

DEFINE_DEBUGFS_ATTRIBUTE(…);

static void __init mcheck_debugfs_init(void)
{ … }
#else
static void __init mcheck_debugfs_init(void) { }
#endif

static int __init mcheck_late_init(void)
{ … }
late_initcall(mcheck_late_init);
linux/arch/x86/kernel/cpu/mce/core.c