// SPDX-License-Identifier: GPL-2.0-only /* * Machine check handler. * * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. * Rest from unknown author(s). * 2004 Andi Kleen. Rewrote most of it. * Copyright 2008 Intel Corporation * Author: Andi Kleen */ #include <linux/thread_info.h> #include <linux/capability.h> #include <linux/miscdevice.h> #include <linux/ratelimit.h> #include <linux/rcupdate.h> #include <linux/kobject.h> #include <linux/uaccess.h> #include <linux/kdebug.h> #include <linux/kernel.h> #include <linux/percpu.h> #include <linux/string.h> #include <linux/device.h> #include <linux/syscore_ops.h> #include <linux/delay.h> #include <linux/ctype.h> #include <linux/sched.h> #include <linux/sysfs.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/kmod.h> #include <linux/poll.h> #include <linux/nmi.h> #include <linux/cpu.h> #include <linux/ras.h> #include <linux/smp.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/debugfs.h> #include <linux/irq_work.h> #include <linux/export.h> #include <linux/set_memory.h> #include <linux/sync_core.h> #include <linux/task_work.h> #include <linux/hardirq.h> #include <linux/kexec.h> #include <asm/fred.h> #include <asm/cpu_device_id.h> #include <asm/processor.h> #include <asm/traps.h> #include <asm/tlbflush.h> #include <asm/mce.h> #include <asm/msr.h> #include <asm/reboot.h> #include <asm/tdx.h> #include "internal.h" /* sysfs synchronization */ static DEFINE_MUTEX(mce_sysfs_mutex); #define CREATE_TRACE_POINTS #include <trace/events/mce.h> #define SPINUNIT … DEFINE_PER_CPU(unsigned, mce_exception_count); DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks); DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array); #define ATTR_LEN … /* One object for each MCE bank, shared by all CPUs */ struct mce_bank_dev { … }; static struct mce_bank_dev mce_bank_devs[MAX_NR_BANKS]; struct mce_vendor_flags mce_flags __read_mostly; struct mca_config mca_cfg __read_mostly = …; static DEFINE_PER_CPU(struct mce, mces_seen); static unsigned long mce_need_notify; /* * MCA banks polled by the period polling timer for corrected events. * With Intel CMCI, this only has MCA banks which do not support CMCI (if any). */ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = …; /* * MCA banks controlled through firmware first for corrected errors. * This is a global list of banks for which we won't enable CMCI and we * won't poll. Firmware controls these banks and is responsible for * reporting corrected errors through GHES. Uncorrected/recoverable * errors are still notified through a machine check. */ mce_banks_t mce_banks_ce_disabled; static struct work_struct mce_work; static struct irq_work mce_irq_work; /* * CPU/chipset specific EDAC code can register a notifier call here to print * MCE errors in a human-readable form. */ BLOCKING_NOTIFIER_HEAD(…) …; /* Do initial initialization of a struct mce */ void mce_setup(struct mce *m) { … } DEFINE_PER_CPU(struct mce, injectm); EXPORT_PER_CPU_SYMBOL_GPL(…); void mce_log(struct mce *m) { … } EXPORT_SYMBOL_GPL(…); void mce_register_decode_chain(struct notifier_block *nb) { … } EXPORT_SYMBOL_GPL(…); void mce_unregister_decode_chain(struct notifier_block *nb) { … } EXPORT_SYMBOL_GPL(…); static void __print_mce(struct mce *m) { … } static void print_mce(struct mce *m) { … } #define PANIC_TIMEOUT … static atomic_t mce_panicked; static int fake_panic; static atomic_t mce_fake_panicked; /* Panic in progress. Enable interrupts and wait for final IPI */ static void wait_for_panic(void) { … } static const char *mce_dump_aux_info(struct mce *m) { … } static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) { … } /* Support code for software error injection */ static int msr_to_offset(u32 msr) { … } void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr) { … } /* MSR access wrappers used for error injection */ noinstr u64 mce_rdmsrl(u32 msr) { … } static noinstr void mce_wrmsrl(u32 msr, u64 v) { … } /* * Collect all global (w.r.t. this processor) status about this machine * check into our "mce" struct so that we can use it later to assess * the severity of the problem as we read per-bank specific details. */ static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs) { … } int mce_available(struct cpuinfo_x86 *c) { … } static void mce_schedule_work(void) { … } static void mce_irq_work_cb(struct irq_work *entry) { … } bool mce_usable_address(struct mce *m) { … } EXPORT_SYMBOL_GPL(…); bool mce_is_memory_error(struct mce *m) { … } EXPORT_SYMBOL_GPL(…); static bool whole_page(struct mce *m) { … } bool mce_is_correctable(struct mce *m) { … } EXPORT_SYMBOL_GPL(…); static int mce_early_notifier(struct notifier_block *nb, unsigned long val, void *data) { … } static struct notifier_block early_nb = …; static int uc_decode_notifier(struct notifier_block *nb, unsigned long val, void *data) { … } static struct notifier_block mce_uc_nb = …; static int mce_default_notifier(struct notifier_block *nb, unsigned long val, void *data) { … } static struct notifier_block mce_default_nb = …; /* * Read ADDR and MISC registers. */ static noinstr void mce_read_aux(struct mce *m, int i) { … } DEFINE_PER_CPU(unsigned, mce_poll_count); /* * Poll for corrected events or events that happened before reset. * Those are just logged through /dev/mcelog. * * This is executed in standard interrupt context. * * Note: spec recommends to panic for fatal unsignalled * errors here. However this would be quite problematic -- * we would need to reimplement the Monarch handling and * it would mess up the exclusion between exception handler * and poll handler -- * so we skip this for now. * These cases should not happen anyways, or only when the CPU * is already totally * confused. In this case it's likely it will * not fully execute the machine check handler either. */ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) { … } EXPORT_SYMBOL_GPL(…); /* * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM * Vol 3B Table 15-20). But this confuses both the code that determines * whether the machine check occurred in kernel or user mode, and also * the severity assessment code. Pretend that EIPV was set, and take the * ip/cs values from the pt_regs that mce_gather_info() ignored earlier. */ static __always_inline void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) { … } /* * Disable fast string copy and return from the MCE handler upon the first SRAR * MCE on bank 1 due to a CPU erratum on Intel Skylake/Cascade Lake/Cooper Lake * CPUs. * The fast string copy instructions ("REP; MOVS*") could consume an * uncorrectable memory error in the cache line _right after_ the desired region * to copy and raise an MCE with RIP pointing to the instruction _after_ the * "REP; MOVS*". * This mitigation addresses the issue completely with the caveat of performance * degradation on the CPU affected. This is still better than the OS crashing on * MCEs raised on an irrelevant process due to "REP; MOVS*" accesses from a * kernel context (e.g., copy_page). * * Returns true when fast string copy on CPU has been disabled. */ static noinstr bool quirk_skylake_repmov(void) { … } /* * Some Zen-based Instruction Fetch Units set EIPV=RIPV=0 on poison consumption * errors. This means mce_gather_info() will not save the "ip" and "cs" registers. * * However, the context is still valid, so save the "cs" register for later use. * * The "ip" register is truly unknown, so don't save it or fixup EIPV/RIPV. * * The Instruction Fetch Unit is at MCA bank 1 for all affected systems. */ static __always_inline void quirk_zen_ifu(int bank, struct mce *m, struct pt_regs *regs) { … } /* * Do a quick check if any of the events requires a panic. * This decides if we keep the events around or clear them. */ static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, struct pt_regs *regs) { … } /* * Variable to establish order between CPUs while scanning. * Each CPU spins initially until executing is equal its number. */ static atomic_t mce_executing; /* * Defines order of CPUs on entry. First CPU becomes Monarch. */ static atomic_t mce_callin; /* * Track which CPUs entered the MCA broadcast synchronization and which not in * order to print holdouts. */ static cpumask_t mce_missing_cpus = …; /* * Check if a timeout waiting for other CPUs happened. */ static noinstr int mce_timed_out(u64 *t, const char *msg) { … } /* * The Monarch's reign. The Monarch is the CPU who entered * the machine check handler first. It waits for the others to * raise the exception too and then grades them. When any * error is fatal panic. Only then let the others continue. * * The other CPUs entering the MCE handler will be controlled by the * Monarch. They are called Subjects. * * This way we prevent any potential data corruption in a unrecoverable case * and also makes sure always all CPU's errors are examined. * * Also this detects the case of a machine check event coming from outer * space (not detected by any CPUs) In this case some external agent wants * us to shut down, so panic too. * * The other CPUs might still decide to panic if the handler happens * in a unrecoverable place, but in this case the system is in a semi-stable * state and won't corrupt anything by itself. It's ok to let the others * continue for a bit first. * * All the spin loops have timeouts; when a timeout happens a CPU * typically elects itself to be Monarch. */ static void mce_reign(void) { … } static atomic_t global_nwo; /* * Start of Monarch synchronization. This waits until all CPUs have * entered the exception handler and then determines if any of them * saw a fatal event that requires panic. Then it executes them * in the entry order. * TBD double check parallel CPU hotunplug */ static noinstr int mce_start(int *no_way_out) { … } /* * Synchronize between CPUs after main scanning loop. * This invokes the bulk of the Monarch processing. */ static noinstr int mce_end(int order) { … } static __always_inline void mce_clear_state(unsigned long *toclear) { … } /* * Cases where we avoid rendezvous handler timeout: * 1) If this CPU is offline. * * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to * skip those CPUs which remain looping in the 1st kernel - see * crash_nmi_callback(). * * Note: there still is a small window between kexec-ing and the new, * kdump kernel establishing a new #MC handler where a broadcasted MCE * might not get handled properly. */ static noinstr bool mce_check_crashing_cpu(void) { … } static __always_inline int __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, unsigned long *toclear, unsigned long *valid_banks, int no_way_out, int *worst) { … } static void kill_me_now(struct callback_head *ch) { … } static void kill_me_maybe(struct callback_head *cb) { … } static void kill_me_never(struct callback_head *cb) { … } static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *)) { … } /* Handle unconfigured int18 (should never happen) */ static noinstr void unexpected_machine_check(struct pt_regs *regs) { … } /* * The actual machine check handler. This only handles real exceptions when * something got corrupted coming in through int 18. * * This is executed in #MC context not subject to normal locking rules. * This implies that most kernel services cannot be safely used. Don't even * think about putting a printk in there! * * On Intel systems this is entered on all CPUs in parallel through * MCE broadcast. However some CPUs might be broken beyond repair, * so be always careful when synchronizing with others. * * Tracing and kprobes are disabled: if we interrupted a kernel context * with IF=1, we need to minimize stack usage. There are also recursion * issues: if the machine check was due to a failure of the memory * backing the user stack, tracing that reads the user stack will cause * potentially infinite recursion. * * Currently, the #MC handler calls out to a number of external facilities * and, therefore, allows instrumentation around them. The optimal thing to * have would be to do the absolutely minimal work required in #MC context * and have instrumentation disabled only around that. Further processing can * then happen in process context where instrumentation is allowed. Achieving * that requires careful auditing and modifications. Until then, the code * allows instrumentation temporarily, where required. * */ noinstr void do_machine_check(struct pt_regs *regs) { … } EXPORT_SYMBOL_GPL(…); #ifndef CONFIG_MEMORY_FAILURE int memory_failure(unsigned long pfn, int flags) { /* mce_severity() should not hand us an ACTION_REQUIRED error */ BUG_ON(flags & MF_ACTION_REQUIRED); pr_err("Uncorrected memory error in page 0x%lx ignored\n" "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn); return 0; } #endif /* * Periodic polling timer for "silent" machine check errors. If the * poller finds an MCE, poll 2x faster. When the poller finds no more * errors, poll 2x slower (up to check_interval seconds). */ static unsigned long check_interval = …; static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); static void __start_timer(struct timer_list *t, unsigned long interval) { … } static void mc_poll_banks_default(void) { … } void (*mc_poll_banks)(void) = …; static void mce_timer_fn(struct timer_list *t) { … } /* * When a storm starts on any bank on this CPU, switch to polling * once per second. When the storm ends, revert to the default * polling interval. */ void mce_timer_kick(bool storm) { … } /* Must not be called in IRQ context where del_timer_sync() can deadlock */ static void mce_timer_delete_all(void) { … } /* * Notify the user(s) about new machine check events. * Can be called from interrupt context, but not from machine check/NMI * context. */ int mce_notify_irq(void) { … } EXPORT_SYMBOL_GPL(…); static void __mcheck_cpu_mce_banks_init(void) { … } /* * Initialize Machine Checks for a CPU. */ static void __mcheck_cpu_cap_init(void) { … } static void __mcheck_cpu_init_generic(void) { … } static void __mcheck_cpu_init_clear_banks(void) { … } /* * Do a final check to see if there are any unused/RAZ banks. * * This must be done after the banks have been initialized and any quirks have * been applied. * * Do not call this from any user-initiated flows, e.g. CPU hotplug or sysfs. * Otherwise, a user who disables a bank will not be able to re-enable it * without a system reboot. */ static void __mcheck_cpu_check_banks(void) { … } /* Add per CPU specific workarounds here */ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) { … } static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) { … } /* * Init basic CPU features needed for early decoding of MCEs. */ static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c) { … } static void mce_centaur_feature_init(struct cpuinfo_x86 *c) { … } static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c) { … } static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c) { … } static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) { … } static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c) { … } static void mce_start_timer(struct timer_list *t) { … } static void __mcheck_cpu_setup_timer(void) { … } static void __mcheck_cpu_init_timer(void) { … } bool filter_mce(struct mce *m) { … } static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) { … } static __always_inline void exc_machine_check_user(struct pt_regs *regs) { … } #ifdef CONFIG_X86_64 /* MCE hit kernel mode */ DEFINE_IDTENTRY_MCE(exc_machine_check) { … } /* The user mode variant. */ DEFINE_IDTENTRY_MCE_USER(exc_machine_check) { … } #ifdef CONFIG_X86_FRED /* * When occurred on different ring level, i.e., from user or kernel * context, #MCE needs to be handled on different stack: User #MCE * on current task stack, while kernel #MCE on a dedicated stack. * * This is exactly how FRED event delivery invokes an exception * handler: ring 3 event on level 0 stack, i.e., current task stack; * ring 0 event on the #MCE dedicated stack specified in the * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED machine check entry * stub doesn't do stack switch. */ DEFINE_FREDENTRY_MCE(exc_machine_check) { … } #endif #else /* 32bit unified entry point */ DEFINE_IDTENTRY_RAW(exc_machine_check) { unsigned long dr7; dr7 = local_db_save(); if (user_mode(regs)) exc_machine_check_user(regs); else exc_machine_check_kernel(regs); local_db_restore(dr7); } #endif /* * Called for each booted CPU to set up machine checks. * Must be called with preempt off: */ void mcheck_cpu_init(struct cpuinfo_x86 *c) { … } /* * Called for each booted CPU to clear some machine checks opt-ins */ void mcheck_cpu_clear(struct cpuinfo_x86 *c) { … } static void __mce_disable_bank(void *arg) { … } void mce_disable_bank(int bank) { … } /* * mce=off Disables machine check * mce=no_cmci Disables CMCI * mce=no_lmce Disables LMCE * mce=dont_log_ce Clears corrected events silently, no log created for CEs. * mce=print_all Print all machine check logs to console * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) * monarchtimeout is how long to wait for other CPUs on machine * check, or 0 to not wait * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h and older. * mce=nobootlog Don't log MCEs from before booting. * mce=bios_cmci_threshold Don't program the CMCI threshold * mce=recovery force enable copy_mc_fragile() */ static int __init mcheck_enable(char *str) { … } __setup(…); int __init mcheck_init(void) { … } /* * mce_syscore: PM support */ /* * Disable machine checks on suspend and shutdown. We can't really handle * them later. */ static void mce_disable_error_reporting(void) { … } static void vendor_disable_error_reporting(void) { … } static int mce_syscore_suspend(void) { … } static void mce_syscore_shutdown(void) { … } /* * On resume clear all MCE state. Don't want to see leftovers from the BIOS. * Only one CPU is active at this time, the others get re-added later using * CPU hotplug: */ static void mce_syscore_resume(void) { … } static struct syscore_ops mce_syscore_ops = …; /* * mce_device: Sysfs support */ static void mce_cpu_restart(void *data) { … } /* Reinit MCEs after user configuration changes */ static void mce_restart(void) { … } /* Toggle features for corrected errors */ static void mce_disable_cmci(void *data) { … } static void mce_enable_ce(void *all) { … } static const struct bus_type mce_subsys = …; DEFINE_PER_CPU(struct device *, mce_device); static inline struct mce_bank_dev *attr_to_bank(struct device_attribute *attr) { … } static ssize_t show_bank(struct device *s, struct device_attribute *attr, char *buf) { … } static ssize_t set_bank(struct device *s, struct device_attribute *attr, const char *buf, size_t size) { … } static ssize_t set_ignore_ce(struct device *s, struct device_attribute *attr, const char *buf, size_t size) { … } static ssize_t set_cmci_disabled(struct device *s, struct device_attribute *attr, const char *buf, size_t size) { … } static ssize_t store_int_with_restart(struct device *s, struct device_attribute *attr, const char *buf, size_t size) { … } static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout); static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce); static DEVICE_BOOL_ATTR(print_all, 0644, mca_cfg.print_all); static struct dev_ext_attribute dev_attr_check_interval = …; static struct dev_ext_attribute dev_attr_ignore_ce = …; static struct dev_ext_attribute dev_attr_cmci_disabled = …; static struct device_attribute *mce_device_attrs[] = …; static cpumask_var_t mce_device_initialized; static void mce_device_release(struct device *dev) { … } /* Per CPU device init. All of the CPUs still share the same bank device: */ static int mce_device_create(unsigned int cpu) { … } static void mce_device_remove(unsigned int cpu) { … } /* Make sure there are no machine checks on offlined CPUs. */ static void mce_disable_cpu(void) { … } static void mce_reenable_cpu(void) { … } static int mce_cpu_dead(unsigned int cpu) { … } static int mce_cpu_online(unsigned int cpu) { … } static int mce_cpu_pre_down(unsigned int cpu) { … } static __init void mce_init_banks(void) { … } /* * When running on XEN, this initcall is ordered against the XEN mcelog * initcall: * * device_initcall(xen_late_init_mcelog); * device_initcall_sync(mcheck_init_device); */ static __init int mcheck_init_device(void) { … } device_initcall_sync(mcheck_init_device); /* * Old style boot options parsing. Only for compatibility. */ static int __init mcheck_disable(char *str) { … } __setup(…); #ifdef CONFIG_DEBUG_FS struct dentry *mce_get_debugfs_dir(void) { … } static void mce_reset(void) { … } static int fake_panic_get(void *data, u64 *val) { … } static int fake_panic_set(void *data, u64 val) { … } DEFINE_DEBUGFS_ATTRIBUTE(…); static void __init mcheck_debugfs_init(void) { … } #else static void __init mcheck_debugfs_init(void) { } #endif static int __init mcheck_late_init(void) { … } late_initcall(mcheck_late_init);