traps.c | Explore in Territory

/*
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
 *
 *  Pentium III FXSR, SSE support
 *	Gareth Hughes <[email protected]>, May 2000
 */

/*
 * Handle hardware traps and faults.
 */

#define pr_fmt(fmt) …

#include <linux/context_tracking.h>
#include <linux/interrupt.h>
#include <linux/kallsyms.h>
#include <linux/kmsan.h>
#include <linux/spinlock.h>
#include <linux/kprobes.h>
#include <linux/uaccess.h>
#include <linux/kdebug.h>
#include <linux/kgdb.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/ptrace.h>
#include <linux/uprobes.h>
#include <linux/string.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/kexec.h>
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
#include <linux/timer.h>
#include <linux/init.h>
#include <linux/bug.h>
#include <linux/nmi.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/io.h>
#include <linux/hardirq.h>
#include <linux/atomic.h>
#include <linux/iommu.h>

#include <asm/stacktrace.h>
#include <asm/processor.h>
#include <asm/debugreg.h>
#include <asm/realmode.h>
#include <asm/text-patching.h>
#include <asm/ftrace.h>
#include <asm/traps.h>
#include <asm/desc.h>
#include <asm/fred.h>
#include <asm/fpu/api.h>
#include <asm/cpu.h>
#include <asm/cpu_entry_area.h>
#include <asm/mce.h>
#include <asm/fixmap.h>
#include <asm/mach_traps.h>
#include <asm/alternative.h>
#include <asm/fpu/xstate.h>
#include <asm/vm86.h>
#include <asm/umip.h>
#include <asm/insn.h>
#include <asm/insn-eval.h>
#include <asm/vdso.h>
#include <asm/tdx.h>
#include <asm/cfi.h>

#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
#else
#include <asm/processor-flags.h>
#include <asm/setup.h>
#endif

#include <asm/proto.h>

DECLARE_BITMAP(system_vectors, NR_VECTORS);

__always_inline int is_valid_bugaddr(unsigned long addr)
{ … }

static nokprobe_inline int
do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str,
		  struct pt_regs *regs,	long error_code)
{ … }

static void show_signal(struct task_struct *tsk, int signr,
			const char *type, const char *desc,
			struct pt_regs *regs, long error_code)
{ … }

static void
do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
	long error_code, int sicode, void __user *addr)
{ … }
NOKPROBE_SYMBOL(do_trap);

static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
	unsigned long trapnr, int signr, int sicode, void __user *addr)
{ … }

/*
 * Posix requires to provide the address of the faulting instruction for
 * SIGILL (#UD) and SIGFPE (#DE) in the si_addr member of siginfo_t.
 *
 * This address is usually regs->ip, but when an uprobe moved the code out
 * of line then regs->ip points to the XOL code which would confuse
 * anything which analyzes the fault address vs. the unmodified binary. If
 * a trap happened in XOL code then uprobe maps regs->ip back to the
 * original instruction address.
 */
static __always_inline void __user *error_get_trap_addr(struct pt_regs *regs)
{ … }

DEFINE_IDTENTRY(exc_divide_error)
{ … }

DEFINE_IDTENTRY(exc_overflow)
{ … }

#ifdef CONFIG_X86_F00F_BUG
void handle_invalid_op(struct pt_regs *regs)
#else
static inline void handle_invalid_op(struct pt_regs *regs)
#endif
{ … }

static noinstr bool handle_bug(struct pt_regs *regs)
{ … }

DEFINE_IDTENTRY_RAW(exc_invalid_op)
{ … }

DEFINE_IDTENTRY(exc_coproc_segment_overrun)
{ … }

DEFINE_IDTENTRY_ERRORCODE(exc_invalid_tss)
{ … }

DEFINE_IDTENTRY_ERRORCODE(exc_segment_not_present)
{ … }

DEFINE_IDTENTRY_ERRORCODE(exc_stack_segment)
{ … }

DEFINE_IDTENTRY_ERRORCODE(exc_alignment_check)
{ … }

#ifdef CONFIG_VMAP_STACK
__visible void __noreturn handle_stack_overflow(struct pt_regs *regs,
						unsigned long fault_address,
						struct stack_info *info)
{ … }
#endif

/*
 * Runs on an IST stack for x86_64 and on a special task stack for x86_32.
 *
 * On x86_64, this is more or less a normal kernel entry.  Notwithstanding the
 * SDM's warnings about double faults being unrecoverable, returning works as
 * expected.  Presumably what the SDM actually means is that the CPU may get
 * the register state wrong on entry, so returning could be a bad idea.
 *
 * Various CPU engineers have promised that double faults due to an IRET fault
 * while the stack is read-only are, in fact, recoverable.
 *
 * On x86_32, this is entered through a task gate, and regs are synthesized
 * from the TSS.  Returning is, in principle, okay, but changes to regs will
 * be lost.  If, for some reason, we need to return to a context with modified
 * regs, the shim code could be adjusted to synchronize the registers.
 *
 * The 32bit #DF shim provides CR2 already as an argument. On 64bit it needs
 * to be read before doing anything else.
 */
DEFINE_IDTENTRY_DF(exc_double_fault)
{ … }

DEFINE_IDTENTRY(exc_bounds)
{ … }

enum kernel_gp_hint { … };

/*
 * When an uncaught #GP occurs, try to determine the memory address accessed by
 * the instruction and return that address to the caller. Also, try to figure
 * out whether any part of the access to that address was non-canonical.
 */
static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs,
						 unsigned long *addr)
{ … }

#define GPFSTR …

static bool fixup_iopl_exception(struct pt_regs *regs)
{ … }

/*
 * The unprivileged ENQCMD instruction generates #GPs if the
 * IA32_PASID MSR has not been populated.  If possible, populate
 * the MSR from a PASID previously allocated to the mm.
 */
static bool try_fixup_enqcmd_gp(void)
{ … }

static bool gp_try_fixup_and_notify(struct pt_regs *regs, int trapnr,
				    unsigned long error_code, const char *str,
				    unsigned long address)
{ … }

static void gp_user_force_sig_segv(struct pt_regs *regs, int trapnr,
				   unsigned long error_code, const char *str)
{ … }

DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
{ … }

static bool do_int3(struct pt_regs *regs)
{ … }
NOKPROBE_SYMBOL(do_int3);

static void do_int3_user(struct pt_regs *regs)
{ … }

DEFINE_IDTENTRY_RAW(exc_int3)
{ … }

#ifdef CONFIG_X86_64
/*
 * Help handler running on a per-cpu (IST or entry trampoline) stack
 * to switch to the normal thread stack if the interrupted code was in
 * user mode. The actual stack switch is done in entry_64.S
 */
asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs)
{ … }

#ifdef CONFIG_AMD_MEM_ENCRYPT
asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *regs)
{ … }
#endif

asmlinkage __visible noinstr struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs)
{ … }
#endif

static bool is_sysenter_singlestep(struct pt_regs *regs)
{ … }

static __always_inline unsigned long debug_read_clear_dr6(void)
{ … }

/*
 * Our handling of the processor debug registers is non-trivial.
 * We do not clear them on entry and exit from the kernel. Therefore
 * it is possible to get a watchpoint trap here from inside the kernel.
 * However, the code in ./ptrace.c has ensured that the user can
 * only set watchpoints on userspace addresses. Therefore the in-kernel
 * watchpoint trap can only occur in code which is reading/writing
 * from user space. Such code must not hold kernel locks (since it
 * can equally take a page fault), therefore it is safe to call
 * force_sig_info even though that claims and releases locks.
 *
 * Code in ./signal.c ensures that the debug control register
 * is restored before we deliver any signal, and therefore that
 * user code runs with the correct debug control register even though
 * we clear it here.
 *
 * Being careful here means that we don't have to be as careful in a
 * lot of more complicated places (task switching can be a bit lazy
 * about restoring all the debug state, and ptrace doesn't have to
 * find every occurrence of the TF bit that could be saved away even
 * by user code)
 *
 * May run on IST stack.
 */

static bool notify_debug(struct pt_regs *regs, unsigned long *dr6)
{ … }

static noinstr void exc_debug_kernel(struct pt_regs *regs, unsigned long dr6)
{ … }

static noinstr void exc_debug_user(struct pt_regs *regs, unsigned long dr6)
{ … }

#ifdef CONFIG_X86_64
/* IST stack entry */
DEFINE_IDTENTRY_DEBUG(exc_debug)
{ … }

/* User entry, runs on regular task stack */
DEFINE_IDTENTRY_DEBUG_USER(exc_debug)
{ … }

#ifdef CONFIG_X86_FRED
/*
 * When occurred on different ring level, i.e., from user or kernel
 * context, #DB needs to be handled on different stack: User #DB on
 * current task stack, while kernel #DB on a dedicated stack.
 *
 * This is exactly how FRED event delivery invokes an exception
 * handler: ring 3 event on level 0 stack, i.e., current task stack;
 * ring 0 event on the #DB dedicated stack specified in the
 * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED debug exception
 * entry stub doesn't do stack switch.
 */
DEFINE_FREDENTRY_DEBUG(exc_debug)
{ … }
#endif /* CONFIG_X86_FRED */

#else
/* 32 bit does not have separate entry points. */
DEFINE_IDTENTRY_RAW(exc_debug)
{
	unsigned long dr6 = debug_read_clear_dr6();

	if (user_mode(regs))
		exc_debug_user(regs, dr6);
	else
		exc_debug_kernel(regs, dr6);
}
#endif

/*
 * Note that we play around with the 'TS' bit in an attempt to get
 * the correct behaviour even in the presence of the asynchronous
 * IRQ13 behaviour
 */
static void math_error(struct pt_regs *regs, int trapnr)
{ … }

DEFINE_IDTENTRY(exc_coprocessor_error)
{ … }

DEFINE_IDTENTRY(exc_simd_coprocessor_error)
{ … }

DEFINE_IDTENTRY(exc_spurious_interrupt_bug)
{ … }

static bool handle_xfd_event(struct pt_regs *regs)
{ … }

DEFINE_IDTENTRY(exc_device_not_available)
{ … }

#ifdef CONFIG_INTEL_TDX_GUEST

#define VE_FAULT_STR …

static void ve_raise_fault(struct pt_regs *regs, long error_code,
			   unsigned long address)
{ … }

/*
 * Virtualization Exceptions (#VE) are delivered to TDX guests due to
 * specific guest actions which may happen in either user space or the
 * kernel:
 *
 *  * Specific instructions (WBINVD, for example)
 *  * Specific MSR accesses
 *  * Specific CPUID leaf accesses
 *  * Access to specific guest physical addresses
 *
 * In the settings that Linux will run in, virtualization exceptions are
 * never generated on accesses to normal, TD-private memory that has been
 * accepted (by BIOS or with tdx_enc_status_changed()).
 *
 * Syscall entry code has a critical window where the kernel stack is not
 * yet set up. Any exception in this window leads to hard to debug issues
 * and can be exploited for privilege escalation. Exceptions in the NMI
 * entry code also cause issues. Returning from the exception handler with
 * IRET will re-enable NMIs and nested NMI will corrupt the NMI stack.
 *
 * For these reasons, the kernel avoids #VEs during the syscall gap and
 * the NMI entry code. Entry code paths do not access TD-shared memory,
 * MMIO regions, use #VE triggering MSRs, instructions, or CPUID leaves
 * that might generate #VE. VMM can remove memory from TD at any point,
 * but access to unaccepted (or missing) private memory leads to VM
 * termination, not to #VE.
 *
 * Similarly to page faults and breakpoints, #VEs are allowed in NMI
 * handlers once the kernel is ready to deal with nested NMIs.
 *
 * During #VE delivery, all interrupts, including NMIs, are blocked until
 * TDGETVEINFO is called. It prevents #VE nesting until the kernel reads
 * the VE info.
 *
 * If a guest kernel action which would normally cause a #VE occurs in
 * the interrupt-disabled region before TDGETVEINFO, a #DF (fault
 * exception) is delivered to the guest which will result in an oops.
 *
 * The entry code has been audited carefully for following these expectations.
 * Changes in the entry code have to be audited for correctness vs. this
 * aspect. Similarly to #PF, #VE in these places will expose kernel to
 * privilege escalation or may lead to random crashes.
 */
DEFINE_IDTENTRY(exc_virtualization_exception)
{ … }

#endif

#ifdef CONFIG_X86_32
DEFINE_IDTENTRY_SW(iret_error)
{
	local_irq_enable();
	if (notify_die(DIE_TRAP, "iret exception", regs, 0,
			X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) {
		do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, 0,
			ILL_BADSTK, (void __user *)NULL);
	}
	local_irq_disable();
}
#endif

/* Do not enable FRED by default yet. */
static bool enable_fred __ro_after_init = …;

#ifdef CONFIG_X86_FRED
static int __init fred_setup(char *str)
{ … }
early_param(…);
#endif

void __init trap_init(void)
{ … }
linux/arch/x86/kernel/traps.c