uprobes.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * User-space Probes (UProbes) for x86
 *
 * Copyright (C) IBM Corporation, 2008-2011
 * Authors:
 *	Srikar Dronamraju
 *	Jim Keniston
 */
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/ptrace.h>
#include <linux/uprobes.h>
#include <linux/uaccess.h>
#include <linux/syscalls.h>

#include <linux/kdebug.h>
#include <asm/processor.h>
#include <asm/insn.h>
#include <asm/mmu_context.h>

/* Post-execution fixups. */

/* Adjust IP back to vicinity of actual insn */
#define UPROBE_FIX_IP …

/* Adjust the return address of a call insn */
#define UPROBE_FIX_CALL …

/* Instruction will modify TF, don't change it */
#define UPROBE_FIX_SETF …

#define UPROBE_FIX_RIP_SI …
#define UPROBE_FIX_RIP_DI …
#define UPROBE_FIX_RIP_BX …
#define UPROBE_FIX_RIP_MASK …

#define UPROBE_TRAP_NR …

/* Adaptations for mhiramat x86 decoder v14. */
#define OPCODE1(insn) …
#define OPCODE2(insn) …
#define OPCODE3(insn) …
#define MODRM_REG(insn) …

#define W …

/*
 * Good-instruction tables for 32-bit apps.  This is non-const and volatile
 * to keep gcc from statically optimizing it out, as variable_test_bit makes
 * some versions of gcc to think only *(unsigned long*) is used.
 *
 * Opcodes we'll probably never support:
 * 6c-6f - ins,outs. SEGVs if used in userspace
 * e4-e7 - in,out imm. SEGVs if used in userspace
 * ec-ef - in,out acc. SEGVs if used in userspace
 * cc - int3. SIGTRAP if used in userspace
 * ce - into. Not used in userspace - no kernel support to make it useful. SEGVs
 *	(why we support bound (62) then? it's similar, and similarly unused...)
 * f1 - int1. SIGTRAP if used in userspace
 * f4 - hlt. SEGVs if used in userspace
 * fa - cli. SEGVs if used in userspace
 * fb - sti. SEGVs if used in userspace
 *
 * Opcodes which need some work to be supported:
 * 07,17,1f - pop es/ss/ds
 *	Normally not used in userspace, but would execute if used.
 *	Can cause GP or stack exception if tries to load wrong segment descriptor.
 *	We hesitate to run them under single step since kernel's handling
 *	of userspace single-stepping (TF flag) is fragile.
 *	We can easily refuse to support push es/cs/ss/ds (06/0e/16/1e)
 *	on the same grounds that they are never used.
 * cd - int N.
 *	Used by userspace for "int 80" syscall entry. (Other "int N"
 *	cause GP -> SEGV since their IDT gates don't allow calls from CPL 3).
 *	Not supported since kernel's handling of userspace single-stepping
 *	(TF flag) is fragile.
 * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad
 */
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
static volatile u32 good_insns_32[256 / 32] = …;
#else
#define good_insns_32 …
#endif

/* Good-instruction tables for 64-bit apps.
 *
 * Genuinely invalid opcodes:
 * 06,07 - formerly push/pop es
 * 0e - formerly push cs
 * 16,17 - formerly push/pop ss
 * 1e,1f - formerly push/pop ds
 * 27,2f,37,3f - formerly daa/das/aaa/aas
 * 60,61 - formerly pusha/popa
 * 62 - formerly bound. EVEX prefix for AVX512 (not yet supported)
 * 82 - formerly redundant encoding of Group1
 * 9a - formerly call seg:ofs
 * ce - formerly into
 * d4,d5 - formerly aam/aad
 * d6 - formerly undocumented salc
 * ea - formerly jmp seg:ofs
 *
 * Opcodes we'll probably never support:
 * 6c-6f - ins,outs. SEGVs if used in userspace
 * e4-e7 - in,out imm. SEGVs if used in userspace
 * ec-ef - in,out acc. SEGVs if used in userspace
 * cc - int3. SIGTRAP if used in userspace
 * f1 - int1. SIGTRAP if used in userspace
 * f4 - hlt. SEGVs if used in userspace
 * fa - cli. SEGVs if used in userspace
 * fb - sti. SEGVs if used in userspace
 *
 * Opcodes which need some work to be supported:
 * cd - int N.
 *	Used by userspace for "int 80" syscall entry. (Other "int N"
 *	cause GP -> SEGV since their IDT gates don't allow calls from CPL 3).
 *	Not supported since kernel's handling of userspace single-stepping
 *	(TF flag) is fragile.
 * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad
 */
#if defined(CONFIG_X86_64)
static volatile u32 good_insns_64[256 / 32] = …;
#else
#define good_insns_64 …
#endif

/* Using this for both 64-bit and 32-bit apps.
 * Opcodes we don't support:
 * 0f 00 - SLDT/STR/LLDT/LTR/VERR/VERW/-/- group. System insns
 * 0f 01 - SGDT/SIDT/LGDT/LIDT/SMSW/-/LMSW/INVLPG group.
 *	Also encodes tons of other system insns if mod=11.
 *	Some are in fact non-system: xend, xtest, rdtscp, maybe more
 * 0f 05 - syscall
 * 0f 06 - clts (CPL0 insn)
 * 0f 07 - sysret
 * 0f 08 - invd (CPL0 insn)
 * 0f 09 - wbinvd (CPL0 insn)
 * 0f 0b - ud2
 * 0f 30 - wrmsr (CPL0 insn) (then why rdmsr is allowed, it's also CPL0 insn?)
 * 0f 34 - sysenter
 * 0f 35 - sysexit
 * 0f 37 - getsec
 * 0f 78 - vmread (Intel VMX. CPL0 insn)
 * 0f 79 - vmwrite (Intel VMX. CPL0 insn)
 *	Note: with prefixes, these two opcodes are
 *	extrq/insertq/AVX512 convert vector ops.
 * 0f ae - group15: [f]xsave,[f]xrstor,[v]{ld,st}mxcsr,clflush[opt],
 *	{rd,wr}{fs,gs}base,{s,l,m}fence.
 *	Why? They are all user-executable.
 */
static volatile u32 good_2byte_insns[256 / 32] = …;
#undef W

/*
 * opcodes we may need to refine support for:
 *
 *  0f - 2-byte instructions: For many of these instructions, the validity
 *  depends on the prefix and/or the reg field.  On such instructions, we
 *  just consider the opcode combination valid if it corresponds to any
 *  valid instruction.
 *
 *  8f - Group 1 - only reg = 0 is OK
 *  c6-c7 - Group 11 - only reg = 0 is OK
 *  d9-df - fpu insns with some illegal encodings
 *  f2, f3 - repnz, repz prefixes.  These are also the first byte for
 *  certain floating-point instructions, such as addsd.
 *
 *  fe - Group 4 - only reg = 0 or 1 is OK
 *  ff - Group 5 - only reg = 0-6 is OK
 *
 * others -- Do we need to support these?
 *
 *  0f - (floating-point?) prefetch instructions
 *  07, 17, 1f - pop es, pop ss, pop ds
 *  26, 2e, 36, 3e - es:, cs:, ss:, ds: segment prefixes --
 *	but 64 and 65 (fs: and gs:) seem to be used, so we support them
 *  67 - addr16 prefix
 *  ce - into
 *  f0 - lock prefix
 */

/*
 * TODO:
 * - Where necessary, examine the modrm byte and allow only valid instructions
 * in the different Groups and fpu instructions.
 */

static bool is_prefix_bad(struct insn *insn)
{ … }

static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool x86_64)
{ … }

#ifdef CONFIG_X86_64

asm …;

extern u8 uretprobe_trampoline_entry[];
extern u8 uretprobe_trampoline_end[];
extern u8 uretprobe_syscall_check[];

void *arch_uprobe_trampoline(unsigned long *psize)
{ … }

static unsigned long trampoline_check_ip(void)
{ … }

SYSCALL_DEFINE0(uretprobe)
{ … }

/*
 * If arch_uprobe->insn doesn't use rip-relative addressing, return
 * immediately.  Otherwise, rewrite the instruction so that it accesses
 * its memory operand indirectly through a scratch register.  Set
 * defparam->fixups accordingly. (The contents of the scratch register
 * will be saved before we single-step the modified instruction,
 * and restored afterward).
 *
 * We do this because a rip-relative instruction can access only a
 * relatively small area (+/- 2 GB from the instruction), and the XOL
 * area typically lies beyond that area.  At least for instructions
 * that store to memory, we can't execute the original instruction
 * and "fix things up" later, because the misdirected store could be
 * disastrous.
 *
 * Some useful facts about rip-relative instructions:
 *
 *  - There's always a modrm byte with bit layout "00 reg 101".
 *  - There's never a SIB byte.
 *  - The displacement is always 4 bytes.
 *  - REX.B=1 bit in REX prefix, which normally extends r/m field,
 *    has no effect on rip-relative mode. It doesn't make modrm byte
 *    with r/m=101 refer to register 1101 = R13.
 */
static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn)
{ … }

static inline unsigned long *
scratch_reg(struct arch_uprobe *auprobe, struct pt_regs *regs)
{ … }

/*
 * If we're emulating a rip-relative instruction, save the contents
 * of the scratch register and store the target address in that register.
 */
static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{ … }

static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{ … }
#else /* 32-bit: */
/*
 * No RIP-relative addressing on 32-bit
 */
static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn)
{
}
static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
}
static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
}
#endif /* CONFIG_X86_64 */

struct uprobe_xol_ops { … };

static inline int sizeof_long(struct pt_regs *regs)
{ … }

static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
{ … }

static int emulate_push_stack(struct pt_regs *regs, unsigned long val)
{ … }

/*
 * We have to fix things up as follows:
 *
 * Typically, the new ip is relative to the copied instruction.  We need
 * to make it relative to the original instruction (FIX_IP).  Exceptions
 * are return instructions and absolute or indirect jump or call instructions.
 *
 * If the single-stepped instruction was a call, the return address that
 * is atop the stack is the address following the copied instruction.  We
 * need to make it the address following the original instruction (FIX_CALL).
 *
 * If the original instruction was a rip-relative instruction such as
 * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent
 * instruction using a scratch register -- e.g., "movl %edx,0xnnnn(%rsi)".
 * We need to restore the contents of the scratch register
 * (FIX_RIP_reg).
 */
static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
{ … }

static void default_abort_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
{ … }

static const struct uprobe_xol_ops default_xol_ops = …;

static bool branch_is_call(struct arch_uprobe *auprobe)
{ … }

#define CASE_COND …

#define COND …

#define XF …

static bool is_cond_jmp_opcode(u8 opcode)
{ … }

static bool check_jmp_cond(struct arch_uprobe *auprobe, struct pt_regs *regs)
{ … }

#undef	XF
#undef	COND
#undef	CASE_COND

static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
{ … }

static bool push_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
{ … }

static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
{ … }

static void branch_clear_offset(struct arch_uprobe *auprobe, struct insn *insn)
{ … }

static const struct uprobe_xol_ops branch_xol_ops = …;

static const struct uprobe_xol_ops push_xol_ops = …;

/* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */
static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
{ … }

/* Returns -ENOSYS if push_xol_ops doesn't handle this insn */
static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
{ … }

/**
 * arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
 * @auprobe: the probepoint information.
 * @mm: the probed address space.
 * @addr: virtual address at which to install the probepoint
 * Return 0 on success or a -ve number on error.
 */
int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr)
{ … }

/*
 * arch_uprobe_pre_xol - prepare to execute out of line.
 * @auprobe: the probepoint information.
 * @regs: reflects the saved user state of current task.
 */
int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{ … }

/*
 * If xol insn itself traps and generates a signal(Say,
 * SIGILL/SIGSEGV/etc), then detect the case where a singlestepped
 * instruction jumps back to its own address. It is assumed that anything
 * like do_page_fault/do_trap/etc sets thread.trap_nr != -1.
 *
 * arch_uprobe_pre_xol/arch_uprobe_post_xol save/restore thread.trap_nr,
 * arch_uprobe_xol_was_trapped() simply checks that ->trap_nr is not equal to
 * UPROBE_TRAP_NR == -1 set by arch_uprobe_pre_xol().
 */
bool arch_uprobe_xol_was_trapped(struct task_struct *t)
{ … }

/*
 * Called after single-stepping. To avoid the SMP problems that can
 * occur when we temporarily put back the original opcode to
 * single-step, we single-stepped a copy of the instruction.
 *
 * This function prepares to resume execution after the single-step.
 */
int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{ … }

/* callback routine for handling exceptions. */
int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data)
{ … }

/*
 * This function gets called when XOL instruction either gets trapped or
 * the thread has a fatal signal. Reset the instruction pointer to its
 * probed address for the potential restart or for post mortem analysis.
 */
void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{ … }

static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
{ … }

bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
{ … }

unsigned long
arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs)
{ … }

bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
				struct pt_regs *regs)
{ … }
linux/arch/x86/kernel/uprobes.c