// SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) … #include <linux/module.h> #include <linux/sched.h> #include <linux/perf_event.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/stringify.h> #include <linux/highmem.h> #include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/memory.h> #include <linux/stop_machine.h> #include <linux/slab.h> #include <linux/kdebug.h> #include <linux/kprobes.h> #include <linux/mmu_context.h> #include <linux/bsearch.h> #include <linux/sync_core.h> #include <asm/text-patching.h> #include <asm/alternative.h> #include <asm/sections.h> #include <asm/mce.h> #include <asm/nmi.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> #include <asm/insn.h> #include <asm/io.h> #include <asm/fixmap.h> #include <asm/paravirt.h> #include <asm/asm-prototypes.h> #include <asm/cfi.h> int __read_mostly alternatives_patched; EXPORT_SYMBOL_GPL(…); #define MAX_PATCH_LEN … #define DA_ALL … #define DA_ALT … #define DA_RET … #define DA_RETPOLINE … #define DA_ENDBR … #define DA_SMP … static unsigned int debug_alternative; static int __init debug_alt(char *str) { … } __setup(…); static int noreplace_smp; static int __init setup_noreplace_smp(char *str) { … } __setup(…); #define DPRINTK(type, fmt, args...) … #define DUMP_BYTES(type, buf, len, fmt, args...) … static const unsigned char x86nops[] = …; const unsigned char * const x86_nops[ASM_NOP_MAX+1] = …; /* * Nomenclature for variable names to simplify and clarify this code and ease * any potential staring at it: * * @instr: source address of the original instructions in the kernel text as * generated by the compiler. * * @buf: temporary buffer on which the patching operates. This buffer is * eventually text-poked into the kernel image. * * @replacement/@repl: pointer to the opcodes which are replacing @instr, located * in the .altinstr_replacement section. */ /* * Fill the buffer with a single effective instruction of size @len. * * In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info) * for every single-byte NOP, try to generate the maximally available NOP of * size <= ASM_NOP_MAX such that only a single CFI entry is generated (vs one for * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and * *jump* over instead of executing long and daft NOPs. */ static void add_nop(u8 *buf, unsigned int len) { … } extern s32 __retpoline_sites[], __retpoline_sites_end[]; extern s32 __return_sites[], __return_sites_end[]; extern s32 __cfi_sites[], __cfi_sites_end[]; extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; extern s32 __smp_locks[], __smp_locks_end[]; void text_poke_early(void *addr, const void *opcode, size_t len); /* * Matches NOP and NOPL, not any of the other possible NOPs. */ static bool insn_is_nop(struct insn *insn) { … } /* * Find the offset of the first non-NOP instruction starting at @offset * but no further than @len. */ static int skip_nops(u8 *buf, int offset, int len) { … } /* * "noinline" to cause control flow change and thus invalidate I$ and * cause refetch after modification. */ static void noinline optimize_nops(const u8 * const instr, u8 *buf, size_t len) { … } /* * In this context, "source" is where the instructions are placed in the * section .altinstr_replacement, for example during kernel build by the * toolchain. * "Destination" is where the instructions are being patched in by this * machinery. * * The source offset is: * * src_imm = target - src_next_ip (1) * * and the target offset is: * * dst_imm = target - dst_next_ip (2) * * so rework (1) as an expression for target like: * * target = src_imm + src_next_ip (1a) * * and substitute in (2) to get: * * dst_imm = (src_imm + src_next_ip) - dst_next_ip (3) * * Now, since the instruction stream is 'identical' at src and dst (it * is being copied after all) it can be stated that: * * src_next_ip = src + ip_offset * dst_next_ip = dst + ip_offset (4) * * Substitute (4) in (3) and observe ip_offset being cancelled out to * obtain: * * dst_imm = src_imm + (src + ip_offset) - (dst + ip_offset) * = src_imm + src - dst + ip_offset - ip_offset * = src_imm + src - dst (5) * * IOW, only the relative displacement of the code block matters. */ #define apply_reloc_n(n_, p_, d_) … static __always_inline void apply_reloc(int n, void *ptr, uintptr_t diff) { … } static __always_inline bool need_reloc(unsigned long offset, u8 *src, size_t src_len) { … } static void __apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len) { … } void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len) { … } /* Low-level backend functions usable from alternative code replacements. */ DEFINE_ASM_FUNC(…); EXPORT_SYMBOL_GPL(…); noinstr void BUG_func(void) { … } EXPORT_SYMBOL(…); #define CALL_RIP_REL_OPCODE … #define CALL_RIP_REL_MODRM … /* * Rewrite the "call BUG_func" replacement to point to the target of the * indirect pv_ops call "call *disp(%ip)". */ static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a) { … } static inline u8 * instr_va(struct alt_instr *i) { … } /* * Replace instructions with better alternatives for this CPU type. This runs * before SMP is initialized to avoid SMP problems with self modifying code. * This implies that asymmetric systems where APs have less capabilities than * the boot processor are not handled. Tough. Make sure you disable such * features by hand. * * Marked "noinline" to cause control flow change and thus insn cache * to refetch changed I$ lines. */ void __init_or_module noinline apply_alternatives(struct alt_instr *start, struct alt_instr *end) { … } static inline bool is_jcc32(struct insn *insn) { … } #if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_OBJTOOL) /* * CALL/JMP *%\reg */ static int emit_indirect(int op, int reg, u8 *bytes) { … } static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes) { … } /* * Rewrite the compiler generated retpoline thunk calls. * * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate * indirect instructions, avoiding the extra indirection. * * For example, convert: * * CALL __x86_indirect_thunk_\reg * * into: * * CALL *%\reg * * It also tries to inline spectre_v2=retpoline,lfence when size permits. */ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) { … } /* * Generated by 'objtool --retpoline'. */ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { … } #ifdef CONFIG_MITIGATION_RETHUNK /* * Rewrite the compiler generated return thunk tail-calls. * * For example, convert: * * JMP __x86_return_thunk * * into: * * RET */ static int patch_return(void *addr, struct insn *insn, u8 *bytes) { … } void __init_or_module noinline apply_returns(s32 *start, s32 *end) { … } #else void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } #endif /* CONFIG_MITIGATION_RETHUNK */ #else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } #endif /* CONFIG_MITIGATION_RETPOLINE && CONFIG_OBJTOOL */ #ifdef CONFIG_X86_KERNEL_IBT static void poison_cfi(void *addr); static void __init_or_module poison_endbr(void *addr, bool warn) { … } /* * Generated by: objtool --ibt * * Seal the functions for indirect calls by clobbering the ENDBR instructions * and the kCFI hash value. */ void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end) { … } #else void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { } #endif /* CONFIG_X86_KERNEL_IBT */ #ifdef CONFIG_CFI_AUTO_DEFAULT #define __CFI_DEFAULT … #elif defined(CONFIG_CFI_CLANG) #define __CFI_DEFAULT … #else #define __CFI_DEFAULT … #endif enum cfi_mode cfi_mode __ro_after_init = …; #ifdef CONFIG_CFI_CLANG struct bpf_insn; /* Must match bpf_func_t / DEFINE_BPF_PROG_RUN() */ extern unsigned int __bpf_prog_runX(const void *ctx, const struct bpf_insn *insn); /* * Force a reference to the external symbol so the compiler generates * __kcfi_typid. */ __ADDRESSABLE(__bpf_prog_runX); /* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX; */ asm …; /* Must match bpf_callback_t */ extern u64 __bpf_callback_fn(u64, u64, u64, u64, u64); __ADDRESSABLE(__bpf_callback_fn); /* u32 __ro_after_init cfi_bpf_subprog_hash = __kcfi_typeid___bpf_callback_fn; */ asm …; u32 cfi_get_func_hash(void *func) { … } #endif #ifdef CONFIG_FINEIBT static bool cfi_rand __ro_after_init = …; static u32 cfi_seed __ro_after_init; /* * Re-hash the CFI hash with a boot-time seed while making sure the result is * not a valid ENDBR instruction. */ static u32 cfi_rehash(u32 hash) { … } static __init int cfi_parse_cmdline(char *str) { … } early_param(…); /* * kCFI FineIBT * * __cfi_\func: __cfi_\func: * movl $0x12345678,%eax // 5 endbr64 // 4 * nop subl $0x12345678,%r10d // 7 * nop jz 1f // 2 * nop ud2 // 2 * nop 1: nop // 1 * nop * nop * nop * nop * nop * nop * nop * * * caller: caller: * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%r10d // 6 * addl $-15(%r11),%r10d // 4 sub $16,%r11 // 4 * je 1f // 2 nop4 // 4 * ud2 // 2 * 1: call __x86_indirect_thunk_r11 // 5 call *%r11; nop2; // 5 * */ asm …; extern u8 fineibt_preamble_start[]; extern u8 fineibt_preamble_end[]; #define fineibt_preamble_size … #define fineibt_preamble_hash … asm …; extern u8 fineibt_caller_start[]; extern u8 fineibt_caller_end[]; #define fineibt_caller_size … #define fineibt_caller_hash … #define fineibt_caller_jmp … static u32 decode_preamble_hash(void *addr) { … } static u32 decode_caller_hash(void *addr) { … } /* .retpoline_sites */ static int cfi_disable_callers(s32 *start, s32 *end) { … } static int cfi_enable_callers(s32 *start, s32 *end) { … } /* .cfi_sites */ static int cfi_rand_preamble(s32 *start, s32 *end) { … } static int cfi_rewrite_preamble(s32 *start, s32 *end) { … } static void cfi_rewrite_endbr(s32 *start, s32 *end) { … } /* .retpoline_sites */ static int cfi_rand_callers(s32 *start, s32 *end) { … } static int cfi_rewrite_callers(s32 *start, s32 *end) { … } static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, s32 *start_cfi, s32 *end_cfi, bool builtin) { … } static inline void poison_hash(void *addr) { … } static void poison_cfi(void *addr) { … } #else static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, s32 *start_cfi, s32 *end_cfi, bool builtin) { } #ifdef CONFIG_X86_KERNEL_IBT static void poison_cfi(void *addr) { } #endif #endif void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, s32 *start_cfi, s32 *end_cfi) { … } #ifdef CONFIG_SMP static void alternatives_smp_lock(const s32 *start, const s32 *end, u8 *text, u8 *text_end) { … } static void alternatives_smp_unlock(const s32 *start, const s32 *end, u8 *text, u8 *text_end) { … } struct smp_alt_module { … }; static LIST_HEAD(smp_alt_modules); static bool uniproc_patched = …; /* protected by text_mutex */ void __init_or_module alternatives_smp_module_add(struct module *mod, char *name, void *locks, void *locks_end, void *text, void *text_end) { … } void __init_or_module alternatives_smp_module_del(struct module *mod) { … } void alternatives_enable_smp(void) { … } /* * Return 1 if the address range is reserved for SMP-alternatives. * Must hold text_mutex. */ int alternatives_text_reserved(void *start, void *end) { … } #endif /* CONFIG_SMP */ /* * Self-test for the INT3 based CALL emulation code. * * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up * properly and that there is a stack gap between the INT3 frame and the * previous context. Without this gap doing a virtual PUSH on the interrupted * stack would corrupt the INT3 IRET frame. * * See entry_{32,64}.S for more details. */ /* * We define the int3_magic() function in assembly to control the calling * convention such that we can 'call' it from assembly. */ extern void int3_magic(unsigned int *ptr); /* defined in asm */ asm …; extern void int3_selftest_ip(void); /* defined in asm below */ static int __init int3_exception_notify(struct notifier_block *self, unsigned long val, void *data) { … } /* Must be noinline to ensure uniqueness of int3_selftest_ip. */ static noinline void __init int3_selftest(void) { … } static __initdata int __alt_reloc_selftest_addr; extern void __init __alt_reloc_selftest(void *arg); __visible noinline void __init __alt_reloc_selftest(void *arg) { … } static noinline void __init alt_reloc_selftest(void) { … } void __init alternative_instructions(void) { … } /** * text_poke_early - Update instructions on a live kernel at boot time * @addr: address to modify * @opcode: source of the copy * @len: length to copy * * When you use this code to patch more than one byte of an instruction * you need to make sure that other CPUs cannot execute this code in parallel. * Also no thread must be currently preempted in the middle of these * instructions. And on the local CPU you need to be protected against NMI or * MCE handlers seeing an inconsistent instruction while you patch. */ void __init_or_module text_poke_early(void *addr, const void *opcode, size_t len) { … } temp_mm_state_t; /* * Using a temporary mm allows to set temporary mappings that are not accessible * by other CPUs. Such mappings are needed to perform sensitive memory writes * that override the kernel memory protections (e.g., W^X), without exposing the * temporary page-table mappings that are required for these write operations to * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the * mapping is torn down. * * Context: The temporary mm needs to be used exclusively by a single core. To * harden security IRQs must be disabled while the temporary mm is * loaded, thereby preventing interrupt handler bugs from overriding * the kernel memory protection. */ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) { … } static inline void unuse_temporary_mm(temp_mm_state_t prev_state) { … } __ro_after_init struct mm_struct *poking_mm; __ro_after_init unsigned long poking_addr; static void text_poke_memcpy(void *dst, const void *src, size_t len) { … } static void text_poke_memset(void *dst, const void *src, size_t len) { … } text_poke_f; static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len) { … } /** * text_poke - Update instructions on a live kernel * @addr: address to modify * @opcode: source of the copy * @len: length to copy * * Only atomic text poke/set should be allowed when not doing early patching. * It means the size must be writable atomically and the address must be aligned * in a way that permits an atomic write. It also makes sure we fit on a single * page. * * Note that the caller must ensure that if the modified code is part of a * module, the module would not be removed during poking. This can be achieved * by registering a module notifier, and ordering module removal and patching * through a mutex. */ void *text_poke(void *addr, const void *opcode, size_t len) { … } /** * text_poke_kgdb - Update instructions on a live kernel by kgdb * @addr: address to modify * @opcode: source of the copy * @len: length to copy * * Only atomic text poke/set should be allowed when not doing early patching. * It means the size must be writable atomically and the address must be aligned * in a way that permits an atomic write. It also makes sure we fit on a single * page. * * Context: should only be used by kgdb, which ensures no other core is running, * despite the fact it does not hold the text_mutex. */ void *text_poke_kgdb(void *addr, const void *opcode, size_t len) { … } void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, bool core_ok) { … } /** * text_poke_copy - Copy instructions into (an unused part of) RX memory * @addr: address to modify * @opcode: source of the copy * @len: length to copy, could be more than 2x PAGE_SIZE * * Not safe against concurrent execution; useful for JITs to dump * new code blocks into unused regions of RX memory. Can be used in * conjunction with synchronize_rcu_tasks() to wait for existing * execution to quiesce after having made sure no existing functions * pointers are live. */ void *text_poke_copy(void *addr, const void *opcode, size_t len) { … } /** * text_poke_set - memset into (an unused part of) RX memory * @addr: address to modify * @c: the byte to fill the area with * @len: length to copy, could be more than 2x PAGE_SIZE * * This is useful to overwrite unused regions of RX memory with illegal * instructions. */ void *text_poke_set(void *addr, int c, size_t len) { … } static void do_sync_core(void *info) { … } void text_poke_sync(void) { … } /* * NOTE: crazy scheme to allow patching Jcc.d32 but not increase the size of * this thing. When len == 6 everything is prefixed with 0x0f and we map * opcode to Jcc.d8, using len to distinguish. */ struct text_poke_loc { … }; struct bp_patching_desc { … }; static struct bp_patching_desc bp_desc; static __always_inline struct bp_patching_desc *try_get_desc(void) { … } static __always_inline void put_desc(void) { … } static __always_inline void *text_poke_addr(struct text_poke_loc *tp) { … } static __always_inline int patch_cmp(const void *key, const void *elt) { … } noinstr int poke_int3_handler(struct pt_regs *regs) { … } #define TP_VEC_MAX … static struct text_poke_loc tp_vec[TP_VEC_MAX]; static int tp_vec_nr; /** * text_poke_bp_batch() -- update instructions on live kernel on SMP * @tp: vector of instructions to patch * @nr_entries: number of entries in the vector * * Modify multi-byte instruction by using int3 breakpoint on SMP. * We completely avoid stop_machine() here, and achieve the * synchronization using int3 breakpoint. * * The way it is done: * - For each entry in the vector: * - add a int3 trap to the address that will be patched * - sync cores * - For each entry in the vector: * - update all but the first byte of the patched range * - sync cores * - For each entry in the vector: * - replace the first byte (int3) by the first byte of * replacing opcode * - sync cores */ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) { … } static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, const void *opcode, size_t len, const void *emulate) { … } /* * We hard rely on the tp_vec being ordered; ensure this is so by flushing * early if needed. */ static bool tp_order_fail(void *addr) { … } static void text_poke_flush(void *addr) { … } void text_poke_finish(void) { … } void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate) { … } /** * text_poke_bp() -- update instructions on live kernel on SMP * @addr: address to patch * @opcode: opcode of new instruction * @len: length to copy * @emulate: instruction to be emulated * * Update a single instruction with the vector in the stack, avoiding * dynamically allocated memory. This function should be used when it is * not possible to allocate memory. */ void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate) { … }