bpf_jit_comp.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0-only
/*
 * BPF JIT compiler
 *
 * Copyright (C) 2011-2013 Eric Dumazet ([email protected])
 * Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
 */
#include <linux/netdevice.h>
#include <linux/filter.h>
#include <linux/if_vlan.h>
#include <linux/bpf.h>
#include <linux/memory.h>
#include <linux/sort.h>
#include <asm/extable.h>
#include <asm/ftrace.h>
#include <asm/set_memory.h>
#include <asm/nospec-branch.h>
#include <asm/text-patching.h>
#include <asm/unwind.h>
#include <asm/cfi.h>

static bool all_callee_regs_used[4] = …;

static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
{ … }

#define EMIT(bytes, len) …

#define EMIT1(b1) …
#define EMIT2(b1, b2) …
#define EMIT3(b1, b2, b3) …
#define EMIT4(b1, b2, b3, b4) …

#define EMIT1_off32(b1, off) …
#define EMIT2_off32(b1, b2, off) …
#define EMIT3_off32(b1, b2, b3, off) …
#define EMIT4_off32(b1, b2, b3, b4, off) …

#ifdef CONFIG_X86_KERNEL_IBT
#define EMIT_ENDBR() …
#define EMIT_ENDBR_POISON() …
#else
#define EMIT_ENDBR …
#define EMIT_ENDBR_POISON …
#endif

static bool is_imm8(int value)
{ … }

/*
 * Let us limit the positive offset to be <= 123.
 * This is to ensure eventual jit convergence For the following patterns:
 * ...
 * pass4, final_proglen=4391:
 *   ...
 *   20e:    48 85 ff                test   rdi,rdi
 *   211:    74 7d                   je     0x290
 *   213:    48 8b 77 00             mov    rsi,QWORD PTR [rdi+0x0]
 *   ...
 *   289:    48 85 ff                test   rdi,rdi
 *   28c:    74 17                   je     0x2a5
 *   28e:    e9 7f ff ff ff          jmp    0x212
 *   293:    bf 03 00 00 00          mov    edi,0x3
 * Note that insn at 0x211 is 2-byte cond jump insn for offset 0x7d (-125)
 * and insn at 0x28e is 5-byte jmp insn with offset -129.
 *
 * pass5, final_proglen=4392:
 *   ...
 *   20e:    48 85 ff                test   rdi,rdi
 *   211:    0f 84 80 00 00 00       je     0x297
 *   217:    48 8b 77 00             mov    rsi,QWORD PTR [rdi+0x0]
 *   ...
 *   28d:    48 85 ff                test   rdi,rdi
 *   290:    74 1a                   je     0x2ac
 *   292:    eb 84                   jmp    0x218
 *   294:    bf 03 00 00 00          mov    edi,0x3
 * Note that insn at 0x211 is 6-byte cond jump insn now since its offset
 * becomes 0x80 based on previous round (0x293 - 0x213 = 0x80).
 * At the same time, insn at 0x292 is a 2-byte insn since its offset is
 * -124.
 *
 * pass6 will repeat the same code as in pass4 and this will prevent
 * eventual convergence.
 *
 * To fix this issue, we need to break je (2->6 bytes) <-> jmp (5->2 bytes)
 * cycle in the above. In the above example je offset <= 0x7c should work.
 *
 * For other cases, je <-> je needs offset <= 0x7b to avoid no convergence
 * issue. For jmp <-> je and jmp <-> jmp cases, jmp offset <= 0x7c should
 * avoid no convergence issue.
 *
 * Overall, let us limit the positive offset for 8bit cond/uncond jmp insn
 * to maximum 123 (0x7b). This way, the jit pass can eventually converge.
 */
static bool is_imm8_jmp_offset(int value)
{ … }

static bool is_simm32(s64 value)
{ … }

static bool is_uimm32(u64 value)
{ … }

/* mov dst, src */
#define EMIT_mov(DST, SRC) …

static int bpf_size_to_x86_bytes(int bpf_size)
{ … }

/*
 * List of x86 cond jumps opcodes (. + s8)
 * Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32)
 */
#define X86_JB …
#define X86_JAE …
#define X86_JE …
#define X86_JNE …
#define X86_JBE …
#define X86_JA …
#define X86_JL …
#define X86_JGE …
#define X86_JLE …
#define X86_JG …

/* Pick a register outside of BPF range for JIT internal work */
#define AUX_REG …
#define X86_REG_R9 …
#define X86_REG_R12 …

/*
 * The following table maps BPF registers to x86-64 registers.
 *
 * x86-64 register R12 is unused, since if used as base address
 * register in load/store instructions, it always needs an
 * extra byte of encoding and is callee saved.
 *
 * x86-64 register R9 is not used by BPF programs, but can be used by BPF
 * trampoline. x86-64 register R10 is used for blinding (if enabled).
 */
static const int reg2hex[] = …;

static const int reg2pt_regs[] = …;

/*
 * is_ereg() == true if BPF register 'reg' maps to x86-64 r8..r15
 * which need extra byte of encoding.
 * rax,rcx,...,rbp have simpler encoding
 */
static bool is_ereg(u32 reg)
{ … }

/*
 * is_ereg_8l() == true if BPF register 'reg' is mapped to access x86-64
 * lower 8-bit registers dil,sil,bpl,spl,r8b..r15b, which need extra byte
 * of encoding. al,cl,dl,bl have simpler encoding.
 */
static bool is_ereg_8l(u32 reg)
{ … }

static bool is_axreg(u32 reg)
{ … }

/* Add modifiers if 'reg' maps to x86-64 registers R8..R15 */
static u8 add_1mod(u8 byte, u32 reg)
{ … }

static u8 add_2mod(u8 byte, u32 r1, u32 r2)
{ … }

static u8 add_3mod(u8 byte, u32 r1, u32 r2, u32 index)
{ … }

/* Encode 'dst_reg' register into x86-64 opcode 'byte' */
static u8 add_1reg(u8 byte, u32 dst_reg)
{ … }

/* Encode 'dst_reg' and 'src_reg' registers into x86-64 opcode 'byte' */
static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
{ … }

/* Some 1-byte opcodes for binary ALU operations */
static u8 simple_alu_opcodes[] = …;

static void jit_fill_hole(void *area, unsigned int size)
{ … }

int bpf_arch_text_invalidate(void *dst, size_t len)
{ … }

struct jit_context { … };

/* Maximum number of bytes emitted while JITing one eBPF insn */
#define BPF_MAX_INSN_SIZE …
#define BPF_INSN_SAFETY …

/* Number of bytes emit_patch() needs to generate instructions */
#define X86_PATCH_SIZE …
/* Number of bytes that will be skipped on tailcall */
#define X86_TAIL_CALL_OFFSET …

static void push_r12(u8 **pprog)
{ … }

static void push_callee_regs(u8 **pprog, bool *callee_regs_used)
{ … }

static void pop_r12(u8 **pprog)
{ … }

static void pop_callee_regs(u8 **pprog, bool *callee_regs_used)
{ … }

static void emit_nops(u8 **pprog, int len)
{ … }

/*
 * Emit the various CFI preambles, see asm/cfi.h and the comments about FineIBT
 * in arch/x86/kernel/alternative.c
 */

static void emit_fineibt(u8 **pprog, u32 hash)
{ … }

static void emit_kcfi(u8 **pprog, u32 hash)
{ … }

static void emit_cfi(u8 **pprog, u32 hash)
{ … }

static void emit_prologue_tail_call(u8 **pprog, bool is_subprog)
{ … }

/*
 * Emit x86-64 prologue code for BPF program.
 * bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes
 * while jumping to another program
 */
static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
			  bool tail_call_reachable, bool is_subprog,
			  bool is_exception_cb)
{ … }

static int emit_patch(u8 **pprog, void *func, void *ip, u8 opcode)
{ … }

static int emit_call(u8 **pprog, void *func, void *ip)
{ … }

static int emit_rsb_call(u8 **pprog, void *func, void *ip)
{ … }

static int emit_jump(u8 **pprog, void *func, void *ip)
{ … }

static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
				void *old_addr, void *new_addr)
{ … }

int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
		       void *old_addr, void *new_addr)
{ … }

#define EMIT_LFENCE() …

static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip)
{ … }

static void emit_return(u8 **pprog, u8 *ip)
{ … }

#define BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack) …

/*
 * Generate the following code:
 *
 * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
 *   if (index >= array->map.max_entries)
 *     goto out;
 *   if ((*tcc_ptr)++ >= MAX_TAIL_CALL_CNT)
 *     goto out;
 *   prog = array->ptrs[index];
 *   if (prog == NULL)
 *     goto out;
 *   goto *(prog->bpf_func + prologue_size);
 * out:
 */
static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog,
					u8 **pprog, bool *callee_regs_used,
					u32 stack_depth, u8 *ip,
					struct jit_context *ctx)
{ … }

static void emit_bpf_tail_call_direct(struct bpf_prog *bpf_prog,
				      struct bpf_jit_poke_descriptor *poke,
				      u8 **pprog, u8 *ip,
				      bool *callee_regs_used, u32 stack_depth,
				      struct jit_context *ctx)
{ … }

static void bpf_tail_call_direct_fixup(struct bpf_prog *prog)
{ … }

static void emit_mov_imm32(u8 **pprog, bool sign_propagate,
			   u32 dst_reg, const u32 imm32)
{ … }

static void emit_mov_imm64(u8 **pprog, u32 dst_reg,
			   const u32 imm32_hi, const u32 imm32_lo)
{ … }

static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg)
{ … }

static void emit_movsx_reg(u8 **pprog, int num_bits, bool is64, u32 dst_reg,
			   u32 src_reg)
{ … }

/* Emit the suffix (ModR/M etc) for addressing *(ptr_reg + off) and val_reg */
static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off)
{ … }

static void emit_insn_suffix_SIB(u8 **pprog, u32 ptr_reg, u32 val_reg, u32 index_reg, int off)
{ … }

/*
 * Emit a REX byte if it will be necessary to address these registers
 */
static void maybe_emit_mod(u8 **pprog, u32 dst_reg, u32 src_reg, bool is64)
{ … }

/*
 * Similar version of maybe_emit_mod() for a single register
 */
static void maybe_emit_1mod(u8 **pprog, u32 reg, bool is64)
{ … }

/* LDX: dst_reg = *(u8*)(src_reg + off) */
static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
{ … }

/* LDSX: dst_reg = *(s8*)(src_reg + off) */
static void emit_ldsx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
{ … }

static void emit_ldx_index(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, u32 index_reg, int off)
{ … }

static void emit_ldx_r12(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
{ … }

/* STX: *(u8*)(dst_reg + off) = src_reg */
static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
{ … }

/* STX: *(u8*)(dst_reg + index_reg + off) = src_reg */
static void emit_stx_index(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, u32 index_reg, int off)
{ … }

static void emit_stx_r12(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
{ … }

/* ST: *(u8*)(dst_reg + index_reg + off) = imm32 */
static void emit_st_index(u8 **pprog, u32 size, u32 dst_reg, u32 index_reg, int off, int imm)
{ … }

static void emit_st_r12(u8 **pprog, u32 size, u32 dst_reg, int off, int imm)
{ … }

static int emit_atomic(u8 **pprog, u8 atomic_op,
		       u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size)
{ … }

static int emit_atomic_index(u8 **pprog, u8 atomic_op, u32 size,
			     u32 dst_reg, u32 src_reg, u32 index_reg, int off)
{ … }

#define DONT_CLEAR …

bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs)
{ … }

static void detect_reg_usage(struct bpf_insn *insn, int insn_cnt,
			     bool *regs_used)
{ … }

/* emit the 3-byte VEX prefix
 *
 * r: same as rex.r, extra bit for ModRM reg field
 * x: same as rex.x, extra bit for SIB index field
 * b: same as rex.b, extra bit for ModRM r/m, or SIB base
 * m: opcode map select, encoding escape bytes e.g. 0x0f38
 * w: same as rex.w (32 bit or 64 bit) or opcode specific
 * src_reg2: additional source reg (encoded as BPF reg)
 * l: vector length (128 bit or 256 bit) or reserved
 * pp: opcode prefix (none, 0x66, 0xf2 or 0xf3)
 */
static void emit_3vex(u8 **pprog, bool r, bool x, bool b, u8 m,
		      bool w, u8 src_reg2, bool l, u8 pp)
{ … }

/* emit BMI2 shift instruction */
static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op)
{ … }

#define INSN_SZ_DIFF …

#define __LOAD_TCC_PTR(off) …
/* mov rax, qword ptr [rbp - rounded_stack_depth - 16] */
#define LOAD_TAIL_CALL_CNT_PTR(stack) …

static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
		  int oldproglen, struct jit_context *ctx, bool jmp_padding)
{ … }

static void clean_stack_garbage(const struct btf_func_model *m,
				u8 **pprog, int nr_stack_slots,
				int stack_size)
{ … }

/* get the count of the regs that are used to pass arguments */
static int get_nr_used_regs(const struct btf_func_model *m)
{ … }

static void save_args(const struct btf_func_model *m, u8 **prog,
		      int stack_size, bool for_call_origin)
{ … }

static void restore_regs(const struct btf_func_model *m, u8 **prog,
			 int stack_size)
{ … }

static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
			   struct bpf_tramp_link *l, int stack_size,
			   int run_ctx_off, bool save_ret,
			   void *image, void *rw_image)
{ … }

static void emit_align(u8 **pprog, u32 align)
{ … }

static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond)
{ … }

static int invoke_bpf(const struct btf_func_model *m, u8 **pprog,
		      struct bpf_tramp_links *tl, int stack_size,
		      int run_ctx_off, bool save_ret,
		      void *image, void *rw_image)
{ … }

static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
			      struct bpf_tramp_links *tl, int stack_size,
			      int run_ctx_off, u8 **branches,
			      void *image, void *rw_image)
{ … }

/* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */
#define LOAD_TRAMP_TAIL_CALL_CNT_PTR(stack) …

/* Example:
 * __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev);
 * its 'struct btf_func_model' will be nr_args=2
 * The assembly code when eth_type_trans is executing after trampoline:
 *
 * push rbp
 * mov rbp, rsp
 * sub rsp, 16                     // space for skb and dev
 * push rbx                        // temp regs to pass start time
 * mov qword ptr [rbp - 16], rdi   // save skb pointer to stack
 * mov qword ptr [rbp - 8], rsi    // save dev pointer to stack
 * call __bpf_prog_enter           // rcu_read_lock and preempt_disable
 * mov rbx, rax                    // remember start time in bpf stats are enabled
 * lea rdi, [rbp - 16]             // R1==ctx of bpf prog
 * call addr_of_jited_FENTRY_prog
 * movabsq rdi, 64bit_addr_of_struct_bpf_prog  // unused if bpf stats are off
 * mov rsi, rbx                    // prog start time
 * call __bpf_prog_exit            // rcu_read_unlock, preempt_enable and stats math
 * mov rdi, qword ptr [rbp - 16]   // restore skb pointer from stack
 * mov rsi, qword ptr [rbp - 8]    // restore dev pointer from stack
 * pop rbx
 * leave
 * ret
 *
 * eth_type_trans has 5 byte nop at the beginning. These 5 bytes will be
 * replaced with 'call generated_bpf_trampoline'. When it returns
 * eth_type_trans will continue executing with original skb and dev pointers.
 *
 * The assembly code when eth_type_trans is called from trampoline:
 *
 * push rbp
 * mov rbp, rsp
 * sub rsp, 24                     // space for skb, dev, return value
 * push rbx                        // temp regs to pass start time
 * mov qword ptr [rbp - 24], rdi   // save skb pointer to stack
 * mov qword ptr [rbp - 16], rsi   // save dev pointer to stack
 * call __bpf_prog_enter           // rcu_read_lock and preempt_disable
 * mov rbx, rax                    // remember start time if bpf stats are enabled
 * lea rdi, [rbp - 24]             // R1==ctx of bpf prog
 * call addr_of_jited_FENTRY_prog  // bpf prog can access skb and dev
 * movabsq rdi, 64bit_addr_of_struct_bpf_prog  // unused if bpf stats are off
 * mov rsi, rbx                    // prog start time
 * call __bpf_prog_exit            // rcu_read_unlock, preempt_enable and stats math
 * mov rdi, qword ptr [rbp - 24]   // restore skb pointer from stack
 * mov rsi, qword ptr [rbp - 16]   // restore dev pointer from stack
 * call eth_type_trans+5           // execute body of eth_type_trans
 * mov qword ptr [rbp - 8], rax    // save return value
 * call __bpf_prog_enter           // rcu_read_lock and preempt_disable
 * mov rbx, rax                    // remember start time in bpf stats are enabled
 * lea rdi, [rbp - 24]             // R1==ctx of bpf prog
 * call addr_of_jited_FEXIT_prog   // bpf prog can access skb, dev, return value
 * movabsq rdi, 64bit_addr_of_struct_bpf_prog  // unused if bpf stats are off
 * mov rsi, rbx                    // prog start time
 * call __bpf_prog_exit            // rcu_read_unlock, preempt_enable and stats math
 * mov rax, qword ptr [rbp - 8]    // restore eth_type_trans's return value
 * pop rbx
 * leave
 * add rsp, 8                      // skip eth_type_trans's frame
 * ret                             // return to its caller
 */
static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_image,
					 void *rw_image_end, void *image,
					 const struct btf_func_model *m, u32 flags,
					 struct bpf_tramp_links *tlinks,
					 void *func_addr)
{ … }

void *arch_alloc_bpf_trampoline(unsigned int size)
{ … }

void arch_free_bpf_trampoline(void *image, unsigned int size)
{ … }

int arch_protect_bpf_trampoline(void *image, unsigned int size)
{ … }

int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
				const struct btf_func_model *m, u32 flags,
				struct bpf_tramp_links *tlinks,
				void *func_addr)
{ … }

int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
			     struct bpf_tramp_links *tlinks, void *func_addr)
{ … }

static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs, u8 *image, u8 *buf)
{ … }

static int cmp_ips(const void *a, const void *b)
{ … }

int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs)
{ … }

struct x64_jit_data { … };

#define MAX_PASSES …
#define PADDING_PASSES …

struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
{ … }

bool bpf_jit_supports_kfunc_call(void)
{ … }

void *bpf_arch_text_copy(void *dst, void *src, size_t len)
{ … }

/* Indicate the JIT backend supports mixing bpf2bpf and tailcalls. */
bool bpf_jit_supports_subprog_tailcalls(void)
{ … }

bool bpf_jit_supports_percpu_insn(void)
{ … }

void bpf_jit_free(struct bpf_prog *prog)
{ … }

bool bpf_jit_supports_exceptions(void)
{ … }

void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
{ … }

void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
			       struct bpf_prog *new, struct bpf_prog *old)
{ … }

bool bpf_jit_supports_arena(void)
{ … }

bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena)
{ … }

bool bpf_jit_supports_ptr_xchg(void)
{ … }

/* x86-64 JIT emits its own code to filter user addresses so return 0 here */
u64 bpf_arch_uaddress_limit(void)
{ … }
linux/arch/x86/net/bpf_jit_comp.c