// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ #include <linux/mm.h> #include <linux/llist.h> #include <linux/bpf.h> #include <linux/irq_work.h> #include <linux/bpf_mem_alloc.h> #include <linux/memcontrol.h> #include <asm/local.h> /* Any context (including NMI) BPF specific memory allocator. * * Tracing BPF programs can attach to kprobe and fentry. Hence they * run in unknown context where calling plain kmalloc() might not be safe. * * Front-end kmalloc() with per-cpu per-bucket cache of free elements. * Refill this cache asynchronously from irq_work. * * CPU_0 buckets * 16 32 64 96 128 196 256 512 1024 2048 4096 * ... * CPU_N buckets * 16 32 64 96 128 196 256 512 1024 2048 4096 * * The buckets are prefilled at the start. * BPF programs always run with migration disabled. * It's safe to allocate from cache of the current cpu with irqs disabled. * Free-ing is always done into bucket of the current cpu as well. * irq_work trims extra free elements from buckets with kfree * and refills them with kmalloc, so global kmalloc logic takes care * of freeing objects allocated by one cpu and freed on another. * * Every allocated objected is padded with extra 8 bytes that contains * struct llist_node. */ #define LLIST_NODE_SZ … #define BPF_MEM_ALLOC_SIZE_MAX … /* similar to kmalloc, but sizeof == 8 bucket is gone */ static u8 size_index[24] __ro_after_init = …; static int bpf_mem_cache_idx(size_t size) { … } #define NUM_CACHES … struct bpf_mem_cache { … }; struct bpf_mem_caches { … }; static const u16 sizes[NUM_CACHES] = …; static struct llist_node notrace *__llist_del_first(struct llist_head *head) { … } static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags) { … } static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c) { … } static void inc_active(struct bpf_mem_cache *c, unsigned long *flags) { … } static void dec_active(struct bpf_mem_cache *c, unsigned long *flags) { … } static void add_obj_to_free_list(struct bpf_mem_cache *c, void *obj) { … } /* Mostly runs from irq_work except __init phase. */ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic) { … } static void free_one(void *obj, bool percpu) { … } static int free_all(struct llist_node *llnode, bool percpu) { … } static void __free_rcu(struct rcu_head *head) { … } static void __free_rcu_tasks_trace(struct rcu_head *head) { … } static void enque_to_free(struct bpf_mem_cache *c, void *obj) { … } static void do_call_rcu_ttrace(struct bpf_mem_cache *c) { … } static void free_bulk(struct bpf_mem_cache *c) { … } static void __free_by_rcu(struct rcu_head *head) { … } static void check_free_by_rcu(struct bpf_mem_cache *c) { … } static void bpf_mem_refill(struct irq_work *work) { … } static void notrace irq_work_raise(struct bpf_mem_cache *c) { … } /* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket * the freelist cache will be elem_size * 64 (or less) on each cpu. * * For bpf programs that don't have statically known allocation sizes and * assuming (low_mark + high_mark) / 2 as an average number of elements per * bucket and all buckets are used the total amount of memory in freelists * on each cpu will be: * 64*16 + 64*32 + 64*64 + 64*96 + 64*128 + 64*196 + 64*256 + 32*512 + 16*1024 + 8*2048 + 4*4096 * == ~ 116 Kbyte using below heuristic. * Initialized, but unused bpf allocator (not bpf map specific one) will * consume ~ 11 Kbyte per cpu. * Typical case will be between 11K and 116K closer to 11K. * bpf progs can and should share bpf_mem_cache when possible. * * Percpu allocation is typically rare. To avoid potential unnecessary large * memory consumption, set low_mark = 1 and high_mark = 3, resulting in c->batch = 1. */ static void init_refill_work(struct bpf_mem_cache *c) { … } static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) { … } /* When size != 0 bpf_mem_cache for each cpu. * This is typical bpf hash map use case when all elements have equal size. * * When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on * kmalloc/kfree. Max allocation size is 4096 in this case. * This is bpf_dynptr and bpf_kptr use case. */ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) { … } int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg) { … } int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size) { … } static void drain_mem_cache(struct bpf_mem_cache *c) { … } static void check_mem_cache(struct bpf_mem_cache *c) { … } static void check_leaked_objs(struct bpf_mem_alloc *ma) { … } static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) { … } static void free_mem_alloc(struct bpf_mem_alloc *ma) { … } static void free_mem_alloc_deferred(struct work_struct *work) { … } static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress) { … } void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) { … } /* notrace is necessary here and in other functions to make sure * bpf programs cannot attach to them and cause llist corruptions. */ static void notrace *unit_alloc(struct bpf_mem_cache *c) { … } /* Though 'ptr' object could have been allocated on a different cpu * add it to the free_llist of the current cpu. * Let kfree() logic deal with it when it's later called from irq_work. */ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) { … } static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr) { … } /* Called from BPF program or from sys_bpf syscall. * In both cases migration is disabled. */ void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size) { … } void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) { … } void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr) { … } void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma) { … } void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr) { … } void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr) { … } /* Directly does a kfree() without putting 'ptr' back to the free_llist * for reuse and without waiting for a rcu_tasks_trace gp. * The caller must first go through the rcu_tasks_trace gp for 'ptr' * before calling bpf_mem_cache_raw_free(). * It could be used when the rcu_tasks_trace callback does not have * a hold on the original bpf_mem_alloc object that allocated the * 'ptr'. This should only be used in the uncommon code path. * Otherwise, the bpf_mem_alloc's free_llist cannot be refilled * and may affect performance. */ void bpf_mem_cache_raw_free(void *ptr) { … } /* When flags == GFP_KERNEL, it signals that the caller will not cause * deadlock when using kmalloc. bpf_mem_cache_alloc_flags() will use * kmalloc if the free_llist is empty. */ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) { … } int bpf_mem_alloc_check_size(bool percpu, size_t size) { … }