memalloc.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
#include <linux/mm.h>
#include <linux/llist.h>
#include <linux/bpf.h>
#include <linux/irq_work.h>
#include <linux/bpf_mem_alloc.h>
#include <linux/memcontrol.h>
#include <asm/local.h>

/* Any context (including NMI) BPF specific memory allocator.
 *
 * Tracing BPF programs can attach to kprobe and fentry. Hence they
 * run in unknown context where calling plain kmalloc() might not be safe.
 *
 * Front-end kmalloc() with per-cpu per-bucket cache of free elements.
 * Refill this cache asynchronously from irq_work.
 *
 * CPU_0 buckets
 * 16 32 64 96 128 196 256 512 1024 2048 4096
 * ...
 * CPU_N buckets
 * 16 32 64 96 128 196 256 512 1024 2048 4096
 *
 * The buckets are prefilled at the start.
 * BPF programs always run with migration disabled.
 * It's safe to allocate from cache of the current cpu with irqs disabled.
 * Free-ing is always done into bucket of the current cpu as well.
 * irq_work trims extra free elements from buckets with kfree
 * and refills them with kmalloc, so global kmalloc logic takes care
 * of freeing objects allocated by one cpu and freed on another.
 *
 * Every allocated objected is padded with extra 8 bytes that contains
 * struct llist_node.
 */
#define LLIST_NODE_SZ …

/* similar to kmalloc, but sizeof == 8 bucket is gone */
static u8 size_index[24] __ro_after_init = …;

static int bpf_mem_cache_idx(size_t size)
{ … }

#define NUM_CACHES …

struct bpf_mem_cache { … };

struct bpf_mem_caches { … };

static const u16 sizes[NUM_CACHES] = …;

static struct llist_node notrace *__llist_del_first(struct llist_head *head)
{ … }

static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags)
{ … }

static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c)
{ … }

static void inc_active(struct bpf_mem_cache *c, unsigned long *flags)
{ … }

static void dec_active(struct bpf_mem_cache *c, unsigned long *flags)
{ … }

static void add_obj_to_free_list(struct bpf_mem_cache *c, void *obj)
{ … }

/* Mostly runs from irq_work except __init phase. */
static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic)
{ … }

static void free_one(void *obj, bool percpu)
{ … }

static int free_all(struct llist_node *llnode, bool percpu)
{ … }

static void __free_rcu(struct rcu_head *head)
{ … }

static void __free_rcu_tasks_trace(struct rcu_head *head)
{ … }

static void enque_to_free(struct bpf_mem_cache *c, void *obj)
{ … }

static void do_call_rcu_ttrace(struct bpf_mem_cache *c)
{ … }

static void free_bulk(struct bpf_mem_cache *c)
{ … }

static void __free_by_rcu(struct rcu_head *head)
{ … }

static void check_free_by_rcu(struct bpf_mem_cache *c)
{ … }

static void bpf_mem_refill(struct irq_work *work)
{ … }

static void notrace irq_work_raise(struct bpf_mem_cache *c)
{ … }

/* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket
 * the freelist cache will be elem_size * 64 (or less) on each cpu.
 *
 * For bpf programs that don't have statically known allocation sizes and
 * assuming (low_mark + high_mark) / 2 as an average number of elements per
 * bucket and all buckets are used the total amount of memory in freelists
 * on each cpu will be:
 * 64*16 + 64*32 + 64*64 + 64*96 + 64*128 + 64*196 + 64*256 + 32*512 + 16*1024 + 8*2048 + 4*4096
 * == ~ 116 Kbyte using below heuristic.
 * Initialized, but unused bpf allocator (not bpf map specific one) will
 * consume ~ 11 Kbyte per cpu.
 * Typical case will be between 11K and 116K closer to 11K.
 * bpf progs can and should share bpf_mem_cache when possible.
 *
 * Percpu allocation is typically rare. To avoid potential unnecessary large
 * memory consumption, set low_mark = 1 and high_mark = 3, resulting in c->batch = 1.
 */
static void init_refill_work(struct bpf_mem_cache *c)
{ … }

static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
{ … }

/* When size != 0 bpf_mem_cache for each cpu.
 * This is typical bpf hash map use case when all elements have equal size.
 *
 * When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on
 * kmalloc/kfree. Max allocation size is 4096 in this case.
 * This is bpf_dynptr and bpf_kptr use case.
 */
int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
{ … }

int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg)
{ … }

int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size)
{ … }

static void drain_mem_cache(struct bpf_mem_cache *c)
{ … }

static void check_mem_cache(struct bpf_mem_cache *c)
{ … }

static void check_leaked_objs(struct bpf_mem_alloc *ma)
{ … }

static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
{ … }

static void free_mem_alloc(struct bpf_mem_alloc *ma)
{ … }

static void free_mem_alloc_deferred(struct work_struct *work)
{ … }

static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress)
{ … }

void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
{ … }

/* notrace is necessary here and in other functions to make sure
 * bpf programs cannot attach to them and cause llist corruptions.
 */
static void notrace *unit_alloc(struct bpf_mem_cache *c)
{ … }

/* Though 'ptr' object could have been allocated on a different cpu
 * add it to the free_llist of the current cpu.
 * Let kfree() logic deal with it when it's later called from irq_work.
 */
static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
{ … }

static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr)
{ … }

/* Called from BPF program or from sys_bpf syscall.
 * In both cases migration is disabled.
 */
void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size)
{ … }

void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
{ … }

void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
{ … }

void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma)
{ … }

void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)
{ … }

void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
{ … }

/* Directly does a kfree() without putting 'ptr' back to the free_llist
 * for reuse and without waiting for a rcu_tasks_trace gp.
 * The caller must first go through the rcu_tasks_trace gp for 'ptr'
 * before calling bpf_mem_cache_raw_free().
 * It could be used when the rcu_tasks_trace callback does not have
 * a hold on the original bpf_mem_alloc object that allocated the
 * 'ptr'. This should only be used in the uncommon code path.
 * Otherwise, the bpf_mem_alloc's free_llist cannot be refilled
 * and may affect performance.
 */
void bpf_mem_cache_raw_free(void *ptr)
{ … }

/* When flags == GFP_KERNEL, it signals that the caller will not cause
 * deadlock when using kmalloc. bpf_mem_cache_alloc_flags() will use
 * kmalloc if the free_llist is empty.
 */
void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
{ … }
linux/kernel/bpf/memalloc.c