linux/kernel/bpf/devmap.c

// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
 */

/* Devmaps primary use is as a backend map for XDP BPF helper call
 * bpf_redirect_map(). Because XDP is mostly concerned with performance we
 * spent some effort to ensure the datapath with redirect maps does not use
 * any locking. This is a quick note on the details.
 *
 * We have three possible paths to get into the devmap control plane bpf
 * syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall
 * will invoke an update, delete, or lookup operation. To ensure updates and
 * deletes appear atomic from the datapath side xchg() is used to modify the
 * netdev_map array. Then because the datapath does a lookup into the netdev_map
 * array (read-only) from an RCU critical section we use call_rcu() to wait for
 * an rcu grace period before free'ing the old data structures. This ensures the
 * datapath always has a valid copy. However, the datapath does a "flush"
 * operation that pushes any pending packets in the driver outside the RCU
 * critical section. Each bpf_dtab_netdev tracks these pending operations using
 * a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed  until
 * this list is empty, indicating outstanding flush operations have completed.
 *
 * BPF syscalls may race with BPF program calls on any of the update, delete
 * or lookup operations. As noted above the xchg() operation also keep the
 * netdev_map consistent in this case. From the devmap side BPF programs
 * calling into these operations are the same as multiple user space threads
 * making system calls.
 *
 * Finally, any of the above may race with a netdev_unregister notifier. The
 * unregister notifier must search for net devices in the map structure that
 * contain a reference to the net device and remove them. This is a two step
 * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b)
 * check to see if the ifindex is the same as the net_device being removed.
 * When removing the dev a cmpxchg() is used to ensure the correct dev is
 * removed, in the case of a concurrent update or delete operation it is
 * possible that the initially referenced dev is no longer in the map. As the
 * notifier hook walks the map we know that new dev references can not be
 * added by the user because core infrastructure ensures dev_get_by_index()
 * calls will fail at this point.
 *
 * The devmap_hash type is a map type which interprets keys as ifindexes and
 * indexes these using a hashmap. This allows maps that use ifindex as key to be
 * densely packed instead of having holes in the lookup array for unused
 * ifindexes. The setup and packet enqueue/send code is shared between the two
 * types of devmap; only the lookup and insertion is different.
 */
#include <linux/bpf.h>
#include <net/xdp.h>
#include <linux/filter.h>
#include <trace/events/xdp.h>
#include <linux/btf_ids.h>

#define DEV_CREATE_FLAG_MASK

struct xdp_dev_bulk_queue {};

struct bpf_dtab_netdev {};

struct bpf_dtab {};

static DEFINE_SPINLOCK(dev_map_lock);
static LIST_HEAD(dev_map_list);

static struct hlist_head *dev_map_create_hash(unsigned int entries,
					      int numa_node)
{}

static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
						    int idx)
{}

static int dev_map_alloc_check(union bpf_attr *attr)
{}

static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
{}

static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
{}

static void dev_map_free(struct bpf_map *map)
{}

static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{}

/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
 * by local_bh_disable() (from XDP calls inside NAPI). The
 * rcu_read_lock_bh_held() below makes lockdep accept both.
 */
static void *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
{}

static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
				    void *next_key)
{}

static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
				struct xdp_frame **frames, int n,
				struct net_device *dev)
{}

static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
{}

/* __dev_flush is called from xdp_do_flush() which _must_ be signalled from the
 * driver before returning from its napi->poll() routine. See the comment above
 * xdp_do_flush() in filter.c.
 */
void __dev_flush(struct list_head *flush_list)
{}

/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
 * by local_bh_disable() (from XDP calls inside NAPI). The
 * rcu_read_lock_bh_held() below makes lockdep accept both.
 */
static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
{}

/* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu
 * variable access, and map elements stick around. See comment above
 * xdp_do_flush() in filter.c.
 */
static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
		       struct net_device *dev_rx, struct bpf_prog *xdp_prog)
{}

static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
				struct net_device *dev_rx,
				struct bpf_prog *xdp_prog)
{}

static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev *dst)
{}

int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
		    struct net_device *dev_rx)
{}

int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
		    struct net_device *dev_rx)
{}

static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
{}

static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj,
				 struct net_device *dev_rx,
				 struct xdp_frame *xdpf)
{}

static inline bool is_ifindex_excluded(int *excluded, int num_excluded, int ifindex)
{}

/* Get ifindex of each upper device. 'indexes' must be able to hold at
 * least MAX_NEST_DEV elements.
 * Returns the number of ifindexes added.
 */
static int get_upper_ifindexes(struct net_device *dev, int *indexes)
{}

int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
			  struct bpf_map *map, bool exclude_ingress)
{}

int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
			     struct bpf_prog *xdp_prog)
{}

static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst,
				  struct sk_buff *skb,
				  struct bpf_prog *xdp_prog)
{}

int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
			   struct bpf_prog *xdp_prog, struct bpf_map *map,
			   bool exclude_ingress)
{}

static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
{}

static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key)
{}

static void __dev_map_entry_free(struct rcu_head *rcu)
{}

static long dev_map_delete_elem(struct bpf_map *map, void *key)
{}

static long dev_map_hash_delete_elem(struct bpf_map *map, void *key)
{}

static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
						    struct bpf_dtab *dtab,
						    struct bpf_devmap_val *val,
						    unsigned int idx)
{}

static long __dev_map_update_elem(struct net *net, struct bpf_map *map,
				  void *key, void *value, u64 map_flags)
{}

static long dev_map_update_elem(struct bpf_map *map, void *key, void *value,
				u64 map_flags)
{}

static long __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
				       void *key, void *value, u64 map_flags)
{}

static long dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
				     u64 map_flags)
{}

static long dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
{}

static long dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
{}

static u64 dev_map_mem_usage(const struct bpf_map *map)
{}

BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab)
const struct bpf_map_ops dev_map_ops =;

const struct bpf_map_ops dev_map_hash_ops =;

static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab,
				       struct net_device *netdev)
{}

static int dev_map_notification(struct notifier_block *notifier,
				ulong event, void *ptr)
{}

static struct notifier_block dev_map_notifier =;

static int __init dev_map_init(void)
{}

subsys_initcall(dev_map_init);