sch_hhf.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0-only
/* net/sched/sch_hhf.c		Heavy-Hitter Filter (HHF)
 *
 * Copyright (C) 2013 Terry Lam <[email protected]>
 * Copyright (C) 2013 Nandita Dukkipati <[email protected]>
 */

#include <linux/jiffies.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
#include <linux/siphash.h>
#include <net/pkt_sched.h>
#include <net/sock.h>

/*	Heavy-Hitter Filter (HHF)
 *
 * Principles :
 * Flows are classified into two buckets: non-heavy-hitter and heavy-hitter
 * buckets. Initially, a new flow starts as non-heavy-hitter. Once classified
 * as heavy-hitter, it is immediately switched to the heavy-hitter bucket.
 * The buckets are dequeued by a Weighted Deficit Round Robin (WDRR) scheduler,
 * in which the heavy-hitter bucket is served with less weight.
 * In other words, non-heavy-hitters (e.g., short bursts of critical traffic)
 * are isolated from heavy-hitters (e.g., persistent bulk traffic) and also have
 * higher share of bandwidth.
 *
 * To capture heavy-hitters, we use the "multi-stage filter" algorithm in the
 * following paper:
 * [EV02] C. Estan and G. Varghese, "New Directions in Traffic Measurement and
 * Accounting", in ACM SIGCOMM, 2002.
 *
 * Conceptually, a multi-stage filter comprises k independent hash functions
 * and k counter arrays. Packets are indexed into k counter arrays by k hash
 * functions, respectively. The counters are then increased by the packet sizes.
 * Therefore,
 *    - For a heavy-hitter flow: *all* of its k array counters must be large.
 *    - For a non-heavy-hitter flow: some of its k array counters can be large
 *      due to hash collision with other small flows; however, with high
 *      probability, not *all* k counters are large.
 *
 * By the design of the multi-stage filter algorithm, the false negative rate
 * (heavy-hitters getting away uncaptured) is zero. However, the algorithm is
 * susceptible to false positives (non-heavy-hitters mistakenly classified as
 * heavy-hitters).
 * Therefore, we also implement the following optimizations to reduce false
 * positives by avoiding unnecessary increment of the counter values:
 *    - Optimization O1: once a heavy-hitter is identified, its bytes are not
 *        accounted in the array counters. This technique is called "shielding"
 *        in Section 3.3.1 of [EV02].
 *    - Optimization O2: conservative update of counters
 *                       (Section 3.3.2 of [EV02]),
 *        New counter value = max {old counter value,
 *                                 smallest counter value + packet bytes}
 *
 * Finally, we refresh the counters periodically since otherwise the counter
 * values will keep accumulating.
 *
 * Once a flow is classified as heavy-hitter, we also save its per-flow state
 * in an exact-matching flow table so that its subsequent packets can be
 * dispatched to the heavy-hitter bucket accordingly.
 *
 *
 * At a high level, this qdisc works as follows:
 * Given a packet p:
 *   - If the flow-id of p (e.g., TCP 5-tuple) is already in the exact-matching
 *     heavy-hitter flow table, denoted table T, then send p to the heavy-hitter
 *     bucket.
 *   - Otherwise, forward p to the multi-stage filter, denoted filter F
 *        + If F decides that p belongs to a non-heavy-hitter flow, then send p
 *          to the non-heavy-hitter bucket.
 *        + Otherwise, if F decides that p belongs to a new heavy-hitter flow,
 *          then set up a new flow entry for the flow-id of p in the table T and
 *          send p to the heavy-hitter bucket.
 *
 * In this implementation:
 *   - T is a fixed-size hash-table with 1024 entries. Hash collision is
 *     resolved by linked-list chaining.
 *   - F has four counter arrays, each array containing 1024 32-bit counters.
 *     That means 4 * 1024 * 32 bits = 16KB of memory.
 *   - Since each array in F contains 1024 counters, 10 bits are sufficient to
 *     index into each array.
 *     Hence, instead of having four hash functions, we chop the 32-bit
 *     skb-hash into three 10-bit chunks, and the remaining 10-bit chunk is
 *     computed as XOR sum of those three chunks.
 *   - We need to clear the counter arrays periodically; however, directly
 *     memsetting 16KB of memory can lead to cache eviction and unwanted delay.
 *     So by representing each counter by a valid bit, we only need to reset
 *     4K of 1 bit (i.e. 512 bytes) instead of 16KB of memory.
 *   - The Deficit Round Robin engine is taken from fq_codel implementation
 *     (net/sched/sch_fq_codel.c). Note that wdrr_bucket corresponds to
 *     fq_codel_flow in fq_codel implementation.
 *
 */

/* Non-configurable parameters */
#define HH_FLOWS_CNT …
#define HHF_ARRAYS_CNT …
#define HHF_ARRAYS_LEN …
#define HHF_BIT_MASK_LEN …
#define HHF_BIT_MASK …

#define WDRR_BUCKET_CNT …
enum wdrr_bucket_idx { … };

#define hhf_time_before(a, b) …

/* Heavy-hitter per-flow state */
struct hh_flow_state { … };

/* Weighted Deficit Round Robin (WDRR) scheduler */
struct wdrr_bucket { … };

struct hhf_sched_data { … };

static u32 hhf_time_stamp(void)
{ … }

/* Looks up a heavy-hitter flow in a chaining list of table T. */
static struct hh_flow_state *seek_list(const u32 hash,
				       struct list_head *head,
				       struct hhf_sched_data *q)
{ … }

/* Returns a flow state entry for a new heavy-hitter.  Either reuses an expired
 * entry or dynamically alloc a new entry.
 */
static struct hh_flow_state *alloc_new_hh(struct list_head *head,
					  struct hhf_sched_data *q)
{ … }

/* Assigns packets to WDRR buckets.  Implements a multi-stage filter to
 * classify heavy-hitters.
 */
static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch)
{ … }

/* Removes one skb from head of bucket. */
static struct sk_buff *dequeue_head(struct wdrr_bucket *bucket)
{ … }

/* Tail-adds skb to bucket. */
static void bucket_add(struct wdrr_bucket *bucket, struct sk_buff *skb)
{ … }

static unsigned int hhf_drop(struct Qdisc *sch, struct sk_buff **to_free)
{ … }

static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
		       struct sk_buff **to_free)
{ … }

static struct sk_buff *hhf_dequeue(struct Qdisc *sch)
{ … }

static void hhf_reset(struct Qdisc *sch)
{ … }

static void hhf_destroy(struct Qdisc *sch)
{ … }

static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = …;

static int hhf_change(struct Qdisc *sch, struct nlattr *opt,
		      struct netlink_ext_ack *extack)
{ … }

static int hhf_init(struct Qdisc *sch, struct nlattr *opt,
		    struct netlink_ext_ack *extack)
{ … }

static int hhf_dump(struct Qdisc *sch, struct sk_buff *skb)
{ … }

static int hhf_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{ … }

static struct Qdisc_ops hhf_qdisc_ops __read_mostly = …;
MODULE_ALIAS_NET_SCH(…) …;

static int __init hhf_module_init(void)
{ … }

static void __exit hhf_module_exit(void)
{ … }

module_init(…) …
module_exit(…)
MODULE_AUTHOR(…) …;
MODULE_AUTHOR(…) …;
MODULE_LICENSE(…) …;
MODULE_DESCRIPTION(…) …;
linux/net/sched/sch_hhf.c