// SPDX-License-Identifier: GPL-2.0-only /* net/sched/sch_hhf.c Heavy-Hitter Filter (HHF) * * Copyright (C) 2013 Terry Lam <[email protected]> * Copyright (C) 2013 Nandita Dukkipati <[email protected]> */ #include <linux/jiffies.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <linux/siphash.h> #include <net/pkt_sched.h> #include <net/sock.h> /* Heavy-Hitter Filter (HHF) * * Principles : * Flows are classified into two buckets: non-heavy-hitter and heavy-hitter * buckets. Initially, a new flow starts as non-heavy-hitter. Once classified * as heavy-hitter, it is immediately switched to the heavy-hitter bucket. * The buckets are dequeued by a Weighted Deficit Round Robin (WDRR) scheduler, * in which the heavy-hitter bucket is served with less weight. * In other words, non-heavy-hitters (e.g., short bursts of critical traffic) * are isolated from heavy-hitters (e.g., persistent bulk traffic) and also have * higher share of bandwidth. * * To capture heavy-hitters, we use the "multi-stage filter" algorithm in the * following paper: * [EV02] C. Estan and G. Varghese, "New Directions in Traffic Measurement and * Accounting", in ACM SIGCOMM, 2002. * * Conceptually, a multi-stage filter comprises k independent hash functions * and k counter arrays. Packets are indexed into k counter arrays by k hash * functions, respectively. The counters are then increased by the packet sizes. * Therefore, * - For a heavy-hitter flow: *all* of its k array counters must be large. * - For a non-heavy-hitter flow: some of its k array counters can be large * due to hash collision with other small flows; however, with high * probability, not *all* k counters are large. * * By the design of the multi-stage filter algorithm, the false negative rate * (heavy-hitters getting away uncaptured) is zero. However, the algorithm is * susceptible to false positives (non-heavy-hitters mistakenly classified as * heavy-hitters). * Therefore, we also implement the following optimizations to reduce false * positives by avoiding unnecessary increment of the counter values: * - Optimization O1: once a heavy-hitter is identified, its bytes are not * accounted in the array counters. This technique is called "shielding" * in Section 3.3.1 of [EV02]. * - Optimization O2: conservative update of counters * (Section 3.3.2 of [EV02]), * New counter value = max {old counter value, * smallest counter value + packet bytes} * * Finally, we refresh the counters periodically since otherwise the counter * values will keep accumulating. * * Once a flow is classified as heavy-hitter, we also save its per-flow state * in an exact-matching flow table so that its subsequent packets can be * dispatched to the heavy-hitter bucket accordingly. * * * At a high level, this qdisc works as follows: * Given a packet p: * - If the flow-id of p (e.g., TCP 5-tuple) is already in the exact-matching * heavy-hitter flow table, denoted table T, then send p to the heavy-hitter * bucket. * - Otherwise, forward p to the multi-stage filter, denoted filter F * + If F decides that p belongs to a non-heavy-hitter flow, then send p * to the non-heavy-hitter bucket. * + Otherwise, if F decides that p belongs to a new heavy-hitter flow, * then set up a new flow entry for the flow-id of p in the table T and * send p to the heavy-hitter bucket. * * In this implementation: * - T is a fixed-size hash-table with 1024 entries. Hash collision is * resolved by linked-list chaining. * - F has four counter arrays, each array containing 1024 32-bit counters. * That means 4 * 1024 * 32 bits = 16KB of memory. * - Since each array in F contains 1024 counters, 10 bits are sufficient to * index into each array. * Hence, instead of having four hash functions, we chop the 32-bit * skb-hash into three 10-bit chunks, and the remaining 10-bit chunk is * computed as XOR sum of those three chunks. * - We need to clear the counter arrays periodically; however, directly * memsetting 16KB of memory can lead to cache eviction and unwanted delay. * So by representing each counter by a valid bit, we only need to reset * 4K of 1 bit (i.e. 512 bytes) instead of 16KB of memory. * - The Deficit Round Robin engine is taken from fq_codel implementation * (net/sched/sch_fq_codel.c). Note that wdrr_bucket corresponds to * fq_codel_flow in fq_codel implementation. * */ /* Non-configurable parameters */ #define HH_FLOWS_CNT … #define HHF_ARRAYS_CNT … #define HHF_ARRAYS_LEN … #define HHF_BIT_MASK_LEN … #define HHF_BIT_MASK … #define WDRR_BUCKET_CNT … enum wdrr_bucket_idx { … }; #define hhf_time_before(a, b) … /* Heavy-hitter per-flow state */ struct hh_flow_state { … }; /* Weighted Deficit Round Robin (WDRR) scheduler */ struct wdrr_bucket { … }; struct hhf_sched_data { … }; static u32 hhf_time_stamp(void) { … } /* Looks up a heavy-hitter flow in a chaining list of table T. */ static struct hh_flow_state *seek_list(const u32 hash, struct list_head *head, struct hhf_sched_data *q) { … } /* Returns a flow state entry for a new heavy-hitter. Either reuses an expired * entry or dynamically alloc a new entry. */ static struct hh_flow_state *alloc_new_hh(struct list_head *head, struct hhf_sched_data *q) { … } /* Assigns packets to WDRR buckets. Implements a multi-stage filter to * classify heavy-hitters. */ static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch) { … } /* Removes one skb from head of bucket. */ static struct sk_buff *dequeue_head(struct wdrr_bucket *bucket) { … } /* Tail-adds skb to bucket. */ static void bucket_add(struct wdrr_bucket *bucket, struct sk_buff *skb) { … } static unsigned int hhf_drop(struct Qdisc *sch, struct sk_buff **to_free) { … } static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { … } static struct sk_buff *hhf_dequeue(struct Qdisc *sch) { … } static void hhf_reset(struct Qdisc *sch) { … } static void hhf_destroy(struct Qdisc *sch) { … } static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = …; static int hhf_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { … } static int hhf_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { … } static int hhf_dump(struct Qdisc *sch, struct sk_buff *skb) { … } static int hhf_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { … } static struct Qdisc_ops hhf_qdisc_ops __read_mostly = …; MODULE_ALIAS_NET_SCH(…) …; static int __init hhf_module_init(void) { … } static void __exit hhf_module_exit(void) { … } module_init(…) … module_exit(…) MODULE_AUTHOR(…) …; MODULE_AUTHOR(…) …; MODULE_LICENSE(…) …; MODULE_DESCRIPTION(…) …;