// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* COMMON Applications Kept Enhanced (CAKE) discipline * * Copyright (C) 2014-2018 Jonathan Morton <[email protected]> * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <[email protected]> * Copyright (C) 2014-2018 Dave Täht <[email protected]> * Copyright (C) 2015-2018 Sebastian Moeller <[email protected]> * (C) 2015-2018 Kevin Darbyshire-Bryant <[email protected]> * Copyright (C) 2017-2018 Ryan Mounce <[email protected]> * * The CAKE Principles: * (or, how to have your cake and eat it too) * * This is a combination of several shaping, AQM and FQ techniques into one * easy-to-use package: * * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE * equipment and bloated MACs. This operates in deficit mode (as in sch_fq), * eliminating the need for any sort of burst parameter (eg. token bucket * depth). Burst support is limited to that necessary to overcome scheduling * latency. * * - A Diffserv-aware priority queue, giving more priority to certain classes, * up to a specified fraction of bandwidth. Above that bandwidth threshold, * the priority is reduced to avoid starving other tins. * * - Each priority tin has a separate Flow Queue system, to isolate traffic * flows from each other. This prevents a burst on one flow from increasing * the delay to another. Flows are distributed to queues using a * set-associative hash function. * * - Each queue is actively managed by Cobalt, which is a combination of the * Codel and Blue AQM algorithms. This serves flows fairly, and signals * congestion early via ECN (if available) and/or packet drops, to keep * latency low. The codel parameters are auto-tuned based on the bandwidth * setting, as is necessary at low bandwidths. * * The configuration parameters are kept deliberately simple for ease of use. * Everything has sane defaults. Complete generality of configuration is *not* * a goal. * * The priority queue operates according to a weighted DRR scheme, combined with * a bandwidth tracker which reuses the shaper logic to detect which side of the * bandwidth sharing threshold the tin is operating. This determines whether a * priority-based weight (high) or a bandwidth-based weight (low) is used for * that tin in the current pass. * * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly * granted us permission to leverage. */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/string.h> #include <linux/in.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/jhash.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/reciprocal_div.h> #include <net/netlink.h> #include <linux/if_vlan.h> #include <net/gso.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/tcp.h> #include <net/flow_dissector.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack_core.h> #endif #define CAKE_SET_WAYS … #define CAKE_MAX_TINS … #define CAKE_QUEUES … #define CAKE_FLOW_MASK … #define CAKE_FLOW_NAT_FLAG … /* struct cobalt_params - contains codel and blue parameters * @interval: codel initial drop rate * @target: maximum persistent sojourn time & blue update rate * @mtu_time: serialisation delay of maximum-size packet * @p_inc: increment of blue drop probability (0.32 fxp) * @p_dec: decrement of blue drop probability (0.32 fxp) */ struct cobalt_params { … }; /* struct cobalt_vars - contains codel and blue variables * @count: codel dropping frequency * @rec_inv_sqrt: reciprocal value of sqrt(count) >> 1 * @drop_next: time to drop next packet, or when we dropped last * @blue_timer: Blue time to next drop * @p_drop: BLUE drop probability (0.32 fxp) * @dropping: set if in dropping state * @ecn_marked: set if marked */ struct cobalt_vars { … }; enum { … }; struct cake_flow { … }; /* please try to keep this structure <= 64 bytes */ struct cake_host { … }; struct cake_heap_entry { … }; struct cake_tin_data { … }; /* number of tins is small, so size of this struct doesn't matter much */ struct cake_sched_data { … }; enum { … }; /* COBALT operates the Codel and BLUE algorithms in parallel, in order to * obtain the best features of each. Codel is excellent on flows which * respond to congestion signals in a TCP-like way. BLUE is more effective on * unresponsive flows. */ struct cobalt_skb_cb { … }; static u64 us_to_ns(u64 us) { … } static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb) { … } static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb) { … } static void cobalt_set_enqueue_time(struct sk_buff *skb, ktime_t now) { … } static u16 quantum_div[CAKE_QUEUES + 1] = …; /* Diffserv lookup tables */ static const u8 precedence[] = …; static const u8 diffserv8[] = …; static const u8 diffserv4[] = …; static const u8 diffserv3[] = …; static const u8 besteffort[] = …; /* tin priority order for stats dumping */ static const u8 normal_order[] = …; static const u8 bulk_order[] = …; #define REC_INV_SQRT_CACHE … static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = …; /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2) * * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32 */ static void cobalt_newton_step(struct cobalt_vars *vars) { … } static void cobalt_invsqrt(struct cobalt_vars *vars) { … } /* There is a big difference in timing between the accurate values placed in * the cache and the approximations given by a single Newton step for small * count values, particularly when stepping from count 1 to 2 or vice versa. * Above 16, a single Newton step gives sufficient accuracy in either * direction, given the precision stored. * * The magnitude of the error when stepping up to count 2 is such as to give * the value that *should* have been produced at count 4. */ static void cobalt_cache_init(void) { … } static void cobalt_vars_init(struct cobalt_vars *vars) { … } /* CoDel control_law is t + interval/sqrt(count) * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid * both sqrt() and divide operation. */ static ktime_t cobalt_control(ktime_t t, u64 interval, u32 rec_inv_sqrt) { … } /* Call this when a packet had to be dropped due to queue overflow. Returns * true if the BLUE state was quiescent before but active after this call. */ static bool cobalt_queue_full(struct cobalt_vars *vars, struct cobalt_params *p, ktime_t now) { … } /* Call this when the queue was serviced but turned out to be empty. Returns * true if the BLUE state was active before but quiescent after this call. */ static bool cobalt_queue_empty(struct cobalt_vars *vars, struct cobalt_params *p, ktime_t now) { … } /* Call this with a freshly dequeued packet for possible congestion marking. * Returns true as an instruction to drop the packet, false for delivery. */ static bool cobalt_should_drop(struct cobalt_vars *vars, struct cobalt_params *p, ktime_t now, struct sk_buff *skb, u32 bulk_flows) { … } static bool cake_update_flowkeys(struct flow_keys *keys, const struct sk_buff *skb) { … } /* Cake has several subtle multiple bit settings. In these cases you * would be matching triple isolate mode as well. */ static bool cake_dsrc(int flow_mode) { … } static bool cake_ddst(int flow_mode) { … } static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, int flow_mode, u16 flow_override, u16 host_override) { … } /* helper functions : might be changed when/if skb use a standard list_head */ /* remove one skb from head of slot queue */ static struct sk_buff *dequeue_head(struct cake_flow *flow) { … } /* add skb to flow queue (tail add) */ static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb) { … } static struct iphdr *cake_get_iphdr(const struct sk_buff *skb, struct ipv6hdr *buf) { … } static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb, void *buf, unsigned int bufsize) { … } static const void *cake_get_tcpopt(const struct tcphdr *tcph, int code, int *oplen) { … } /* Compare two SACK sequences. A sequence is considered greater if it SACKs more * bytes than the other. In the case where both sequences ACKs bytes that the * other doesn't, A is considered greater. DSACKs in A also makes A be * considered greater. * * @return -1, 0 or 1 as normal compare functions */ static int cake_tcph_sack_compare(const struct tcphdr *tcph_a, const struct tcphdr *tcph_b) { … } static void cake_tcph_get_tstamp(const struct tcphdr *tcph, u32 *tsval, u32 *tsecr) { … } static bool cake_tcph_may_drop(const struct tcphdr *tcph, u32 tstamp_new, u32 tsecr_new) { … } static struct sk_buff *cake_ack_filter(struct cake_sched_data *q, struct cake_flow *flow) { … } static u64 cake_ewma(u64 avg, u64 sample, u32 shift) { … } static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off) { … } static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) { … } static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j) { … } static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i) { … } static void cake_heapify(struct cake_sched_data *q, u16 i) { … } static void cake_heapify_up(struct cake_sched_data *q, u16 i) { … } static int cake_advance_shaper(struct cake_sched_data *q, struct cake_tin_data *b, struct sk_buff *skb, ktime_t now, bool drop) { … } static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) { … } static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash) { … } static struct cake_tin_data *cake_select_tin(struct Qdisc *sch, struct sk_buff *skb) { … } static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t, struct sk_buff *skb, int flow_mode, int *qerr) { … } static void cake_reconfigure(struct Qdisc *sch); static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { … } static struct sk_buff *cake_dequeue_one(struct Qdisc *sch) { … } /* Discard leftover packets from a tin no longer in use. */ static void cake_clear_tin(struct Qdisc *sch, u16 tin) { … } static struct sk_buff *cake_dequeue(struct Qdisc *sch) { … } static void cake_reset(struct Qdisc *sch) { … } static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = …; static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, u64 target_ns, u64 rtt_est_ns) { … } static int cake_config_besteffort(struct Qdisc *sch) { … } static int cake_config_precedence(struct Qdisc *sch) { … } /* List of known Diffserv codepoints: * * Default Forwarding (DF/CS0) - Best Effort * Max Throughput (TOS2) * Min Delay (TOS4) * LLT "La" (TOS5) * Assured Forwarding 1 (AF1x) - x3 * Assured Forwarding 2 (AF2x) - x3 * Assured Forwarding 3 (AF3x) - x3 * Assured Forwarding 4 (AF4x) - x3 * Precedence Class 1 (CS1) * Precedence Class 2 (CS2) * Precedence Class 3 (CS3) * Precedence Class 4 (CS4) * Precedence Class 5 (CS5) * Precedence Class 6 (CS6) * Precedence Class 7 (CS7) * Voice Admit (VA) * Expedited Forwarding (EF) * Lower Effort (LE) * * Total 26 codepoints. */ /* List of traffic classes in RFC 4594, updated by RFC 8622: * (roughly descending order of contended priority) * (roughly ascending order of uncontended throughput) * * Network Control (CS6,CS7) - routing traffic * Telephony (EF,VA) - aka. VoIP streams * Signalling (CS5) - VoIP setup * Multimedia Conferencing (AF4x) - aka. video calls * Realtime Interactive (CS4) - eg. games * Multimedia Streaming (AF3x) - eg. YouTube, NetFlix, Twitch * Broadcast Video (CS3) * Low-Latency Data (AF2x,TOS4) - eg. database * Ops, Admin, Management (CS2) - eg. ssh * Standard Service (DF & unrecognised codepoints) * High-Throughput Data (AF1x,TOS2) - eg. web traffic * Low-Priority Data (LE,CS1) - eg. BitTorrent * * Total 12 traffic classes. */ static int cake_config_diffserv8(struct Qdisc *sch) { … } static int cake_config_diffserv4(struct Qdisc *sch) { … } static int cake_config_diffserv3(struct Qdisc *sch) { … } static void cake_reconfigure(struct Qdisc *sch) { … } static int cake_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { … } static void cake_destroy(struct Qdisc *sch) { … } static int cake_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { … } static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) { … } static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { … } static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg) { … } static unsigned long cake_find(struct Qdisc *sch, u32 classid) { … } static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent, u32 classid) { … } static void cake_unbind(struct Qdisc *q, unsigned long cl) { … } static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl, struct netlink_ext_ack *extack) { … } static int cake_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { … } static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) { … } static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg) { … } static const struct Qdisc_class_ops cake_class_ops = …; static struct Qdisc_ops cake_qdisc_ops __read_mostly = …; MODULE_ALIAS_NET_SCH(…) …; static int __init cake_module_init(void) { … } static void __exit cake_module_exit(void) { … } module_init(…) … module_exit(…) MODULE_AUTHOR(…) …; MODULE_LICENSE(…) …; MODULE_DESCRIPTION(…) …;