sch_cake.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause

/* COMMON Applications Kept Enhanced (CAKE) discipline
 *
 * Copyright (C) 2014-2018 Jonathan Morton <[email protected]>
 * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <[email protected]>
 * Copyright (C) 2014-2018 Dave Täht <[email protected]>
 * Copyright (C) 2015-2018 Sebastian Moeller <[email protected]>
 * (C) 2015-2018 Kevin Darbyshire-Bryant <[email protected]>
 * Copyright (C) 2017-2018 Ryan Mounce <[email protected]>
 *
 * The CAKE Principles:
 *		   (or, how to have your cake and eat it too)
 *
 * This is a combination of several shaping, AQM and FQ techniques into one
 * easy-to-use package:
 *
 * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
 *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
 *   eliminating the need for any sort of burst parameter (eg. token bucket
 *   depth).  Burst support is limited to that necessary to overcome scheduling
 *   latency.
 *
 * - A Diffserv-aware priority queue, giving more priority to certain classes,
 *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
 *   the priority is reduced to avoid starving other tins.
 *
 * - Each priority tin has a separate Flow Queue system, to isolate traffic
 *   flows from each other.  This prevents a burst on one flow from increasing
 *   the delay to another.  Flows are distributed to queues using a
 *   set-associative hash function.
 *
 * - Each queue is actively managed by Cobalt, which is a combination of the
 *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
 *   congestion early via ECN (if available) and/or packet drops, to keep
 *   latency low.  The codel parameters are auto-tuned based on the bandwidth
 *   setting, as is necessary at low bandwidths.
 *
 * The configuration parameters are kept deliberately simple for ease of use.
 * Everything has sane defaults.  Complete generality of configuration is *not*
 * a goal.
 *
 * The priority queue operates according to a weighted DRR scheme, combined with
 * a bandwidth tracker which reuses the shaper logic to detect which side of the
 * bandwidth sharing threshold the tin is operating.  This determines whether a
 * priority-based weight (high) or a bandwidth-based weight (low) is used for
 * that tin in the current pass.
 *
 * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
 * granted us permission to leverage.
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/string.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/jhash.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/reciprocal_div.h>
#include <net/netlink.h>
#include <linux/if_vlan.h>
#include <net/gso.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
#include <net/tcp.h>
#include <net/flow_dissector.h>

#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <net/netfilter/nf_conntrack_core.h>
#endif

#define CAKE_SET_WAYS …
#define CAKE_MAX_TINS …
#define CAKE_QUEUES …
#define CAKE_FLOW_MASK …
#define CAKE_FLOW_NAT_FLAG …

/* struct cobalt_params - contains codel and blue parameters
 * @interval:	codel initial drop rate
 * @target:     maximum persistent sojourn time & blue update rate
 * @mtu_time:   serialisation delay of maximum-size packet
 * @p_inc:      increment of blue drop probability (0.32 fxp)
 * @p_dec:      decrement of blue drop probability (0.32 fxp)
 */
struct cobalt_params { … };

/* struct cobalt_vars - contains codel and blue variables
 * @count:		codel dropping frequency
 * @rec_inv_sqrt:	reciprocal value of sqrt(count) >> 1
 * @drop_next:		time to drop next packet, or when we dropped last
 * @blue_timer:		Blue time to next drop
 * @p_drop:		BLUE drop probability (0.32 fxp)
 * @dropping:		set if in dropping state
 * @ecn_marked:		set if marked
 */
struct cobalt_vars { … };

enum { … };

struct cake_flow { … }; /* please try to keep this structure <= 64 bytes */

struct cake_host { … };

struct cake_heap_entry { … };

struct cake_tin_data { … }; /* number of tins is small, so size of this struct doesn't matter much */

struct cake_sched_data { … };

enum { … };

/* COBALT operates the Codel and BLUE algorithms in parallel, in order to
 * obtain the best features of each.  Codel is excellent on flows which
 * respond to congestion signals in a TCP-like way.  BLUE is more effective on
 * unresponsive flows.
 */

struct cobalt_skb_cb { … };

static u64 us_to_ns(u64 us)
{ … }

static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
{ … }

static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
{ … }

static void cobalt_set_enqueue_time(struct sk_buff *skb,
				    ktime_t now)
{ … }

static u16 quantum_div[CAKE_QUEUES + 1] = …;

/* Diffserv lookup tables */

static const u8 precedence[] = …;

static const u8 diffserv8[] = …;

static const u8 diffserv4[] = …;

static const u8 diffserv3[] = …;

static const u8 besteffort[] = …;

/* tin priority order for stats dumping */

static const u8 normal_order[] = …;
static const u8 bulk_order[] = …;

#define REC_INV_SQRT_CACHE …
static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = …;

/* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
 * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
 *
 * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
 */

static void cobalt_newton_step(struct cobalt_vars *vars)
{ … }

static void cobalt_invsqrt(struct cobalt_vars *vars)
{ … }

/* There is a big difference in timing between the accurate values placed in
 * the cache and the approximations given by a single Newton step for small
 * count values, particularly when stepping from count 1 to 2 or vice versa.
 * Above 16, a single Newton step gives sufficient accuracy in either
 * direction, given the precision stored.
 *
 * The magnitude of the error when stepping up to count 2 is such as to give
 * the value that *should* have been produced at count 4.
 */

static void cobalt_cache_init(void)
{ … }

static void cobalt_vars_init(struct cobalt_vars *vars)
{ … }

/* CoDel control_law is t + interval/sqrt(count)
 * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
 * both sqrt() and divide operation.
 */
static ktime_t cobalt_control(ktime_t t,
			      u64 interval,
			      u32 rec_inv_sqrt)
{ … }

/* Call this when a packet had to be dropped due to queue overflow.  Returns
 * true if the BLUE state was quiescent before but active after this call.
 */
static bool cobalt_queue_full(struct cobalt_vars *vars,
			      struct cobalt_params *p,
			      ktime_t now)
{ … }

/* Call this when the queue was serviced but turned out to be empty.  Returns
 * true if the BLUE state was active before but quiescent after this call.
 */
static bool cobalt_queue_empty(struct cobalt_vars *vars,
			       struct cobalt_params *p,
			       ktime_t now)
{ … }

/* Call this with a freshly dequeued packet for possible congestion marking.
 * Returns true as an instruction to drop the packet, false for delivery.
 */
static bool cobalt_should_drop(struct cobalt_vars *vars,
			       struct cobalt_params *p,
			       ktime_t now,
			       struct sk_buff *skb,
			       u32 bulk_flows)
{ … }

static bool cake_update_flowkeys(struct flow_keys *keys,
				 const struct sk_buff *skb)
{ … }

/* Cake has several subtle multiple bit settings. In these cases you
 *  would be matching triple isolate mode as well.
 */

static bool cake_dsrc(int flow_mode)
{ … }

static bool cake_ddst(int flow_mode)
{ … }

static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
		     int flow_mode, u16 flow_override, u16 host_override)
{ … }

/* helper functions : might be changed when/if skb use a standard list_head */
/* remove one skb from head of slot queue */

static struct sk_buff *dequeue_head(struct cake_flow *flow)
{ … }

/* add skb to flow queue (tail add) */

static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
{ … }

static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
				    struct ipv6hdr *buf)
{ … }

static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
				      void *buf, unsigned int bufsize)
{ … }

static const void *cake_get_tcpopt(const struct tcphdr *tcph,
				   int code, int *oplen)
{ … }

/* Compare two SACK sequences. A sequence is considered greater if it SACKs more
 * bytes than the other. In the case where both sequences ACKs bytes that the
 * other doesn't, A is considered greater. DSACKs in A also makes A be
 * considered greater.
 *
 * @return -1, 0 or 1 as normal compare functions
 */
static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
				  const struct tcphdr *tcph_b)
{ … }

static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
				 u32 *tsval, u32 *tsecr)
{ … }

static bool cake_tcph_may_drop(const struct tcphdr *tcph,
			       u32 tstamp_new, u32 tsecr_new)
{ … }

static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
				       struct cake_flow *flow)
{ … }

static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
{ … }

static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
{ … }

static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
{ … }

static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
{ … }

static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
{ … }

static void cake_heapify(struct cake_sched_data *q, u16 i)
{ … }

static void cake_heapify_up(struct cake_sched_data *q, u16 i)
{ … }

static int cake_advance_shaper(struct cake_sched_data *q,
			       struct cake_tin_data *b,
			       struct sk_buff *skb,
			       ktime_t now, bool drop)
{ … }

static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
{ … }

static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash)
{ … }

static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
					     struct sk_buff *skb)
{ … }

static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
			 struct sk_buff *skb, int flow_mode, int *qerr)
{ … }

static void cake_reconfigure(struct Qdisc *sch);

static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
			struct sk_buff **to_free)
{ … }

static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
{ … }

/* Discard leftover packets from a tin no longer in use. */
static void cake_clear_tin(struct Qdisc *sch, u16 tin)
{ … }

static struct sk_buff *cake_dequeue(struct Qdisc *sch)
{ … }

static void cake_reset(struct Qdisc *sch)
{ … }

static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = …;

static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
			  u64 target_ns, u64 rtt_est_ns)
{ … }

static int cake_config_besteffort(struct Qdisc *sch)
{ … }

static int cake_config_precedence(struct Qdisc *sch)
{ … }

/*	List of known Diffserv codepoints:
 *
 *	Default Forwarding (DF/CS0) - Best Effort
 *	Max Throughput (TOS2)
 *	Min Delay (TOS4)
 *	LLT "La" (TOS5)
 *	Assured Forwarding 1 (AF1x) - x3
 *	Assured Forwarding 2 (AF2x) - x3
 *	Assured Forwarding 3 (AF3x) - x3
 *	Assured Forwarding 4 (AF4x) - x3
 *	Precedence Class 1 (CS1)
 *	Precedence Class 2 (CS2)
 *	Precedence Class 3 (CS3)
 *	Precedence Class 4 (CS4)
 *	Precedence Class 5 (CS5)
 *	Precedence Class 6 (CS6)
 *	Precedence Class 7 (CS7)
 *	Voice Admit (VA)
 *	Expedited Forwarding (EF)
 *	Lower Effort (LE)
 *
 *	Total 26 codepoints.
 */

/*	List of traffic classes in RFC 4594, updated by RFC 8622:
 *		(roughly descending order of contended priority)
 *		(roughly ascending order of uncontended throughput)
 *
 *	Network Control (CS6,CS7)      - routing traffic
 *	Telephony (EF,VA)         - aka. VoIP streams
 *	Signalling (CS5)               - VoIP setup
 *	Multimedia Conferencing (AF4x) - aka. video calls
 *	Realtime Interactive (CS4)     - eg. games
 *	Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
 *	Broadcast Video (CS3)
 *	Low-Latency Data (AF2x,TOS4)      - eg. database
 *	Ops, Admin, Management (CS2)      - eg. ssh
 *	Standard Service (DF & unrecognised codepoints)
 *	High-Throughput Data (AF1x,TOS2)  - eg. web traffic
 *	Low-Priority Data (LE,CS1)        - eg. BitTorrent
 *
 *	Total 12 traffic classes.
 */

static int cake_config_diffserv8(struct Qdisc *sch)
{ … }

static int cake_config_diffserv4(struct Qdisc *sch)
{ … }

static int cake_config_diffserv3(struct Qdisc *sch)
{ … }

static void cake_reconfigure(struct Qdisc *sch)
{ … }

static int cake_change(struct Qdisc *sch, struct nlattr *opt,
		       struct netlink_ext_ack *extack)
{ … }

static void cake_destroy(struct Qdisc *sch)
{ … }

static int cake_init(struct Qdisc *sch, struct nlattr *opt,
		     struct netlink_ext_ack *extack)
{ … }

static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
{ … }

static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{ … }

static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
{ … }

static unsigned long cake_find(struct Qdisc *sch, u32 classid)
{ … }

static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
			       u32 classid)
{ … }

static void cake_unbind(struct Qdisc *q, unsigned long cl)
{ … }

static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
					struct netlink_ext_ack *extack)
{ … }

static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
			   struct sk_buff *skb, struct tcmsg *tcm)
{ … }

static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
				 struct gnet_dump *d)
{ … }

static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
{ … }

static const struct Qdisc_class_ops cake_class_ops = …;

static struct Qdisc_ops cake_qdisc_ops __read_mostly = …;
MODULE_ALIAS_NET_SCH(…) …;

static int __init cake_module_init(void)
{ … }

static void __exit cake_module_exit(void)
{ … }

module_init(…) …
module_exit(…)
MODULE_AUTHOR(…) …;
MODULE_LICENSE(…) …;
MODULE_DESCRIPTION(…) …;
linux/net/sched/sch_cake.c