sch_qfq.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0-only
/*
 * net/sched/sch_qfq.c         Quick Fair Queueing Plus Scheduler.
 *
 * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente.
 * Copyright (c) 2012 Paolo Valente.
 */

#include <linux/module.h>
#include <linux/init.h>
#include <linux/bitops.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/pkt_sched.h>
#include <net/sch_generic.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>


/*  Quick Fair Queueing Plus
    ========================

    Sources:

    [1] Paolo Valente,
    "Reducing the Execution Time of Fair-Queueing Schedulers."
    http://algo.ing.unimo.it/people/paolo/agg-sched/agg-sched.pdf

    Sources for QFQ:

    [2] Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient
    Packet Scheduling with Tight Bandwidth Distribution Guarantees."

    See also:
    http://retis.sssup.it/~fabio/linux/qfq/
 */

/*

  QFQ+ divides classes into aggregates of at most MAX_AGG_CLASSES
  classes. Each aggregate is timestamped with a virtual start time S
  and a virtual finish time F, and scheduled according to its
  timestamps. S and F are computed as a function of a system virtual
  time function V. The classes within each aggregate are instead
  scheduled with DRR.

  To speed up operations, QFQ+ divides also aggregates into a limited
  number of groups. Which group a class belongs to depends on the
  ratio between the maximum packet length for the class and the weight
  of the class. Groups have their own S and F. In the end, QFQ+
  schedules groups, then aggregates within groups, then classes within
  aggregates. See [1] and [2] for a full description.

  Virtual time computations.

  S, F and V are all computed in fixed point arithmetic with
  FRAC_BITS decimal bits.

  QFQ_MAX_INDEX is the maximum index allowed for a group. We need
	one bit per index.
  QFQ_MAX_WSHIFT is the maximum power of two supported as a weight.

  The layout of the bits is as below:

                   [ MTU_SHIFT ][      FRAC_BITS    ]
                   [ MAX_INDEX    ][ MIN_SLOT_SHIFT ]
				 ^.__grp->index = 0
				 *.__grp->slot_shift

  where MIN_SLOT_SHIFT is derived by difference from the others.

  The max group index corresponds to Lmax/w_min, where
  Lmax=1<<MTU_SHIFT, w_min = 1 .
  From this, and knowing how many groups (MAX_INDEX) we want,
  we can derive the shift corresponding to each group.

  Because we often need to compute
	F = S + len/w_i  and V = V + len/wsum
  instead of storing w_i store the value
	inv_w = (1<<FRAC_BITS)/w_i
  so we can do F = S + len * inv_w * wsum.
  We use W_TOT in the formulas so we can easily move between
  static and adaptive weight sum.

  The per-scheduler-instance data contain all the data structures
  for the scheduler: bitmaps and bucket lists.

 */

/*
 * Maximum number of consecutive slots occupied by backlogged classes
 * inside a group.
 */
#define QFQ_MAX_SLOTS …

/*
 * Shifts used for aggregate<->group mapping.  We allow class weights that are
 * in the range [1, 2^MAX_WSHIFT], and we try to map each aggregate i to the
 * group with the smallest index that can support the L_i / r_i configured
 * for the classes in the aggregate.
 *
 * grp->index is the index of the group; and grp->slot_shift
 * is the shift for the corresponding (scaled) sigma_i.
 */
#define QFQ_MAX_INDEX …
#define QFQ_MAX_WSHIFT …

#define QFQ_MAX_WEIGHT …
#define QFQ_MAX_WSUM …

#define FRAC_BITS …
#define ONE_FP …

#define QFQ_MTU_SHIFT …
#define QFQ_MIN_LMAX …
#define QFQ_MAX_LMAX …

#define QFQ_MAX_AGG_CLASSES …

/*
 * Possible group states.  These values are used as indexes for the bitmaps
 * array of struct qfq_queue.
 */
enum qfq_state { … };

struct qfq_group;

struct qfq_aggregate;

struct qfq_class { … };

struct qfq_aggregate { … };

struct qfq_group { … };

struct qfq_sched { … };

/*
 * Possible reasons why the timestamps of an aggregate are updated
 * enqueue: the aggregate switches from idle to active and must scheduled
 *	    for service
 * requeue: the aggregate finishes its budget, so it stops being served and
 *	    must be rescheduled for service
 */
enum update_reason { … };

static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid)
{ … }

static const struct netlink_range_validation lmax_range = …;

static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = …;

/*
 * Calculate a flow index, given its weight and maximum packet length.
 * index = log_2(maxlen/weight) but we need to apply the scaling.
 * This is used only once at flow creation.
 */
static int qfq_calc_index(u32 inv_w, unsigned int maxlen, u32 min_slot_shift)
{ … }

static void qfq_deactivate_agg(struct qfq_sched *, struct qfq_aggregate *);
static void qfq_activate_agg(struct qfq_sched *, struct qfq_aggregate *,
			     enum update_reason);

static void qfq_init_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
			 u32 lmax, u32 weight)
{ … }

static struct qfq_aggregate *qfq_find_agg(struct qfq_sched *q,
					  u32 lmax, u32 weight)
{ … }


/* Update aggregate as a function of the new number of classes. */
static void qfq_update_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
			   int new_num_classes)
{ … }

/* Add class to aggregate. */
static void qfq_add_to_agg(struct qfq_sched *q,
			   struct qfq_aggregate *agg,
			   struct qfq_class *cl)
{ … }

static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *);

static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
{ … }

/* Deschedule class from within its parent aggregate. */
static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl)
{ … }

/* Remove class from its parent aggregate. */
static void qfq_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl)
{ … }

/* Deschedule class and remove it from its parent aggregate. */
static void qfq_deact_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl)
{ … }

/* Move class to a new aggregate, matching the new class weight and/or lmax */
static int qfq_change_agg(struct Qdisc *sch, struct qfq_class *cl, u32 weight,
			   u32 lmax)
{ … }

static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
			    struct nlattr **tca, unsigned long *arg,
			    struct netlink_ext_ack *extack)
{ … }

static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl)
{ … }

static int qfq_delete_class(struct Qdisc *sch, unsigned long arg,
			    struct netlink_ext_ack *extack)
{ … }

static unsigned long qfq_search_class(struct Qdisc *sch, u32 classid)
{ … }

static struct tcf_block *qfq_tcf_block(struct Qdisc *sch, unsigned long cl,
				       struct netlink_ext_ack *extack)
{ … }

static unsigned long qfq_bind_tcf(struct Qdisc *sch, unsigned long parent,
				  u32 classid)
{ … }

static void qfq_unbind_tcf(struct Qdisc *sch, unsigned long arg)
{ … }

static int qfq_graft_class(struct Qdisc *sch, unsigned long arg,
			   struct Qdisc *new, struct Qdisc **old,
			   struct netlink_ext_ack *extack)
{ … }

static struct Qdisc *qfq_class_leaf(struct Qdisc *sch, unsigned long arg)
{ … }

static int qfq_dump_class(struct Qdisc *sch, unsigned long arg,
			  struct sk_buff *skb, struct tcmsg *tcm)
{ … }

static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
				struct gnet_dump *d)
{ … }

static void qfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
{ … }

static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch,
				      int *qerr)
{ … }

/* Generic comparison function, handling wraparound. */
static inline int qfq_gt(u64 a, u64 b)
{ … }

/* Round a precise timestamp to its slotted value. */
static inline u64 qfq_round_down(u64 ts, unsigned int shift)
{ … }

/* return the pointer to the group with lowest index in the bitmap */
static inline struct qfq_group *qfq_ffs(struct qfq_sched *q,
					unsigned long bitmap)
{ … }
/* Calculate a mask to mimic what would be ffs_from(). */
static inline unsigned long mask_from(unsigned long bitmap, int from)
{ … }

/*
 * The state computation relies on ER=0, IR=1, EB=2, IB=3
 * First compute eligibility comparing grp->S, q->V,
 * then check if someone is blocking us and possibly add EB
 */
static int qfq_calc_state(struct qfq_sched *q, const struct qfq_group *grp)
{ … }


/*
 * In principle
 *	q->bitmaps[dst] |= q->bitmaps[src] & mask;
 *	q->bitmaps[src] &= ~mask;
 * but we should make sure that src != dst
 */
static inline void qfq_move_groups(struct qfq_sched *q, unsigned long mask,
				   int src, int dst)
{ … }

static void qfq_unblock_groups(struct qfq_sched *q, int index, u64 old_F)
{ … }

/*
 * perhaps
 *
	old_V ^= q->V;
	old_V >>= q->min_slot_shift;
	if (old_V) {
		...
	}
 *
 */
static void qfq_make_eligible(struct qfq_sched *q)
{ … }

/*
 * The index of the slot in which the input aggregate agg is to be
 * inserted must not be higher than QFQ_MAX_SLOTS-2. There is a '-2'
 * and not a '-1' because the start time of the group may be moved
 * backward by one slot after the aggregate has been inserted, and
 * this would cause non-empty slots to be right-shifted by one
 * position.
 *
 * QFQ+ fully satisfies this bound to the slot index if the parameters
 * of the classes are not changed dynamically, and if QFQ+ never
 * happens to postpone the service of agg unjustly, i.e., it never
 * happens that the aggregate becomes backlogged and eligible, or just
 * eligible, while an aggregate with a higher approximated finish time
 * is being served. In particular, in this case QFQ+ guarantees that
 * the timestamps of agg are low enough that the slot index is never
 * higher than 2. Unfortunately, QFQ+ cannot provide the same
 * guarantee if it happens to unjustly postpone the service of agg, or
 * if the parameters of some class are changed.
 *
 * As for the first event, i.e., an out-of-order service, the
 * upper bound to the slot index guaranteed by QFQ+ grows to
 * 2 +
 * QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) *
 * (current_max_weight/current_wsum) <= 2 + 8 * 128 * 1.
 *
 * The following function deals with this problem by backward-shifting
 * the timestamps of agg, if needed, so as to guarantee that the slot
 * index is never higher than QFQ_MAX_SLOTS-2. This backward-shift may
 * cause the service of other aggregates to be postponed, yet the
 * worst-case guarantees of these aggregates are not violated.  In
 * fact, in case of no out-of-order service, the timestamps of agg
 * would have been even lower than they are after the backward shift,
 * because QFQ+ would have guaranteed a maximum value equal to 2 for
 * the slot index, and 2 < QFQ_MAX_SLOTS-2. Hence the aggregates whose
 * service is postponed because of the backward-shift would have
 * however waited for the service of agg before being served.
 *
 * The other event that may cause the slot index to be higher than 2
 * for agg is a recent change of the parameters of some class. If the
 * weight of a class is increased or the lmax (max_pkt_size) of the
 * class is decreased, then a new aggregate with smaller slot size
 * than the original parent aggregate of the class may happen to be
 * activated. The activation of this aggregate should be properly
 * delayed to when the service of the class has finished in the ideal
 * system tracked by QFQ+. If the activation of the aggregate is not
 * delayed to this reference time instant, then this aggregate may be
 * unjustly served before other aggregates waiting for service. This
 * may cause the above bound to the slot index to be violated for some
 * of these unlucky aggregates.
 *
 * Instead of delaying the activation of the new aggregate, which is
 * quite complex, the above-discussed capping of the slot index is
 * used to handle also the consequences of a change of the parameters
 * of a class.
 */
static void qfq_slot_insert(struct qfq_group *grp, struct qfq_aggregate *agg,
			    u64 roundedS)
{ … }

/* Maybe introduce hlist_first_entry?? */
static struct qfq_aggregate *qfq_slot_head(struct qfq_group *grp)
{ … }

/*
 * remove the entry from the slot
 */
static void qfq_front_slot_remove(struct qfq_group *grp)
{ … }

/*
 * Returns the first aggregate in the first non-empty bucket of the
 * group. As a side effect, adjusts the bucket list so the first
 * non-empty bucket is at position 0 in full_slots.
 */
static struct qfq_aggregate *qfq_slot_scan(struct qfq_group *grp)
{ … }

/*
 * adjust the bucket list. When the start time of a group decreases,
 * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to
 * move the objects. The mask of occupied slots must be shifted
 * because we use ffs() to find the first non-empty slot.
 * This covers decreases in the group's start time, but what about
 * increases of the start time ?
 * Here too we should make sure that i is less than 32
 */
static void qfq_slot_rotate(struct qfq_group *grp, u64 roundedS)
{ … }

static void qfq_update_eligible(struct qfq_sched *q)
{ … }

/* Dequeue head packet of the head class in the DRR queue of the aggregate. */
static struct sk_buff *agg_dequeue(struct qfq_aggregate *agg,
				   struct qfq_class *cl, unsigned int len)
{ … }

static inline struct sk_buff *qfq_peek_skb(struct qfq_aggregate *agg,
					   struct qfq_class **cl,
					   unsigned int *len)
{ … }

/* Update F according to the actual service received by the aggregate. */
static inline void charge_actual_service(struct qfq_aggregate *agg)
{ … }

/* Assign a reasonable start time for a new aggregate in group i.
 * Admissible values for \hat(F) are multiples of \sigma_i
 * no greater than V+\sigma_i . Larger values mean that
 * we had a wraparound so we consider the timestamp to be stale.
 *
 * If F is not stale and F >= V then we set S = F.
 * Otherwise we should assign S = V, but this may violate
 * the ordering in EB (see [2]). So, if we have groups in ER,
 * set S to the F_j of the first group j which would be blocking us.
 * We are guaranteed not to move S backward because
 * otherwise our group i would still be blocked.
 */
static void qfq_update_start(struct qfq_sched *q, struct qfq_aggregate *agg)
{ … }

/* Update the timestamps of agg before scheduling/rescheduling it for
 * service.  In particular, assign to agg->F its maximum possible
 * value, i.e., the virtual finish time with which the aggregate
 * should be labeled if it used all its budget once in service.
 */
static inline void
qfq_update_agg_ts(struct qfq_sched *q,
		    struct qfq_aggregate *agg, enum update_reason reason)
{ … }

static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg);

static struct sk_buff *qfq_dequeue(struct Qdisc *sch)
{ … }

static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q)
{ … }

static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
		       struct sk_buff **to_free)
{ … }

/*
 * Schedule aggregate according to its timestamps.
 */
static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
{ … }


/* Update agg ts and schedule agg for service */
static void qfq_activate_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
			     enum update_reason reason)
{ … }

static void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp,
			    struct qfq_aggregate *agg)
{ … }

/*
 * Called to forcibly deschedule an aggregate.  If the aggregate is
 * not in the front bucket, or if the latter has other aggregates in
 * the front bucket, we can simply remove the aggregate with no other
 * side effects.
 * Otherwise we must propagate the event up.
 */
static void qfq_deactivate_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
{ … }

static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg)
{ … }

static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt,
			  struct netlink_ext_ack *extack)
{ … }

static void qfq_reset_qdisc(struct Qdisc *sch)
{ … }

static void qfq_destroy_qdisc(struct Qdisc *sch)
{ … }

static const struct Qdisc_class_ops qfq_class_ops = …;

static struct Qdisc_ops qfq_qdisc_ops __read_mostly = …;
MODULE_ALIAS_NET_SCH(…) …;

static int __init qfq_init(void)
{ … }

static void __exit qfq_exit(void)
{ … }

module_init(…) …;
module_exit(qfq_exit);
MODULE_LICENSE(…) …;
MODULE_DESCRIPTION(…) …;
linux/net/sched/sch_qfq.c