// SPDX-License-Identifier: GPL-2.0-only /* * net/sched/sch_qfq.c Quick Fair Queueing Plus Scheduler. * * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente. * Copyright (c) 2012 Paolo Valente. */ #include <linux/module.h> #include <linux/init.h> #include <linux/bitops.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/pkt_sched.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> /* Quick Fair Queueing Plus ======================== Sources: [1] Paolo Valente, "Reducing the Execution Time of Fair-Queueing Schedulers." http://algo.ing.unimo.it/people/paolo/agg-sched/agg-sched.pdf Sources for QFQ: [2] Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient Packet Scheduling with Tight Bandwidth Distribution Guarantees." See also: http://retis.sssup.it/~fabio/linux/qfq/ */ /* QFQ+ divides classes into aggregates of at most MAX_AGG_CLASSES classes. Each aggregate is timestamped with a virtual start time S and a virtual finish time F, and scheduled according to its timestamps. S and F are computed as a function of a system virtual time function V. The classes within each aggregate are instead scheduled with DRR. To speed up operations, QFQ+ divides also aggregates into a limited number of groups. Which group a class belongs to depends on the ratio between the maximum packet length for the class and the weight of the class. Groups have their own S and F. In the end, QFQ+ schedules groups, then aggregates within groups, then classes within aggregates. See [1] and [2] for a full description. Virtual time computations. S, F and V are all computed in fixed point arithmetic with FRAC_BITS decimal bits. QFQ_MAX_INDEX is the maximum index allowed for a group. We need one bit per index. QFQ_MAX_WSHIFT is the maximum power of two supported as a weight. The layout of the bits is as below: [ MTU_SHIFT ][ FRAC_BITS ] [ MAX_INDEX ][ MIN_SLOT_SHIFT ] ^.__grp->index = 0 *.__grp->slot_shift where MIN_SLOT_SHIFT is derived by difference from the others. The max group index corresponds to Lmax/w_min, where Lmax=1<<MTU_SHIFT, w_min = 1 . From this, and knowing how many groups (MAX_INDEX) we want, we can derive the shift corresponding to each group. Because we often need to compute F = S + len/w_i and V = V + len/wsum instead of storing w_i store the value inv_w = (1<<FRAC_BITS)/w_i so we can do F = S + len * inv_w * wsum. We use W_TOT in the formulas so we can easily move between static and adaptive weight sum. The per-scheduler-instance data contain all the data structures for the scheduler: bitmaps and bucket lists. */ /* * Maximum number of consecutive slots occupied by backlogged classes * inside a group. */ #define QFQ_MAX_SLOTS … /* * Shifts used for aggregate<->group mapping. We allow class weights that are * in the range [1, 2^MAX_WSHIFT], and we try to map each aggregate i to the * group with the smallest index that can support the L_i / r_i configured * for the classes in the aggregate. * * grp->index is the index of the group; and grp->slot_shift * is the shift for the corresponding (scaled) sigma_i. */ #define QFQ_MAX_INDEX … #define QFQ_MAX_WSHIFT … #define QFQ_MAX_WEIGHT … #define QFQ_MAX_WSUM … #define FRAC_BITS … #define ONE_FP … #define QFQ_MTU_SHIFT … #define QFQ_MIN_LMAX … #define QFQ_MAX_LMAX … #define QFQ_MAX_AGG_CLASSES … /* * Possible group states. These values are used as indexes for the bitmaps * array of struct qfq_queue. */ enum qfq_state { … }; struct qfq_group; struct qfq_aggregate; struct qfq_class { … }; struct qfq_aggregate { … }; struct qfq_group { … }; struct qfq_sched { … }; /* * Possible reasons why the timestamps of an aggregate are updated * enqueue: the aggregate switches from idle to active and must scheduled * for service * requeue: the aggregate finishes its budget, so it stops being served and * must be rescheduled for service */ enum update_reason { … }; static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid) { … } static const struct netlink_range_validation lmax_range = …; static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = …; /* * Calculate a flow index, given its weight and maximum packet length. * index = log_2(maxlen/weight) but we need to apply the scaling. * This is used only once at flow creation. */ static int qfq_calc_index(u32 inv_w, unsigned int maxlen, u32 min_slot_shift) { … } static void qfq_deactivate_agg(struct qfq_sched *, struct qfq_aggregate *); static void qfq_activate_agg(struct qfq_sched *, struct qfq_aggregate *, enum update_reason); static void qfq_init_agg(struct qfq_sched *q, struct qfq_aggregate *agg, u32 lmax, u32 weight) { … } static struct qfq_aggregate *qfq_find_agg(struct qfq_sched *q, u32 lmax, u32 weight) { … } /* Update aggregate as a function of the new number of classes. */ static void qfq_update_agg(struct qfq_sched *q, struct qfq_aggregate *agg, int new_num_classes) { … } /* Add class to aggregate. */ static void qfq_add_to_agg(struct qfq_sched *q, struct qfq_aggregate *agg, struct qfq_class *cl) { … } static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *); static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg) { … } /* Deschedule class from within its parent aggregate. */ static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl) { … } /* Remove class from its parent aggregate. */ static void qfq_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl) { … } /* Deschedule class and remove it from its parent aggregate. */ static void qfq_deact_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl) { … } /* Move class to a new aggregate, matching the new class weight and/or lmax */ static int qfq_change_agg(struct Qdisc *sch, struct qfq_class *cl, u32 weight, u32 lmax) { … } static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca, unsigned long *arg, struct netlink_ext_ack *extack) { … } static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl) { … } static int qfq_delete_class(struct Qdisc *sch, unsigned long arg, struct netlink_ext_ack *extack) { … } static unsigned long qfq_search_class(struct Qdisc *sch, u32 classid) { … } static struct tcf_block *qfq_tcf_block(struct Qdisc *sch, unsigned long cl, struct netlink_ext_ack *extack) { … } static unsigned long qfq_bind_tcf(struct Qdisc *sch, unsigned long parent, u32 classid) { … } static void qfq_unbind_tcf(struct Qdisc *sch, unsigned long arg) { … } static int qfq_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { … } static struct Qdisc *qfq_class_leaf(struct Qdisc *sch, unsigned long arg) { … } static int qfq_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb, struct tcmsg *tcm) { … } static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) { … } static void qfq_walk(struct Qdisc *sch, struct qdisc_walker *arg) { … } static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) { … } /* Generic comparison function, handling wraparound. */ static inline int qfq_gt(u64 a, u64 b) { … } /* Round a precise timestamp to its slotted value. */ static inline u64 qfq_round_down(u64 ts, unsigned int shift) { … } /* return the pointer to the group with lowest index in the bitmap */ static inline struct qfq_group *qfq_ffs(struct qfq_sched *q, unsigned long bitmap) { … } /* Calculate a mask to mimic what would be ffs_from(). */ static inline unsigned long mask_from(unsigned long bitmap, int from) { … } /* * The state computation relies on ER=0, IR=1, EB=2, IB=3 * First compute eligibility comparing grp->S, q->V, * then check if someone is blocking us and possibly add EB */ static int qfq_calc_state(struct qfq_sched *q, const struct qfq_group *grp) { … } /* * In principle * q->bitmaps[dst] |= q->bitmaps[src] & mask; * q->bitmaps[src] &= ~mask; * but we should make sure that src != dst */ static inline void qfq_move_groups(struct qfq_sched *q, unsigned long mask, int src, int dst) { … } static void qfq_unblock_groups(struct qfq_sched *q, int index, u64 old_F) { … } /* * perhaps * old_V ^= q->V; old_V >>= q->min_slot_shift; if (old_V) { ... } * */ static void qfq_make_eligible(struct qfq_sched *q) { … } /* * The index of the slot in which the input aggregate agg is to be * inserted must not be higher than QFQ_MAX_SLOTS-2. There is a '-2' * and not a '-1' because the start time of the group may be moved * backward by one slot after the aggregate has been inserted, and * this would cause non-empty slots to be right-shifted by one * position. * * QFQ+ fully satisfies this bound to the slot index if the parameters * of the classes are not changed dynamically, and if QFQ+ never * happens to postpone the service of agg unjustly, i.e., it never * happens that the aggregate becomes backlogged and eligible, or just * eligible, while an aggregate with a higher approximated finish time * is being served. In particular, in this case QFQ+ guarantees that * the timestamps of agg are low enough that the slot index is never * higher than 2. Unfortunately, QFQ+ cannot provide the same * guarantee if it happens to unjustly postpone the service of agg, or * if the parameters of some class are changed. * * As for the first event, i.e., an out-of-order service, the * upper bound to the slot index guaranteed by QFQ+ grows to * 2 + * QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) * * (current_max_weight/current_wsum) <= 2 + 8 * 128 * 1. * * The following function deals with this problem by backward-shifting * the timestamps of agg, if needed, so as to guarantee that the slot * index is never higher than QFQ_MAX_SLOTS-2. This backward-shift may * cause the service of other aggregates to be postponed, yet the * worst-case guarantees of these aggregates are not violated. In * fact, in case of no out-of-order service, the timestamps of agg * would have been even lower than they are after the backward shift, * because QFQ+ would have guaranteed a maximum value equal to 2 for * the slot index, and 2 < QFQ_MAX_SLOTS-2. Hence the aggregates whose * service is postponed because of the backward-shift would have * however waited for the service of agg before being served. * * The other event that may cause the slot index to be higher than 2 * for agg is a recent change of the parameters of some class. If the * weight of a class is increased or the lmax (max_pkt_size) of the * class is decreased, then a new aggregate with smaller slot size * than the original parent aggregate of the class may happen to be * activated. The activation of this aggregate should be properly * delayed to when the service of the class has finished in the ideal * system tracked by QFQ+. If the activation of the aggregate is not * delayed to this reference time instant, then this aggregate may be * unjustly served before other aggregates waiting for service. This * may cause the above bound to the slot index to be violated for some * of these unlucky aggregates. * * Instead of delaying the activation of the new aggregate, which is * quite complex, the above-discussed capping of the slot index is * used to handle also the consequences of a change of the parameters * of a class. */ static void qfq_slot_insert(struct qfq_group *grp, struct qfq_aggregate *agg, u64 roundedS) { … } /* Maybe introduce hlist_first_entry?? */ static struct qfq_aggregate *qfq_slot_head(struct qfq_group *grp) { … } /* * remove the entry from the slot */ static void qfq_front_slot_remove(struct qfq_group *grp) { … } /* * Returns the first aggregate in the first non-empty bucket of the * group. As a side effect, adjusts the bucket list so the first * non-empty bucket is at position 0 in full_slots. */ static struct qfq_aggregate *qfq_slot_scan(struct qfq_group *grp) { … } /* * adjust the bucket list. When the start time of a group decreases, * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to * move the objects. The mask of occupied slots must be shifted * because we use ffs() to find the first non-empty slot. * This covers decreases in the group's start time, but what about * increases of the start time ? * Here too we should make sure that i is less than 32 */ static void qfq_slot_rotate(struct qfq_group *grp, u64 roundedS) { … } static void qfq_update_eligible(struct qfq_sched *q) { … } /* Dequeue head packet of the head class in the DRR queue of the aggregate. */ static struct sk_buff *agg_dequeue(struct qfq_aggregate *agg, struct qfq_class *cl, unsigned int len) { … } static inline struct sk_buff *qfq_peek_skb(struct qfq_aggregate *agg, struct qfq_class **cl, unsigned int *len) { … } /* Update F according to the actual service received by the aggregate. */ static inline void charge_actual_service(struct qfq_aggregate *agg) { … } /* Assign a reasonable start time for a new aggregate in group i. * Admissible values for \hat(F) are multiples of \sigma_i * no greater than V+\sigma_i . Larger values mean that * we had a wraparound so we consider the timestamp to be stale. * * If F is not stale and F >= V then we set S = F. * Otherwise we should assign S = V, but this may violate * the ordering in EB (see [2]). So, if we have groups in ER, * set S to the F_j of the first group j which would be blocking us. * We are guaranteed not to move S backward because * otherwise our group i would still be blocked. */ static void qfq_update_start(struct qfq_sched *q, struct qfq_aggregate *agg) { … } /* Update the timestamps of agg before scheduling/rescheduling it for * service. In particular, assign to agg->F its maximum possible * value, i.e., the virtual finish time with which the aggregate * should be labeled if it used all its budget once in service. */ static inline void qfq_update_agg_ts(struct qfq_sched *q, struct qfq_aggregate *agg, enum update_reason reason) { … } static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg); static struct sk_buff *qfq_dequeue(struct Qdisc *sch) { … } static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q) { … } static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { … } /* * Schedule aggregate according to its timestamps. */ static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg) { … } /* Update agg ts and schedule agg for service */ static void qfq_activate_agg(struct qfq_sched *q, struct qfq_aggregate *agg, enum update_reason reason) { … } static void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp, struct qfq_aggregate *agg) { … } /* * Called to forcibly deschedule an aggregate. If the aggregate is * not in the front bucket, or if the latter has other aggregates in * the front bucket, we can simply remove the aggregate with no other * side effects. * Otherwise we must propagate the event up. */ static void qfq_deactivate_agg(struct qfq_sched *q, struct qfq_aggregate *agg) { … } static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg) { … } static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { … } static void qfq_reset_qdisc(struct Qdisc *sch) { … } static void qfq_destroy_qdisc(struct Qdisc *sch) { … } static const struct Qdisc_class_ops qfq_class_ops = …; static struct Qdisc_ops qfq_qdisc_ops __read_mostly = …; MODULE_ALIAS_NET_SCH(…) …; static int __init qfq_init(void) { … } static void __exit qfq_exit(void) { … } module_init(…) …; module_exit(qfq_exit); MODULE_LICENSE(…) …; MODULE_DESCRIPTION(…) …;