blk-iocost.c | Explore in Territory

/* SPDX-License-Identifier: GPL-2.0
 *
 * IO cost model based controller.
 *
 * Copyright (C) 2019 Tejun Heo <[email protected]>
 * Copyright (C) 2019 Andy Newell <[email protected]>
 * Copyright (C) 2019 Facebook
 *
 * One challenge of controlling IO resources is the lack of trivially
 * observable cost metric.  This is distinguished from CPU and memory where
 * wallclock time and the number of bytes can serve as accurate enough
 * approximations.
 *
 * Bandwidth and iops are the most commonly used metrics for IO devices but
 * depending on the type and specifics of the device, different IO patterns
 * easily lead to multiple orders of magnitude variations rendering them
 * useless for the purpose of IO capacity distribution.  While on-device
 * time, with a lot of clutches, could serve as a useful approximation for
 * non-queued rotational devices, this is no longer viable with modern
 * devices, even the rotational ones.
 *
 * While there is no cost metric we can trivially observe, it isn't a
 * complete mystery.  For example, on a rotational device, seek cost
 * dominates while a contiguous transfer contributes a smaller amount
 * proportional to the size.  If we can characterize at least the relative
 * costs of these different types of IOs, it should be possible to
 * implement a reasonable work-conserving proportional IO resource
 * distribution.
 *
 * 1. IO Cost Model
 *
 * IO cost model estimates the cost of an IO given its basic parameters and
 * history (e.g. the end sector of the last IO).  The cost is measured in
 * device time.  If a given IO is estimated to cost 10ms, the device should
 * be able to process ~100 of those IOs in a second.
 *
 * Currently, there's only one builtin cost model - linear.  Each IO is
 * classified as sequential or random and given a base cost accordingly.
 * On top of that, a size cost proportional to the length of the IO is
 * added.  While simple, this model captures the operational
 * characteristics of a wide varienty of devices well enough.  Default
 * parameters for several different classes of devices are provided and the
 * parameters can be configured from userspace via
 * /sys/fs/cgroup/io.cost.model.
 *
 * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
 * device-specific coefficients.
 *
 * 2. Control Strategy
 *
 * The device virtual time (vtime) is used as the primary control metric.
 * The control strategy is composed of the following three parts.
 *
 * 2-1. Vtime Distribution
 *
 * When a cgroup becomes active in terms of IOs, its hierarchical share is
 * calculated.  Please consider the following hierarchy where the numbers
 * inside parentheses denote the configured weights.
 *
 *           root
 *         /       \
 *      A (w:100)  B (w:300)
 *      /       \
 *  A0 (w:100)  A1 (w:100)
 *
 * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
 * of equal weight, each gets 50% share.  If then B starts issuing IOs, B
 * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
 * 12.5% each.  The distribution mechanism only cares about these flattened
 * shares.  They're called hweights (hierarchical weights) and always add
 * upto 1 (WEIGHT_ONE).
 *
 * A given cgroup's vtime runs slower in inverse proportion to its hweight.
 * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
 * against the device vtime - an IO which takes 10ms on the underlying
 * device is considered to take 80ms on A0.
 *
 * This constitutes the basis of IO capacity distribution.  Each cgroup's
 * vtime is running at a rate determined by its hweight.  A cgroup tracks
 * the vtime consumed by past IOs and can issue a new IO if doing so
 * wouldn't outrun the current device vtime.  Otherwise, the IO is
 * suspended until the vtime has progressed enough to cover it.
 *
 * 2-2. Vrate Adjustment
 *
 * It's unrealistic to expect the cost model to be perfect.  There are too
 * many devices and even on the same device the overall performance
 * fluctuates depending on numerous factors such as IO mixture and device
 * internal garbage collection.  The controller needs to adapt dynamically.
 *
 * This is achieved by adjusting the overall IO rate according to how busy
 * the device is.  If the device becomes overloaded, we're sending down too
 * many IOs and should generally slow down.  If there are waiting issuers
 * but the device isn't saturated, we're issuing too few and should
 * generally speed up.
 *
 * To slow down, we lower the vrate - the rate at which the device vtime
 * passes compared to the wall clock.  For example, if the vtime is running
 * at the vrate of 75%, all cgroups added up would only be able to issue
 * 750ms worth of IOs per second, and vice-versa for speeding up.
 *
 * Device business is determined using two criteria - rq wait and
 * completion latencies.
 *
 * When a device gets saturated, the on-device and then the request queues
 * fill up and a bio which is ready to be issued has to wait for a request
 * to become available.  When this delay becomes noticeable, it's a clear
 * indication that the device is saturated and we lower the vrate.  This
 * saturation signal is fairly conservative as it only triggers when both
 * hardware and software queues are filled up, and is used as the default
 * busy signal.
 *
 * As devices can have deep queues and be unfair in how the queued commands
 * are executed, solely depending on rq wait may not result in satisfactory
 * control quality.  For a better control quality, completion latency QoS
 * parameters can be configured so that the device is considered saturated
 * if N'th percentile completion latency rises above the set point.
 *
 * The completion latency requirements are a function of both the
 * underlying device characteristics and the desired IO latency quality of
 * service.  There is an inherent trade-off - the tighter the latency QoS,
 * the higher the bandwidth lossage.  Latency QoS is disabled by default
 * and can be set through /sys/fs/cgroup/io.cost.qos.
 *
 * 2-3. Work Conservation
 *
 * Imagine two cgroups A and B with equal weights.  A is issuing a small IO
 * periodically while B is sending out enough parallel IOs to saturate the
 * device on its own.  Let's say A's usage amounts to 100ms worth of IO
 * cost per second, i.e., 10% of the device capacity.  The naive
 * distribution of half and half would lead to 60% utilization of the
 * device, a significant reduction in the total amount of work done
 * compared to free-for-all competition.  This is too high a cost to pay
 * for IO control.
 *
 * To conserve the total amount of work done, we keep track of how much
 * each active cgroup is actually using and yield part of its weight if
 * there are other cgroups which can make use of it.  In the above case,
 * A's weight will be lowered so that it hovers above the actual usage and
 * B would be able to use the rest.
 *
 * As we don't want to penalize a cgroup for donating its weight, the
 * surplus weight adjustment factors in a margin and has an immediate
 * snapback mechanism in case the cgroup needs more IO vtime for itself.
 *
 * Note that adjusting down surplus weights has the same effects as
 * accelerating vtime for other cgroups and work conservation can also be
 * implemented by adjusting vrate dynamically.  However, squaring who can
 * donate and should take back how much requires hweight propagations
 * anyway making it easier to implement and understand as a separate
 * mechanism.
 *
 * 3. Monitoring
 *
 * Instead of debugfs or other clumsy monitoring mechanisms, this
 * controller uses a drgn based monitoring script -
 * tools/cgroup/iocost_monitor.py.  For details on drgn, please see
 * https://github.com/osandov/drgn.  The output looks like the following.
 *
 *  sdb RUN   per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
 *                 active      weight      hweight% inflt% dbt  delay usages%
 *  test/a              *    50/   50  33.33/ 33.33  27.65   2  0*041 033:033:033
 *  test/b              *   100/  100  66.67/ 66.67  17.56   0  0*000 066:079:077
 *
 * - per	: Timer period
 * - cur_per	: Internal wall and device vtime clock
 * - vrate	: Device virtual time rate against wall clock
 * - weight	: Surplus-adjusted and configured weights
 * - hweight	: Surplus-adjusted and configured hierarchical weights
 * - inflt	: The percentage of in-flight IO cost at the end of last period
 * - del_ms	: Deferred issuer delay induction level and duration
 * - usages	: Usage history
 */

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/timer.h>
#include <linux/time64.h>
#include <linux/parser.h>
#include <linux/sched/signal.h>
#include <asm/local.h>
#include <asm/local64.h>
#include "blk-rq-qos.h"
#include "blk-stat.h"
#include "blk-wbt.h"
#include "blk-cgroup.h"

#ifdef CONFIG_TRACEPOINTS

/* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
#define TRACE_IOCG_PATH_LEN …
static DEFINE_SPINLOCK(trace_iocg_path_lock);
static char trace_iocg_path[TRACE_IOCG_PATH_LEN];

#define TRACE_IOCG_PATH(type, iocg, ...) …

#else	/* CONFIG_TRACE_POINTS */
#define TRACE_IOCG_PATH …
#endif	/* CONFIG_TRACE_POINTS */

enum { … };

enum { … };

enum { … };

enum ioc_running { … };

/* io.cost.qos controls including per-dev enable of the whole controller */
enum { … };

/* io.cost.qos params */
enum { … };

/* io.cost.model controls */
enum { … };

/* builtin linear cost model coefficients */
enum { … };

enum { … };

enum { … };

struct ioc_params { … };

struct ioc_margins { … };

struct ioc_missed { … };

struct ioc_pcpu_stat { … };

/* per device */
struct ioc { … };

struct iocg_pcpu_stat { … };

struct iocg_stat { … };

/* per device-cgroup pair */
struct ioc_gq { … };

/* per cgroup */
struct ioc_cgrp { … };

struct ioc_now { … };

struct iocg_wait { … };

struct iocg_wake_ctx { … };

static const struct ioc_params autop[] = …;

/*
 * vrate adjust percentages indexed by ioc->busy_level.  We adjust up on
 * vtime credit shortage and down on device saturation.
 */
static u32 vrate_adj_pct[] = …;

static struct blkcg_policy blkcg_policy_iocost;

/* accessors and helpers */
static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
{ … }

static struct ioc *q_to_ioc(struct request_queue *q)
{ … }

static const char __maybe_unused *ioc_name(struct ioc *ioc)
{ … }

static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
{ … }

static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
{ … }

static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
{ … }

static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
{ … }

/*
 * Scale @abs_cost to the inverse of @hw_inuse.  The lower the hierarchical
 * weight, the more expensive each IO.  Must round up.
 */
static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
{ … }

/*
 * The inverse of abs_cost_to_cost().  Must round up.
 */
static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
{ … }

static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio,
			    u64 abs_cost, u64 cost)
{ … }

static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags)
{ … }

static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags)
{ … }

#define CREATE_TRACE_POINTS
#include <trace/events/iocost.h>

static void ioc_refresh_margins(struct ioc *ioc)
{ … }

/* latency Qos params changed, update period_us and all the dependent params */
static void ioc_refresh_period_us(struct ioc *ioc)
{ … }

/*
 *  ioc->rqos.disk isn't initialized when this function is called from
 *  the init path.
 */
static int ioc_autop_idx(struct ioc *ioc, struct gendisk *disk)
{ … }

/*
 * Take the followings as input
 *
 *  @bps	maximum sequential throughput
 *  @seqiops	maximum sequential 4k iops
 *  @randiops	maximum random 4k iops
 *
 * and calculate the linear model cost coefficients.
 *
 *  *@page	per-page cost		1s / (@bps / 4096)
 *  *@seqio	base cost of a seq IO	max((1s / @seqiops) - *@page, 0)
 *  @randiops	base cost of a rand IO	max((1s / @randiops) - *@page, 0)
 */
static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
			u64 *page, u64 *seqio, u64 *randio)
{ … }

static void ioc_refresh_lcoefs(struct ioc *ioc)
{ … }

/*
 * struct gendisk is required as an argument because ioc->rqos.disk
 * is not properly initialized when called from the init path.
 */
static bool ioc_refresh_params_disk(struct ioc *ioc, bool force,
				    struct gendisk *disk)
{ … }

static bool ioc_refresh_params(struct ioc *ioc, bool force)
{ … }

/*
 * When an iocg accumulates too much vtime or gets deactivated, we throw away
 * some vtime, which lowers the overall device utilization. As the exact amount
 * which is being thrown away is known, we can compensate by accelerating the
 * vrate accordingly so that the extra vtime generated in the current period
 * matches what got lost.
 */
static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now)
{ … }

static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct,
				  int nr_lagging, int nr_shortages,
				  int prev_busy_level, u32 *missed_ppm)
{ … }

/* take a snapshot of the current [v]time and vrate */
static void ioc_now(struct ioc *ioc, struct ioc_now *now)
{ … }

static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
{ … }

/*
 * Update @iocg's `active` and `inuse` to @active and @inuse, update level
 * weight sums and propagate upwards accordingly. If @save, the current margin
 * is saved to be used as reference for later inuse in-period adjustments.
 */
static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
				bool save, struct ioc_now *now)
{ … }

static void commit_weights(struct ioc *ioc)
{ … }

static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
			      bool save, struct ioc_now *now)
{ … }

static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
{ … }

/*
 * Calculate the hweight_inuse @iocg would get with max @inuse assuming all the
 * other weights stay unchanged.
 */
static u32 current_hweight_max(struct ioc_gq *iocg)
{ … }

static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now)
{ … }

static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
{ … }

static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
{ … }

static void iocg_incur_debt(struct ioc_gq *iocg, u64 abs_cost,
			    struct ioc_now *now)
{ … }

static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay,
			  struct ioc_now *now)
{ … }

static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
			int flags, void *key)
{ … }

/*
 * Calculate the accumulated budget, pay debt if @pay_debt and wake up waiters
 * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in
 * addition to iocg->waitq.lock.
 */
static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt,
			    struct ioc_now *now)
{ … }

static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
{ … }

static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
{ … }

/* was iocg idle this period? */
static bool iocg_is_idle(struct ioc_gq *iocg)
{ … }

/*
 * Call this function on the target leaf @iocg's to build pre-order traversal
 * list of all the ancestors in @inner_walk. The inner nodes are linked through
 * ->walk_list and the caller is responsible for dissolving the list after use.
 */
static void iocg_build_inner_walk(struct ioc_gq *iocg,
				  struct list_head *inner_walk)
{ … }

/* propagate the deltas to the parent */
static void iocg_flush_stat_upward(struct ioc_gq *iocg)
{ … }

/* collect per-cpu counters and propagate the deltas to the parent */
static void iocg_flush_stat_leaf(struct ioc_gq *iocg, struct ioc_now *now)
{ … }

/* get stat counters ready for reading on all active iocgs */
static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now)
{ … }

/*
 * Determine what @iocg's hweight_inuse should be after donating unused
 * capacity. @hwm is the upper bound and used to signal no donation. This
 * function also throws away @iocg's excess budget.
 */
static u32 hweight_after_donation(struct ioc_gq *iocg, u32 old_hwi, u32 hwm,
				  u32 usage, struct ioc_now *now)
{ … }

/*
 * For work-conservation, an iocg which isn't using all of its share should
 * donate the leftover to other iocgs. There are two ways to achieve this - 1.
 * bumping up vrate accordingly 2. lowering the donating iocg's inuse weight.
 *
 * #1 is mathematically simpler but has the drawback of requiring synchronous
 * global hweight_inuse updates when idle iocg's get activated or inuse weights
 * change due to donation snapbacks as it has the possibility of grossly
 * overshooting what's allowed by the model and vrate.
 *
 * #2 is inherently safe with local operations. The donating iocg can easily
 * snap back to higher weights when needed without worrying about impacts on
 * other nodes as the impacts will be inherently correct. This also makes idle
 * iocg activations safe. The only effect activations have is decreasing
 * hweight_inuse of others, the right solution to which is for those iocgs to
 * snap back to higher weights.
 *
 * So, we go with #2. The challenge is calculating how each donating iocg's
 * inuse should be adjusted to achieve the target donation amounts. This is done
 * using Andy's method described in the following pdf.
 *
 *   https://drive.google.com/file/d/1PsJwxPFtjUnwOY1QJ5AeICCcsL7BM3bo
 *
 * Given the weights and target after-donation hweight_inuse values, Andy's
 * method determines how the proportional distribution should look like at each
 * sibling level to maintain the relative relationship between all non-donating
 * pairs. To roughly summarize, it divides the tree into donating and
 * non-donating parts, calculates global donation rate which is used to
 * determine the target hweight_inuse for each node, and then derives per-level
 * proportions.
 *
 * The following pdf shows that global distribution calculated this way can be
 * achieved by scaling inuse weights of donating leaves and propagating the
 * adjustments upwards proportionally.
 *
 *   https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE
 *
 * Combining the above two, we can determine how each leaf iocg's inuse should
 * be adjusted to achieve the target donation.
 *
 *   https://drive.google.com/file/d/1WcrltBOSPN0qXVdBgnKm4mdp9FhuEFQN
 *
 * The inline comments use symbols from the last pdf.
 *
 *   b is the sum of the absolute budgets in the subtree. 1 for the root node.
 *   f is the sum of the absolute budgets of non-donating nodes in the subtree.
 *   t is the sum of the absolute budgets of donating nodes in the subtree.
 *   w is the weight of the node. w = w_f + w_t
 *   w_f is the non-donating portion of w. w_f = w * f / b
 *   w_b is the donating portion of w. w_t = w * t / b
 *   s is the sum of all sibling weights. s = Sum(w) for siblings
 *   s_f and s_t are the non-donating and donating portions of s.
 *
 * Subscript p denotes the parent's counterpart and ' the adjusted value - e.g.
 * w_pt is the donating portion of the parent's weight and w'_pt the same value
 * after adjustments. Subscript r denotes the root node's values.
 */
static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now)
{ … }

/*
 * A low weight iocg can amass a large amount of debt, for example, when
 * anonymous memory gets reclaimed aggressively. If the system has a lot of
 * memory paired with a slow IO device, the debt can span multiple seconds or
 * more. If there are no other subsequent IO issuers, the in-debt iocg may end
 * up blocked paying its debt while the IO device is idle.
 *
 * The following protects against such cases. If the device has been
 * sufficiently idle for a while, the debts are halved and delays are
 * recalculated.
 */
static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors,
			      struct ioc_now *now)
{ … }

/*
 * Check the active iocgs' state to avoid oversleeping and deactive
 * idle iocgs.
 *
 * Since waiters determine the sleep durations based on the vrate
 * they saw at the time of sleep, if vrate has increased, some
 * waiters could be sleeping for too long. Wake up tardy waiters
 * which should have woken up in the last period and expire idle
 * iocgs.
 */
static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now)
{ … }

static void ioc_timer_fn(struct timer_list *timer)
{ … }

static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime,
				      u64 abs_cost, struct ioc_now *now)
{ … }

static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
				    bool is_merge, u64 *costp)
{ … }

static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
{ … }

static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc,
					 u64 *costp)
{ … }

static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc)
{ … }

static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
{ … }

static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
			   struct bio *bio)
{ … }

static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
{ … }

static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
{ … }

static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
{ … }

static void ioc_rqos_exit(struct rq_qos *rqos)
{ … }

static const struct rq_qos_ops ioc_rqos_ops = …;

static int blk_iocost_init(struct gendisk *disk)
{ … }

static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
{ … }

static void ioc_cpd_free(struct blkcg_policy_data *cpd)
{ … }

static struct blkg_policy_data *ioc_pd_alloc(struct gendisk *disk,
		struct blkcg *blkcg, gfp_t gfp)
{ … }

static void ioc_pd_init(struct blkg_policy_data *pd)
{ … }

static void ioc_pd_free(struct blkg_policy_data *pd)
{ … }

static void ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
{ … }

static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
			     int off)
{ … }


static int ioc_weight_show(struct seq_file *sf, void *v)
{ … }

static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
				size_t nbytes, loff_t off)
{ … }

static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
			  int off)
{ … }

static int ioc_qos_show(struct seq_file *sf, void *v)
{ … }

static const match_table_t qos_ctrl_tokens = …;

static const match_table_t qos_tokens = …;

static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
			     size_t nbytes, loff_t off)
{ … }

static u64 ioc_cost_model_prfill(struct seq_file *sf,
				 struct blkg_policy_data *pd, int off)
{ … }

static int ioc_cost_model_show(struct seq_file *sf, void *v)
{ … }

static const match_table_t cost_ctrl_tokens = …;

static const match_table_t i_lcoef_tokens = …;

static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
				    size_t nbytes, loff_t off)
{ … }

static struct cftype ioc_files[] = …;

static struct blkcg_policy blkcg_policy_iocost = …;

static int __init ioc_init(void)
{ … }

static void __exit ioc_exit(void)
{ … }

module_init(…) …;
module_exit(ioc_exit);
linux/block/blk-iocost.c