blk-iolatency.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0
/*
 * Block rq-qos base io controller
 *
 * This works similar to wbt with a few exceptions
 *
 * - It's bio based, so the latency covers the whole block layer in addition to
 *   the actual io.
 * - We will throttle all IO that comes in here if we need to.
 * - We use the mean latency over the 100ms window.  This is because writes can
 *   be particularly fast, which could give us a false sense of the impact of
 *   other workloads on our protected workload.
 * - By default there's no throttling, we set the queue_depth to UINT_MAX so
 *   that we can have as many outstanding bio's as we're allowed to.  Only at
 *   throttle time do we pay attention to the actual queue depth.
 *
 * The hierarchy works like the cpu controller does, we track the latency at
 * every configured node, and each configured node has it's own independent
 * queue depth.  This means that we only care about our latency targets at the
 * peer level.  Some group at the bottom of the hierarchy isn't going to affect
 * a group at the end of some other path if we're only configred at leaf level.
 *
 * Consider the following
 *
 *                   root blkg
 *             /                     \
 *        fast (target=5ms)     slow (target=10ms)
 *         /     \                  /        \
 *       a        b          normal(15ms)   unloved
 *
 * "a" and "b" have no target, but their combined io under "fast" cannot exceed
 * an average latency of 5ms.  If it does then we will throttle the "slow"
 * group.  In the case of "normal", if it exceeds its 15ms target, we will
 * throttle "unloved", but nobody else.
 *
 * In this example "fast", "slow", and "normal" will be the only groups actually
 * accounting their io latencies.  We have to walk up the heirarchy to the root
 * on every submit and complete so we can do the appropriate stat recording and
 * adjust the queue depth of ourselves if needed.
 *
 * There are 2 ways we throttle IO.
 *
 * 1) Queue depth throttling.  As we throttle down we will adjust the maximum
 * number of IO's we're allowed to have in flight.  This starts at (u64)-1 down
 * to 1.  If the group is only ever submitting IO for itself then this is the
 * only way we throttle.
 *
 * 2) Induced delay throttling.  This is for the case that a group is generating
 * IO that has to be issued by the root cg to avoid priority inversion. So think
 * REQ_META or REQ_SWAP.  If we are already at qd == 1 and we're getting a lot
 * of work done for us on behalf of the root cg and are being asked to scale
 * down more then we induce a latency at userspace return.  We accumulate the
 * total amount of time we need to be punished by doing
 *
 * total_time += min_lat_nsec - actual_io_completion
 *
 * and then at throttle time will do
 *
 * throttle_time = min(total_time, NSEC_PER_SEC)
 *
 * This induced delay will throttle back the activity that is generating the
 * root cg issued io's, wethere that's some metadata intensive operation or the
 * group is using so much memory that it is pushing us into swap.
 *
 * Copyright (C) 2018 Josef Bacik
 */
#include <linux/kernel.h>
#include <linux/blk_types.h>
#include <linux/backing-dev.h>
#include <linux/module.h>
#include <linux/timer.h>
#include <linux/memcontrol.h>
#include <linux/sched/loadavg.h>
#include <linux/sched/signal.h>
#include <trace/events/block.h>
#include <linux/blk-mq.h>
#include "blk-rq-qos.h"
#include "blk-stat.h"
#include "blk-cgroup.h"
#include "blk.h"

#define DEFAULT_SCALE_COOKIE …

static struct blkcg_policy blkcg_policy_iolatency;
struct iolatency_grp;

struct blk_iolatency { … };

static inline struct blk_iolatency *BLKIOLATENCY(struct rq_qos *rqos)
{ … }

struct child_latency_info { … };

struct percentile_stats { … };

struct latency_stat { … };

struct iolatency_grp { … };

#define BLKIOLATENCY_MIN_WIN_SIZE …
#define BLKIOLATENCY_MAX_WIN_SIZE …
/*
 * These are the constants used to fake the fixed-point moving average
 * calculation just like load average.  The call to calc_load() folds
 * (FIXED_1 (2048) - exp_factor) * new_sample into lat_avg.  The sampling
 * window size is bucketed to try to approximately calculate average
 * latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows
 * elapse immediately.  Note, windows only elapse with IO activity.  Idle
 * periods extend the most recent window.
 */
#define BLKIOLATENCY_NR_EXP_FACTORS …
#define BLKIOLATENCY_EXP_BUCKET_SIZE …
static const u64 iolatency_exp_factors[BLKIOLATENCY_NR_EXP_FACTORS] = …;

static inline struct iolatency_grp *pd_to_lat(struct blkg_policy_data *pd)
{ … }

static inline struct iolatency_grp *blkg_to_lat(struct blkcg_gq *blkg)
{ … }

static inline struct blkcg_gq *lat_to_blkg(struct iolatency_grp *iolat)
{ … }

static inline void latency_stat_init(struct iolatency_grp *iolat,
				     struct latency_stat *stat)
{ … }

static inline void latency_stat_sum(struct iolatency_grp *iolat,
				    struct latency_stat *sum,
				    struct latency_stat *stat)
{ … }

static inline void latency_stat_record_time(struct iolatency_grp *iolat,
					    u64 req_time)
{ … }

static inline bool latency_sum_ok(struct iolatency_grp *iolat,
				  struct latency_stat *stat)
{ … }

static inline u64 latency_stat_samples(struct iolatency_grp *iolat,
				       struct latency_stat *stat)
{ … }

static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat,
					      struct latency_stat *stat)
{ … }

static void iolat_cleanup_cb(struct rq_wait *rqw, void *private_data)
{ … }

static bool iolat_acquire_inflight(struct rq_wait *rqw, void *private_data)
{ … }

static void __blkcg_iolatency_throttle(struct rq_qos *rqos,
				       struct iolatency_grp *iolat,
				       bool issue_as_root,
				       bool use_memdelay)
{ … }

#define SCALE_DOWN_FACTOR …
#define SCALE_UP_FACTOR …

static inline unsigned long scale_amount(unsigned long qd, bool up)
{ … }

/*
 * We scale the qd down faster than we scale up, so we need to use this helper
 * to adjust the scale_cookie accordingly so we don't prematurely get
 * scale_cookie at DEFAULT_SCALE_COOKIE and unthrottle too much.
 *
 * Each group has their own local copy of the last scale cookie they saw, so if
 * the global scale cookie goes up or down they know which way they need to go
 * based on their last knowledge of it.
 */
static void scale_cookie_change(struct blk_iolatency *blkiolat,
				struct child_latency_info *lat_info,
				bool up)
{ … }

/*
 * Change the queue depth of the iolatency_grp.  We add 1/16th of the
 * queue depth at a time so we don't get wild swings and hopefully dial in to
 * fairer distribution of the overall queue depth.  We halve the queue depth
 * at a time so we can scale down queue depth quickly from default unlimited
 * to target.
 */
static void scale_change(struct iolatency_grp *iolat, bool up)
{ … }

/* Check our parent and see if the scale cookie has changed. */
static void check_scale_change(struct iolatency_grp *iolat)
{ … }

static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio)
{ … }

static void iolatency_record_time(struct iolatency_grp *iolat,
				  struct bio_issue *issue, u64 now,
				  bool issue_as_root)
{ … }

#define BLKIOLATENCY_MIN_ADJUST_TIME …
#define BLKIOLATENCY_MIN_GOOD_SAMPLES …

static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now)
{ … }

static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
{ … }

static void blkcg_iolatency_exit(struct rq_qos *rqos)
{ … }

static const struct rq_qos_ops blkcg_iolatency_ops = …;

static void blkiolatency_timer_fn(struct timer_list *t)
{ … }

/**
 * blkiolatency_enable_work_fn - Enable or disable iolatency on the device
 * @work: enable_work of the blk_iolatency of interest
 *
 * iolatency needs to keep track of the number of in-flight IOs per cgroup. This
 * is relatively expensive as it involves walking up the hierarchy twice for
 * every IO. Thus, if iolatency is not enabled in any cgroup for the device, we
 * want to disable the in-flight tracking.
 *
 * We have to make sure that the counting is balanced - we don't want to leak
 * the in-flight counts by disabling accounting in the completion path while IOs
 * are in flight. This is achieved by ensuring that no IO is in flight by
 * freezing the queue while flipping ->enabled. As this requires a sleepable
 * context, ->enabled flipping is punted to this work function.
 */
static void blkiolatency_enable_work_fn(struct work_struct *work)
{ … }

static int blk_iolatency_init(struct gendisk *disk)
{ … }

static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
{ … }

static void iolatency_clear_scaling(struct blkcg_gq *blkg)
{ … }

static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
			     size_t nbytes, loff_t off)
{ … }

static u64 iolatency_prfill_limit(struct seq_file *sf,
				  struct blkg_policy_data *pd, int off)
{ … }

static int iolatency_print_limit(struct seq_file *sf, void *v)
{ … }

static void iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s)
{ … }

static void iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
{ … }

static struct blkg_policy_data *iolatency_pd_alloc(struct gendisk *disk,
		struct blkcg *blkcg, gfp_t gfp)
{ … }

static void iolatency_pd_init(struct blkg_policy_data *pd)
{ … }

static void iolatency_pd_offline(struct blkg_policy_data *pd)
{ … }

static void iolatency_pd_free(struct blkg_policy_data *pd)
{ … }

static struct cftype iolatency_files[] = …;

static struct blkcg_policy blkcg_policy_iolatency = …;

static int __init iolatency_init(void)
{ … }

static void __exit iolatency_exit(void)
{ … }

module_init(…) …;
module_exit(iolatency_exit);
linux/block/blk-iolatency.c