// SPDX-License-Identifier: GPL-2.0 /* * Block rq-qos base io controller * * This works similar to wbt with a few exceptions * * - It's bio based, so the latency covers the whole block layer in addition to * the actual io. * - We will throttle all IO that comes in here if we need to. * - We use the mean latency over the 100ms window. This is because writes can * be particularly fast, which could give us a false sense of the impact of * other workloads on our protected workload. * - By default there's no throttling, we set the queue_depth to UINT_MAX so * that we can have as many outstanding bio's as we're allowed to. Only at * throttle time do we pay attention to the actual queue depth. * * The hierarchy works like the cpu controller does, we track the latency at * every configured node, and each configured node has it's own independent * queue depth. This means that we only care about our latency targets at the * peer level. Some group at the bottom of the hierarchy isn't going to affect * a group at the end of some other path if we're only configred at leaf level. * * Consider the following * * root blkg * / \ * fast (target=5ms) slow (target=10ms) * / \ / \ * a b normal(15ms) unloved * * "a" and "b" have no target, but their combined io under "fast" cannot exceed * an average latency of 5ms. If it does then we will throttle the "slow" * group. In the case of "normal", if it exceeds its 15ms target, we will * throttle "unloved", but nobody else. * * In this example "fast", "slow", and "normal" will be the only groups actually * accounting their io latencies. We have to walk up the heirarchy to the root * on every submit and complete so we can do the appropriate stat recording and * adjust the queue depth of ourselves if needed. * * There are 2 ways we throttle IO. * * 1) Queue depth throttling. As we throttle down we will adjust the maximum * number of IO's we're allowed to have in flight. This starts at (u64)-1 down * to 1. If the group is only ever submitting IO for itself then this is the * only way we throttle. * * 2) Induced delay throttling. This is for the case that a group is generating * IO that has to be issued by the root cg to avoid priority inversion. So think * REQ_META or REQ_SWAP. If we are already at qd == 1 and we're getting a lot * of work done for us on behalf of the root cg and are being asked to scale * down more then we induce a latency at userspace return. We accumulate the * total amount of time we need to be punished by doing * * total_time += min_lat_nsec - actual_io_completion * * and then at throttle time will do * * throttle_time = min(total_time, NSEC_PER_SEC) * * This induced delay will throttle back the activity that is generating the * root cg issued io's, wethere that's some metadata intensive operation or the * group is using so much memory that it is pushing us into swap. * * Copyright (C) 2018 Josef Bacik */ #include <linux/kernel.h> #include <linux/blk_types.h> #include <linux/backing-dev.h> #include <linux/module.h> #include <linux/timer.h> #include <linux/memcontrol.h> #include <linux/sched/loadavg.h> #include <linux/sched/signal.h> #include <trace/events/block.h> #include <linux/blk-mq.h> #include "blk-rq-qos.h" #include "blk-stat.h" #include "blk-cgroup.h" #include "blk.h" #define DEFAULT_SCALE_COOKIE … static struct blkcg_policy blkcg_policy_iolatency; struct iolatency_grp; struct blk_iolatency { … }; static inline struct blk_iolatency *BLKIOLATENCY(struct rq_qos *rqos) { … } struct child_latency_info { … }; struct percentile_stats { … }; struct latency_stat { … }; struct iolatency_grp { … }; #define BLKIOLATENCY_MIN_WIN_SIZE … #define BLKIOLATENCY_MAX_WIN_SIZE … /* * These are the constants used to fake the fixed-point moving average * calculation just like load average. The call to calc_load() folds * (FIXED_1 (2048) - exp_factor) * new_sample into lat_avg. The sampling * window size is bucketed to try to approximately calculate average * latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows * elapse immediately. Note, windows only elapse with IO activity. Idle * periods extend the most recent window. */ #define BLKIOLATENCY_NR_EXP_FACTORS … #define BLKIOLATENCY_EXP_BUCKET_SIZE … static const u64 iolatency_exp_factors[BLKIOLATENCY_NR_EXP_FACTORS] = …; static inline struct iolatency_grp *pd_to_lat(struct blkg_policy_data *pd) { … } static inline struct iolatency_grp *blkg_to_lat(struct blkcg_gq *blkg) { … } static inline struct blkcg_gq *lat_to_blkg(struct iolatency_grp *iolat) { … } static inline void latency_stat_init(struct iolatency_grp *iolat, struct latency_stat *stat) { … } static inline void latency_stat_sum(struct iolatency_grp *iolat, struct latency_stat *sum, struct latency_stat *stat) { … } static inline void latency_stat_record_time(struct iolatency_grp *iolat, u64 req_time) { … } static inline bool latency_sum_ok(struct iolatency_grp *iolat, struct latency_stat *stat) { … } static inline u64 latency_stat_samples(struct iolatency_grp *iolat, struct latency_stat *stat) { … } static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat, struct latency_stat *stat) { … } static void iolat_cleanup_cb(struct rq_wait *rqw, void *private_data) { … } static bool iolat_acquire_inflight(struct rq_wait *rqw, void *private_data) { … } static void __blkcg_iolatency_throttle(struct rq_qos *rqos, struct iolatency_grp *iolat, bool issue_as_root, bool use_memdelay) { … } #define SCALE_DOWN_FACTOR … #define SCALE_UP_FACTOR … static inline unsigned long scale_amount(unsigned long qd, bool up) { … } /* * We scale the qd down faster than we scale up, so we need to use this helper * to adjust the scale_cookie accordingly so we don't prematurely get * scale_cookie at DEFAULT_SCALE_COOKIE and unthrottle too much. * * Each group has their own local copy of the last scale cookie they saw, so if * the global scale cookie goes up or down they know which way they need to go * based on their last knowledge of it. */ static void scale_cookie_change(struct blk_iolatency *blkiolat, struct child_latency_info *lat_info, bool up) { … } /* * Change the queue depth of the iolatency_grp. We add 1/16th of the * queue depth at a time so we don't get wild swings and hopefully dial in to * fairer distribution of the overall queue depth. We halve the queue depth * at a time so we can scale down queue depth quickly from default unlimited * to target. */ static void scale_change(struct iolatency_grp *iolat, bool up) { … } /* Check our parent and see if the scale cookie has changed. */ static void check_scale_change(struct iolatency_grp *iolat) { … } static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio) { … } static void iolatency_record_time(struct iolatency_grp *iolat, struct bio_issue *issue, u64 now, bool issue_as_root) { … } #define BLKIOLATENCY_MIN_ADJUST_TIME … #define BLKIOLATENCY_MIN_GOOD_SAMPLES … static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now) { … } static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) { … } static void blkcg_iolatency_exit(struct rq_qos *rqos) { … } static const struct rq_qos_ops blkcg_iolatency_ops = …; static void blkiolatency_timer_fn(struct timer_list *t) { … } /** * blkiolatency_enable_work_fn - Enable or disable iolatency on the device * @work: enable_work of the blk_iolatency of interest * * iolatency needs to keep track of the number of in-flight IOs per cgroup. This * is relatively expensive as it involves walking up the hierarchy twice for * every IO. Thus, if iolatency is not enabled in any cgroup for the device, we * want to disable the in-flight tracking. * * We have to make sure that the counting is balanced - we don't want to leak * the in-flight counts by disabling accounting in the completion path while IOs * are in flight. This is achieved by ensuring that no IO is in flight by * freezing the queue while flipping ->enabled. As this requires a sleepable * context, ->enabled flipping is punted to this work function. */ static void blkiolatency_enable_work_fn(struct work_struct *work) { … } static int blk_iolatency_init(struct gendisk *disk) { … } static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) { … } static void iolatency_clear_scaling(struct blkcg_gq *blkg) { … } static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { … } static u64 iolatency_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, int off) { … } static int iolatency_print_limit(struct seq_file *sf, void *v) { … } static void iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s) { … } static void iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) { … } static struct blkg_policy_data *iolatency_pd_alloc(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp) { … } static void iolatency_pd_init(struct blkg_policy_data *pd) { … } static void iolatency_pd_offline(struct blkg_policy_data *pd) { … } static void iolatency_pd_free(struct blkg_policy_data *pd) { … } static struct cftype iolatency_files[] = …; static struct blkcg_policy blkcg_policy_iolatency = …; static int __init iolatency_init(void) { … } static void __exit iolatency_exit(void) { … } module_init(…) …; module_exit(iolatency_exit);