// SPDX-License-Identifier: GPL-2.0-or-later /* * Budget Fair Queueing (BFQ) I/O scheduler. * * Based on ideas and code from CFQ: * Copyright (C) 2003 Jens Axboe <[email protected]> * * Copyright (C) 2008 Fabio Checconi <[email protected]> * Paolo Valente <[email protected]> * * Copyright (C) 2010 Paolo Valente <[email protected]> * Arianna Avanzini <[email protected]> * * Copyright (C) 2017 Paolo Valente <[email protected]> * * BFQ is a proportional-share I/O scheduler, with some extra * low-latency capabilities. BFQ also supports full hierarchical * scheduling through cgroups. Next paragraphs provide an introduction * on BFQ inner workings. Details on BFQ benefits, usage and * limitations can be found in Documentation/block/bfq-iosched.rst. * * BFQ is a proportional-share storage-I/O scheduling algorithm based * on the slice-by-slice service scheme of CFQ. But BFQ assigns * budgets, measured in number of sectors, to processes instead of * time slices. The device is not granted to the in-service process * for a given time slice, but until it has exhausted its assigned * budget. This change from the time to the service domain enables BFQ * to distribute the device throughput among processes as desired, * without any distortion due to throughput fluctuations, or to device * internal queueing. BFQ uses an ad hoc internal scheduler, called * B-WF2Q+, to schedule processes according to their budgets. More * precisely, BFQ schedules queues associated with processes. Each * process/queue is assigned a user-configurable weight, and B-WF2Q+ * guarantees that each queue receives a fraction of the throughput * proportional to its weight. Thanks to the accurate policy of * B-WF2Q+, BFQ can afford to assign high budgets to I/O-bound * processes issuing sequential requests (to boost the throughput), * and yet guarantee a low latency to interactive and soft real-time * applications. * * In particular, to provide these low-latency guarantees, BFQ * explicitly privileges the I/O of two classes of time-sensitive * applications: interactive and soft real-time. In more detail, BFQ * behaves this way if the low_latency parameter is set (default * configuration). This feature enables BFQ to provide applications in * these classes with a very low latency. * * To implement this feature, BFQ constantly tries to detect whether * the I/O requests in a bfq_queue come from an interactive or a soft * real-time application. For brevity, in these cases, the queue is * said to be interactive or soft real-time. In both cases, BFQ * privileges the service of the queue, over that of non-interactive * and non-soft-real-time queues. This privileging is performed, * mainly, by raising the weight of the queue. So, for brevity, we * call just weight-raising periods the time periods during which a * queue is privileged, because deemed interactive or soft real-time. * * The detection of soft real-time queues/applications is described in * detail in the comments on the function * bfq_bfqq_softrt_next_start. On the other hand, the detection of an * interactive queue works as follows: a queue is deemed interactive * if it is constantly non empty only for a limited time interval, * after which it does become empty. The queue may be deemed * interactive again (for a limited time), if it restarts being * constantly non empty, provided that this happens only after the * queue has remained empty for a given minimum idle time. * * By default, BFQ computes automatically the above maximum time * interval, i.e., the time interval after which a constantly * non-empty queue stops being deemed interactive. Since a queue is * weight-raised while it is deemed interactive, this maximum time * interval happens to coincide with the (maximum) duration of the * weight-raising for interactive queues. * * Finally, BFQ also features additional heuristics for * preserving both a low latency and a high throughput on NCQ-capable, * rotational or flash-based devices, and to get the job done quickly * for applications consisting in many I/O-bound processes. * * NOTE: if the main or only goal, with a given device, is to achieve * the maximum-possible throughput at all times, then do switch off * all low-latency heuristics for that device, by setting low_latency * to 0. * * BFQ is described in [1], where also a reference to the initial, * more theoretical paper on BFQ can be found. The interested reader * can find in the latter paper full details on the main algorithm, as * well as formulas of the guarantees and formal proofs of all the * properties. With respect to the version of BFQ presented in these * papers, this implementation adds a few more heuristics, such as the * ones that guarantee a low latency to interactive and soft real-time * applications, and a hierarchical extension based on H-WF2Q+. * * B-WF2Q+ is based on WF2Q+, which is described in [2], together with * H-WF2Q+, while the augmented tree used here to implement B-WF2Q+ * with O(log N) complexity derives from the one introduced with EEVDF * in [3]. * * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O * Scheduler", Proceedings of the First Workshop on Mobile System * Technologies (MST-2015), May 2015. * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf * * [2] Jon C.R. Bennett and H. Zhang, "Hierarchical Packet Fair Queueing * Algorithms", IEEE/ACM Transactions on Networking, 5(5):675-689, * Oct 1997. * * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz * * [3] I. Stoica and H. Abdel-Wahab, "Earliest Eligible Virtual Deadline * First: A Flexible and Accurate Mechanism for Proportional Share * Resource Allocation", technical report. * * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf */ #include <linux/module.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/cgroup.h> #include <linux/ktime.h> #include <linux/rbtree.h> #include <linux/ioprio.h> #include <linux/sbitmap.h> #include <linux/delay.h> #include <linux/backing-dev.h> #include <trace/events/block.h> #include "elevator.h" #include "blk.h" #include "blk-mq.h" #include "blk-mq-sched.h" #include "bfq-iosched.h" #include "blk-wbt.h" #define BFQ_BFQQ_FNS … BFQ_BFQQ_FNS(just_created); BFQ_BFQQ_FNS(busy); BFQ_BFQQ_FNS(wait_request); BFQ_BFQQ_FNS(non_blocking_wait_rq); BFQ_BFQQ_FNS(fifo_expire); BFQ_BFQQ_FNS(has_short_ttime); BFQ_BFQQ_FNS(sync); BFQ_BFQQ_FNS(IO_bound); BFQ_BFQQ_FNS(in_large_burst); BFQ_BFQQ_FNS(coop); BFQ_BFQQ_FNS(split_coop); BFQ_BFQQ_FNS(softrt_update); #undef BFQ_BFQQ_FNS \ /* Expiration time of async (0) and sync (1) requests, in ns. */ static const u64 bfq_fifo_expire[2] = …; /* Maximum backwards seek (magic number lifted from CFQ), in KiB. */ static const int bfq_back_max = …; /* Penalty of a backwards seek, in number of sectors. */ static const int bfq_back_penalty = …; /* Idling period duration, in ns. */ static u64 bfq_slice_idle = …; /* Minimum number of assigned budgets for which stats are safe to compute. */ static const int bfq_stats_min_budgets = …; /* Default maximum budget values, in sectors and number of requests. */ static const int bfq_default_max_budget = …; /* * When a sync request is dispatched, the queue that contains that * request, and all the ancestor entities of that queue, are charged * with the number of sectors of the request. In contrast, if the * request is async, then the queue and its ancestor entities are * charged with the number of sectors of the request, multiplied by * the factor below. This throttles the bandwidth for async I/O, * w.r.t. to sync I/O, and it is done to counter the tendency of async * writes to steal I/O throughput to reads. * * The current value of this parameter is the result of a tuning with * several hardware and software configurations. We tried to find the * lowest value for which writes do not cause noticeable problems to * reads. In fact, the lower this parameter, the stabler I/O control, * in the following respect. The lower this parameter is, the less * the bandwidth enjoyed by a group decreases * - when the group does writes, w.r.t. to when it does reads; * - when other groups do reads, w.r.t. to when they do writes. */ static const int bfq_async_charge_factor = …; /* Default timeout values, in jiffies, approximating CFQ defaults. */ const int bfq_timeout = …; /* * Time limit for merging (see comments in bfq_setup_cooperator). Set * to the slowest value that, in our tests, proved to be effective in * removing false positives, while not causing true positives to miss * queue merging. * * As can be deduced from the low time limit below, queue merging, if * successful, happens at the very beginning of the I/O of the involved * cooperating processes, as a consequence of the arrival of the very * first requests from each cooperator. After that, there is very * little chance to find cooperators. */ static const unsigned long bfq_merge_time_limit = …; static struct kmem_cache *bfq_pool; /* Below this threshold (in ns), we consider thinktime immediate. */ #define BFQ_MIN_TT … /* hw_tag detection: parallel requests threshold and min samples needed. */ #define BFQ_HW_QUEUE_THRESHOLD … #define BFQ_HW_QUEUE_SAMPLES … #define BFQQ_SEEK_THR … #define BFQQ_SECT_THR_NONROT … #define BFQ_RQ_SEEKY(bfqd, last_pos, rq) … #define BFQQ_CLOSE_THR … #define BFQQ_SEEKY(bfqq) … /* * Sync random I/O is likely to be confused with soft real-time I/O, * because it is characterized by limited throughput and apparently * isochronous arrival pattern. To avoid false positives, queues * containing only random (seeky) I/O are prevented from being tagged * as soft real-time. */ #define BFQQ_TOTALLY_SEEKY(bfqq) … /* Min number of samples required to perform peak-rate update */ #define BFQ_RATE_MIN_SAMPLES … /* Min observation time interval required to perform a peak-rate update (ns) */ #define BFQ_RATE_MIN_INTERVAL … /* Target observation time interval for a peak-rate update (ns) */ #define BFQ_RATE_REF_INTERVAL … /* * Shift used for peak-rate fixed precision calculations. * With * - the current shift: 16 positions * - the current type used to store rate: u32 * - the current unit of measure for rate: [sectors/usec], or, more precisely, * [(sectors/usec) / 2^BFQ_RATE_SHIFT] to take into account the shift, * the range of rates that can be stored is * [1 / 2^BFQ_RATE_SHIFT, 2^(32 - BFQ_RATE_SHIFT)] sectors/usec = * [1 / 2^16, 2^16] sectors/usec = [15e-6, 65536] sectors/usec = * [15, 65G] sectors/sec * Which, assuming a sector size of 512B, corresponds to a range of * [7.5K, 33T] B/sec */ #define BFQ_RATE_SHIFT … /* * When configured for computing the duration of the weight-raising * for interactive queues automatically (see the comments at the * beginning of this file), BFQ does it using the following formula: * duration = (ref_rate / r) * ref_wr_duration, * where r is the peak rate of the device, and ref_rate and * ref_wr_duration are two reference parameters. In particular, * ref_rate is the peak rate of the reference storage device (see * below), and ref_wr_duration is about the maximum time needed, with * BFQ and while reading two files in parallel, to load typical large * applications on the reference device (see the comments on * max_service_from_wr below, for more details on how ref_wr_duration * is obtained). In practice, the slower/faster the device at hand * is, the more/less it takes to load applications with respect to the * reference device. Accordingly, the longer/shorter BFQ grants * weight raising to interactive applications. * * BFQ uses two different reference pairs (ref_rate, ref_wr_duration), * depending on whether the device is rotational or non-rotational. * * In the following definitions, ref_rate[0] and ref_wr_duration[0] * are the reference values for a rotational device, whereas * ref_rate[1] and ref_wr_duration[1] are the reference values for a * non-rotational device. The reference rates are not the actual peak * rates of the devices used as a reference, but slightly lower * values. The reason for using slightly lower values is that the * peak-rate estimator tends to yield slightly lower values than the * actual peak rate (it can yield the actual peak rate only if there * is only one process doing I/O, and the process does sequential * I/O). * * The reference peak rates are measured in sectors/usec, left-shifted * by BFQ_RATE_SHIFT. */ static int ref_rate[2] = …; /* * To improve readability, a conversion function is used to initialize * the following array, which entails that the array can be * initialized only in a function. */ static int ref_wr_duration[2]; /* * BFQ uses the above-detailed, time-based weight-raising mechanism to * privilege interactive tasks. This mechanism is vulnerable to the * following false positives: I/O-bound applications that will go on * doing I/O for much longer than the duration of weight * raising. These applications have basically no benefit from being * weight-raised at the beginning of their I/O. On the opposite end, * while being weight-raised, these applications * a) unjustly steal throughput to applications that may actually need * low latency; * b) make BFQ uselessly perform device idling; device idling results * in loss of device throughput with most flash-based storage, and may * increase latencies when used purposelessly. * * BFQ tries to reduce these problems, by adopting the following * countermeasure. To introduce this countermeasure, we need first to * finish explaining how the duration of weight-raising for * interactive tasks is computed. * * For a bfq_queue deemed as interactive, the duration of weight * raising is dynamically adjusted, as a function of the estimated * peak rate of the device, so as to be equal to the time needed to * execute the 'largest' interactive task we benchmarked so far. By * largest task, we mean the task for which each involved process has * to do more I/O than for any of the other tasks we benchmarked. This * reference interactive task is the start-up of LibreOffice Writer, * and in this task each process/bfq_queue needs to have at most ~110K * sectors transferred. * * This last piece of information enables BFQ to reduce the actual * duration of weight-raising for at least one class of I/O-bound * applications: those doing sequential or quasi-sequential I/O. An * example is file copy. In fact, once started, the main I/O-bound * processes of these applications usually consume the above 110K * sectors in much less time than the processes of an application that * is starting, because these I/O-bound processes will greedily devote * almost all their CPU cycles only to their target, * throughput-friendly I/O operations. This is even more true if BFQ * happens to be underestimating the device peak rate, and thus * overestimating the duration of weight raising. But, according to * our measurements, once transferred 110K sectors, these processes * have no right to be weight-raised any longer. * * Basing on the last consideration, BFQ ends weight-raising for a * bfq_queue if the latter happens to have received an amount of * service at least equal to the following constant. The constant is * set to slightly more than 110K, to have a minimum safety margin. * * This early ending of weight-raising reduces the amount of time * during which interactive false positives cause the two problems * described at the beginning of these comments. */ static const unsigned long max_service_from_wr = …; /* * Maximum time between the creation of two queues, for stable merge * to be activated (in ms) */ static const unsigned long bfq_activation_stable_merging = …; /* * Minimum time to be waited before evaluating delayed stable merge (in ms) */ static const unsigned long bfq_late_stable_merging = …; #define RQ_BIC(rq) … #define RQ_BFQQ(rq) … struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync, unsigned int actuator_idx) { … } static void bfq_put_stable_ref(struct bfq_queue *bfqq); void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync, unsigned int actuator_idx) { … } struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) { … } /** * icq_to_bic - convert iocontext queue structure to bfq_io_cq. * @icq: the iocontext queue. */ static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) { … } /** * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. * @q: the request queue. */ static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q) { … } /* * Scheduler run of queue, if there are requests pending and no one in the * driver that will restart queueing. */ void bfq_schedule_dispatch(struct bfq_data *bfqd) { … } #define bfq_class_idle(bfqq) … #define bfq_sample_valid(samples) … /* * Lifted from AS - choose which of rq1 and rq2 that is best served now. * We choose the request that is closer to the head right now. Distance * behind the head is penalized and only allowed to a certain extent. */ static struct request *bfq_choose_req(struct bfq_data *bfqd, struct request *rq1, struct request *rq2, sector_t last) { … } #define BFQ_LIMIT_INLINE_DEPTH … #ifdef CONFIG_BFQ_GROUP_IOSCHED static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit) { … } #else static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit) { return false; } #endif /* * Async I/O can easily starve sync I/O (both sync reads and sync * writes), by consuming all tags. Similarly, storms of sync writes, * such as those that sync(2) may trigger, can starve sync reads. * Limit depths of async I/O and sync writes so as to counter both * problems. * * Also if a bfq queue or its parent cgroup consume more tags than would be * appropriate for their weight, we trim the available tag depth to 1. This * avoids a situation where one cgroup can starve another cgroup from tags and * thus block service differentiation among cgroups. Note that because the * queue / cgroup already has many requests allocated and queued, this does not * significantly affect service guarantees coming from the BFQ scheduling * algorithm. */ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) { … } static struct bfq_queue * bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, sector_t sector, struct rb_node **ret_parent, struct rb_node ***rb_link) { … } static bool bfq_too_late_for_merging(struct bfq_queue *bfqq) { … } /* * The following function is not marked as __cold because it is * actually cold, but for the same performance goal described in the * comments on the likely() at the beginning of * bfq_setup_cooperator(). Unexpectedly, to reach an even lower * execution time for the case where this function is not invoked, we * had to add an unlikely() in each involved if(). */ void __cold bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) { … } /* * The following function returns false either if every active queue * must receive the same share of the throughput (symmetric scenario), * or, as a special case, if bfqq must receive a share of the * throughput lower than or equal to the share that every other active * queue must receive. If bfqq does sync I/O, then these are the only * two cases where bfqq happens to be guaranteed its share of the * throughput even if I/O dispatching is not plugged when bfqq remains * temporarily empty (for more details, see the comments in the * function bfq_better_to_idle()). For this reason, the return value * of this function is used to check whether I/O-dispatch plugging can * be avoided. * * The above first case (symmetric scenario) occurs when: * 1) all active queues have the same weight, * 2) all active queues belong to the same I/O-priority class, * 3) all active groups at the same level in the groups tree have the same * weight, * 4) all active groups at the same level in the groups tree have the same * number of children. * * Unfortunately, keeping the necessary state for evaluating exactly * the last two symmetry sub-conditions above would be quite complex * and time consuming. Therefore this function evaluates, instead, * only the following stronger three sub-conditions, for which it is * much easier to maintain the needed state: * 1) all active queues have the same weight, * 2) all active queues belong to the same I/O-priority class, * 3) there is at most one active group. * In particular, the last condition is always true if hierarchical * support or the cgroups interface are not enabled, thus no state * needs to be maintained in this case. */ static bool bfq_asymmetric_scenario(struct bfq_data *bfqd, struct bfq_queue *bfqq) { … } /* * If the weight-counter tree passed as input contains no counter for * the weight of the input queue, then add that counter; otherwise just * increment the existing counter. * * Note that weight-counter trees contain few nodes in mostly symmetric * scenarios. For example, if all queues have the same weight, then the * weight-counter tree for the queues may contain at most one node. * This holds even if low_latency is on, because weight-raised queues * are not inserted in the tree. * In most scenarios, the rate at which nodes are created/destroyed * should be low too. */ void bfq_weights_tree_add(struct bfq_queue *bfqq) { … } /* * Decrement the weight counter associated with the queue, and, if the * counter reaches 0, remove the counter from the tree. * See the comments to the function bfq_weights_tree_add() for considerations * about overhead. */ void bfq_weights_tree_remove(struct bfq_queue *bfqq) { … } /* * Return expired entry, or NULL to just start from scratch in rbtree. */ static struct request *bfq_check_fifo(struct bfq_queue *bfqq, struct request *last) { … } static struct request *bfq_find_next_rq(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct request *last) { … } /* see the definition of bfq_async_charge_factor for details */ static unsigned long bfq_serv_to_charge(struct request *rq, struct bfq_queue *bfqq) { … } /** * bfq_updated_next_req - update the queue after a new next_rq selection. * @bfqd: the device data the queue belongs to. * @bfqq: the queue to update. * * If the first request of a queue changes we make sure that the queue * has enough budget to serve at least its first request (if the * request has grown). We do this because if the queue has not enough * budget for its first request, it has to go through two dispatch * rounds to actually get it dispatched. */ static void bfq_updated_next_req(struct bfq_data *bfqd, struct bfq_queue *bfqq) { … } static unsigned int bfq_wr_duration(struct bfq_data *bfqd) { … } /* switch back from soft real-time to interactive weight raising */ static void switch_back_to_interactive_wr(struct bfq_queue *bfqq, struct bfq_data *bfqd) { … } static void bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, struct bfq_io_cq *bic, bool bfq_already_existing) { … } static int bfqq_process_refs(struct bfq_queue *bfqq) { … } /* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */ static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) { … } /* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) { … } /* * If many queues belonging to the same group happen to be created * shortly after each other, then the processes associated with these * queues have typically a common goal. In particular, bursts of queue * creations are usually caused by services or applications that spawn * many parallel threads/processes. Examples are systemd during boot, * or git grep. To help these processes get their job done as soon as * possible, it is usually better to not grant either weight-raising * or device idling to their queues, unless these queues must be * protected from the I/O flowing through other active queues. * * In this comment we describe, firstly, the reasons why this fact * holds, and, secondly, the next function, which implements the main * steps needed to properly mark these queues so that they can then be * treated in a different way. * * The above services or applications benefit mostly from a high * throughput: the quicker the requests of the activated queues are * cumulatively served, the sooner the target job of these queues gets * completed. As a consequence, weight-raising any of these queues, * which also implies idling the device for it, is almost always * counterproductive, unless there are other active queues to isolate * these new queues from. If there no other active queues, then * weight-raising these new queues just lowers throughput in most * cases. * * On the other hand, a burst of queue creations may be caused also by * the start of an application that does not consist of a lot of * parallel I/O-bound threads. In fact, with a complex application, * several short processes may need to be executed to start-up the * application. In this respect, to start an application as quickly as * possible, the best thing to do is in any case to privilege the I/O * related to the application with respect to all other * I/O. Therefore, the best strategy to start as quickly as possible * an application that causes a burst of queue creations is to * weight-raise all the queues created during the burst. This is the * exact opposite of the best strategy for the other type of bursts. * * In the end, to take the best action for each of the two cases, the * two types of bursts need to be distinguished. Fortunately, this * seems relatively easy, by looking at the sizes of the bursts. In * particular, we found a threshold such that only bursts with a * larger size than that threshold are apparently caused by * services or commands such as systemd or git grep. For brevity, * hereafter we call just 'large' these bursts. BFQ *does not* * weight-raise queues whose creation occurs in a large burst. In * addition, for each of these queues BFQ performs or does not perform * idling depending on which choice boosts the throughput more. The * exact choice depends on the device and request pattern at * hand. * * Unfortunately, false positives may occur while an interactive task * is starting (e.g., an application is being started). The * consequence is that the queues associated with the task do not * enjoy weight raising as expected. Fortunately these false positives * are very rare. They typically occur if some service happens to * start doing I/O exactly when the interactive task starts. * * Turning back to the next function, it is invoked only if there are * no active queues (apart from active queues that would belong to the * same, possible burst bfqq would belong to), and it implements all * the steps needed to detect the occurrence of a large burst and to * properly mark all the queues belonging to it (so that they can then * be treated in a different way). This goal is achieved by * maintaining a "burst list" that holds, temporarily, the queues that * belong to the burst in progress. The list is then used to mark * these queues as belonging to a large burst if the burst does become * large. The main steps are the following. * * . when the very first queue is created, the queue is inserted into the * list (as it could be the first queue in a possible burst) * * . if the current burst has not yet become large, and a queue Q that does * not yet belong to the burst is activated shortly after the last time * at which a new queue entered the burst list, then the function appends * Q to the burst list * * . if, as a consequence of the previous step, the burst size reaches * the large-burst threshold, then * * . all the queues in the burst list are marked as belonging to a * large burst * * . the burst list is deleted; in fact, the burst list already served * its purpose (keeping temporarily track of the queues in a burst, * so as to be able to mark them as belonging to a large burst in the * previous sub-step), and now is not needed any more * * . the device enters a large-burst mode * * . if a queue Q that does not belong to the burst is created while * the device is in large-burst mode and shortly after the last time * at which a queue either entered the burst list or was marked as * belonging to the current large burst, then Q is immediately marked * as belonging to a large burst. * * . if a queue Q that does not belong to the burst is created a while * later, i.e., not shortly after, than the last time at which a queue * either entered the burst list or was marked as belonging to the * current large burst, then the current burst is deemed as finished and: * * . the large-burst mode is reset if set * * . the burst list is emptied * * . Q is inserted in the burst list, as Q may be the first queue * in a possible new burst (then the burst list contains just Q * after this step). */ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) { … } static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) { … } /* * If enough samples have been computed, return the current max budget * stored in bfqd, which is dynamically updated according to the * estimated disk peak rate; otherwise return the default max budget */ static int bfq_max_budget(struct bfq_data *bfqd) { … } /* * Return min budget, which is a fraction of the current or default * max budget (trying with 1/32) */ static int bfq_min_budget(struct bfq_data *bfqd) { … } /* * The next function, invoked after the input queue bfqq switches from * idle to busy, updates the budget of bfqq. The function also tells * whether the in-service queue should be expired, by returning * true. The purpose of expiring the in-service queue is to give bfqq * the chance to possibly preempt the in-service queue, and the reason * for preempting the in-service queue is to achieve one of the two * goals below. * * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has * expired because it has remained idle. In particular, bfqq may have * expired for one of the following two reasons: * * - BFQQE_NO_MORE_REQUESTS bfqq did not enjoy any device idling * and did not make it to issue a new request before its last * request was served; * * - BFQQE_TOO_IDLE bfqq did enjoy device idling, but did not issue * a new request before the expiration of the idling-time. * * Even if bfqq has expired for one of the above reasons, the process * associated with the queue may be however issuing requests greedily, * and thus be sensitive to the bandwidth it receives (bfqq may have * remained idle for other reasons: CPU high load, bfqq not enjoying * idling, I/O throttling somewhere in the path from the process to * the I/O scheduler, ...). But if, after every expiration for one of * the above two reasons, bfqq has to wait for the service of at least * one full budget of another queue before being served again, then * bfqq is likely to get a much lower bandwidth or resource time than * its reserved ones. To address this issue, two countermeasures need * to be taken. * * First, the budget and the timestamps of bfqq need to be updated in * a special way on bfqq reactivation: they need to be updated as if * bfqq did not remain idle and did not expire. In fact, if they are * computed as if bfqq expired and remained idle until reactivation, * then the process associated with bfqq is treated as if, instead of * being greedy, it stopped issuing requests when bfqq remained idle, * and restarts issuing requests only on this reactivation. In other * words, the scheduler does not help the process recover the "service * hole" between bfqq expiration and reactivation. As a consequence, * the process receives a lower bandwidth than its reserved one. In * contrast, to recover this hole, the budget must be updated as if * bfqq was not expired at all before this reactivation, i.e., it must * be set to the value of the remaining budget when bfqq was * expired. Along the same line, timestamps need to be assigned the * value they had the last time bfqq was selected for service, i.e., * before last expiration. Thus timestamps need to be back-shifted * with respect to their normal computation (see [1] for more details * on this tricky aspect). * * Secondly, to allow the process to recover the hole, the in-service * queue must be expired too, to give bfqq the chance to preempt it * immediately. In fact, if bfqq has to wait for a full budget of the * in-service queue to be completed, then it may become impossible to * let the process recover the hole, even if the back-shifted * timestamps of bfqq are lower than those of the in-service queue. If * this happens for most or all of the holes, then the process may not * receive its reserved bandwidth. In this respect, it is worth noting * that, being the service of outstanding requests unpreemptible, a * little fraction of the holes may however be unrecoverable, thereby * causing a little loss of bandwidth. * * The last important point is detecting whether bfqq does need this * bandwidth recovery. In this respect, the next function deems the * process associated with bfqq greedy, and thus allows it to recover * the hole, if: 1) the process is waiting for the arrival of a new * request (which implies that bfqq expired for one of the above two * reasons), and 2) such a request has arrived soon. The first * condition is controlled through the flag non_blocking_wait_rq, * while the second through the flag arrived_in_time. If both * conditions hold, then the function computes the budget in the * above-described special way, and signals that the in-service queue * should be expired. Timestamp back-shifting is done later in * __bfq_activate_entity. * * 2. Reduce latency. Even if timestamps are not backshifted to let * the process associated with bfqq recover a service hole, bfqq may * however happen to have, after being (re)activated, a lower finish * timestamp than the in-service queue. That is, the next budget of * bfqq may have to be completed before the one of the in-service * queue. If this is the case, then preempting the in-service queue * allows this goal to be achieved, apart from the unpreemptible, * outstanding requests mentioned above. * * Unfortunately, regardless of which of the above two goals one wants * to achieve, service trees need first to be updated to know whether * the in-service queue must be preempted. To have service trees * correctly updated, the in-service queue must be expired and * rescheduled, and bfqq must be scheduled too. This is one of the * most costly operations (in future versions, the scheduling * mechanism may be re-designed in such a way to make it possible to * know whether preemption is needed without needing to update service * trees). In addition, queue preemptions almost always cause random * I/O, which may in turn cause loss of throughput. Finally, there may * even be no in-service queue when the next function is invoked (so, * no queue to compare timestamps with). Because of these facts, the * next function adopts the following simple scheme to avoid costly * operations, too frequent preemptions and too many dependencies on * the state of the scheduler: it requests the expiration of the * in-service queue (unconditionally) only for queues that need to * recover a hole. Then it delegates to other parts of the code the * responsibility of handling the above case 2. */ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, struct bfq_queue *bfqq, bool arrived_in_time) { … } /* * Return the farthest past time instant according to jiffies * macros. */ static unsigned long bfq_smallest_from_now(void) { … } static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, struct bfq_queue *bfqq, unsigned int old_wr_coeff, bool wr_or_deserves_wr, bool interactive, bool in_burst, bool soft_rt) { … } static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd, struct bfq_queue *bfqq) { … } /* * Return true if bfqq is in a higher priority class, or has a higher * weight than the in-service queue. */ static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq, struct bfq_queue *in_serv_bfqq) { … } /* * Get the index of the actuator that will serve bio. */ static unsigned int bfq_actuator_index(struct bfq_data *bfqd, struct bio *bio) { … } static bool bfq_better_to_idle(struct bfq_queue *bfqq); static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, struct bfq_queue *bfqq, int old_wr_coeff, struct request *rq, bool *interactive) { … } static void bfq_reset_inject_limit(struct bfq_data *bfqd, struct bfq_queue *bfqq) { … } static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns) { … } /* * Detect whether bfqq's I/O seems synchronized with that of some * other queue, i.e., whether bfqq, after remaining empty, happens to * receive new I/O only right after some I/O request of the other * queue has been completed. We call waker queue the other queue, and * we assume, for simplicity, that bfqq may have at most one waker * queue. * * A remarkable throughput boost can be reached by unconditionally * injecting the I/O of the waker queue, every time a new * bfq_dispatch_request happens to be invoked while I/O is being * plugged for bfqq. In addition to boosting throughput, this * unblocks bfqq's I/O, thereby improving bandwidth and latency for * bfqq. Note that these same results may be achieved with the general * injection mechanism, but less effectively. For details on this * aspect, see the comments on the choice of the queue for injection * in bfq_select_queue(). * * Turning back to the detection of a waker queue, a queue Q is deemed as a * waker queue for bfqq if, for three consecutive times, bfqq happens to become * non empty right after a request of Q has been completed within given * timeout. In this respect, even if bfqq is empty, we do not check for a waker * if it still has some in-flight I/O. In fact, in this case bfqq is actually * still being served by the drive, and may receive new I/O on the completion * of some of the in-flight requests. In particular, on the first time, Q is * tentatively set as a candidate waker queue, while on the third consecutive * time that Q is detected, the field waker_bfqq is set to Q, to confirm that Q * is a waker queue for bfqq. These detection steps are performed only if bfqq * has a long think time, so as to make it more likely that bfqq's I/O is * actually being blocked by a synchronization. This last filter, plus the * above three-times requirement and time limit for detection, make false * positives less likely. * * NOTE * * The sooner a waker queue is detected, the sooner throughput can be * boosted by injecting I/O from the waker queue. Fortunately, * detection is likely to be actually fast, for the following * reasons. While blocked by synchronization, bfqq has a long think * time. This implies that bfqq's inject limit is at least equal to 1 * (see the comments in bfq_update_inject_limit()). So, thanks to * injection, the waker queue is likely to be served during the very * first I/O-plugging time interval for bfqq. This triggers the first * step of the detection mechanism. Thanks again to injection, the * candidate waker queue is then likely to be confirmed no later than * during the next I/O-plugging interval for bfqq. * * ISSUE * * On queue merging all waker information is lost. */ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, u64 now_ns) { … } static void bfq_add_request(struct request *rq) { … } static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, struct bio *bio, struct request_queue *q) { … } static sector_t get_sdist(sector_t last_pos, struct request *rq) { … } static void bfq_remove_request(struct request_queue *q, struct request *rq) { … } static bool bfq_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { … } static int bfq_request_merge(struct request_queue *q, struct request **req, struct bio *bio) { … } static void bfq_request_merged(struct request_queue *q, struct request *req, enum elv_merge type) { … } /* * This function is called to notify the scheduler that the requests * rq and 'next' have been merged, with 'next' going away. BFQ * exploits this hook to address the following issue: if 'next' has a * fifo_time lower that rq, then the fifo_time of rq must be set to * the value of 'next', to not forget the greater age of 'next'. * * NOTE: in this function we assume that rq is in a bfq_queue, basing * on that rq is picked from the hash table q->elevator->hash, which, * in its turn, is filled only with I/O requests present in * bfq_queues, while BFQ is in use for the request queue q. In fact, * the function that fills this hash table (elv_rqhash_add) is called * only by bfq_insert_request. */ static void bfq_requests_merged(struct request_queue *q, struct request *rq, struct request *next) { … } /* Must be called with bfqq != NULL */ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) { … } void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) { … } static void bfq_end_wr(struct bfq_data *bfqd) { … } static sector_t bfq_io_struct_pos(void *io_struct, bool request) { … } static int bfq_rq_close_to_sector(void *io_struct, bool request, sector_t sector) { … } static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, struct bfq_queue *bfqq, sector_t sector) { … } static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd, struct bfq_queue *cur_bfqq, sector_t sector) { … } static struct bfq_queue * bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) { … } static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) { … } static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, struct bfq_queue *bfqq); static struct bfq_queue * bfq_setup_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_queue *stable_merge_bfqq, struct bfq_iocq_bfqq_data *bfqq_data) { … } /* * Attempt to schedule a merge of bfqq with the currently in-service * queue or with a close queue among the scheduled queues. Return * NULL if no merge was scheduled, a pointer to the shared bfq_queue * structure otherwise. * * The OOM queue is not allowed to participate to cooperation: in fact, since * the requests temporarily redirected to the OOM queue could be redirected * again to dedicated queues at any time, the state needed to correctly * handle merging with the OOM queue would be quite complex and expensive * to maintain. Besides, in such a critical condition as an out of memory, * the benefits of queue merging may be little relevant, or even negligible. * * WARNING: queue merging may impair fairness among non-weight raised * queues, for at least two reasons: 1) the original weight of a * merged queue may change during the merged state, 2) even being the * weight the same, a merged queue may be bloated with many more * requests than the ones produced by its originally-associated * process. */ static struct bfq_queue * bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, void *io_struct, bool request, struct bfq_io_cq *bic) { … } static void bfq_bfqq_save_state(struct bfq_queue *bfqq) { … } static void bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq, struct bfq_queue *new_bfqq) { … } void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq) { … } static void bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) { … } static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, struct bio *bio) { … } /* * Set the maximum time for the in-service queue to consume its * budget. This prevents seeky processes from lowering the throughput. * In practice, a time-slice service scheme is used with seeky * processes. */ static void bfq_set_budget_timeout(struct bfq_data *bfqd, struct bfq_queue *bfqq) { … } static void __bfq_set_in_service_queue(struct bfq_data *bfqd, struct bfq_queue *bfqq) { … } /* * Get and set a new queue for service. */ static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) { … } static void bfq_arm_slice_timer(struct bfq_data *bfqd) { … } /* * In autotuning mode, max_budget is dynamically recomputed as the * amount of sectors transferred in timeout at the estimated peak * rate. This enables BFQ to utilize a full timeslice with a full * budget, even if the in-service queue is served at peak rate. And * this maximises throughput with sequential workloads. */ static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) { … } /* * Update parameters related to throughput and responsiveness, as a * function of the estimated peak rate. See comments on * bfq_calc_max_budget(), and on the ref_wr_duration array. */ static void update_thr_responsiveness_params(struct bfq_data *bfqd) { … } static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) { … } static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) { … } /* * Update the read/write peak rate (the main quantity used for * auto-tuning, see update_thr_responsiveness_params()). * * It is not trivial to estimate the peak rate (correctly): because of * the presence of sw and hw queues between the scheduler and the * device components that finally serve I/O requests, it is hard to * say exactly when a given dispatched request is served inside the * device, and for how long. As a consequence, it is hard to know * precisely at what rate a given set of requests is actually served * by the device. * * On the opposite end, the dispatch time of any request is trivially * available, and, from this piece of information, the "dispatch rate" * of requests can be immediately computed. So, the idea in the next * function is to use what is known, namely request dispatch times * (plus, when useful, request completion times), to estimate what is * unknown, namely in-device request service rate. * * The main issue is that, because of the above facts, the rate at * which a certain set of requests is dispatched over a certain time * interval can vary greatly with respect to the rate at which the * same requests are then served. But, since the size of any * intermediate queue is limited, and the service scheme is lossless * (no request is silently dropped), the following obvious convergence * property holds: the number of requests dispatched MUST become * closer and closer to the number of requests completed as the * observation interval grows. This is the key property used in * the next function to estimate the peak service rate as a function * of the observed dispatch rate. The function assumes to be invoked * on every request dispatch. */ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) { … } /* * Remove request from internal lists. */ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq) { … } /* * There is a case where idling does not have to be performed for * throughput concerns, but to preserve the throughput share of * the process associated with bfqq. * * To introduce this case, we can note that allowing the drive * to enqueue more than one request at a time, and hence * delegating de facto final scheduling decisions to the * drive's internal scheduler, entails loss of control on the * actual request service order. In particular, the critical * situation is when requests from different processes happen * to be present, at the same time, in the internal queue(s) * of the drive. In such a situation, the drive, by deciding * the service order of the internally-queued requests, does * determine also the actual throughput distribution among * these processes. But the drive typically has no notion or * concern about per-process throughput distribution, and * makes its decisions only on a per-request basis. Therefore, * the service distribution enforced by the drive's internal * scheduler is likely to coincide with the desired throughput * distribution only in a completely symmetric, or favorably * skewed scenario where: * (i-a) each of these processes must get the same throughput as * the others, * (i-b) in case (i-a) does not hold, it holds that the process * associated with bfqq must receive a lower or equal * throughput than any of the other processes; * (ii) the I/O of each process has the same properties, in * terms of locality (sequential or random), direction * (reads or writes), request sizes, greediness * (from I/O-bound to sporadic), and so on; * In fact, in such a scenario, the drive tends to treat the requests * of each process in about the same way as the requests of the * others, and thus to provide each of these processes with about the * same throughput. This is exactly the desired throughput * distribution if (i-a) holds, or, if (i-b) holds instead, this is an * even more convenient distribution for (the process associated with) * bfqq. * * In contrast, in any asymmetric or unfavorable scenario, device * idling (I/O-dispatch plugging) is certainly needed to guarantee * that bfqq receives its assigned fraction of the device throughput * (see [1] for details). * * The problem is that idling may significantly reduce throughput with * certain combinations of types of I/O and devices. An important * example is sync random I/O on flash storage with command * queueing. So, unless bfqq falls in cases where idling also boosts * throughput, it is important to check conditions (i-a), i(-b) and * (ii) accurately, so as to avoid idling when not strictly needed for * service guarantees. * * Unfortunately, it is extremely difficult to thoroughly check * condition (ii). And, in case there are active groups, it becomes * very difficult to check conditions (i-a) and (i-b) too. In fact, * if there are active groups, then, for conditions (i-a) or (i-b) to * become false 'indirectly', it is enough that an active group * contains more active processes or sub-groups than some other active * group. More precisely, for conditions (i-a) or (i-b) to become * false because of such a group, it is not even necessary that the * group is (still) active: it is sufficient that, even if the group * has become inactive, some of its descendant processes still have * some request already dispatched but still waiting for * completion. In fact, requests have still to be guaranteed their * share of the throughput even after being dispatched. In this * respect, it is easy to show that, if a group frequently becomes * inactive while still having in-flight requests, and if, when this * happens, the group is not considered in the calculation of whether * the scenario is asymmetric, then the group may fail to be * guaranteed its fair share of the throughput (basically because * idling may not be performed for the descendant processes of the * group, but it had to be). We address this issue with the following * bi-modal behavior, implemented in the function * bfq_asymmetric_scenario(). * * If there are groups with requests waiting for completion * (as commented above, some of these groups may even be * already inactive), then the scenario is tagged as * asymmetric, conservatively, without checking any of the * conditions (i-a), (i-b) or (ii). So the device is idled for bfqq. * This behavior matches also the fact that groups are created * exactly if controlling I/O is a primary concern (to * preserve bandwidth and latency guarantees). * * On the opposite end, if there are no groups with requests waiting * for completion, then only conditions (i-a) and (i-b) are actually * controlled, i.e., provided that conditions (i-a) or (i-b) holds, * idling is not performed, regardless of whether condition (ii) * holds. In other words, only if conditions (i-a) and (i-b) do not * hold, then idling is allowed, and the device tends to be prevented * from queueing many requests, possibly of several processes. Since * there are no groups with requests waiting for completion, then, to * control conditions (i-a) and (i-b) it is enough to check just * whether all the queues with requests waiting for completion also * have the same weight. * * Not checking condition (ii) evidently exposes bfqq to the * risk of getting less throughput than its fair share. * However, for queues with the same weight, a further * mechanism, preemption, mitigates or even eliminates this * problem. And it does so without consequences on overall * throughput. This mechanism and its benefits are explained * in the next three paragraphs. * * Even if a queue, say Q, is expired when it remains idle, Q * can still preempt the new in-service queue if the next * request of Q arrives soon (see the comments on * bfq_bfqq_update_budg_for_activation). If all queues and * groups have the same weight, this form of preemption, * combined with the hole-recovery heuristic described in the * comments on function bfq_bfqq_update_budg_for_activation, * are enough to preserve a correct bandwidth distribution in * the mid term, even without idling. In fact, even if not * idling allows the internal queues of the device to contain * many requests, and thus to reorder requests, we can rather * safely assume that the internal scheduler still preserves a * minimum of mid-term fairness. * * More precisely, this preemption-based, idleless approach * provides fairness in terms of IOPS, and not sectors per * second. This can be seen with a simple example. Suppose * that there are two queues with the same weight, but that * the first queue receives requests of 8 sectors, while the * second queue receives requests of 1024 sectors. In * addition, suppose that each of the two queues contains at * most one request at a time, which implies that each queue * always remains idle after it is served. Finally, after * remaining idle, each queue receives very quickly a new * request. It follows that the two queues are served * alternatively, preempting each other if needed. This * implies that, although both queues have the same weight, * the queue with large requests receives a service that is * 1024/8 times as high as the service received by the other * queue. * * The motivation for using preemption instead of idling (for * queues with the same weight) is that, by not idling, * service guarantees are preserved (completely or at least in * part) without minimally sacrificing throughput. And, if * there is no active group, then the primary expectation for * this device is probably a high throughput. * * We are now left only with explaining the two sub-conditions in the * additional compound condition that is checked below for deciding * whether the scenario is asymmetric. To explain the first * sub-condition, we need to add that the function * bfq_asymmetric_scenario checks the weights of only * non-weight-raised queues, for efficiency reasons (see comments on * bfq_weights_tree_add()). Then the fact that bfqq is weight-raised * is checked explicitly here. More precisely, the compound condition * below takes into account also the fact that, even if bfqq is being * weight-raised, the scenario is still symmetric if all queues with * requests waiting for completion happen to be * weight-raised. Actually, we should be even more precise here, and * differentiate between interactive weight raising and soft real-time * weight raising. * * The second sub-condition checked in the compound condition is * whether there is a fair amount of already in-flight I/O not * belonging to bfqq. If so, I/O dispatching is to be plugged, for the * following reason. The drive may decide to serve in-flight * non-bfqq's I/O requests before bfqq's ones, thereby delaying the * arrival of new I/O requests for bfqq (recall that bfqq is sync). If * I/O-dispatching is not plugged, then, while bfqq remains empty, a * basically uncontrolled amount of I/O from other queues may be * dispatched too, possibly causing the service of bfqq's I/O to be * delayed even longer in the drive. This problem gets more and more * serious as the speed and the queue depth of the drive grow, * because, as these two quantities grow, the probability to find no * queue busy but many requests in flight grows too. By contrast, * plugging I/O dispatching minimizes the delay induced by already * in-flight I/O, and enables bfqq to recover the bandwidth it may * lose because of this delay. * * As a side note, it is worth considering that the above * device-idling countermeasures may however fail in the following * unlucky scenario: if I/O-dispatch plugging is (correctly) disabled * in a time period during which all symmetry sub-conditions hold, and * therefore the device is allowed to enqueue many requests, but at * some later point in time some sub-condition stops to hold, then it * may become impossible to make requests be served in the desired * order until all the requests already queued in the device have been * served. The last sub-condition commented above somewhat mitigates * this problem for weight-raised queues. * * However, as an additional mitigation for this problem, we preserve * plugging for a special symmetric case that may suddenly turn into * asymmetric: the case where only bfqq is busy. In this case, not * expiring bfqq does not cause any harm to any other queues in terms * of service guarantees. In contrast, it avoids the following unlucky * sequence of events: (1) bfqq is expired, (2) a new queue with a * lower weight than bfqq becomes busy (or more queues), (3) the new * queue is served until a new request arrives for bfqq, (4) when bfqq * is finally served, there are so many requests of the new queue in * the drive that the pending requests for bfqq take a lot of time to * be served. In particular, event (2) may case even already * dispatched requests of bfqq to be delayed, inside the drive. So, to * avoid this series of events, the scenario is preventively declared * as asymmetric also if bfqq is the only busy queues */ static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd, struct bfq_queue *bfqq) { … } static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, enum bfqq_expiration reason) { … } /** * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. * @bfqd: device data. * @bfqq: queue to update. * @reason: reason for expiration. * * Handle the feedback on @bfqq budget at queue expiration. * See the body for detailed comments. */ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, struct bfq_queue *bfqq, enum bfqq_expiration reason) { … } /* * Return true if the process associated with bfqq is "slow". The slow * flag is used, in addition to the budget timeout, to reduce the * amount of service provided to seeky processes, and thus reduce * their chances to lower the throughput. More details in the comments * on the function bfq_bfqq_expire(). * * An important observation is in order: as discussed in the comments * on the function bfq_update_peak_rate(), with devices with internal * queues, it is hard if ever possible to know when and for how long * an I/O request is processed by the device (apart from the trivial * I/O pattern where a new request is dispatched only after the * previous one has been completed). This makes it hard to evaluate * the real rate at which the I/O requests of each bfq_queue are * served. In fact, for an I/O scheduler like BFQ, serving a * bfq_queue means just dispatching its requests during its service * slot (i.e., until the budget of the queue is exhausted, or the * queue remains idle, or, finally, a timeout fires). But, during the * service slot of a bfq_queue, around 100 ms at most, the device may * be even still processing requests of bfq_queues served in previous * service slots. On the opposite end, the requests of the in-service * bfq_queue may be completed after the service slot of the queue * finishes. * * Anyway, unless more sophisticated solutions are used * (where possible), the sum of the sizes of the requests dispatched * during the service slot of a bfq_queue is probably the only * approximation available for the service received by the bfq_queue * during its service slot. And this sum is the quantity used in this * function to evaluate the I/O speed of a process. */ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, bool compensate, unsigned long *delta_ms) { … } /* * To be deemed as soft real-time, an application must meet two * requirements. First, the application must not require an average * bandwidth higher than the approximate bandwidth required to playback or * record a compressed high-definition video. * The next function is invoked on the completion of the last request of a * batch, to compute the next-start time instant, soft_rt_next_start, such * that, if the next request of the application does not arrive before * soft_rt_next_start, then the above requirement on the bandwidth is met. * * The second requirement is that the request pattern of the application is * isochronous, i.e., that, after issuing a request or a batch of requests, * the application stops issuing new requests until all its pending requests * have been completed. After that, the application may issue a new batch, * and so on. * For this reason the next function is invoked to compute * soft_rt_next_start only for applications that meet this requirement, * whereas soft_rt_next_start is set to infinity for applications that do * not. * * Unfortunately, even a greedy (i.e., I/O-bound) application may * happen to meet, occasionally or systematically, both the above * bandwidth and isochrony requirements. This may happen at least in * the following circumstances. First, if the CPU load is high. The * application may stop issuing requests while the CPUs are busy * serving other processes, then restart, then stop again for a while, * and so on. The other circumstances are related to the storage * device: the storage device is highly loaded or reaches a low-enough * throughput with the I/O of the application (e.g., because the I/O * is random and/or the device is slow). In all these cases, the * I/O of the application may be simply slowed down enough to meet * the bandwidth and isochrony requirements. To reduce the probability * that greedy applications are deemed as soft real-time in these * corner cases, a further rule is used in the computation of * soft_rt_next_start: the return value of this function is forced to * be higher than the maximum between the following two quantities. * * (a) Current time plus: (1) the maximum time for which the arrival * of a request is waited for when a sync queue becomes idle, * namely bfqd->bfq_slice_idle, and (2) a few extra jiffies. We * postpone for a moment the reason for adding a few extra * jiffies; we get back to it after next item (b). Lower-bounding * the return value of this function with the current time plus * bfqd->bfq_slice_idle tends to filter out greedy applications, * because the latter issue their next request as soon as possible * after the last one has been completed. In contrast, a soft * real-time application spends some time processing data, after a * batch of its requests has been completed. * * (b) Current value of bfqq->soft_rt_next_start. As pointed out * above, greedy applications may happen to meet both the * bandwidth and isochrony requirements under heavy CPU or * storage-device load. In more detail, in these scenarios, these * applications happen, only for limited time periods, to do I/O * slowly enough to meet all the requirements described so far, * including the filtering in above item (a). These slow-speed * time intervals are usually interspersed between other time * intervals during which these applications do I/O at a very high * speed. Fortunately, exactly because of the high speed of the * I/O in the high-speed intervals, the values returned by this * function happen to be so high, near the end of any such * high-speed interval, to be likely to fall *after* the end of * the low-speed time interval that follows. These high values are * stored in bfqq->soft_rt_next_start after each invocation of * this function. As a consequence, if the last value of * bfqq->soft_rt_next_start is constantly used to lower-bound the * next value that this function may return, then, from the very * beginning of a low-speed interval, bfqq->soft_rt_next_start is * likely to be constantly kept so high that any I/O request * issued during the low-speed interval is considered as arriving * to soon for the application to be deemed as soft * real-time. Then, in the high-speed interval that follows, the * application will not be deemed as soft real-time, just because * it will do I/O at a high speed. And so on. * * Getting back to the filtering in item (a), in the following two * cases this filtering might be easily passed by a greedy * application, if the reference quantity was just * bfqd->bfq_slice_idle: * 1) HZ is so low that the duration of a jiffy is comparable to or * higher than bfqd->bfq_slice_idle. This happens, e.g., on slow * devices with HZ=100. The time granularity may be so coarse * that the approximation, in jiffies, of bfqd->bfq_slice_idle * is rather lower than the exact value. * 2) jiffies, instead of increasing at a constant rate, may stop increasing * for a while, then suddenly 'jump' by several units to recover the lost * increments. This seems to happen, e.g., inside virtual machines. * To address this issue, in the filtering in (a) we do not use as a * reference time interval just bfqd->bfq_slice_idle, but * bfqd->bfq_slice_idle plus a few jiffies. In particular, we add the * minimum number of jiffies for which the filter seems to be quite * precise also in embedded systems and KVM/QEMU virtual machines. */ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, struct bfq_queue *bfqq) { … } /** * bfq_bfqq_expire - expire a queue. * @bfqd: device owning the queue. * @bfqq: the queue to expire. * @compensate: if true, compensate for the time spent idling. * @reason: the reason causing the expiration. * * If the process associated with bfqq does slow I/O (e.g., because it * issues random requests), we charge bfqq with the time it has been * in service instead of the service it has received (see * bfq_bfqq_charge_time for details on how this goal is achieved). As * a consequence, bfqq will typically get higher timestamps upon * reactivation, and hence it will be rescheduled as if it had * received more service than what it has actually received. In the * end, bfqq receives less service in proportion to how slowly its * associated process consumes its budgets (and hence how seriously it * tends to lower the throughput). In addition, this time-charging * strategy guarantees time fairness among slow processes. In * contrast, if the process associated with bfqq is not slow, we * charge bfqq exactly with the service it has received. * * Charging time to the first type of queues and the exact service to * the other has the effect of using the WF2Q+ policy to schedule the * former on a timeslice basis, without violating service domain * guarantees among the latter. */ void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, bool compensate, enum bfqq_expiration reason) { … } /* * Budget timeout is not implemented through a dedicated timer, but * just checked on request arrivals and completions, as well as on * idle timer expirations. */ static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) { … } /* * If we expire a queue that is actively waiting (i.e., with the * device idled) for the arrival of a new request, then we may incur * the timestamp misalignment problem described in the body of the * function __bfq_activate_entity. Hence we return true only if this * condition does not hold, or if the queue is slow enough to deserve * only to be kicked off for preserving a high throughput. */ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) { … } static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, struct bfq_queue *bfqq) { … } /* * For a queue that becomes empty, device idling is allowed only if * this function returns true for that queue. As a consequence, since * device idling plays a critical role for both throughput boosting * and service guarantees, the return value of this function plays a * critical role as well. * * In a nutshell, this function returns true only if idling is * beneficial for throughput or, even if detrimental for throughput, * idling is however necessary to preserve service guarantees (low * latency, desired throughput distribution, ...). In particular, on * NCQ-capable devices, this function tries to return false, so as to * help keep the drives' internal queues full, whenever this helps the * device boost the throughput without causing any service-guarantee * issue. * * Most of the issues taken into account to get the return value of * this function are not trivial. We discuss these issues in the two * functions providing the main pieces of information needed by this * function. */ static bool bfq_better_to_idle(struct bfq_queue *bfqq) { … } /* * If the in-service queue is empty but the function bfq_better_to_idle * returns true, then: * 1) the queue must remain in service and cannot be expired, and * 2) the device must be idled to wait for the possible arrival of a new * request for the queue. * See the comments on the function bfq_better_to_idle for the reasons * why performing device idling is the best choice to boost the throughput * and preserve service guarantees when bfq_better_to_idle itself * returns true. */ static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) { … } /* * This function chooses the queue from which to pick the next extra * I/O request to inject, if it finds a compatible queue. See the * comments on bfq_update_inject_limit() for details on the injection * mechanism, and for the definitions of the quantities mentioned * below. */ static struct bfq_queue * bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) { … } static struct bfq_queue * bfq_find_active_bfqq_for_actuator(struct bfq_data *bfqd, int idx) { … } /* * Perform a linear scan of each actuator, until an actuator is found * for which the following three conditions hold: the load of the * actuator is below the threshold (see comments on * actuator_load_threshold for details) and lower than that of the * next actuator (comments on this extra condition below), and there * is a queue that contains I/O for that actuator. On success, return * that queue. * * Performing a plain linear scan entails a prioritization among * actuators. The extra condition above breaks this prioritization and * tends to distribute injection uniformly across actuators. */ static struct bfq_queue * bfq_find_bfqq_for_underused_actuator(struct bfq_data *bfqd) { … } /* * Select a queue for service. If we have a current queue in service, * check whether to continue servicing it, or retrieve and set a new one. */ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) { … } static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) { … } /* * Dispatch next request from bfqq. */ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) { … } static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) { … } static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) { … } #ifdef CONFIG_BFQ_CGROUP_DEBUG static void bfq_update_dispatch_stats(struct request_queue *q, struct request *rq, struct bfq_queue *in_serv_queue, bool idle_timer_disabled) { … } #else static inline void bfq_update_dispatch_stats(struct request_queue *q, struct request *rq, struct bfq_queue *in_serv_queue, bool idle_timer_disabled) {} #endif /* CONFIG_BFQ_CGROUP_DEBUG */ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) { … } /* * Task holds one reference to the queue, dropped when task exits. Each rq * in-flight on this queue also holds a reference, dropped when rq is freed. * * Scheduler lock must be held here. Recall not to use bfqq after calling * this function on it. */ void bfq_put_queue(struct bfq_queue *bfqq) { … } static void bfq_put_stable_ref(struct bfq_queue *bfqq) { … } void bfq_put_cooperator(struct bfq_queue *bfqq) { … } static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) { … } static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync, unsigned int actuator_idx) { … } static void _bfq_exit_icq(struct bfq_io_cq *bic, unsigned int num_actuators) { … } static void bfq_exit_icq(struct io_cq *icq) { … } /* * Update the entity prio values; note that the new values will not * be used until the next (re)activation. */ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) { … } static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, struct bio *bio, bool is_sync, struct bfq_io_cq *bic, bool respawn); static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) { … } static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_io_cq *bic, pid_t pid, int is_sync, unsigned int act_idx) { … } static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, struct bfq_group *bfqg, int ioprio_class, int ioprio, int act_idx) { … } static struct bfq_queue * bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_io_cq *bic, struct bfq_queue *last_bfqq_created) { … } /* * Many throughput-sensitive workloads are made of several parallel * I/O flows, with all flows generated by the same application, or * more generically by the same task (e.g., system boot). The most * counterproductive action with these workloads is plugging I/O * dispatch when one of the bfq_queues associated with these flows * remains temporarily empty. * * To avoid this plugging, BFQ has been using a burst-handling * mechanism for years now. This mechanism has proven effective for * throughput, and not detrimental for service guarantees. The * following function pushes this mechanism a little bit further, * basing on the following two facts. * * First, all the I/O flows of a the same application or task * contribute to the execution/completion of that common application * or task. So the performance figures that matter are total * throughput of the flows and task-wide I/O latency. In particular, * these flows do not need to be protected from each other, in terms * of individual bandwidth or latency. * * Second, the above fact holds regardless of the number of flows. * * Putting these two facts together, this commits merges stably the * bfq_queues associated with these I/O flows, i.e., with the * processes that generate these IO/ flows, regardless of how many the * involved processes are. * * To decide whether a set of bfq_queues is actually associated with * the I/O flows of a common application or task, and to merge these * queues stably, this function operates as follows: given a bfq_queue, * say Q2, currently being created, and the last bfq_queue, say Q1, * created before Q2, Q2 is merged stably with Q1 if * - very little time has elapsed since when Q1 was created * - Q2 has the same ioprio as Q1 * - Q2 belongs to the same group as Q1 * * Merging bfq_queues also reduces scheduling overhead. A fio test * with ten random readers on /dev/nullb shows a throughput boost of * 40%, with a quadcore. Since BFQ's execution time amounts to ~50% of * the total per-request processing time, the above throughput boost * implies that BFQ's overhead is reduced by more than 50%. * * This new mechanism most certainly obsoletes the current * burst-handling heuristics. We keep those heuristics for the moment. */ static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_io_cq *bic) { … } static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, struct bio *bio, bool is_sync, struct bfq_io_cq *bic, bool respawn) { … } static void bfq_update_io_thinktime(struct bfq_data *bfqd, struct bfq_queue *bfqq) { … } static void bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct request *rq) { … } static void bfq_update_has_short_ttime(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_io_cq *bic) { … } /* * Called when a new fs request (rq) is added to bfqq. Check if there's * something we should do about it. */ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct request *rq) { … } static void bfqq_request_allocated(struct bfq_queue *bfqq) { … } static void bfqq_request_freed(struct bfq_queue *bfqq) { … } /* returns true if it causes the idle timer to be disabled */ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) { … } #ifdef CONFIG_BFQ_CGROUP_DEBUG static void bfq_update_insert_stats(struct request_queue *q, struct bfq_queue *bfqq, bool idle_timer_disabled, blk_opf_t cmd_flags) { … } #else static inline void bfq_update_insert_stats(struct request_queue *q, struct bfq_queue *bfqq, bool idle_timer_disabled, blk_opf_t cmd_flags) {} #endif /* CONFIG_BFQ_CGROUP_DEBUG */ static struct bfq_queue *bfq_init_rq(struct request *rq); static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_insert_t flags) { … } static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, struct list_head *list, blk_insert_t flags) { … } static void bfq_update_hw_tag(struct bfq_data *bfqd) { … } static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) { … } /* * The processes associated with bfqq may happen to generate their * cumulative I/O at a lower rate than the rate at which the device * could serve the same I/O. This is rather probable, e.g., if only * one process is associated with bfqq and the device is an SSD. It * results in bfqq becoming often empty while in service. In this * respect, if BFQ is allowed to switch to another queue when bfqq * remains empty, then the device goes on being fed with I/O requests, * and the throughput is not affected. In contrast, if BFQ is not * allowed to switch to another queue---because bfqq is sync and * I/O-dispatch needs to be plugged while bfqq is temporarily * empty---then, during the service of bfqq, there will be frequent * "service holes", i.e., time intervals during which bfqq gets empty * and the device can only consume the I/O already queued in its * hardware queues. During service holes, the device may even get to * remaining idle. In the end, during the service of bfqq, the device * is driven at a lower speed than the one it can reach with the kind * of I/O flowing through bfqq. * * To counter this loss of throughput, BFQ implements a "request * injection mechanism", which tries to fill the above service holes * with I/O requests taken from other queues. The hard part in this * mechanism is finding the right amount of I/O to inject, so as to * both boost throughput and not break bfqq's bandwidth and latency * guarantees. In this respect, the mechanism maintains a per-queue * inject limit, computed as below. While bfqq is empty, the injection * mechanism dispatches extra I/O requests only until the total number * of I/O requests in flight---i.e., already dispatched but not yet * completed---remains lower than this limit. * * A first definition comes in handy to introduce the algorithm by * which the inject limit is computed. We define as first request for * bfqq, an I/O request for bfqq that arrives while bfqq is in * service, and causes bfqq to switch from empty to non-empty. The * algorithm updates the limit as a function of the effect of * injection on the service times of only the first requests of * bfqq. The reason for this restriction is that these are the * requests whose service time is affected most, because they are the * first to arrive after injection possibly occurred. * * To evaluate the effect of injection, the algorithm measures the * "total service time" of first requests. We define as total service * time of an I/O request, the time that elapses since when the * request is enqueued into bfqq, to when it is completed. This * quantity allows the whole effect of injection to be measured. It is * easy to see why. Suppose that some requests of other queues are * actually injected while bfqq is empty, and that a new request R * then arrives for bfqq. If the device does start to serve all or * part of the injected requests during the service hole, then, * because of this extra service, it may delay the next invocation of * the dispatch hook of BFQ. Then, even after R gets eventually * dispatched, the device may delay the actual service of R if it is * still busy serving the extra requests, or if it decides to serve, * before R, some extra request still present in its queues. As a * conclusion, the cumulative extra delay caused by injection can be * easily evaluated by just comparing the total service time of first * requests with and without injection. * * The limit-update algorithm works as follows. On the arrival of a * first request of bfqq, the algorithm measures the total time of the * request only if one of the three cases below holds, and, for each * case, it updates the limit as described below: * * (1) If there is no in-flight request. This gives a baseline for the * total service time of the requests of bfqq. If the baseline has * not been computed yet, then, after computing it, the limit is * set to 1, to start boosting throughput, and to prepare the * ground for the next case. If the baseline has already been * computed, then it is updated, in case it results to be lower * than the previous value. * * (2) If the limit is higher than 0 and there are in-flight * requests. By comparing the total service time in this case with * the above baseline, it is possible to know at which extent the * current value of the limit is inflating the total service * time. If the inflation is below a certain threshold, then bfqq * is assumed to be suffering from no perceivable loss of its * service guarantees, and the limit is even tentatively * increased. If the inflation is above the threshold, then the * limit is decreased. Due to the lack of any hysteresis, this * logic makes the limit oscillate even in steady workload * conditions. Yet we opted for it, because it is fast in reaching * the best value for the limit, as a function of the current I/O * workload. To reduce oscillations, this step is disabled for a * short time interval after the limit happens to be decreased. * * (3) Periodically, after resetting the limit, to make sure that the * limit eventually drops in case the workload changes. This is * needed because, after the limit has gone safely up for a * certain workload, it is impossible to guess whether the * baseline total service time may have changed, without measuring * it again without injection. A more effective version of this * step might be to just sample the baseline, by interrupting * injection only once, and then to reset/lower the limit only if * the total service time with the current limit does happen to be * too large. * * More details on each step are provided in the comments on the * pieces of code that implement these steps: the branch handling the * transition from empty to non empty in bfq_add_request(), the branch * handling injection in bfq_select_queue(), and the function * bfq_choose_bfqq_for_injection(). These comments also explain some * exceptions, made by the injection mechanism in some special cases. */ static void bfq_update_inject_limit(struct bfq_data *bfqd, struct bfq_queue *bfqq) { … } /* * Handle either a requeue or a finish for rq. The things to do are * the same in both cases: all references to rq are to be dropped. In * particular, rq is considered completed from the point of view of * the scheduler. */ static void bfq_finish_requeue_request(struct request *rq) { … } static void bfq_finish_request(struct request *rq) { … } /* * Removes the association between the current task and bfqq, assuming * that bic points to the bfq iocontext of the task. * Returns NULL if a new bfqq should be allocated, or the old bfqq if this * was the last process referring to that bfqq. */ static struct bfq_queue * bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) { … } static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, struct bfq_io_cq *bic, struct bio *bio, bool split, bool is_sync, bool *new_queue) { … } /* * Only reset private fields. The actual request preparation will be * performed by bfq_init_rq, when rq is either inserted or merged. See * comments on bfq_init_rq for the reason behind this delayed * preparation. */ static void bfq_prepare_request(struct request *rq) { … } /* * If needed, init rq, allocate bfq data structures associated with * rq, and increment reference counters in the destination bfq_queue * for rq. Return the destination bfq_queue for rq, or NULL is rq is * not associated with any bfq_queue. * * This function is invoked by the functions that perform rq insertion * or merging. One may have expected the above preparation operations * to be performed in bfq_prepare_request, and not delayed to when rq * is inserted or merged. The rationale behind this delayed * preparation is that, after the prepare_request hook is invoked for * rq, rq may still be transformed into a request with no icq, i.e., a * request not associated with any queue. No bfq hook is invoked to * signal this transformation. As a consequence, should these * preparation operations be performed when the prepare_request hook * is invoked, and should rq be transformed one moment later, bfq * would end up in an inconsistent state, because it would have * incremented some queue counters for an rq destined to * transformation, without any chance to correctly lower these * counters back. In contrast, no transformation can still happen for * rq after rq has been inserted or merged. So, it is safe to execute * these preparation operations when rq is finally inserted or merged. */ static struct bfq_queue *bfq_init_rq(struct request *rq) { … } static void bfq_idle_slice_timer_body(struct bfq_data *bfqd, struct bfq_queue *bfqq) { … } /* * Handler of the expiration of the timer running if the in-service queue * is idling inside its time slice. */ static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) { … } static void __bfq_put_async_bfqq(struct bfq_data *bfqd, struct bfq_queue **bfqq_ptr) { … } /* * Release all the bfqg references to its async queues. If we are * deallocating the group these queues may still contain requests, so * we reparent them to the root cgroup (i.e., the only one that will * exist for sure until all the requests on a device are gone). */ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) { … } /* * See the comments on bfq_limit_depth for the purpose of * the depths set in the function. Return minimum shallow depth we'll use. */ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) { … } static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx) { … } static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index) { … } static void bfq_exit_queue(struct elevator_queue *e) { … } static void bfq_init_root_group(struct bfq_group *root_group, struct bfq_data *bfqd) { … } static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) { … } static void bfq_slab_kill(void) { … } static int __init bfq_slab_setup(void) { … } static ssize_t bfq_var_show(unsigned int var, char *page) { … } static int bfq_var_store(unsigned long *var, const char *page) { … } #define SHOW_FUNCTION … SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2); SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2); SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2); SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1); SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0); SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); #undef SHOW_FUNCTION #define USEC_SHOW_FUNCTION … USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle); #undef USEC_SHOW_FUNCTION #define STORE_FUNCTION … STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, INT_MAX, 2); STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, INT_MAX, 2); STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, INT_MAX, 0); STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2); #undef STORE_FUNCTION #define USEC_STORE_FUNCTION … USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0, UINT_MAX); #undef USEC_STORE_FUNCTION static ssize_t bfq_max_budget_store(struct elevator_queue *e, const char *page, size_t count) { … } /* * Leaving this name to preserve name compatibility with cfq * parameters, but this timeout is used for both sync and async. */ static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, const char *page, size_t count) { … } static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, const char *page, size_t count) { … } static ssize_t bfq_low_latency_store(struct elevator_queue *e, const char *page, size_t count) { … } #define BFQ_ATTR(name) … static struct elv_fs_entry bfq_attrs[] = …; static struct elevator_type iosched_bfq_mq = …; MODULE_ALIAS(…) …; static int __init bfq_init(void) { … } static void __exit bfq_exit(void) { … } module_init(…) …; module_exit(bfq_exit); MODULE_AUTHOR(…) …; MODULE_LICENSE(…) …; MODULE_DESCRIPTION(…) …;