// SPDX-License-Identifier: GPL-2.0 /* * Common Block IO controller cgroup interface * * Based on ideas and code from CFQ, CFS and BFQ: * Copyright (C) 2003 Jens Axboe <[email protected]> * * Copyright (C) 2008 Fabio Checconi <[email protected]> * Paolo Valente <[email protected]> * * Copyright (C) 2009 Vivek Goyal <[email protected]> * Nauman Rafique <[email protected]> * * For policy-specific per-blkcg data: * Copyright (C) 2015 Paolo Valente <[email protected]> * Arianna Avanzini <[email protected]> */ #include <linux/ioprio.h> #include <linux/kdev_t.h> #include <linux/module.h> #include <linux/sched/signal.h> #include <linux/err.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/slab.h> #include <linux/delay.h> #include <linux/atomic.h> #include <linux/ctype.h> #include <linux/resume_user_mode.h> #include <linux/psi.h> #include <linux/part_stat.h> #include "blk.h" #include "blk-cgroup.h" #include "blk-ioprio.h" #include "blk-throttle.h" static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu); /* * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation. * blkcg_pol_register_mutex nests outside of it and synchronizes entire * policy [un]register operations including cgroup file additions / * removals. Putting cgroup file registration outside blkcg_pol_mutex * allows grabbing it from cgroup callbacks. */ static DEFINE_MUTEX(blkcg_pol_register_mutex); static DEFINE_MUTEX(blkcg_pol_mutex); struct blkcg blkcg_root; EXPORT_SYMBOL_GPL(…); struct cgroup_subsys_state * const blkcg_root_css = …; EXPORT_SYMBOL_GPL(…); static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ bool blkcg_debug_stats = …; static DEFINE_RAW_SPINLOCK(blkg_stat_lock); #define BLKG_DESTROY_BATCH_SIZE … /* * Lockless lists for tracking IO stats update * * New IO stats are stored in the percpu iostat_cpu within blkcg_gq (blkg). * There are multiple blkg's (one for each block device) attached to each * blkcg. The rstat code keeps track of which cpu has IO stats updated, * but it doesn't know which blkg has the updated stats. If there are many * block devices in a system, the cost of iterating all the blkg's to flush * out the IO stats can be high. To reduce such overhead, a set of percpu * lockless lists (lhead) per blkcg are used to track the set of recently * updated iostat_cpu's since the last flush. An iostat_cpu will be put * onto the lockless list on the update side [blk_cgroup_bio_start()] if * not there yet and then removed when being flushed [blkcg_rstat_flush()]. * References to blkg are gotten and then put back in the process to * protect against blkg removal. * * Return: 0 if successful or -ENOMEM if allocation fails. */ static int init_blkcg_llists(struct blkcg *blkcg) { … } /** * blkcg_css - find the current css * * Find the css associated with either the kthread or the current task. * This may return a dying css, so it is up to the caller to use tryget logic * to confirm it is alive and well. */ static struct cgroup_subsys_state *blkcg_css(void) { … } static bool blkcg_policy_enabled(struct request_queue *q, const struct blkcg_policy *pol) { … } static void blkg_free_workfn(struct work_struct *work) { … } /** * blkg_free - free a blkg * @blkg: blkg to free * * Free @blkg which may be partially allocated. */ static void blkg_free(struct blkcg_gq *blkg) { … } static void __blkg_release(struct rcu_head *rcu) { … } /* * A group is RCU protected, but having an rcu lock does not mean that one * can access all the fields of blkg and assume these are valid. For * example, don't try to follow throtl_data and request queue links. * * Having a reference to blkg under an rcu allows accesses to only values * local to groups like group stats and group rate limits. */ static void blkg_release(struct percpu_ref *ref) { … } #ifdef CONFIG_BLK_CGROUP_PUNT_BIO static struct workqueue_struct *blkcg_punt_bio_wq; static void blkg_async_bio_workfn(struct work_struct *work) { … } /* * When a shared kthread issues a bio for a cgroup, doing so synchronously can * lead to priority inversions as the kthread can be trapped waiting for that * cgroup. Use this helper instead of submit_bio to punt the actual issuing to * a dedicated per-blkcg work item to avoid such priority inversions. */ void blkcg_punt_bio_submit(struct bio *bio) { … } EXPORT_SYMBOL_GPL(…); static int __init blkcg_punt_bio_init(void) { … } subsys_initcall(blkcg_punt_bio_init); #endif /* CONFIG_BLK_CGROUP_PUNT_BIO */ /** * bio_blkcg_css - return the blkcg CSS associated with a bio * @bio: target bio * * This returns the CSS for the blkcg associated with a bio, or %NULL if not * associated. Callers are expected to either handle %NULL or know association * has been done prior to calling this. */ struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio) { … } EXPORT_SYMBOL_GPL(…); /** * blkcg_parent - get the parent of a blkcg * @blkcg: blkcg of interest * * Return the parent blkcg of @blkcg. Can be called anytime. */ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) { … } /** * blkg_alloc - allocate a blkg * @blkcg: block cgroup the new blkg is associated with * @disk: gendisk the new blkg is associated with * @gfp_mask: allocation mask to use * * Allocate a new blkg associating @blkcg and @disk. */ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, gfp_t gfp_mask) { … } /* * If @new_blkg is %NULL, this function tries to allocate a new one as * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return. */ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk, struct blkcg_gq *new_blkg) { … } /** * blkg_lookup_create - lookup blkg, try to create one if not there * @blkcg: blkcg of interest * @disk: gendisk of interest * * Lookup blkg for the @blkcg - @disk pair. If it doesn't exist, try to * create one. blkg creation is performed recursively from blkcg_root such * that all non-root blkg's have access to the parent blkg. This function * should be called under RCU read lock and takes @disk->queue->queue_lock. * * Returns the blkg or the closest blkg if blkg_create() fails as it walks * down from root. */ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, struct gendisk *disk) { … } static void blkg_destroy(struct blkcg_gq *blkg) { … } static void blkg_destroy_all(struct gendisk *disk) { … } static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) { … } static void __blkg_clear_stat(struct blkg_iostat_set *bis) { … } static void blkg_clear_stat(struct blkcg_gq *blkg) { … } static int blkcg_reset_stats(struct cgroup_subsys_state *css, struct cftype *cftype, u64 val) { … } const char *blkg_dev_name(struct blkcg_gq *blkg) { … } /** * blkcg_print_blkgs - helper for printing per-blkg data * @sf: seq_file to print to * @blkcg: blkcg of interest * @prfill: fill function to print out a blkg * @pol: policy in question * @data: data to be passed to @prfill * @show_total: to print out sum of prfill return values or not * * This function invokes @prfill on each blkg of @blkcg if pd for the * policy specified by @pol exists. @prfill is invoked with @sf, the * policy data and @data and the matching queue lock held. If @show_total * is %true, the sum of the return values from @prfill is printed with * "Total" label at the end. * * This is to be used to construct print functions for * cftype->read_seq_string method. */ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int), const struct blkcg_policy *pol, int data, bool show_total) { … } EXPORT_SYMBOL_GPL(…); /** * __blkg_prfill_u64 - prfill helper for a single u64 value * @sf: seq_file to print to * @pd: policy private data of interest * @v: value to print * * Print @v to @sf for the device associated with @pd. */ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) { … } EXPORT_SYMBOL_GPL(…); /** * blkg_conf_init - initialize a blkg_conf_ctx * @ctx: blkg_conf_ctx to initialize * @input: input string * * Initialize @ctx which can be used to parse blkg config input string @input. * Once initialized, @ctx can be used with blkg_conf_open_bdev() and * blkg_conf_prep(), and must be cleaned up with blkg_conf_exit(). */ void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input) { … } EXPORT_SYMBOL_GPL(…); /** * blkg_conf_open_bdev - parse and open bdev for per-blkg config update * @ctx: blkg_conf_ctx initialized with blkg_conf_init() * * Parse the device node prefix part, MAJ:MIN, of per-blkg config update from * @ctx->input and get and store the matching bdev in @ctx->bdev. @ctx->body is * set to point past the device node prefix. * * This function may be called multiple times on @ctx and the extra calls become * NOOPs. blkg_conf_prep() implicitly calls this function. Use this function * explicitly if bdev access is needed without resolving the blkcg / policy part * of @ctx->input. Returns -errno on error. */ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx) { … } /** * blkg_conf_prep - parse and prepare for per-blkg config update * @blkcg: target block cgroup * @pol: target policy * @ctx: blkg_conf_ctx initialized with blkg_conf_init() * * Parse per-blkg config update from @ctx->input and initialize @ctx * accordingly. On success, @ctx->body points to the part of @ctx->input * following MAJ:MIN, @ctx->bdev points to the target block device and * @ctx->blkg to the blkg being configured. * * blkg_conf_open_bdev() may be called on @ctx beforehand. On success, this * function returns with queue lock held and must be followed by * blkg_conf_exit(). */ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, struct blkg_conf_ctx *ctx) __acquires(&bdev->bd_queue->queue_lock) { … } EXPORT_SYMBOL_GPL(…); /** * blkg_conf_exit - clean up per-blkg config update * @ctx: blkg_conf_ctx initialized with blkg_conf_init() * * Clean up after per-blkg config update. This function must be called on all * blkg_conf_ctx's initialized with blkg_conf_init(). */ void blkg_conf_exit(struct blkg_conf_ctx *ctx) __releases(&ctx->bdev->bd_queue->queue_lock) __releases(&ctx->bdev->bd_queue->rq_qos_mutex) { … } EXPORT_SYMBOL_GPL(…); static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src) { … } static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src) { … } static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur, struct blkg_iostat *last) { … } static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu) { … } static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) { … } /* * We source root cgroup stats from the system-wide stats to avoid * tracking the same information twice and incurring overhead when no * cgroups are defined. For that reason, cgroup_rstat_flush in * blkcg_print_stat does not actually fill out the iostat in the root * cgroup's blkcg_gq. * * However, we would like to re-use the printing code between the root and * non-root cgroups to the extent possible. For that reason, we simulate * flushing the root cgroup's stats by explicitly filling in the iostat * with disk level statistics. */ static void blkcg_fill_root_iostats(void) { … } static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) { … } static int blkcg_print_stat(struct seq_file *sf, void *v) { … } static struct cftype blkcg_files[] = …; static struct cftype blkcg_legacy_files[] = …; #ifdef CONFIG_CGROUP_WRITEBACK struct list_head *blkcg_get_cgwb_list(struct cgroup_subsys_state *css) { … } #endif /* * blkcg destruction is a three-stage process. * * 1. Destruction starts. The blkcg_css_offline() callback is invoked * which offlines writeback. Here we tie the next stage of blkg destruction * to the completion of writeback associated with the blkcg. This lets us * avoid punting potentially large amounts of outstanding writeback to root * while maintaining any ongoing policies. The next stage is triggered when * the nr_cgwbs count goes to zero. * * 2. When the nr_cgwbs count goes to zero, blkcg_destroy_blkgs() is called * and handles the destruction of blkgs. Here the css reference held by * the blkg is put back eventually allowing blkcg_css_free() to be called. * This work may occur in cgwb_release_workfn() on the cgwb_release * workqueue. Any submitted ios that fail to get the blkg ref will be * punted to the root_blkg. * * 3. Once the blkcg ref count goes to zero, blkcg_css_free() is called. * This finally frees the blkcg. */ /** * blkcg_destroy_blkgs - responsible for shooting down blkgs * @blkcg: blkcg of interest * * blkgs should be removed while holding both q and blkcg locks. As blkcg lock * is nested inside q lock, this function performs reverse double lock dancing. * Destroying the blkgs releases the reference held on the blkcg's css allowing * blkcg_css_free to eventually be called. * * This is the blkcg counterpart of ioc_release_fn(). */ static void blkcg_destroy_blkgs(struct blkcg *blkcg) { … } /** * blkcg_pin_online - pin online state * @blkcg_css: blkcg of interest * * While pinned, a blkcg is kept online. This is primarily used to * impedance-match blkg and cgwb lifetimes so that blkg doesn't go offline * while an associated cgwb is still active. */ void blkcg_pin_online(struct cgroup_subsys_state *blkcg_css) { … } /** * blkcg_unpin_online - unpin online state * @blkcg_css: blkcg of interest * * This is primarily used to impedance-match blkg and cgwb lifetimes so * that blkg doesn't go offline while an associated cgwb is still active. * When this count goes to zero, all active cgwbs have finished so the * blkcg can continue destruction by calling blkcg_destroy_blkgs(). */ void blkcg_unpin_online(struct cgroup_subsys_state *blkcg_css) { … } /** * blkcg_css_offline - cgroup css_offline callback * @css: css of interest * * This function is called when @css is about to go away. Here the cgwbs are * offlined first and only once writeback associated with the blkcg has * finished do we start step 2 (see above). */ static void blkcg_css_offline(struct cgroup_subsys_state *css) { … } static void blkcg_css_free(struct cgroup_subsys_state *css) { … } static struct cgroup_subsys_state * blkcg_css_alloc(struct cgroup_subsys_state *parent_css) { … } static int blkcg_css_online(struct cgroup_subsys_state *css) { … } void blkg_init_queue(struct request_queue *q) { … } int blkcg_init_disk(struct gendisk *disk) { … } void blkcg_exit_disk(struct gendisk *disk) { … } static void blkcg_exit(struct task_struct *tsk) { … } struct cgroup_subsys io_cgrp_subsys = …; EXPORT_SYMBOL_GPL(…); /** * blkcg_activate_policy - activate a blkcg policy on a gendisk * @disk: gendisk of interest * @pol: blkcg policy to activate * * Activate @pol on @disk. Requires %GFP_KERNEL context. @disk goes through * bypass mode to populate its blkgs with policy_data for @pol. * * Activation happens with @disk bypassed, so nobody would be accessing blkgs * from IO path. Update of each blkg is protected by both queue and blkcg * locks so that holding either lock and testing blkcg_policy_enabled() is * always enough for dereferencing policy data. * * The caller is responsible for synchronizing [de]activations and policy * [un]registerations. Returns 0 on success, -errno on failure. */ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { … } EXPORT_SYMBOL_GPL(…); /** * blkcg_deactivate_policy - deactivate a blkcg policy on a gendisk * @disk: gendisk of interest * @pol: blkcg policy to deactivate * * Deactivate @pol on @disk. Follows the same synchronization rules as * blkcg_activate_policy(). */ void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { … } EXPORT_SYMBOL_GPL(…); static void blkcg_free_all_cpd(struct blkcg_policy *pol) { … } /** * blkcg_policy_register - register a blkcg policy * @pol: blkcg policy to register * * Register @pol with blkcg core. Might sleep and @pol may be modified on * successful registration. Returns 0 on success and -errno on failure. */ int blkcg_policy_register(struct blkcg_policy *pol) { … } EXPORT_SYMBOL_GPL(…); /** * blkcg_policy_unregister - unregister a blkcg policy * @pol: blkcg policy to unregister * * Undo blkcg_policy_register(@pol). Might sleep. */ void blkcg_policy_unregister(struct blkcg_policy *pol) { … } EXPORT_SYMBOL_GPL(…); /* * Scale the accumulated delay based on how long it has been since we updated * the delay. We only call this when we are adding delay, in case it's been a * while since we added delay, and when we are checking to see if we need to * delay a task, to account for any delays that may have occurred. */ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) { … } /* * This is called when we want to actually walk up the hierarchy and check to * see if we need to throttle, and then actually throttle if there is some * accumulated delay. This should only be called upon return to user space so * we're not holding some lock that would induce a priority inversion. */ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) { … } /** * blkcg_maybe_throttle_current - throttle the current task if it has been marked * * This is only called if we've been marked with set_notify_resume(). Obviously * we can be set_notify_resume() for reasons other than blkcg throttling, so we * check to see if current->throttle_disk is set and if not this doesn't do * anything. This should only ever be called by the resume code, it's not meant * to be called by people willy-nilly as it will actually do the work to * throttle the task if it is setup for throttling. */ void blkcg_maybe_throttle_current(void) { … } /** * blkcg_schedule_throttle - this task needs to check for throttling * @disk: disk to throttle * @use_memdelay: do we charge this to memory delay for PSI * * This is called by the IO controller when we know there's delay accumulated * for the blkg for this task. We do not pass the blkg because there are places * we call this that may not have that information, the swapping code for * instance will only have a block_device at that point. This set's the * notify_resume for the task to check and see if it requires throttling before * returning to user space. * * We will only schedule once per syscall. You can call this over and over * again and it will only do the check once upon return to user space, and only * throttle once. If the task needs to be throttled again it'll need to be * re-set at the next time we see the task. */ void blkcg_schedule_throttle(struct gendisk *disk, bool use_memdelay) { … } /** * blkcg_add_delay - add delay to this blkg * @blkg: blkg of interest * @now: the current time in nanoseconds * @delta: how many nanoseconds of delay to add * * Charge @delta to the blkg's current delay accumulation. This is used to * throttle tasks if an IO controller thinks we need more throttling. */ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta) { … } /** * blkg_tryget_closest - try and get a blkg ref on the closet blkg * @bio: target bio * @css: target css * * As the failure mode here is to walk up the blkg tree, this ensure that the * blkg->parent pointers are always valid. This returns the blkg that it ended * up taking a reference on or %NULL if no reference was taken. */ static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio, struct cgroup_subsys_state *css) { … } /** * bio_associate_blkg_from_css - associate a bio with a specified css * @bio: target bio * @css: target css * * Associate @bio with the blkg found by combining the css's blkg and the * request_queue of the @bio. An association failure is handled by walking up * the blkg tree. Therefore, the blkg associated can be anything between @blkg * and q->root_blkg. This situation only happens when a cgroup is dying and * then the remaining bios will spill to the closest alive blkg. * * A reference will be taken on the blkg and will be released when @bio is * freed. */ void bio_associate_blkg_from_css(struct bio *bio, struct cgroup_subsys_state *css) { … } EXPORT_SYMBOL_GPL(…); /** * bio_associate_blkg - associate a bio with a blkg * @bio: target bio * * Associate @bio with the blkg found from the bio's css and request_queue. * If one is not found, bio_lookup_blkg() creates the blkg. If a blkg is * already associated, the css is reused and association redone as the * request_queue may have changed. */ void bio_associate_blkg(struct bio *bio) { … } EXPORT_SYMBOL_GPL(…); /** * bio_clone_blkg_association - clone blkg association from src to dst bio * @dst: destination bio * @src: source bio */ void bio_clone_blkg_association(struct bio *dst, struct bio *src) { … } EXPORT_SYMBOL_GPL(…); static int blk_cgroup_io_type(struct bio *bio) { … } void blk_cgroup_bio_start(struct bio *bio) { … } bool blk_cgroup_congested(void) { … } module_param(blkcg_debug_stats, bool, 0644); MODULE_PARM_DESC(…) …;