// SPDX-License-Identifier: GPL-2.0 #include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/ktime.h> #include <linux/list.h> #include <linux/math64.h> #include <linux/sizes.h> #include <linux/workqueue.h> #include "ctree.h" #include "block-group.h" #include "discard.h" #include "free-space-cache.h" #include "fs.h" /* * This contains the logic to handle async discard. * * Async discard manages trimming of free space outside of transaction commit. * Discarding is done by managing the block_groups on a LRU list based on free * space recency. Two passes are used to first prioritize discarding extents * and then allow for trimming in the bitmap the best opportunity to coalesce. * The block_groups are maintained on multiple lists to allow for multiple * passes with different discard filter requirements. A delayed work item is * used to manage discarding with timeout determined by a max of the delay * incurred by the iops rate limit, the byte rate limit, and the max delay of * BTRFS_DISCARD_MAX_DELAY. * * Note, this only keeps track of block_groups that are explicitly for data. * Mixed block_groups are not supported. * * The first list is special to manage discarding of fully free block groups. * This is necessary because we issue a final trim for a full free block group * after forgetting it. When a block group becomes unused, instead of directly * being added to the unused_bgs list, we add it to this first list. Then * from there, if it becomes fully discarded, we place it onto the unused_bgs * list. * * The in-memory free space cache serves as the backing state for discard. * Consequently this means there is no persistence. We opt to load all the * block groups in as not discarded, so the mount case degenerates to the * crashing case. * * As the free space cache uses bitmaps, there exists a tradeoff between * ease/efficiency for find_free_extent() and the accuracy of discard state. * Here we opt to let untrimmed regions merge with everything while only letting * trimmed regions merge with other trimmed regions. This can cause * overtrimming, but the coalescing benefit seems to be worth it. Additionally, * bitmap state is tracked as a whole. If we're able to fully trim a bitmap, * the trimmed flag is set on the bitmap. Otherwise, if an allocation comes in, * this resets the state and we will retry trimming the whole bitmap. This is a * tradeoff between discard state accuracy and the cost of accounting. */ /* This is an initial delay to give some chance for block reuse */ #define BTRFS_DISCARD_DELAY … #define BTRFS_DISCARD_UNUSED_DELAY … #define BTRFS_DISCARD_MIN_DELAY_MSEC … #define BTRFS_DISCARD_MAX_DELAY_MSEC … #define BTRFS_DISCARD_MAX_IOPS … /* Monotonically decreasing minimum length filters after index 0 */ static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = …; static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { … } /* * Determine if async discard should be running. * * @discard_ctl: discard control * * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set. */ static bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl) { … } static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { … } static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { … } static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { … } static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { … } /* * Find block_group that's up next for discarding. * * @discard_ctl: discard control * @now: current time * * Iterate over the discard lists to find the next block_group up for * discarding checking the discard_eligible_time of block_group. */ static struct btrfs_block_group *find_next_block_group( struct btrfs_discard_ctl *discard_ctl, u64 now) { … } /* * Look up next block group and set it for use. * * @discard_ctl: discard control * @discard_state: the discard_state of the block_group after state management * @discard_index: the discard_index of the block_group after state management * @now: time when discard was invoked, in ns * * Wrap find_next_block_group() and set the block_group to be in use. * @discard_state's control flow is managed here. Variables related to * @discard_state are reset here as needed (eg. @discard_cursor). @discard_state * and @discard_index are remembered as it may change while we're discarding, * but we want the discard to execute in the context determined here. */ static struct btrfs_block_group *peek_discard_list( struct btrfs_discard_ctl *discard_ctl, enum btrfs_discard_state *discard_state, int *discard_index, u64 now) { … } /* * Update a block group's filters. * * @block_group: block group of interest * @bytes: recently freed region size after coalescing * * Async discard maintains multiple lists with progressively smaller filters * to prioritize discarding based on size. Should a free space that matches * a larger filter be returned to the free_space_cache, prioritize that discard * by moving @block_group to the proper filter. */ void btrfs_discard_check_filter(struct btrfs_block_group *block_group, u64 bytes) { … } /* * Move a block group along the discard lists. * * @discard_ctl: discard control * @block_group: block_group of interest * * Increment @block_group's discard_index. If it falls of the list, let it be. * Otherwise add it back to the appropriate list. */ static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { … } /* * Remove a block_group from the discard lists. * * @discard_ctl: discard control * @block_group: block_group of interest * * Remove @block_group from the discard lists. If necessary, wait on the * current work and then reschedule the delayed work. */ void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { … } /* * Handles queuing the block_groups. * * @discard_ctl: discard control * @block_group: block_group of interest * * Maintain the LRU order of the discard lists. */ void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { … } static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, u64 now, bool override) { … } /* * Responsible for scheduling the discard work. * * @discard_ctl: discard control * @override: override the current timer * * Discards are issued by a delayed workqueue item. @override is used to * update the current delay as the baseline delay interval is reevaluated on * transaction commit. This is also maxed with any other rate limit. */ void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, bool override) { … } /* * Determine next step of a block_group. * * @discard_ctl: discard control * @block_group: block_group of interest * * Determine the next step for a block group after it's finished going through * a pass on a discard list. If it is unused and fully trimmed, we can mark it * unused and send it to the unused_bgs path. Otherwise, pass it onto the * appropriate filter list or let it fall off. */ static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { … } /* * Discard work queue callback * * @work: work * * Find the next block_group to start discarding and then discard a single * region. It does this in a two-pass fashion: first extents and second * bitmaps. Completely discarded block groups are sent to the unused_bgs path. */ static void btrfs_discard_workfn(struct work_struct *work) { … } /* * Recalculate the base delay. * * @discard_ctl: discard control * * Recalculate the base delay which is based off the total number of * discardable_extents. Clamp this between the lower_limit (iops_limit or 1ms) * and the upper_limit (BTRFS_DISCARD_MAX_DELAY_MSEC). */ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) { … } /* * Propagate discard counters. * * @block_group: block_group of interest * * Propagate deltas of counters up to the discard_ctl. It maintains a current * counter and a previous counter passing the delta up to the global stat. * Then the current counter value becomes the previous counter value. */ void btrfs_discard_update_discardable(struct btrfs_block_group *block_group) { … } /* * Punt unused_bgs list to discard lists. * * @fs_info: fs_info of interest * * The unused_bgs list needs to be punted to the discard lists because the * order of operations is changed. In the normal synchronous discard path, the * block groups are trimmed via a single large trim in transaction commit. This * is ultimately what we are trying to avoid with asynchronous discard. Thus, * it must be done before going down the unused_bgs path. */ void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info) { … } /* * Purge discard lists. * * @discard_ctl: discard control * * If we are disabling async discard, we may have intercepted block groups that * are completely free and ready for the unused_bgs path. As discarding will * now happen in transaction commit or not at all, we can safely mark the * corresponding block groups as unused and they will be sent on their merry * way to the unused_bgs list. */ static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl) { … } void btrfs_discard_resume(struct btrfs_fs_info *fs_info) { … } void btrfs_discard_stop(struct btrfs_fs_info *fs_info) { … } void btrfs_discard_init(struct btrfs_fs_info *fs_info) { … } void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info) { … }