block-group.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0

#include <linux/sizes.h>
#include <linux/list_sort.h>
#include "misc.h"
#include "ctree.h"
#include "block-group.h"
#include "space-info.h"
#include "disk-io.h"
#include "free-space-cache.h"
#include "free-space-tree.h"
#include "volumes.h"
#include "transaction.h"
#include "ref-verify.h"
#include "sysfs.h"
#include "tree-log.h"
#include "delalloc-space.h"
#include "discard.h"
#include "raid56.h"
#include "zoned.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"

#ifdef CONFIG_BTRFS_DEBUG
int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group)
{ … }
#endif

/*
 * Return target flags in extended format or 0 if restripe for this chunk_type
 * is not in progress
 *
 * Should be called with balance_lock held
 */
static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
{ … }

/*
 * @flags: available profiles in extended format (see ctree.h)
 *
 * Return reduced profile in chunk format.  If profile changing is in progress
 * (either running or paused) picks the target profile (if it's already
 * available), otherwise falls back to plain reducing.
 */
static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
{ … }

u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
{ … }

void btrfs_get_block_group(struct btrfs_block_group *cache)
{ … }

void btrfs_put_block_group(struct btrfs_block_group *cache)
{ … }

/*
 * This adds the block group to the fs_info rb tree for the block group cache
 */
static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
				       struct btrfs_block_group *block_group)
{ … }

/*
 * This will return the block group at or after bytenr if contains is 0, else
 * it will return the block group that contains the bytenr
 */
static struct btrfs_block_group *block_group_cache_tree_search(
		struct btrfs_fs_info *info, u64 bytenr, int contains)
{ … }

/*
 * Return the block group that starts at or after bytenr
 */
struct btrfs_block_group *btrfs_lookup_first_block_group(
		struct btrfs_fs_info *info, u64 bytenr)
{ … }

/*
 * Return the block group that contains the given bytenr
 */
struct btrfs_block_group *btrfs_lookup_block_group(
		struct btrfs_fs_info *info, u64 bytenr)
{ … }

struct btrfs_block_group *btrfs_next_block_group(
		struct btrfs_block_group *cache)
{ … }

/*
 * Check if we can do a NOCOW write for a given extent.
 *
 * @fs_info:       The filesystem information object.
 * @bytenr:        Logical start address of the extent.
 *
 * Check if we can do a NOCOW write for the given extent, and increments the
 * number of NOCOW writers in the block group that contains the extent, as long
 * as the block group exists and it's currently not in read-only mode.
 *
 * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller
 *          is responsible for calling btrfs_dec_nocow_writers() later.
 *
 *          Or NULL if we can not do a NOCOW write
 */
struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
						  u64 bytenr)
{ … }

/*
 * Decrement the number of NOCOW writers in a block group.
 *
 * This is meant to be called after a previous call to btrfs_inc_nocow_writers(),
 * and on the block group returned by that call. Typically this is called after
 * creating an ordered extent for a NOCOW write, to prevent races with scrub and
 * relocation.
 *
 * After this call, the caller should not use the block group anymore. It it wants
 * to use it, then it should get a reference on it before calling this function.
 */
void btrfs_dec_nocow_writers(struct btrfs_block_group *bg)
{ … }

void btrfs_wait_nocow_writers(struct btrfs_block_group *bg)
{ … }

void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
					const u64 start)
{ … }

void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg)
{ … }

struct btrfs_caching_control *btrfs_get_caching_control(
		struct btrfs_block_group *cache)
{ … }

static void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
{ … }

/*
 * When we wait for progress in the block group caching, its because our
 * allocation attempt failed at least once.  So, we must sleep and let some
 * progress happen before we try again.
 *
 * This function will sleep at least once waiting for new free space to show
 * up, and then it will check the block group free space numbers for our min
 * num_bytes.  Another option is to have it go ahead and look in the rbtree for
 * a free extent of a given size, but this is a good start.
 *
 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
 * any of the information in this block group.
 */
void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
					   u64 num_bytes)
{ … }

static int btrfs_caching_ctl_wait_done(struct btrfs_block_group *cache,
				       struct btrfs_caching_control *caching_ctl)
{ … }

static int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
{ … }

#ifdef CONFIG_BTRFS_DEBUG
static void fragment_free_space(struct btrfs_block_group *block_group)
{ … }
#endif

/*
 * Add a free space range to the in memory free space cache of a block group.
 * This checks if the range contains super block locations and any such
 * locations are not added to the free space cache.
 *
 * @block_group:      The target block group.
 * @start:            Start offset of the range.
 * @end:              End offset of the range (exclusive).
 * @total_added_ret:  Optional pointer to return the total amount of space
 *                    added to the block group's free space cache.
 *
 * Returns 0 on success or < 0 on error.
 */
int btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start,
			     u64 end, u64 *total_added_ret)
{ … }

/*
 * Get an arbitrary extent item index / max_index through the block group
 *
 * @block_group   the block group to sample from
 * @index:        the integral step through the block group to grab from
 * @max_index:    the granularity of the sampling
 * @key:          return value parameter for the item we find
 *
 * Pre-conditions on indices:
 * 0 <= index <= max_index
 * 0 < max_index
 *
 * Returns: 0 on success, 1 if the search didn't yield a useful item, negative
 * error code on error.
 */
static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl,
					  struct btrfs_block_group *block_group,
					  int index, int max_index,
					  struct btrfs_key *found_key)
{ … }

/*
 * Best effort attempt to compute a block group's size class while caching it.
 *
 * @block_group: the block group we are caching
 *
 * We cannot infer the size class while adding free space extents, because that
 * logic doesn't care about contiguous file extents (it doesn't differentiate
 * between a 100M extent and 100 contiguous 1M extents). So we need to read the
 * file extent items. Reading all of them is quite wasteful, because usually
 * only a handful are enough to give a good answer. Therefore, we just grab 5 of
 * them at even steps through the block group and pick the smallest size class
 * we see. Since size class is best effort, and not guaranteed in general,
 * inaccuracy is acceptable.
 *
 * To be more explicit about why this algorithm makes sense:
 *
 * If we are caching in a block group from disk, then there are three major cases
 * to consider:
 * 1. the block group is well behaved and all extents in it are the same size
 *    class.
 * 2. the block group is mostly one size class with rare exceptions for last
 *    ditch allocations
 * 3. the block group was populated before size classes and can have a totally
 *    arbitrary mix of size classes.
 *
 * In case 1, looking at any extent in the block group will yield the correct
 * result. For the mixed cases, taking the minimum size class seems like a good
 * approximation, since gaps from frees will be usable to the size class. For
 * 2., a small handful of file extents is likely to yield the right answer. For
 * 3, we can either read every file extent, or admit that this is best effort
 * anyway and try to stay fast.
 *
 * Returns: 0 on success, negative error code on error.
 */
static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl,
				       struct btrfs_block_group *block_group)
{ … }

static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
{ … }

static inline void btrfs_free_excluded_extents(const struct btrfs_block_group *bg)
{ … }

static noinline void caching_thread(struct btrfs_work *work)
{ … }

int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
{ … }

static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
{ … }

/*
 * Clear incompat bits for the following feature(s):
 *
 * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
 *            in the whole filesystem
 *
 * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups
 */
static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
{ … }

static struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
{ … }

static int remove_block_group_item(struct btrfs_trans_handle *trans,
				   struct btrfs_path *path,
				   struct btrfs_block_group *block_group)
{ … }

int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
			     struct btrfs_chunk_map *map)
{ … }

struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
		struct btrfs_fs_info *fs_info, const u64 chunk_offset)
{ … }

/*
 * Mark block group @cache read-only, so later write won't happen to block
 * group @cache.
 *
 * If @force is not set, this function will only mark the block group readonly
 * if we have enough free space (1M) in other metadata/system block groups.
 * If @force is not set, this function will mark the block group readonly
 * without checking free space.
 *
 * NOTE: This function doesn't care if other block groups can contain all the
 * data in this block group. That check should be done by relocation routine,
 * not this function.
 */
static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
{ … }

static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
				 struct btrfs_block_group *bg)
{ … }

/*
 * Process the unused_bgs list and remove any that don't have any allocated
 * space inside of them.
 */
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
{ … }

void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
{ … }

/*
 * We want block groups with a low number of used bytes to be in the beginning
 * of the list, so they will get reclaimed first.
 */
static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
			   const struct list_head *b)
{ … }

static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info)
{ … }

static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed)
{ … }

void btrfs_reclaim_bgs_work(struct work_struct *work)
{ … }

void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
{ … }

void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
{ … }

static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
			   struct btrfs_path *path)
{ … }

static int find_first_block_group(struct btrfs_fs_info *fs_info,
				  struct btrfs_path *path,
				  struct btrfs_key *key)
{ … }

static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
{ … }

/*
 * Map a physical disk address to a list of logical addresses.
 *
 * @fs_info:       the filesystem
 * @chunk_start:   logical address of block group
 * @physical:	   physical address to map to logical addresses
 * @logical:	   return array of logical addresses which map to @physical
 * @naddrs:	   length of @logical
 * @stripe_len:    size of IO stripe for the given block group
 *
 * Maps a particular @physical disk address to a list of @logical addresses.
 * Used primarily to exclude those portions of a block group that contain super
 * block copies.
 */
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
		     u64 physical, u64 **logical, int *naddrs, int *stripe_len)
{ … }

static int exclude_super_stripes(struct btrfs_block_group *cache)
{ … }

static struct btrfs_block_group *btrfs_create_block_group_cache(
		struct btrfs_fs_info *fs_info, u64 start)
{ … }

/*
 * Iterate all chunks and verify that each of them has the corresponding block
 * group
 */
static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
{ … }

static int read_one_block_group(struct btrfs_fs_info *info,
				struct btrfs_block_group_item *bgi,
				const struct btrfs_key *key,
				int need_clear)
{ … }

static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
{ … }

int btrfs_read_block_groups(struct btrfs_fs_info *info)
{ … }

/*
 * This function, insert_block_group_item(), belongs to the phase 2 of chunk
 * allocation.
 *
 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
 * phases.
 */
static int insert_block_group_item(struct btrfs_trans_handle *trans,
				   struct btrfs_block_group *block_group)
{ … }

static int insert_dev_extent(struct btrfs_trans_handle *trans,
			    struct btrfs_device *device, u64 chunk_offset,
			    u64 start, u64 num_bytes)
{ … }

/*
 * This function belongs to phase 2.
 *
 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
 * phases.
 */
static int insert_dev_extents(struct btrfs_trans_handle *trans,
				   u64 chunk_offset, u64 chunk_size)
{ … }

/*
 * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
 * chunk allocation.
 *
 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
 * phases.
 */
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
{ … }

/*
 * For extent tree v2 we use the block_group_item->chunk_offset to point at our
 * global root id.  For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID.
 */
static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset)
{ … }

struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
						 u64 type,
						 u64 chunk_offset, u64 size)
{ … }

/*
 * Mark one block group RO, can be called several times for the same block
 * group.
 *
 * @cache:		the destination block group
 * @do_chunk_alloc:	whether need to do chunk pre-allocation, this is to
 * 			ensure we still have some free space after marking this
 * 			block group RO.
 */
int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
			     bool do_chunk_alloc)
{ … }

void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
{ … }

static int update_block_group_item(struct btrfs_trans_handle *trans,
				   struct btrfs_path *path,
				   struct btrfs_block_group *cache)
{ … }

static int cache_save_setup(struct btrfs_block_group *block_group,
			    struct btrfs_trans_handle *trans,
			    struct btrfs_path *path)
{ … }

int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
{ … }

/*
 * Transaction commit does final block group cache writeback during a critical
 * section where nothing is allowed to change the FS.  This is required in
 * order for the cache to actually match the block group, but can introduce a
 * lot of latency into the commit.
 *
 * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO.
 * There's a chance we'll have to redo some of it if the block group changes
 * again during the commit, but it greatly reduces the commit latency by
 * getting rid of the easy block groups while we're still allowing others to
 * join the commit.
 */
int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
{ … }

int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
{ … }

int btrfs_update_block_group(struct btrfs_trans_handle *trans,
			     u64 bytenr, u64 num_bytes, bool alloc)
{ … }

/*
 * Update the block_group and space info counters.
 *
 * @cache:	The cache we are manipulating
 * @ram_bytes:  The number of bytes of file content, and will be same to
 *              @num_bytes except for the compress path.
 * @num_bytes:	The number of bytes in question
 * @delalloc:   The blocks are allocated for the delalloc write
 *
 * This is called by the allocator when it reserves space. If this is a
 * reservation and the block group has become read only we cannot make the
 * reservation and return -EAGAIN, otherwise this function always succeeds.
 */
int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
			     u64 ram_bytes, u64 num_bytes, int delalloc,
			     bool force_wrong_size_class)
{ … }

/*
 * Update the block_group and space info counters.
 *
 * @cache:      The cache we are manipulating
 * @num_bytes:  The number of bytes in question
 * @delalloc:   The blocks are allocated for the delalloc write
 *
 * This is called by somebody who is freeing space that was never actually used
 * on disk.  For example if you reserve some space for a new leaf in transaction
 * A and before transaction A commits you free that leaf, you call this with
 * reserve set to 0 in order to clear the reservation.
 */
void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
			       u64 num_bytes, int delalloc)
{ … }

static void force_metadata_allocation(struct btrfs_fs_info *info)
{ … }

static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
			      struct btrfs_space_info *sinfo, int force)
{ … }

int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
{ … }

static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
{ … }

/*
 * Chunk allocation is done in 2 phases:
 *
 * 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for
 *    the chunk, the chunk mapping, create its block group and add the items
 *    that belong in the chunk btree to it - more specifically, we need to
 *    update device items in the chunk btree and add a new chunk item to it.
 *
 * 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block
 *    group item to the extent btree and the device extent items to the devices
 *    btree.
 *
 * This is done to prevent deadlocks. For example when COWing a node from the
 * extent btree we are holding a write lock on the node's parent and if we
 * trigger chunk allocation and attempted to insert the new block group item
 * in the extent btree right way, we could deadlock because the path for the
 * insertion can include that parent node. At first glance it seems impossible
 * to trigger chunk allocation after starting a transaction since tasks should
 * reserve enough transaction units (metadata space), however while that is true
 * most of the time, chunk allocation may still be triggered for several reasons:
 *
 * 1) When reserving metadata, we check if there is enough free space in the
 *    metadata space_info and therefore don't trigger allocation of a new chunk.
 *    However later when the task actually tries to COW an extent buffer from
 *    the extent btree or from the device btree for example, it is forced to
 *    allocate a new block group (chunk) because the only one that had enough
 *    free space was just turned to RO mode by a running scrub for example (or
 *    device replace, block group reclaim thread, etc), so we can not use it
 *    for allocating an extent and end up being forced to allocate a new one;
 *
 * 2) Because we only check that the metadata space_info has enough free bytes,
 *    we end up not allocating a new metadata chunk in that case. However if
 *    the filesystem was mounted in degraded mode, none of the existing block
 *    groups might be suitable for extent allocation due to their incompatible
 *    profile (for e.g. mounting a 2 devices filesystem, where all block groups
 *    use a RAID1 profile, in degraded mode using a single device). In this case
 *    when the task attempts to COW some extent buffer of the extent btree for
 *    example, it will trigger allocation of a new metadata block group with a
 *    suitable profile (SINGLE profile in the example of the degraded mount of
 *    the RAID1 filesystem);
 *
 * 3) The task has reserved enough transaction units / metadata space, but when
 *    it attempts to COW an extent buffer from the extent or device btree for
 *    example, it does not find any free extent in any metadata block group,
 *    therefore forced to try to allocate a new metadata block group.
 *    This is because some other task allocated all available extents in the
 *    meanwhile - this typically happens with tasks that don't reserve space
 *    properly, either intentionally or as a bug. One example where this is
 *    done intentionally is fsync, as it does not reserve any transaction units
 *    and ends up allocating a variable number of metadata extents for log
 *    tree extent buffers;
 *
 * 4) The task has reserved enough transaction units / metadata space, but right
 *    before it tries to allocate the last extent buffer it needs, a discard
 *    operation comes in and, temporarily, removes the last free space entry from
 *    the only metadata block group that had free space (discard starts by
 *    removing a free space entry from a block group, then does the discard
 *    operation and, once it's done, it adds back the free space entry to the
 *    block group).
 *
 * We also need this 2 phases setup when adding a device to a filesystem with
 * a seed device - we must create new metadata and system chunks without adding
 * any of the block group items to the chunk, extent and device btrees. If we
 * did not do it this way, we would get ENOSPC when attempting to update those
 * btrees, since all the chunks from the seed device are read-only.
 *
 * Phase 1 does the updates and insertions to the chunk btree because if we had
 * it done in phase 2 and have a thundering herd of tasks allocating chunks in
 * parallel, we risk having too many system chunks allocated by many tasks if
 * many tasks reach phase 1 without the previous ones completing phase 2. In the
 * extreme case this leads to exhaustion of the system chunk array in the
 * superblock. This is easier to trigger if using a btree node/leaf size of 64K
 * and with RAID filesystems (so we have more device items in the chunk btree).
 * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
 * the system chunk array due to concurrent allocations") provides more details.
 *
 * Allocation of system chunks does not happen through this function. A task that
 * needs to update the chunk btree (the only btree that uses system chunks), must
 * preallocate chunk space by calling either check_system_chunk() or
 * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or
 * metadata chunk or when removing a chunk, while the later is used before doing
 * a modification to the chunk btree - use cases for the later are adding,
 * removing and resizing a device as well as relocation of a system chunk.
 * See the comment below for more details.
 *
 * The reservation of system space, done through check_system_chunk(), as well
 * as all the updates and insertions into the chunk btree must be done while
 * holding fs_info->chunk_mutex. This is important to guarantee that while COWing
 * an extent buffer from the chunks btree we never trigger allocation of a new
 * system chunk, which would result in a deadlock (trying to lock twice an
 * extent buffer of the chunk btree, first time before triggering the chunk
 * allocation and the second time during chunk allocation while attempting to
 * update the chunks btree). The system chunk array is also updated while holding
 * that mutex. The same logic applies to removing chunks - we must reserve system
 * space, update the chunk btree and the system chunk array in the superblock
 * while holding fs_info->chunk_mutex.
 *
 * This function, btrfs_chunk_alloc(), belongs to phase 1.
 *
 * If @force is CHUNK_ALLOC_FORCE:
 *    - return 1 if it successfully allocates a chunk,
 *    - return errors including -ENOSPC otherwise.
 * If @force is NOT CHUNK_ALLOC_FORCE:
 *    - return 0 if it doesn't need to allocate a new chunk,
 *    - return 1 if it successfully allocates a chunk,
 *    - return errors including -ENOSPC otherwise.
 */
int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
		      enum btrfs_chunk_alloc_enum force)
{ … }

static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
{ … }

static void reserve_chunk_space(struct btrfs_trans_handle *trans,
				u64 bytes,
				u64 type)
{ … }

/*
 * Reserve space in the system space for allocating or removing a chunk.
 * The caller must be holding fs_info->chunk_mutex.
 */
void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
{ … }

/*
 * Reserve space in the system space, if needed, for doing a modification to the
 * chunk btree.
 *
 * @trans:		A transaction handle.
 * @is_item_insertion:	Indicate if the modification is for inserting a new item
 *			in the chunk btree or if it's for the deletion or update
 *			of an existing item.
 *
 * This is used in a context where we need to update the chunk btree outside
 * block group allocation and removal, to avoid a deadlock with a concurrent
 * task that is allocating a metadata or data block group and therefore needs to
 * update the chunk btree while holding the chunk mutex. After the update to the
 * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called.
 *
 */
void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
				  bool is_item_insertion)
{ … }

void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
{ … }

/*
 * Must be called only after stopping all workers, since we could have block
 * group caching kthreads running, and therefore they could race with us if we
 * freed the block groups before stopping them.
 */
int btrfs_free_block_groups(struct btrfs_fs_info *info)
{ … }

void btrfs_freeze_block_group(struct btrfs_block_group *cache)
{ … }

void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
{ … }

bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg)
{ … }

void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount)
{ … }

enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size)
{ … }

/*
 * Handle a block group allocating an extent in a size class
 *
 * @bg:				The block group we allocated in.
 * @size_class:			The size class of the allocation.
 * @force_wrong_size_class:	Whether we are desperate enough to allow
 *				mismatched size classes.
 *
 * Returns: 0 if the size class was valid for this block_group, -EAGAIN in the
 * case of a race that leads to the wrong size class without
 * force_wrong_size_class set.
 *
 * find_free_extent will skip block groups with a mismatched size class until
 * it really needs to avoid ENOSPC. In that case it will set
 * force_wrong_size_class. However, if a block group is newly allocated and
 * doesn't yet have a size class, then it is possible for two allocations of
 * different sizes to race and both try to use it. The loser is caught here and
 * has to retry.
 */
int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
				     enum btrfs_block_group_size_class size_class,
				     bool force_wrong_size_class)
{ … }

bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg)
{ … }
linux/fs/btrfs/block-group.c