space-info.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0

#include "linux/spinlock.h"
#include <linux/minmax.h>
#include "misc.h"
#include "ctree.h"
#include "space-info.h"
#include "sysfs.h"
#include "volumes.h"
#include "free-space-cache.h"
#include "ordered-data.h"
#include "transaction.h"
#include "block-group.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"

/*
 * HOW DOES SPACE RESERVATION WORK
 *
 * If you want to know about delalloc specifically, there is a separate comment
 * for that with the delalloc code.  This comment is about how the whole system
 * works generally.
 *
 * BASIC CONCEPTS
 *
 *   1) space_info.  This is the ultimate arbiter of how much space we can use.
 *   There's a description of the bytes_ fields with the struct declaration,
 *   refer to that for specifics on each field.  Suffice it to say that for
 *   reservations we care about total_bytes - SUM(space_info->bytes_) when
 *   determining if there is space to make an allocation.  There is a space_info
 *   for METADATA, SYSTEM, and DATA areas.
 *
 *   2) block_rsv's.  These are basically buckets for every different type of
 *   metadata reservation we have.  You can see the comment in the block_rsv
 *   code on the rules for each type, but generally block_rsv->reserved is how
 *   much space is accounted for in space_info->bytes_may_use.
 *
 *   3) btrfs_calc*_size.  These are the worst case calculations we used based
 *   on the number of items we will want to modify.  We have one for changing
 *   items, and one for inserting new items.  Generally we use these helpers to
 *   determine the size of the block reserves, and then use the actual bytes
 *   values to adjust the space_info counters.
 *
 * MAKING RESERVATIONS, THE NORMAL CASE
 *
 *   We call into either btrfs_reserve_data_bytes() or
 *   btrfs_reserve_metadata_bytes(), depending on which we're looking for, with
 *   num_bytes we want to reserve.
 *
 *   ->reserve
 *     space_info->bytes_may_reserve += num_bytes
 *
 *   ->extent allocation
 *     Call btrfs_add_reserved_bytes() which does
 *     space_info->bytes_may_reserve -= num_bytes
 *     space_info->bytes_reserved += extent_bytes
 *
 *   ->insert reference
 *     Call btrfs_update_block_group() which does
 *     space_info->bytes_reserved -= extent_bytes
 *     space_info->bytes_used += extent_bytes
 *
 * MAKING RESERVATIONS, FLUSHING NORMALLY (non-priority)
 *
 *   Assume we are unable to simply make the reservation because we do not have
 *   enough space
 *
 *   -> __reserve_bytes
 *     create a reserve_ticket with ->bytes set to our reservation, add it to
 *     the tail of space_info->tickets, kick async flush thread
 *
 *   ->handle_reserve_ticket
 *     wait on ticket->wait for ->bytes to be reduced to 0, or ->error to be set
 *     on the ticket.
 *
 *   -> btrfs_async_reclaim_metadata_space/btrfs_async_reclaim_data_space
 *     Flushes various things attempting to free up space.
 *
 *   -> btrfs_try_granting_tickets()
 *     This is called by anything that either subtracts space from
 *     space_info->bytes_may_use, ->bytes_pinned, etc, or adds to the
 *     space_info->total_bytes.  This loops through the ->priority_tickets and
 *     then the ->tickets list checking to see if the reservation can be
 *     completed.  If it can the space is added to space_info->bytes_may_use and
 *     the ticket is woken up.
 *
 *   -> ticket wakeup
 *     Check if ->bytes == 0, if it does we got our reservation and we can carry
 *     on, if not return the appropriate error (ENOSPC, but can be EINTR if we
 *     were interrupted.)
 *
 * MAKING RESERVATIONS, FLUSHING HIGH PRIORITY
 *
 *   Same as the above, except we add ourselves to the
 *   space_info->priority_tickets, and we do not use ticket->wait, we simply
 *   call flush_space() ourselves for the states that are safe for us to call
 *   without deadlocking and hope for the best.
 *
 * THE FLUSHING STATES
 *
 *   Generally speaking we will have two cases for each state, a "nice" state
 *   and a "ALL THE THINGS" state.  In btrfs we delay a lot of work in order to
 *   reduce the locking over head on the various trees, and even to keep from
 *   doing any work at all in the case of delayed refs.  Each of these delayed
 *   things however hold reservations, and so letting them run allows us to
 *   reclaim space so we can make new reservations.
 *
 *   FLUSH_DELAYED_ITEMS
 *     Every inode has a delayed item to update the inode.  Take a simple write
 *     for example, we would update the inode item at write time to update the
 *     mtime, and then again at finish_ordered_io() time in order to update the
 *     isize or bytes.  We keep these delayed items to coalesce these operations
 *     into a single operation done on demand.  These are an easy way to reclaim
 *     metadata space.
 *
 *   FLUSH_DELALLOC
 *     Look at the delalloc comment to get an idea of how much space is reserved
 *     for delayed allocation.  We can reclaim some of this space simply by
 *     running delalloc, but usually we need to wait for ordered extents to
 *     reclaim the bulk of this space.
 *
 *   FLUSH_DELAYED_REFS
 *     We have a block reserve for the outstanding delayed refs space, and every
 *     delayed ref operation holds a reservation.  Running these is a quick way
 *     to reclaim space, but we want to hold this until the end because COW can
 *     churn a lot and we can avoid making some extent tree modifications if we
 *     are able to delay for as long as possible.
 *
 *   ALLOC_CHUNK
 *     We will skip this the first time through space reservation, because of
 *     overcommit and we don't want to have a lot of useless metadata space when
 *     our worst case reservations will likely never come true.
 *
 *   RUN_DELAYED_IPUTS
 *     If we're freeing inodes we're likely freeing checksums, file extent
 *     items, and extent tree items.  Loads of space could be freed up by these
 *     operations, however they won't be usable until the transaction commits.
 *
 *   COMMIT_TRANS
 *     This will commit the transaction.  Historically we had a lot of logic
 *     surrounding whether or not we'd commit the transaction, but this waits born
 *     out of a pre-tickets era where we could end up committing the transaction
 *     thousands of times in a row without making progress.  Now thanks to our
 *     ticketing system we know if we're not making progress and can error
 *     everybody out after a few commits rather than burning the disk hoping for
 *     a different answer.
 *
 * OVERCOMMIT
 *
 *   Because we hold so many reservations for metadata we will allow you to
 *   reserve more space than is currently free in the currently allocate
 *   metadata space.  This only happens with metadata, data does not allow
 *   overcommitting.
 *
 *   You can see the current logic for when we allow overcommit in
 *   btrfs_can_overcommit(), but it only applies to unallocated space.  If there
 *   is no unallocated space to be had, all reservations are kept within the
 *   free space in the allocated metadata chunks.
 *
 *   Because of overcommitting, you generally want to use the
 *   btrfs_can_overcommit() logic for metadata allocations, as it does the right
 *   thing with or without extra unallocated space.
 */

u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
			  bool may_use_included)
{ … }

/*
 * after adding space to the filesystem, we need to clear the full flags
 * on all the space infos.
 */
void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
{ … }

/*
 * Block groups with more than this value (percents) of unusable space will be
 * scheduled for background reclaim.
 */
#define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH …

#define BTRFS_UNALLOC_BLOCK_GROUP_TARGET …

/*
 * Calculate chunk size depending on volume type (regular or zoned).
 */
static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags)
{ … }

/*
 * Update default chunk size.
 */
void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info,
					u64 chunk_size)
{ … }

static int create_space_info(struct btrfs_fs_info *info, u64 flags)
{ … }

int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
{ … }

void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
				struct btrfs_block_group *block_group)
{ … }

struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
					       u64 flags)
{ … }

static u64 calc_effective_data_chunk_size(struct btrfs_fs_info *fs_info)
{ … }

static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
			  struct btrfs_space_info *space_info,
			  enum btrfs_reserve_flush_enum flush)
{ … }

int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
			 struct btrfs_space_info *space_info, u64 bytes,
			 enum btrfs_reserve_flush_enum flush)
{ … }

static void remove_ticket(struct btrfs_space_info *space_info,
			  struct reserve_ticket *ticket)
{ … }

/*
 * This is for space we already have accounted in space_info->bytes_may_use, so
 * basically when we're returning space from block_rsv's.
 */
void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
				struct btrfs_space_info *space_info)
{ … }

#define DUMP_BLOCK_RSV(fs_info, rsv_name) …

static const char *space_info_flag_to_str(const struct btrfs_space_info *space_info)
{ … }

static void dump_global_block_rsv(struct btrfs_fs_info *fs_info)
{ … }

static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
				    struct btrfs_space_info *info)
{ … }

void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
			   struct btrfs_space_info *info, u64 bytes,
			   int dump_block_groups)
{ … }

static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info,
					u64 to_reclaim)
{ … }

/*
 * shrink metadata reservation for delalloc
 */
static void shrink_delalloc(struct btrfs_fs_info *fs_info,
			    struct btrfs_space_info *space_info,
			    u64 to_reclaim, bool wait_ordered,
			    bool for_preempt)
{ … }

/*
 * Try to flush some data based on policy set by @state. This is only advisory
 * and may fail for various reasons. The caller is supposed to examine the
 * state of @space_info to detect the outcome.
 */
static void flush_space(struct btrfs_fs_info *fs_info,
		       struct btrfs_space_info *space_info, u64 num_bytes,
		       enum btrfs_flush_state state, bool for_preempt)
{ … }

static inline u64
btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
				 struct btrfs_space_info *space_info)
{ … }

static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
				    struct btrfs_space_info *space_info)
{ … }

static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
				  struct btrfs_space_info *space_info,
				  struct reserve_ticket *ticket)
{ … }

/*
 * We've exhausted our flushing, start failing tickets.
 *
 * @fs_info - fs_info for this fs
 * @space_info - the space info we were flushing
 *
 * We call this when we've exhausted our flushing ability and haven't made
 * progress in satisfying tickets.  The reservation code handles tickets in
 * order, so if there is a large ticket first and then smaller ones we could
 * very well satisfy the smaller tickets.  This will attempt to wake up any
 * tickets in the list to catch this case.
 *
 * This function returns true if it was able to make progress by clearing out
 * other tickets, or if it stumbles across a ticket that was smaller than the
 * first ticket.
 */
static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
				   struct btrfs_space_info *space_info)
{ … }

/*
 * This is for normal flushers, we can wait all goddamned day if we want to.  We
 * will loop and continuously try to flush as long as we are making progress.
 * We count progress as clearing off tickets each time we have to loop.
 */
static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
{ … }

/*
 * This handles pre-flushing of metadata space before we get to the point that
 * we need to start blocking threads on tickets.  The logic here is different
 * from the other flush paths because it doesn't rely on tickets to tell us how
 * much we need to flush, instead it attempts to keep us below the 80% full
 * watermark of space by flushing whichever reservation pool is currently the
 * largest.
 */
static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
{ … }

/*
 * FLUSH_DELALLOC_WAIT:
 *   Space is freed from flushing delalloc in one of two ways.
 *
 *   1) compression is on and we allocate less space than we reserved
 *   2) we are overwriting existing space
 *
 *   For #1 that extra space is reclaimed as soon as the delalloc pages are
 *   COWed, by way of btrfs_add_reserved_bytes() which adds the actual extent
 *   length to ->bytes_reserved, and subtracts the reserved space from
 *   ->bytes_may_use.
 *
 *   For #2 this is trickier.  Once the ordered extent runs we will drop the
 *   extent in the range we are overwriting, which creates a delayed ref for
 *   that freed extent.  This however is not reclaimed until the transaction
 *   commits, thus the next stages.
 *
 * RUN_DELAYED_IPUTS
 *   If we are freeing inodes, we want to make sure all delayed iputs have
 *   completed, because they could have been on an inode with i_nlink == 0, and
 *   thus have been truncated and freed up space.  But again this space is not
 *   immediately re-usable, it comes in the form of a delayed ref, which must be
 *   run and then the transaction must be committed.
 *
 * COMMIT_TRANS
 *   This is where we reclaim all of the pinned space generated by running the
 *   iputs
 *
 * ALLOC_CHUNK_FORCE
 *   For data we start with alloc chunk force, however we could have been full
 *   before, and then the transaction commit could have freed new block groups,
 *   so if we now have space to allocate do the force chunk allocation.
 */
static const enum btrfs_flush_state data_flush_states[] = …;

static void btrfs_async_reclaim_data_space(struct work_struct *work)
{ … }

void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
{ … }

static const enum btrfs_flush_state priority_flush_states[] = …;

static const enum btrfs_flush_state evict_flush_states[] = …;

static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
				struct btrfs_space_info *space_info,
				struct reserve_ticket *ticket,
				const enum btrfs_flush_state *states,
				int states_nr)
{ … }

static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info,
					struct btrfs_space_info *space_info,
					struct reserve_ticket *ticket)
{ … }

static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
				struct btrfs_space_info *space_info,
				struct reserve_ticket *ticket)

{ … }

/*
 * Do the appropriate flushing and waiting for a ticket.
 *
 * @fs_info:    the filesystem
 * @space_info: space info for the reservation
 * @ticket:     ticket for the reservation
 * @start_ns:   timestamp when the reservation started
 * @orig_bytes: amount of bytes originally reserved
 * @flush:      how much we can flush
 *
 * This does the work of figuring out how to flush for the ticket, waiting for
 * the reservation, and returning the appropriate error if there is one.
 */
static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
				 struct btrfs_space_info *space_info,
				 struct reserve_ticket *ticket,
				 u64 start_ns, u64 orig_bytes,
				 enum btrfs_reserve_flush_enum flush)
{ … }

/*
 * This returns true if this flush state will go through the ordinary flushing
 * code.
 */
static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush)
{ … }

static inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info,
				       struct btrfs_space_info *space_info)
{ … }

static inline bool can_steal(enum btrfs_reserve_flush_enum flush)
{ … }

/*
 * NO_FLUSH and FLUSH_EMERGENCY don't want to create a ticket, they just want to
 * fail as quickly as possible.
 */
static inline bool can_ticket(enum btrfs_reserve_flush_enum flush)
{ … }

/*
 * Try to reserve bytes from the block_rsv's space.
 *
 * @fs_info:    the filesystem
 * @space_info: space info we want to allocate from
 * @orig_bytes: number of bytes we want
 * @flush:      whether or not we can flush to make our reservation
 *
 * This will reserve orig_bytes number of bytes from the space info associated
 * with the block_rsv.  If there is not enough space it will make an attempt to
 * flush out space to make room.  It will do this by flushing delalloc if
 * possible or committing the transaction.  If flush is 0 then no attempts to
 * regain reservations will be made and this will fail if there is not enough
 * space already.
 */
static int __reserve_bytes(struct btrfs_fs_info *fs_info,
			   struct btrfs_space_info *space_info, u64 orig_bytes,
			   enum btrfs_reserve_flush_enum flush)
{ … }

/*
 * Try to reserve metadata bytes from the block_rsv's space.
 *
 * @fs_info:    the filesystem
 * @space_info: the space_info we're allocating for
 * @orig_bytes: number of bytes we want
 * @flush:      whether or not we can flush to make our reservation
 *
 * This will reserve orig_bytes number of bytes from the space info associated
 * with the block_rsv.  If there is not enough space it will make an attempt to
 * flush out space to make room.  It will do this by flushing delalloc if
 * possible or committing the transaction.  If flush is 0 then no attempts to
 * regain reservations will be made and this will fail if there is not enough
 * space already.
 */
int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
				 struct btrfs_space_info *space_info,
				 u64 orig_bytes,
				 enum btrfs_reserve_flush_enum flush)
{ … }

/*
 * Try to reserve data bytes for an allocation.
 *
 * @fs_info: the filesystem
 * @bytes:   number of bytes we need
 * @flush:   how we are allowed to flush
 *
 * This will reserve bytes from the data space info.  If there is not enough
 * space then we will attempt to flush space as specified by flush.
 */
int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
			     enum btrfs_reserve_flush_enum flush)
{ … }

/* Dump all the space infos when we abort a transaction due to ENOSPC. */
__cold void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info)
{ … }

/*
 * Account the unused space of all the readonly block group in the space_info.
 * takes mirrors into account.
 */
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
{ … }

static u64 calc_pct_ratio(u64 x, u64 y)
{ … }

/*
 * A reasonable buffer for unallocated space is 10 data block_groups.
 * If we claw this back repeatedly, we can still achieve efficient
 * utilization when near full, and not do too much reclaim while
 * always maintaining a solid buffer for workloads that quickly
 * allocate and pressure the unallocated space.
 */
static u64 calc_unalloc_target(struct btrfs_fs_info *fs_info)
{ … }

/*
 * The fundamental goal of automatic reclaim is to protect the filesystem's
 * unallocated space and thus minimize the probability of the filesystem going
 * read only when a metadata allocation failure causes a transaction abort.
 *
 * However, relocations happen into the space_info's unused space, therefore
 * automatic reclaim must also back off as that space runs low. There is no
 * value in doing trivial "relocations" of re-writing the same block group
 * into a fresh one.
 *
 * Furthermore, we want to avoid doing too much reclaim even if there are good
 * candidates. This is because the allocator is pretty good at filling up the
 * holes with writes. So we want to do just enough reclaim to try and stay
 * safe from running out of unallocated space but not be wasteful about it.
 *
 * Therefore, the dynamic reclaim threshold is calculated as follows:
 * - calculate a target unallocated amount of 5 block group sized chunks
 * - ratchet up the intensity of reclaim depending on how far we are from
 *   that target by using a formula of unalloc / target to set the threshold.
 *
 * Typically with 10 block groups as the target, the discrete values this comes
 * out to are 0, 10, 20, ... , 80, 90, and 99.
 */
static int calc_dynamic_reclaim_threshold(struct btrfs_space_info *space_info)
{ … }

int btrfs_calc_reclaim_threshold(struct btrfs_space_info *space_info)
{ … }

/*
 * Under "urgent" reclaim, we will reclaim even fresh block groups that have
 * recently seen successful allocations, as we are desperate to reclaim
 * whatever we can to avoid ENOSPC in a transaction leading to a readonly fs.
 */
static bool is_reclaim_urgent(struct btrfs_space_info *space_info)
{ … }

static int do_reclaim_sweep(struct btrfs_fs_info *fs_info,
			    struct btrfs_space_info *space_info, int raid)
{ … }

void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes)
{ … }

void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready)
{ … }

bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info)
{ … }

int btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info)
{ … }
linux/fs/btrfs/space-info.c