// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2016 Oracle. All Rights Reserved. * Author: Darrick J. Wong <[email protected]> */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_btree.h" #include "xfs_bmap.h" #include "xfs_refcount_btree.h" #include "xfs_alloc.h" #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_bit.h" #include "xfs_refcount.h" #include "xfs_rmap.h" #include "xfs_ag.h" #include "xfs_health.h" #include "xfs_refcount_item.h" struct kmem_cache *xfs_refcount_intent_cache; /* Allowable refcount adjustment amounts. */ enum xfs_refc_adjust_op { … }; STATIC int __xfs_refcount_cow_alloc(struct xfs_btree_cur *rcur, xfs_agblock_t agbno, xfs_extlen_t aglen); STATIC int __xfs_refcount_cow_free(struct xfs_btree_cur *rcur, xfs_agblock_t agbno, xfs_extlen_t aglen); /* * Look up the first record less than or equal to [bno, len] in the btree * given by cur. */ int xfs_refcount_lookup_le( struct xfs_btree_cur *cur, enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat) { … } /* * Look up the first record greater than or equal to [bno, len] in the btree * given by cur. */ int xfs_refcount_lookup_ge( struct xfs_btree_cur *cur, enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat) { … } /* * Look up the first record equal to [bno, len] in the btree * given by cur. */ int xfs_refcount_lookup_eq( struct xfs_btree_cur *cur, enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat) { … } /* Convert on-disk record to in-core format. */ void xfs_refcount_btrec_to_irec( const union xfs_btree_rec *rec, struct xfs_refcount_irec *irec) { … } /* Simple checks for refcount records. */ xfs_failaddr_t xfs_refcount_check_irec( struct xfs_perag *pag, const struct xfs_refcount_irec *irec) { … } static inline int xfs_refcount_complain_bad_rec( struct xfs_btree_cur *cur, xfs_failaddr_t fa, const struct xfs_refcount_irec *irec) { … } /* * Get the data from the pointed-to record. */ int xfs_refcount_get_rec( struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, int *stat) { … } /* * Update the record referred to by cur to the value given * by [bno, len, refcount]. * This either works (return 0) or gets an EFSCORRUPTED error. */ STATIC int xfs_refcount_update( struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec) { … } /* * Insert the record referred to by cur to the value given * by [bno, len, refcount]. * This either works (return 0) or gets an EFSCORRUPTED error. */ int xfs_refcount_insert( struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, int *i) { … } /* * Remove the record referred to by cur, then set the pointer to the spot * where the record could be re-inserted, in case we want to increment or * decrement the cursor. * This either works (return 0) or gets an EFSCORRUPTED error. */ STATIC int xfs_refcount_delete( struct xfs_btree_cur *cur, int *i) { … } /* * Adjusting the Reference Count * * As stated elsewhere, the reference count btree (refcbt) stores * >1 reference counts for extents of physical blocks. In this * operation, we're either raising or lowering the reference count of * some subrange stored in the tree: * * <------ adjustment range ------> * ----+ +---+-----+ +--+--------+--------- * 2 | | 3 | 4 | |17| 55 | 10 * ----+ +---+-----+ +--+--------+--------- * X axis is physical blocks number; * reference counts are the numbers inside the rectangles * * The first thing we need to do is to ensure that there are no * refcount extents crossing either boundary of the range to be * adjusted. For any extent that does cross a boundary, split it into * two extents so that we can increment the refcount of one of the * pieces later: * * <------ adjustment range ------> * ----+ +---+-----+ +--+--------+----+---- * 2 | | 3 | 2 | |17| 55 | 10 | 10 * ----+ +---+-----+ +--+--------+----+---- * * For this next step, let's assume that all the physical blocks in * the adjustment range are mapped to a file and are therefore in use * at least once. Therefore, we can infer that any gap in the * refcount tree within the adjustment range represents a physical * extent with refcount == 1: * * <------ adjustment range ------> * ----+---+---+-----+-+--+--------+----+---- * 2 |"1"| 3 | 2 |1|17| 55 | 10 | 10 * ----+---+---+-----+-+--+--------+----+---- * ^ * * For each extent that falls within the interval range, figure out * which extent is to the left or the right of that extent. Now we * have a left, current, and right extent. If the new reference count * of the center extent enables us to merge left, center, and right * into one record covering all three, do so. If the center extent is * at the left end of the range, abuts the left extent, and its new * reference count matches the left extent's record, then merge them. * If the center extent is at the right end of the range, abuts the * right extent, and the reference counts match, merge those. In the * example, we can left merge (assuming an increment operation): * * <------ adjustment range ------> * --------+---+-----+-+--+--------+----+---- * 2 | 3 | 2 |1|17| 55 | 10 | 10 * --------+---+-----+-+--+--------+----+---- * ^ * * For all other extents within the range, adjust the reference count * or delete it if the refcount falls below 2. If we were * incrementing, the end result looks like this: * * <------ adjustment range ------> * --------+---+-----+-+--+--------+----+---- * 2 | 4 | 3 |2|18| 56 | 11 | 10 * --------+---+-----+-+--+--------+----+---- * * The result of a decrement operation looks as such: * * <------ adjustment range ------> * ----+ +---+ +--+--------+----+---- * 2 | | 2 | |16| 54 | 9 | 10 * ----+ +---+ +--+--------+----+---- * DDDD 111111DD * * The blocks marked "D" are freed; the blocks marked "1" are only * referenced once and therefore the record is removed from the * refcount btree. */ /* Next block after this extent. */ static inline xfs_agblock_t xfs_refc_next( struct xfs_refcount_irec *rc) { … } /* * Split a refcount extent that crosses agbno. */ STATIC int xfs_refcount_split_extent( struct xfs_btree_cur *cur, enum xfs_refc_domain domain, xfs_agblock_t agbno, bool *shape_changed) { … } /* * Merge the left, center, and right extents. */ STATIC int xfs_refcount_merge_center_extents( struct xfs_btree_cur *cur, struct xfs_refcount_irec *left, struct xfs_refcount_irec *center, struct xfs_refcount_irec *right, unsigned long long extlen, xfs_extlen_t *aglen) { … } /* * Merge with the left extent. */ STATIC int xfs_refcount_merge_left_extent( struct xfs_btree_cur *cur, struct xfs_refcount_irec *left, struct xfs_refcount_irec *cleft, xfs_agblock_t *agbno, xfs_extlen_t *aglen) { … } /* * Merge with the right extent. */ STATIC int xfs_refcount_merge_right_extent( struct xfs_btree_cur *cur, struct xfs_refcount_irec *right, struct xfs_refcount_irec *cright, xfs_extlen_t *aglen) { … } /* * Find the left extent and the one after it (cleft). This function assumes * that we've already split any extent crossing agbno. */ STATIC int xfs_refcount_find_left_extents( struct xfs_btree_cur *cur, struct xfs_refcount_irec *left, struct xfs_refcount_irec *cleft, enum xfs_refc_domain domain, xfs_agblock_t agbno, xfs_extlen_t aglen) { … } /* * Find the right extent and the one before it (cright). This function * assumes that we've already split any extents crossing agbno + aglen. */ STATIC int xfs_refcount_find_right_extents( struct xfs_btree_cur *cur, struct xfs_refcount_irec *right, struct xfs_refcount_irec *cright, enum xfs_refc_domain domain, xfs_agblock_t agbno, xfs_extlen_t aglen) { … } /* Is this extent valid? */ static inline bool xfs_refc_valid( const struct xfs_refcount_irec *rc) { … } static inline xfs_nlink_t xfs_refc_merge_refcount( const struct xfs_refcount_irec *irec, enum xfs_refc_adjust_op adjust) { … } static inline bool xfs_refc_want_merge_center( const struct xfs_refcount_irec *left, const struct xfs_refcount_irec *cleft, const struct xfs_refcount_irec *cright, const struct xfs_refcount_irec *right, bool cleft_is_cright, enum xfs_refc_adjust_op adjust, unsigned long long *ulenp) { … } static inline bool xfs_refc_want_merge_left( const struct xfs_refcount_irec *left, const struct xfs_refcount_irec *cleft, enum xfs_refc_adjust_op adjust) { … } static inline bool xfs_refc_want_merge_right( const struct xfs_refcount_irec *cright, const struct xfs_refcount_irec *right, enum xfs_refc_adjust_op adjust) { … } /* * Try to merge with any extents on the boundaries of the adjustment range. */ STATIC int xfs_refcount_merge_extents( struct xfs_btree_cur *cur, enum xfs_refc_domain domain, xfs_agblock_t *agbno, xfs_extlen_t *aglen, enum xfs_refc_adjust_op adjust, bool *shape_changed) { … } /* * XXX: This is a pretty hand-wavy estimate. The penalty for guessing * true incorrectly is a shutdown FS; the penalty for guessing false * incorrectly is more transaction rolls than might be necessary. * Be conservative here. */ static bool xfs_refcount_still_have_space( struct xfs_btree_cur *cur) { … } /* * Adjust the refcounts of middle extents. At this point we should have * split extents that crossed the adjustment range; merged with adjacent * extents; and updated agbno/aglen to reflect the merges. Therefore, * all we have to do is update the extents inside [agbno, agbno + aglen]. */ STATIC int xfs_refcount_adjust_extents( struct xfs_btree_cur *cur, xfs_agblock_t *agbno, xfs_extlen_t *aglen, enum xfs_refc_adjust_op adj) { … } /* Adjust the reference count of a range of AG blocks. */ STATIC int xfs_refcount_adjust( struct xfs_btree_cur *cur, xfs_agblock_t *agbno, xfs_extlen_t *aglen, enum xfs_refc_adjust_op adj) { … } /* * Set up a continuation a deferred refcount operation by updating the intent. * Checks to make sure we're not going to run off the end of the AG. */ static inline int xfs_refcount_continue_op( struct xfs_btree_cur *cur, struct xfs_refcount_intent *ri, xfs_agblock_t new_agbno) { … } /* * Process one of the deferred refcount operations. We pass back the * btree cursor to maintain our lock on the btree between calls. * This saves time and eliminates a buffer deadlock between the * superblock and the AGF because we'll always grab them in the same * order. */ int xfs_refcount_finish_one( struct xfs_trans *tp, struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur) { … } /* * Record a refcount intent for later processing. */ static void __xfs_refcount_add( struct xfs_trans *tp, enum xfs_refcount_intent_type type, xfs_fsblock_t startblock, xfs_extlen_t blockcount) { … } /* * Increase the reference count of the blocks backing a file's extent. */ void xfs_refcount_increase_extent( struct xfs_trans *tp, struct xfs_bmbt_irec *PREV) { … } /* * Decrease the reference count of the blocks backing a file's extent. */ void xfs_refcount_decrease_extent( struct xfs_trans *tp, struct xfs_bmbt_irec *PREV) { … } /* * Given an AG extent, find the lowest-numbered run of shared blocks * within that range and return the range in fbno/flen. If * find_end_of_shared is set, return the longest contiguous extent of * shared blocks; if not, just return the first extent we find. If no * shared blocks are found, fbno and flen will be set to NULLAGBLOCK * and 0, respectively. */ int xfs_refcount_find_shared( struct xfs_btree_cur *cur, xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_end_of_shared) { … } /* * Recovering CoW Blocks After a Crash * * Due to the way that the copy on write mechanism works, there's a window of * opportunity in which we can lose track of allocated blocks during a crash. * Because CoW uses delayed allocation in the in-core CoW fork, writeback * causes blocks to be allocated and stored in the CoW fork. The blocks are * no longer in the free space btree but are not otherwise recorded anywhere * until the write completes and the blocks are mapped into the file. A crash * in between allocation and remapping results in the replacement blocks being * lost. This situation is exacerbated by the CoW extent size hint because * allocations can hang around for long time. * * However, there is a place where we can record these allocations before they * become mappings -- the reference count btree. The btree does not record * extents with refcount == 1, so we can record allocations with a refcount of * 1. Blocks being used for CoW writeout cannot be shared, so there should be * no conflict with shared block records. These mappings should be created * when we allocate blocks to the CoW fork and deleted when they're removed * from the CoW fork. * * Minor nit: records for in-progress CoW allocations and records for shared * extents must never be merged, to preserve the property that (except for CoW * allocations) there are no refcount btree entries with refcount == 1. The * only time this could potentially happen is when unsharing a block that's * adjacent to CoW allocations, so we must be careful to avoid this. * * At mount time we recover lost CoW allocations by searching the refcount * btree for these refcount == 1 mappings. These represent CoW allocations * that were in progress at the time the filesystem went down, so we can free * them to get the space back. * * This mechanism is superior to creating EFIs for unmapped CoW extents for * several reasons -- first, EFIs pin the tail of the log and would have to be * periodically relogged to avoid filling up the log. Second, CoW completions * will have to file an EFD and create new EFIs for whatever remains in the * CoW fork; this partially takes care of (1) but extent-size reservations * will have to periodically relog even if there's no writeout in progress. * This can happen if the CoW extent size hint is set, which you really want. * Third, EFIs cannot currently be automatically relogged into newer * transactions to advance the log tail. Fourth, stuffing the log full of * EFIs places an upper bound on the number of CoW allocations that can be * held filesystem-wide at any given time. Recording them in the refcount * btree doesn't require us to maintain any state in memory and doesn't pin * the log. */ /* * Adjust the refcounts of CoW allocations. These allocations are "magic" * in that they're not referenced anywhere else in the filesystem, so we * stash them in the refcount btree with a refcount of 1 until either file * remapping (or CoW cancellation) happens. */ STATIC int xfs_refcount_adjust_cow_extents( struct xfs_btree_cur *cur, xfs_agblock_t agbno, xfs_extlen_t aglen, enum xfs_refc_adjust_op adj) { … } /* * Add or remove refcount btree entries for CoW reservations. */ STATIC int xfs_refcount_adjust_cow( struct xfs_btree_cur *cur, xfs_agblock_t agbno, xfs_extlen_t aglen, enum xfs_refc_adjust_op adj) { … } /* * Record a CoW allocation in the refcount btree. */ STATIC int __xfs_refcount_cow_alloc( struct xfs_btree_cur *rcur, xfs_agblock_t agbno, xfs_extlen_t aglen) { … } /* * Remove a CoW allocation from the refcount btree. */ STATIC int __xfs_refcount_cow_free( struct xfs_btree_cur *rcur, xfs_agblock_t agbno, xfs_extlen_t aglen) { … } /* Record a CoW staging extent in the refcount btree. */ void xfs_refcount_alloc_cow_extent( struct xfs_trans *tp, xfs_fsblock_t fsb, xfs_extlen_t len) { … } /* Forget a CoW staging event in the refcount btree. */ void xfs_refcount_free_cow_extent( struct xfs_trans *tp, xfs_fsblock_t fsb, xfs_extlen_t len) { … } struct xfs_refcount_recovery { … }; /* Stuff an extent on the recovery list. */ STATIC int xfs_refcount_recover_extent( struct xfs_btree_cur *cur, const union xfs_btree_rec *rec, void *priv) { … } /* Find and remove leftover CoW reservations. */ int xfs_refcount_recover_cow_leftovers( struct xfs_mount *mp, struct xfs_perag *pag) { … } /* * Scan part of the keyspace of the refcount records and tell us if the area * has no records, is fully mapped by records, or is partially filled. */ int xfs_refcount_has_records( struct xfs_btree_cur *cur, enum xfs_refc_domain domain, xfs_agblock_t bno, xfs_extlen_t len, enum xbtree_recpacking *outcome) { … } struct xfs_refcount_query_range_info { … }; /* Format btree record and pass to our callback. */ STATIC int xfs_refcount_query_range_helper( struct xfs_btree_cur *cur, const union xfs_btree_rec *rec, void *priv) { … } /* Find all refcount records between two keys. */ int xfs_refcount_query_range( struct xfs_btree_cur *cur, const struct xfs_refcount_irec *low_rec, const struct xfs_refcount_irec *high_rec, xfs_refcount_query_range_fn fn, void *priv) { … } int __init xfs_refcount_intent_init_cache(void) { … } void xfs_refcount_intent_destroy_cache(void) { … }