// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2018-2024 Oracle. All Rights Reserved. * Author: Darrick J. Wong <[email protected]> */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_btree.h" #include "xfs_btree_staging.h" #include "xfs_buf_mem.h" #include "xfs_btree_mem.h" #include "xfs_bit.h" #include "xfs_log_format.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_alloc.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc.h" #include "xfs_ialloc_btree.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" #include "xfs_inode.h" #include "xfs_icache.h" #include "xfs_bmap.h" #include "xfs_bmap_btree.h" #include "xfs_refcount.h" #include "xfs_refcount_btree.h" #include "xfs_ag.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/bitmap.h" #include "scrub/agb_bitmap.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" #include "scrub/iscan.h" #include "scrub/newbt.h" #include "scrub/reap.h" /* * Reverse Mapping Btree Repair * ============================ * * This is the most involved of all the AG space btree rebuilds. Everywhere * else in XFS we lock inodes and then AG data structures, but generating the * list of rmap records requires that we be able to scan both block mapping * btrees of every inode in the filesystem to see if it owns any extents in * this AG. We can't tolerate any inode updates while we do this, so we * freeze the filesystem to lock everyone else out, and grant ourselves * special privileges to run transactions with regular background reclamation * turned off. * * We also have to be very careful not to allow inode reclaim to start a * transaction because all transactions (other than our own) will block. * Deferred inode inactivation helps us out there. * * I) Reverse mappings for all non-space metadata and file data are collected * according to the following algorithm: * * 1. For each fork of each inode: * 1.1. Create a bitmap BMBIT to track bmbt blocks if necessary. * 1.2. If the incore extent map isn't loaded, walk the bmbt to accumulate * bmaps into rmap records (see 1.1.4). Set bits in BMBIT for each btree * block. * 1.3. If the incore extent map is loaded but the fork is in btree format, * just visit the bmbt blocks to set the corresponding BMBIT areas. * 1.4. From the incore extent map, accumulate each bmap that falls into our * target AG. Remember, multiple bmap records can map to a single rmap * record, so we cannot simply emit rmap records 1:1. * 1.5. Emit rmap records for each extent in BMBIT and free it. * 2. Create bitmaps INOBIT and ICHUNKBIT. * 3. For each record in the inobt, set the corresponding areas in ICHUNKBIT, * and set bits in INOBIT for each btree block. If the inobt has no records * at all, we must be careful to record its root in INOBIT. * 4. For each block in the finobt, set the corresponding INOBIT area. * 5. Emit rmap records for each extent in INOBIT and ICHUNKBIT and free them. * 6. Create bitmaps REFCBIT and COWBIT. * 7. For each CoW staging extent in the refcountbt, set the corresponding * areas in COWBIT. * 8. For each block in the refcountbt, set the corresponding REFCBIT area. * 9. Emit rmap records for each extent in REFCBIT and COWBIT and free them. * A. Emit rmap for the AG headers. * B. Emit rmap for the log, if there is one. * * II) The rmapbt shape and space metadata rmaps are computed as follows: * * 1. Count the rmaps collected in the previous step. (= NR) * 2. Estimate the number of rmapbt blocks needed to store NR records. (= RMB) * 3. Reserve RMB blocks through the newbt using the allocator in normap mode. * 4. Create bitmap AGBIT. * 5. For each reservation in the newbt, set the corresponding areas in AGBIT. * 6. For each block in the AGFL, bnobt, and cntbt, set the bits in AGBIT. * 7. Count the extents in AGBIT. (= AGNR) * 8. Estimate the number of rmapbt blocks needed for NR + AGNR rmaps. (= RMB') * 9. If RMB' >= RMB, reserve RMB' - RMB more newbt blocks, set RMB = RMB', * and clear AGBIT. Go to step 5. * A. Emit rmaps for each extent in AGBIT. * * III) The rmapbt is constructed and set in place as follows: * * 1. Sort the rmap records. * 2. Bulk load the rmaps. * * IV) Reap the old btree blocks. * * 1. Create a bitmap OLDRMBIT. * 2. For each gap in the new rmapbt, set the corresponding areas of OLDRMBIT. * 3. For each extent in the bnobt, clear the corresponding parts of OLDRMBIT. * 4. Reap the extents corresponding to the set areas in OLDRMBIT. These are * the parts of the AG that the rmap didn't find during its scan of the * primary metadata and aren't known to be in the free space, which implies * that they were the old rmapbt blocks. * 5. Commit. * * We use the 'xrep_rmap' prefix for all the rmap functions. */ /* Context for collecting rmaps */ struct xrep_rmap { … }; /* Set us up to repair reverse mapping btrees. */ int xrep_setup_ag_rmapbt( struct xfs_scrub *sc) { … } /* Make sure there's nothing funny about this mapping. */ STATIC int xrep_rmap_check_mapping( struct xfs_scrub *sc, const struct xfs_rmap_irec *rec) { … } /* Store a reverse-mapping record. */ static inline int xrep_rmap_stash( struct xrep_rmap *rr, xfs_agblock_t startblock, xfs_extlen_t blockcount, uint64_t owner, uint64_t offset, unsigned int flags) { … } struct xrep_rmap_stash_run { … }; static int xrep_rmap_stash_run( uint32_t start, uint32_t len, void *priv) { … } /* * Emit rmaps for every extent of bits set in the bitmap. Caller must ensure * that the ranges are in units of FS blocks. */ STATIC int xrep_rmap_stash_bitmap( struct xrep_rmap *rr, struct xagb_bitmap *bitmap, const struct xfs_owner_info *oinfo) { … } /* Section (I): Finding all file and bmbt extents. */ /* Context for accumulating rmaps for an inode fork. */ struct xrep_rmap_ifork { … }; /* Stash an rmap that we accumulated while walking an inode fork. */ STATIC int xrep_rmap_stash_accumulated( struct xrep_rmap_ifork *rf) { … } /* Accumulate a bmbt record. */ STATIC int xrep_rmap_visit_bmbt( struct xfs_btree_cur *cur, struct xfs_bmbt_irec *rec, void *priv) { … } /* Add a btree block to the bitmap. */ STATIC int xrep_rmap_visit_iroot_btree_block( struct xfs_btree_cur *cur, int level, void *priv) { … } /* * Iterate a metadata btree rooted in an inode to collect rmap records for * anything in this fork that matches the AG. */ STATIC int xrep_rmap_scan_iroot_btree( struct xrep_rmap_ifork *rf, struct xfs_btree_cur *cur) { … } /* * Iterate the block mapping btree to collect rmap records for anything in this * fork that matches the AG. Sets @mappings_done to true if we've scanned the * block mappings in this fork. */ STATIC int xrep_rmap_scan_bmbt( struct xrep_rmap_ifork *rf, struct xfs_inode *ip, bool *mappings_done) { … } /* * Iterate the in-core extent cache to collect rmap records for anything in * this fork that matches the AG. */ STATIC int xrep_rmap_scan_iext( struct xrep_rmap_ifork *rf, struct xfs_ifork *ifp) { … } /* Find all the extents from a given AG in an inode fork. */ STATIC int xrep_rmap_scan_ifork( struct xrep_rmap *rr, struct xfs_inode *ip, int whichfork) { … } /* * Take ILOCK on a file that we want to scan. * * Select ILOCK_EXCL if the file has an unloaded data bmbt or has an unloaded * attr bmbt. Otherwise, take ILOCK_SHARED. */ static inline unsigned int xrep_rmap_scan_ilock( struct xfs_inode *ip) { … } /* Record reverse mappings for a file. */ STATIC int xrep_rmap_scan_inode( struct xrep_rmap *rr, struct xfs_inode *ip) { … } /* Section (I): Find all AG metadata extents except for free space metadata. */ struct xrep_rmap_inodes { … }; /* Record inode btree rmaps. */ STATIC int xrep_rmap_walk_inobt( struct xfs_btree_cur *cur, const union xfs_btree_rec *rec, void *priv) { … } /* Collect rmaps for the blocks containing inode btrees and the inode chunks. */ STATIC int xrep_rmap_find_inode_rmaps( struct xrep_rmap *rr) { … } /* Record a CoW staging extent. */ STATIC int xrep_rmap_walk_cowblocks( struct xfs_btree_cur *cur, const struct xfs_refcount_irec *irec, void *priv) { … } /* * Collect rmaps for the blocks containing the refcount btree, and all CoW * staging extents. */ STATIC int xrep_rmap_find_refcount_rmaps( struct xrep_rmap *rr) { … } /* Generate rmaps for the AG headers (AGI/AGF/AGFL) */ STATIC int xrep_rmap_find_agheader_rmaps( struct xrep_rmap *rr) { … } /* Generate rmaps for the log, if it's in this AG. */ STATIC int xrep_rmap_find_log_rmaps( struct xrep_rmap *rr) { … } /* Check and count all the records that we gathered. */ STATIC int xrep_rmap_check_record( struct xfs_btree_cur *cur, const struct xfs_rmap_irec *rec, void *priv) { … } /* * Generate all the reverse-mappings for this AG, a list of the old rmapbt * blocks, and the new btreeblks count. Figure out if we have enough free * space to reconstruct the inode btrees. The caller must clean up the lists * if anything goes wrong. This implements section (I) above. */ STATIC int xrep_rmap_find_rmaps( struct xrep_rmap *rr) { … } /* Section (II): Reserving space for new rmapbt and setting free space bitmap */ struct xrep_rmap_agfl { … }; /* Add an AGFL block to the rmap list. */ STATIC int xrep_rmap_walk_agfl( struct xfs_mount *mp, xfs_agblock_t agbno, void *priv) { … } /* * Run one round of reserving space for the new rmapbt and recomputing the * number of blocks needed to store the previously observed rmapbt records and * the ones we'll create for the free space metadata. When we don't need more * blocks, return a bitmap of OWN_AG extents in @freesp_blocks and set @done to * true. */ STATIC int xrep_rmap_try_reserve( struct xrep_rmap *rr, struct xfs_btree_cur *rmap_cur, struct xagb_bitmap *freesp_blocks, uint64_t *blocks_reserved, bool *done) { … } /* * Iteratively reserve space for rmap btree while recording OWN_AG rmaps for * the free space metadata. This implements section (II) above. */ STATIC int xrep_rmap_reserve_space( struct xrep_rmap *rr, struct xfs_btree_cur *rmap_cur) { … } /* Section (III): Building the new rmap btree. */ /* Update the AGF counters. */ STATIC int xrep_rmap_reset_counters( struct xrep_rmap *rr) { … } /* Retrieve rmapbt data for bulk load. */ STATIC int xrep_rmap_get_records( struct xfs_btree_cur *cur, unsigned int idx, struct xfs_btree_block *block, unsigned int nr_wanted, void *priv) { … } /* Feed one of the new btree blocks to the bulk loader. */ STATIC int xrep_rmap_claim_block( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, void *priv) { … } /* Custom allocation function for new rmap btrees. */ STATIC int xrep_rmap_alloc_vextent( struct xfs_scrub *sc, struct xfs_alloc_arg *args, xfs_fsblock_t alloc_hint) { … } /* Count the records in this btree. */ STATIC int xrep_rmap_count_records( struct xfs_btree_cur *cur, unsigned long long *nr) { … } /* * Use the collected rmap information to stage a new rmap btree. If this is * successful we'll return with the new btree root information logged to the * repair transaction but not yet committed. This implements section (III) * above. */ STATIC int xrep_rmap_build_new_tree( struct xrep_rmap *rr) { … } /* Section (IV): Reaping the old btree. */ struct xrep_rmap_find_gaps { … }; /* Subtract each free extent in the bnobt from the rmap gaps. */ STATIC int xrep_rmap_find_freesp( struct xfs_btree_cur *cur, const struct xfs_alloc_rec_incore *rec, void *priv) { … } /* Record the free space we find, as part of cleaning out the btree. */ STATIC int xrep_rmap_find_gaps( struct xfs_btree_cur *cur, const struct xfs_rmap_irec *rec, void *priv) { … } /* * Reap the old rmapbt blocks. Now that the rmapbt is fully rebuilt, we make * a list of gaps in the rmap records and a list of the extents mentioned in * the bnobt. Any block that's in the new rmapbt gap list but not mentioned * in the bnobt is a block from the old rmapbt and can be removed. */ STATIC int xrep_rmap_remove_old_tree( struct xrep_rmap *rr) { … } static inline bool xrep_rmapbt_want_live_update( struct xchk_iscan *iscan, const struct xfs_owner_info *oi) { … } /* * Apply a rmapbt update from the regular filesystem into our shadow btree. * We're running from the thread that owns the AGF buffer and is generating * the update, so we must be careful about which parts of the struct xrep_rmap * that we change. */ static int xrep_rmapbt_live_update( struct notifier_block *nb, unsigned long action, void *data) { … } /* Set up the filesystem scan components. */ STATIC int xrep_rmap_setup_scan( struct xrep_rmap *rr) { … } /* Tear down scan components. */ STATIC void xrep_rmap_teardown( struct xrep_rmap *rr) { … } /* Repair the rmap btree for some AG. */ int xrep_rmapbt( struct xfs_scrub *sc) { … }