// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_inode_item.h" #include "xfs_quota.h" #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_bmap_util.h" #include "xfs_dquot_item.h" #include "xfs_dquot.h" #include "xfs_reflink.h" #include "xfs_ialloc.h" #include "xfs_ag.h" #include "xfs_log_priv.h" #include "xfs_health.h" #include <linux/iversion.h> /* Radix tree tags for incore inode tree. */ /* inode is to be reclaimed */ #define XFS_ICI_RECLAIM_TAG … /* Inode has speculative preallocations (posteof or cow) to clean. */ #define XFS_ICI_BLOCKGC_TAG … /* * The goal for walking incore inodes. These can correspond with incore inode * radix tree tags when convenient. Avoid existing XFS_IWALK namespace. */ enum xfs_icwalk_goal { … }; static int xfs_icwalk(struct xfs_mount *mp, enum xfs_icwalk_goal goal, struct xfs_icwalk *icw); static int xfs_icwalk_ag(struct xfs_perag *pag, enum xfs_icwalk_goal goal, struct xfs_icwalk *icw); /* * Private inode cache walk flags for struct xfs_icwalk. Must not * coincide with XFS_ICWALK_FLAGS_VALID. */ /* Stop scanning after icw_scan_limit inodes. */ #define XFS_ICWALK_FLAG_SCAN_LIMIT … #define XFS_ICWALK_FLAG_RECLAIM_SICK … #define XFS_ICWALK_FLAG_UNION … #define XFS_ICWALK_PRIVATE_FLAGS … /* * Allocate and initialise an xfs_inode. */ struct xfs_inode * xfs_inode_alloc( struct xfs_mount *mp, xfs_ino_t ino) { … } STATIC void xfs_inode_free_callback( struct rcu_head *head) { … } static void __xfs_inode_free( struct xfs_inode *ip) { … } void xfs_inode_free( struct xfs_inode *ip) { … } /* * Queue background inode reclaim work if there are reclaimable inodes and there * isn't reclaim work already scheduled or in progress. */ static void xfs_reclaim_work_queue( struct xfs_mount *mp) { … } /* * Background scanning to trim preallocated space. This is queued based on the * 'speculative_prealloc_lifetime' tunable (5m by default). */ static inline void xfs_blockgc_queue( struct xfs_perag *pag) { … } /* Set a tag on both the AG incore inode tree and the AG radix tree. */ static void xfs_perag_set_inode_tag( struct xfs_perag *pag, xfs_agino_t agino, unsigned int tag) { … } /* Clear a tag on both the AG incore inode tree and the AG radix tree. */ static void xfs_perag_clear_inode_tag( struct xfs_perag *pag, xfs_agino_t agino, unsigned int tag) { … } /* * When we recycle a reclaimable inode, we need to re-initialise the VFS inode * part of the structure. This is made more complex by the fact we store * information about the on-disk values in the VFS inode and so we can't just * overwrite the values unconditionally. Hence we save the parameters we * need to retain across reinitialisation, and rewrite them into the VFS inode * after reinitialisation even if it fails. */ static int xfs_reinit_inode( struct xfs_mount *mp, struct inode *inode) { … } /* * Carefully nudge an inode whose VFS state has been torn down back into a * usable state. Drops the i_flags_lock and the rcu read lock. */ static int xfs_iget_recycle( struct xfs_perag *pag, struct xfs_inode *ip) __releases(&ip->i_flags_lock) { … } /* * If we are allocating a new inode, then check what was returned is * actually a free, empty inode. If we are not allocating an inode, * then check we didn't find a free inode. * * Returns: * 0 if the inode free state matches the lookup context * -ENOENT if the inode is free and we are not allocating * -EFSCORRUPTED if there is any state mismatch at all */ static int xfs_iget_check_free_state( struct xfs_inode *ip, int flags) { … } /* Make all pending inactivation work start immediately. */ static bool xfs_inodegc_queue_all( struct xfs_mount *mp) { … } /* Wait for all queued work and collect errors */ static int xfs_inodegc_wait_all( struct xfs_mount *mp) { … } /* * Check the validity of the inode we just found it the cache */ static int xfs_iget_cache_hit( struct xfs_perag *pag, struct xfs_inode *ip, xfs_ino_t ino, int flags, int lock_flags) __releases(RCU) { … } static int xfs_iget_cache_miss( struct xfs_mount *mp, struct xfs_perag *pag, xfs_trans_t *tp, xfs_ino_t ino, struct xfs_inode **ipp, int flags, int lock_flags) { … } /* * Look up an inode by number in the given file system. The inode is looked up * in the cache held in each AG. If the inode is found in the cache, initialise * the vfs inode if necessary. * * If it is not in core, read it in from the file system's device, add it to the * cache and initialise the vfs inode. * * The inode is locked according to the value of the lock_flags parameter. * Inode lookup is only done during metadata operations and not as part of the * data IO path. Hence we only allow locking of the XFS_ILOCK during lookup. */ int xfs_iget( struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, uint flags, uint lock_flags, struct xfs_inode **ipp) { … } /* * Grab the inode for reclaim exclusively. * * We have found this inode via a lookup under RCU, so the inode may have * already been freed, or it may be in the process of being recycled by * xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode * has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE * will not be set. Hence we need to check for both these flag conditions to * avoid inodes that are no longer reclaim candidates. * * Note: checking for other state flags here, under the i_flags_lock or not, is * racy and should be avoided. Those races should be resolved only after we have * ensured that we are able to reclaim this inode and the world can see that we * are going to reclaim it. * * Return true if we grabbed it, false otherwise. */ static bool xfs_reclaim_igrab( struct xfs_inode *ip, struct xfs_icwalk *icw) { … } /* * Inode reclaim is non-blocking, so the default action if progress cannot be * made is to "requeue" the inode for reclaim by unlocking it and clearing the * XFS_IRECLAIM flag. If we are in a shutdown state, we don't care about * blocking anymore and hence we can wait for the inode to be able to reclaim * it. * * We do no IO here - if callers require inodes to be cleaned they must push the * AIL first to trigger writeback of dirty inodes. This enables writeback to be * done in the background in a non-blocking manner, and enables memory reclaim * to make progress without blocking. */ static void xfs_reclaim_inode( struct xfs_inode *ip, struct xfs_perag *pag) { … } /* Reclaim sick inodes if we're unmounting or the fs went down. */ static inline bool xfs_want_reclaim_sick( struct xfs_mount *mp) { … } void xfs_reclaim_inodes( struct xfs_mount *mp) { … } /* * The shrinker infrastructure determines how many inodes we should scan for * reclaim. We want as many clean inodes ready to reclaim as possible, so we * push the AIL here. We also want to proactively free up memory if we can to * minimise the amount of work memory reclaim has to do so we kick the * background reclaim if it isn't already scheduled. */ long xfs_reclaim_inodes_nr( struct xfs_mount *mp, unsigned long nr_to_scan) { … } /* * Return the number of reclaimable inodes in the filesystem for * the shrinker to determine how much to reclaim. */ long xfs_reclaim_inodes_count( struct xfs_mount *mp) { … } STATIC bool xfs_icwalk_match_id( struct xfs_inode *ip, struct xfs_icwalk *icw) { … } /* * A union-based inode filtering algorithm. Process the inode if any of the * criteria match. This is for global/internal scans only. */ STATIC bool xfs_icwalk_match_id_union( struct xfs_inode *ip, struct xfs_icwalk *icw) { … } /* * Is this inode @ip eligible for eof/cow block reclamation, given some * filtering parameters @icw? The inode is eligible if @icw is null or * if the predicate functions match. */ static bool xfs_icwalk_match( struct xfs_inode *ip, struct xfs_icwalk *icw) { … } /* * This is a fast pass over the inode cache to try to get reclaim moving on as * many inodes as possible in a short period of time. It kicks itself every few * seconds, as well as being kicked by the inode cache shrinker when memory * goes low. */ void xfs_reclaim_worker( struct work_struct *work) { … } STATIC int xfs_inode_free_eofblocks( struct xfs_inode *ip, struct xfs_icwalk *icw, unsigned int *lockflags) { … } static void xfs_blockgc_set_iflag( struct xfs_inode *ip, unsigned long iflag) { … } void xfs_inode_set_eofblocks_tag( xfs_inode_t *ip) { … } static void xfs_blockgc_clear_iflag( struct xfs_inode *ip, unsigned long iflag) { … } void xfs_inode_clear_eofblocks_tag( xfs_inode_t *ip) { … } /* * Set ourselves up to free CoW blocks from this file. If it's already clean * then we can bail out quickly, but otherwise we must back off if the file * is undergoing some kind of write. */ static bool xfs_prep_free_cowblocks( struct xfs_inode *ip) { … } /* * Automatic CoW Reservation Freeing * * These functions automatically garbage collect leftover CoW reservations * that were made on behalf of a cowextsize hint when we start to run out * of quota or when the reservations sit around for too long. If the file * has dirty pages or is undergoing writeback, its CoW reservations will * be retained. * * The actual garbage collection piggybacks off the same code that runs * the speculative EOF preallocation garbage collector. */ STATIC int xfs_inode_free_cowblocks( struct xfs_inode *ip, struct xfs_icwalk *icw, unsigned int *lockflags) { … } void xfs_inode_set_cowblocks_tag( xfs_inode_t *ip) { … } void xfs_inode_clear_cowblocks_tag( xfs_inode_t *ip) { … } /* Disable post-EOF and CoW block auto-reclamation. */ void xfs_blockgc_stop( struct xfs_mount *mp) { … } /* Enable post-EOF and CoW block auto-reclamation. */ void xfs_blockgc_start( struct xfs_mount *mp) { … } /* Don't try to run block gc on an inode that's in any of these states. */ #define XFS_BLOCKGC_NOGRAB_IFLAGS … /* * Decide if the given @ip is eligible for garbage collection of speculative * preallocations, and grab it if so. Returns true if it's ready to go or * false if we should just ignore it. */ static bool xfs_blockgc_igrab( struct xfs_inode *ip) { … } /* Scan one incore inode for block preallocations that we can remove. */ static int xfs_blockgc_scan_inode( struct xfs_inode *ip, struct xfs_icwalk *icw) { … } /* Background worker that trims preallocated space. */ void xfs_blockgc_worker( struct work_struct *work) { … } /* * Try to free space in the filesystem by purging inactive inodes, eofblocks * and cowblocks. */ int xfs_blockgc_free_space( struct xfs_mount *mp, struct xfs_icwalk *icw) { … } /* * Reclaim all the free space that we can by scheduling the background blockgc * and inodegc workers immediately and waiting for them all to clear. */ int xfs_blockgc_flush_all( struct xfs_mount *mp) { … } /* * Run cow/eofblocks scans on the supplied dquots. We don't know exactly which * quota caused an allocation failure, so we make a best effort by including * each quota under low free space conditions (less than 1% free space) in the * scan. * * Callers must not hold any inode's ILOCK. If requesting a synchronous scan * (XFS_ICWALK_FLAG_SYNC), the caller also must not hold any inode's IOLOCK or * MMAPLOCK. */ int xfs_blockgc_free_dquots( struct xfs_mount *mp, struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, unsigned int iwalk_flags) { … } /* Run cow/eofblocks scans on the quotas attached to the inode. */ int xfs_blockgc_free_quota( struct xfs_inode *ip, unsigned int iwalk_flags) { … } /* XFS Inode Cache Walking Code */ /* * The inode lookup is done in batches to keep the amount of lock traffic and * radix tree lookups to a minimum. The batch size is a trade off between * lookup reduction and stack usage. This is in the reclaim path, so we can't * be too greedy. */ #define XFS_LOOKUP_BATCH … /* * Decide if we want to grab this inode in anticipation of doing work towards * the goal. */ static inline bool xfs_icwalk_igrab( enum xfs_icwalk_goal goal, struct xfs_inode *ip, struct xfs_icwalk *icw) { … } /* * Process an inode. Each processing function must handle any state changes * made by the icwalk igrab function. Return -EAGAIN to skip an inode. */ static inline int xfs_icwalk_process_inode( enum xfs_icwalk_goal goal, struct xfs_inode *ip, struct xfs_perag *pag, struct xfs_icwalk *icw) { … } /* * For a given per-AG structure @pag and a goal, grab qualifying inodes and * process them in some manner. */ static int xfs_icwalk_ag( struct xfs_perag *pag, enum xfs_icwalk_goal goal, struct xfs_icwalk *icw) { … } /* Walk all incore inodes to achieve a given goal. */ static int xfs_icwalk( struct xfs_mount *mp, enum xfs_icwalk_goal goal, struct xfs_icwalk *icw) { … } #ifdef DEBUG static void xfs_check_delalloc( struct xfs_inode *ip, int whichfork) { … } #else #define xfs_check_delalloc … #endif /* Schedule the inode for reclaim. */ static void xfs_inodegc_set_reclaimable( struct xfs_inode *ip) { … } /* * Free all speculative preallocations and possibly even the inode itself. * This is the last chance to make changes to an otherwise unreferenced file * before incore reclamation happens. */ static int xfs_inodegc_inactivate( struct xfs_inode *ip) { … } void xfs_inodegc_worker( struct work_struct *work) { … } /* * Expedite all pending inodegc work to run immediately. This does not wait for * completion of the work. */ void xfs_inodegc_push( struct xfs_mount *mp) { … } /* * Force all currently queued inode inactivation work to run immediately and * wait for the work to finish. */ int xfs_inodegc_flush( struct xfs_mount *mp) { … } /* * Flush all the pending work and then disable the inode inactivation background * workers and wait for them to stop. Caller must hold sb->s_umount to * coordinate changes in the inodegc_enabled state. */ void xfs_inodegc_stop( struct xfs_mount *mp) { … } /* * Enable the inode inactivation background workers and schedule deferred inode * inactivation work if there is any. Caller must hold sb->s_umount to * coordinate changes in the inodegc_enabled state. */ void xfs_inodegc_start( struct xfs_mount *mp) { … } #ifdef CONFIG_XFS_RT static inline bool xfs_inodegc_want_queue_rt_file( struct xfs_inode *ip) { … } #else #define xfs_inodegc_want_queue_rt_file … #endif /* CONFIG_XFS_RT */ /* * Schedule the inactivation worker when: * * - We've accumulated more than one inode cluster buffer's worth of inodes. * - There is less than 5% free space left. * - Any of the quotas for this inode are near an enforcement limit. */ static inline bool xfs_inodegc_want_queue_work( struct xfs_inode *ip, unsigned int items) { … } /* * Upper bound on the number of inodes in each AG that can be queued for * inactivation at any given time, to avoid monopolizing the workqueue. */ #define XFS_INODEGC_MAX_BACKLOG … /* * Make the frontend wait for inactivations when: * * - Memory shrinkers queued the inactivation worker and it hasn't finished. * - The queue depth exceeds the maximum allowable percpu backlog. * * Note: If we are in a NOFS context here (e.g. current thread is running a * transaction) the we don't want to block here as inodegc progress may require * filesystem resources we hold to make progress and that could result in a * deadlock. Hence we skip out of here if we are in a scoped NOFS context. */ static inline bool xfs_inodegc_want_flush_work( struct xfs_inode *ip, unsigned int items, unsigned int shrinker_hits) { … } /* * Queue a background inactivation worker if there are inodes that need to be * inactivated and higher level xfs code hasn't disabled the background * workers. */ static void xfs_inodegc_queue( struct xfs_inode *ip) { … } /* * We set the inode flag atomically with the radix tree tag. Once we get tag * lookups on the radix tree, this inode flag can go away. * * We always use background reclaim here because even if the inode is clean, it * still may be under IO and hence we have wait for IO completion to occur * before we can reclaim the inode. The background reclaim path handles this * more efficiently than we can here, so simply let background reclaim tear down * all inodes. */ void xfs_inode_mark_reclaimable( struct xfs_inode *ip) { … } /* * Register a phony shrinker so that we can run background inodegc sooner when * there's memory pressure. Inactivation does not itself free any memory but * it does make inodes reclaimable, which eventually frees memory. * * The count function, seek value, and batch value are crafted to trigger the * scan function during the second round of scanning. Hopefully this means * that we reclaimed enough memory that initiating metadata transactions won't * make things worse. */ #define XFS_INODEGC_SHRINKER_COUNT … #define XFS_INODEGC_SHRINKER_BATCH … static unsigned long xfs_inodegc_shrinker_count( struct shrinker *shrink, struct shrink_control *sc) { … } static unsigned long xfs_inodegc_shrinker_scan( struct shrinker *shrink, struct shrink_control *sc) { … } /* Register a shrinker so we can accelerate inodegc and throttle queuing. */ int xfs_inodegc_register_shrinker( struct xfs_mount *mp) { … }