health.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (C) 2019-2023 Oracle.  All Rights Reserved.
 * Author: Darrick J. Wong <[email protected]>
 */
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
#include "xfs_ag.h"
#include "xfs_health.h"
#include "scrub/scrub.h"
#include "scrub/health.h"
#include "scrub/common.h"

/*
 * Scrub and In-Core Filesystem Health Assessments
 * ===============================================
 *
 * Online scrub and repair have the time and the ability to perform stronger
 * checks than we can do from the metadata verifiers, because they can
 * cross-reference records between data structures.  Therefore, scrub is in a
 * good position to update the online filesystem health assessments to reflect
 * the good/bad state of the data structure.
 *
 * We therefore extend scrub in the following ways to achieve this:
 *
 * 1. Create a "sick_mask" field in the scrub context.  When we're setting up a
 * scrub call, set this to the default XFS_SICK_* flag(s) for the selected
 * scrub type (call it A).  Scrub and repair functions can override the default
 * sick_mask value if they choose.
 *
 * 2. If the scrubber returns a runtime error code, we exit making no changes
 * to the incore sick state.
 *
 * 3. If the scrubber finds that A is clean, use sick_mask to clear the incore
 * sick flags before exiting.
 *
 * 4. If the scrubber finds that A is corrupt, use sick_mask to set the incore
 * sick flags.  If the user didn't want to repair then we exit, leaving the
 * metadata structure unfixed and the sick flag set.
 *
 * 5. Now we know that A is corrupt and the user wants to repair, so run the
 * repairer.  If the repairer returns an error code, we exit with that error
 * code, having made no further changes to the incore sick state.
 *
 * 6. If repair rebuilds A correctly and the subsequent re-scrub of A is clean,
 * use sick_mask to clear the incore sick flags.  This should have the effect
 * that A is no longer marked sick.
 *
 * 7. If repair rebuilds A incorrectly, the re-scrub will find it corrupt and
 * use sick_mask to set the incore sick flags.  This should have no externally
 * visible effect since we already set them in step (4).
 *
 * There are some complications to this story, however.  For certain types of
 * complementary metadata indices (e.g. inobt/finobt), it is easier to rebuild
 * both structures at the same time.  The following principles apply to this
 * type of repair strategy:
 *
 * 8. Any repair function that rebuilds multiple structures should update
 * sick_mask_visible to reflect whatever other structures are rebuilt, and
 * verify that all the rebuilt structures can pass a scrub check.  The outcomes
 * of 5-7 still apply, but with a sick_mask that covers everything being
 * rebuilt.
 */

/* Map our scrub type to a sick mask and a set of health update functions. */

enum xchk_health_group { … };

struct xchk_health_map { … };

static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = …;

/* Return the health status mask for this scrub type. */
unsigned int
xchk_health_mask_for_scrub_type(
	__u32			scrub_type)
{ … }

/*
 * If the scrub state is clean, add @mask to the scrub sick mask to clear
 * additional sick flags from the metadata object's sick state.
 */
void
xchk_mark_healthy_if_clean(
	struct xfs_scrub	*sc,
	unsigned int		mask)
{ … }

/*
 * If we're scrubbing a piece of file metadata for the first time, does it look
 * like it has been zapped?  Skip the check if we just repaired the metadata
 * and are revalidating it.
 */
bool
xchk_file_looks_zapped(
	struct xfs_scrub	*sc,
	unsigned int		mask)
{ … }

/*
 * Scrub gave the filesystem a clean bill of health, so clear all the indirect
 * markers of past problems (at least for the fs and ags) so that we can be
 * healthy again.
 */
STATIC void
xchk_mark_all_healthy(
	struct xfs_mount	*mp)
{ … }

/*
 * Update filesystem health assessments based on what we found and did.
 *
 * If the scrubber finds errors, we mark sick whatever's mentioned in
 * sick_mask, no matter whether this is a first scan or an
 * evaluation of repair effectiveness.
 *
 * Otherwise, no direct corruption was found, so mark whatever's in
 * sick_mask as healthy.
 */
void
xchk_update_health(
	struct xfs_scrub	*sc)
{ … }

/* Is the given per-AG btree healthy enough for scanning? */
void
xchk_ag_btree_del_cursor_if_sick(
	struct xfs_scrub	*sc,
	struct xfs_btree_cur	**curp,
	unsigned int		sm_type)
{ … }

/*
 * Quick scan to double-check that there isn't any evidence of lingering
 * primary health problems.  If we're still clear, then the health update will
 * take care of clearing the indirect evidence.
 */
int
xchk_health_record(
	struct xfs_scrub	*sc)
{ … }
linux/fs/xfs/scrub/health.c