// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2022-2023 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <[email protected]>
*/
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_btree.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
#include "xfs_inode.h"
#include "xfs_inode_fork.h"
#include "xfs_alloc.h"
#include "xfs_bmap.h"
#include "xfs_rmap.h"
#include "xfs_refcount.h"
#include "xfs_quota.h"
#include "xfs_ialloc.h"
#include "xfs_ag.h"
#include "xfs_error.h"
#include "xfs_errortag.h"
#include "xfs_icache.h"
#include "xfs_refcount_btree.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/bitmap.h"
#include "scrub/off_bitmap.h"
#include "scrub/fsb_bitmap.h"
#include "scrub/reap.h"
/*
* CoW Fork Mapping Repair
* =======================
*
* Although CoW staging extents are owned by incore CoW inode forks, on disk
* they are owned by the refcount btree. The ondisk metadata does not record
* any ownership information, which limits what we can do to repair the
* mappings in the CoW fork. At most, we can replace ifork mappings that lack
* an entry in the refcount btree or are described by a reverse mapping record
* whose owner is not OWN_COW.
*
* Replacing extents is also tricky -- we can't touch written CoW fork extents
* since they are undergoing writeback, and delalloc extents do not require
* repair since they only exist incore. Hence the most we can do is find the
* bad parts of unwritten mappings, allocate a replacement set of blocks, and
* replace the incore mapping. We use the regular reaping process to unmap
* or free the discarded blocks, as appropriate.
*/
struct xrep_cow {
struct xfs_scrub *sc;
/* Bitmap of file offset ranges that need replacing. */
struct xoff_bitmap bad_fileoffs;
/* Bitmap of fsblocks that were removed from the CoW fork. */
struct xfsb_bitmap old_cowfork_fsblocks;
/* CoW fork mappings used to scan for bad CoW staging extents. */
struct xfs_bmbt_irec irec;
/* refcount btree block number of irec.br_startblock */
unsigned int irec_startbno;
/* refcount btree block number of the next refcount record we expect */
unsigned int next_bno;
};
/* CoW staging extent. */
struct xrep_cow_extent {
xfs_fsblock_t fsbno;
xfs_extlen_t len;
};
/*
* Mark the part of the file range that corresponds to the given physical
* space. Caller must ensure that the physical range is within xc->irec.
*/
STATIC int
xrep_cow_mark_file_range(
struct xrep_cow *xc,
xfs_fsblock_t startblock,
xfs_filblks_t blockcount)
{
xfs_fileoff_t startoff;
startoff = xc->irec.br_startoff +
(startblock - xc->irec.br_startblock);
trace_xrep_cow_mark_file_range(xc->sc->ip, startblock, startoff,
blockcount);
return xoff_bitmap_set(&xc->bad_fileoffs, startoff, blockcount);
}
/*
* Trim @src to fit within the CoW fork mapping being examined, and put the
* result in @dst.
*/
static inline void
xrep_cow_trim_refcount(
struct xrep_cow *xc,
struct xfs_refcount_irec *dst,
const struct xfs_refcount_irec *src)
{
unsigned int adj;
memcpy(dst, src, sizeof(*dst));
if (dst->rc_startblock < xc->irec_startbno) {
adj = xc->irec_startbno - dst->rc_startblock;
dst->rc_blockcount -= adj;
dst->rc_startblock += adj;
}
if (dst->rc_startblock + dst->rc_blockcount >
xc->irec_startbno + xc->irec.br_blockcount) {
adj = (dst->rc_startblock + dst->rc_blockcount) -
(xc->irec_startbno + xc->irec.br_blockcount);
dst->rc_blockcount -= adj;
}
}
/* Mark any shared CoW staging extents. */
STATIC int
xrep_cow_mark_shared_staging(
struct xfs_btree_cur *cur,
const struct xfs_refcount_irec *rec,
void *priv)
{
struct xrep_cow *xc = priv;
struct xfs_refcount_irec rrec;
xfs_fsblock_t fsbno;
if (!xfs_refcount_check_domain(rec) ||
rec->rc_domain != XFS_REFC_DOMAIN_SHARED)
return -EFSCORRUPTED;
xrep_cow_trim_refcount(xc, &rrec, rec);
fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno,
rrec.rc_startblock);
return xrep_cow_mark_file_range(xc, fsbno, rrec.rc_blockcount);
}
/*
* Mark any portion of the CoW fork file offset range where there is not a CoW
* staging extent record in the refcountbt, and keep a record of where we did
* find correct refcountbt records. Staging records are always cleaned out at
* mount time, so any two inodes trying to map the same staging area would have
* already taken the fs down due to refcount btree verifier errors. Hence this
* inode should be the sole creator of the staging extent records ondisk.
*/
STATIC int
xrep_cow_mark_missing_staging(
struct xfs_btree_cur *cur,
const struct xfs_refcount_irec *rec,
void *priv)
{
struct xrep_cow *xc = priv;
struct xfs_refcount_irec rrec;
int error;
if (!xfs_refcount_check_domain(rec) ||
rec->rc_domain != XFS_REFC_DOMAIN_COW)
return -EFSCORRUPTED;
xrep_cow_trim_refcount(xc, &rrec, rec);
if (xc->next_bno >= rrec.rc_startblock)
goto next;
error = xrep_cow_mark_file_range(xc,
XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno,
xc->next_bno),
rrec.rc_startblock - xc->next_bno);
if (error)
return error;
next:
xc->next_bno = rrec.rc_startblock + rrec.rc_blockcount;
return 0;
}
/*
* Mark any area that does not correspond to a CoW staging rmap. These are
* cross-linked areas that must be avoided.
*/
STATIC int
xrep_cow_mark_missing_staging_rmap(
struct xfs_btree_cur *cur,
const struct xfs_rmap_irec *rec,
void *priv)
{
struct xrep_cow *xc = priv;
xfs_fsblock_t fsbno;
xfs_agblock_t rec_bno;
xfs_extlen_t rec_len;
unsigned int adj;
if (rec->rm_owner == XFS_RMAP_OWN_COW)
return 0;
rec_bno = rec->rm_startblock;
rec_len = rec->rm_blockcount;
if (rec_bno < xc->irec_startbno) {
adj = xc->irec_startbno - rec_bno;
rec_len -= adj;
rec_bno += adj;
}
if (rec_bno + rec_len > xc->irec_startbno + xc->irec.br_blockcount) {
adj = (rec_bno + rec_len) -
(xc->irec_startbno + xc->irec.br_blockcount);
rec_len -= adj;
}
fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, rec_bno);
return xrep_cow_mark_file_range(xc, fsbno, rec_len);
}
/*
* Find any part of the CoW fork mapping that isn't a single-owner CoW staging
* extent and mark the corresponding part of the file range in the bitmap.
*/
STATIC int
xrep_cow_find_bad(
struct xrep_cow *xc)
{
struct xfs_refcount_irec rc_low = { 0 };
struct xfs_refcount_irec rc_high = { 0 };
struct xfs_rmap_irec rm_low = { 0 };
struct xfs_rmap_irec rm_high = { 0 };
struct xfs_perag *pag;
struct xfs_scrub *sc = xc->sc;
xfs_agnumber_t agno;
int error;
agno = XFS_FSB_TO_AGNO(sc->mp, xc->irec.br_startblock);
xc->irec_startbno = XFS_FSB_TO_AGBNO(sc->mp, xc->irec.br_startblock);
pag = xfs_perag_get(sc->mp, agno);
if (!pag)
return -EFSCORRUPTED;
error = xrep_ag_init(sc, pag, &sc->sa);
if (error)
goto out_pag;
/* Mark any CoW fork extents that are shared. */
rc_low.rc_startblock = xc->irec_startbno;
rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_SHARED;
error = xfs_refcount_query_range(sc->sa.refc_cur, &rc_low, &rc_high,
xrep_cow_mark_shared_staging, xc);
if (error)
goto out_sa;
/* Make sure there are CoW staging extents for the whole mapping. */
rc_low.rc_startblock = xc->irec_startbno;
rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_COW;
xc->next_bno = xc->irec_startbno;
error = xfs_refcount_query_range(sc->sa.refc_cur, &rc_low, &rc_high,
xrep_cow_mark_missing_staging, xc);
if (error)
goto out_sa;
if (xc->next_bno < xc->irec_startbno + xc->irec.br_blockcount) {
error = xrep_cow_mark_file_range(xc,
XFS_AGB_TO_FSB(sc->mp, pag->pag_agno,
xc->next_bno),
xc->irec_startbno + xc->irec.br_blockcount -
xc->next_bno);
if (error)
goto out_sa;
}
/* Mark any area has an rmap that isn't a COW staging extent. */
rm_low.rm_startblock = xc->irec_startbno;
memset(&rm_high, 0xFF, sizeof(rm_high));
rm_high.rm_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
error = xfs_rmap_query_range(sc->sa.rmap_cur, &rm_low, &rm_high,
xrep_cow_mark_missing_staging_rmap, xc);
if (error)
goto out_sa;
/*
* If userspace is forcing us to rebuild the CoW fork or someone turned
* on the debugging knob, replace everything in the CoW fork.
*/
if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) ||
XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) {
error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock,
xc->irec.br_blockcount);
if (error)
return error;
}
out_sa:
xchk_ag_free(sc, &sc->sa);
out_pag:
xfs_perag_put(pag);
return 0;
}
/*
* Allocate a replacement CoW staging extent of up to the given number of
* blocks, and fill out the mapping.
*/
STATIC int
xrep_cow_alloc(
struct xfs_scrub *sc,
xfs_extlen_t maxlen,
struct xrep_cow_extent *repl)
{
struct xfs_alloc_arg args = {
.tp = sc->tp,
.mp = sc->mp,
.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE,
.minlen = 1,
.maxlen = maxlen,
.prod = 1,
.resv = XFS_AG_RESV_NONE,
.datatype = XFS_ALLOC_USERDATA,
};
int error;
error = xfs_trans_reserve_more(sc->tp, maxlen, 0);
if (error)
return error;
error = xfs_alloc_vextent_start_ag(&args,
XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino));
if (error)
return error;
if (args.fsbno == NULLFSBLOCK)
return -ENOSPC;
xfs_refcount_alloc_cow_extent(sc->tp, args.fsbno, args.len);
repl->fsbno = args.fsbno;
repl->len = args.len;
return 0;
}
/*
* Look up the current CoW fork mapping so that we only allocate enough to
* replace a single mapping. If we don't find a mapping that covers the start
* of the file range, or we find a delalloc or written extent, something is
* seriously wrong, since we didn't drop the ILOCK.
*/
static inline int
xrep_cow_find_mapping(
struct xrep_cow *xc,
struct xfs_iext_cursor *icur,
xfs_fileoff_t startoff,
struct xfs_bmbt_irec *got)
{
struct xfs_inode *ip = xc->sc->ip;
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
if (!xfs_iext_lookup_extent(ip, ifp, startoff, icur, got))
goto bad;
if (got->br_startoff > startoff)
goto bad;
if (got->br_blockcount == 0)
goto bad;
if (isnullstartblock(got->br_startblock))
goto bad;
if (xfs_bmap_is_written_extent(got))
goto bad;
return 0;
bad:
ASSERT(0);
return -EFSCORRUPTED;
}
#define REPLACE_LEFT_SIDE (1U << 0)
#define REPLACE_RIGHT_SIDE (1U << 1)
/*
* Given a CoW fork mapping @got and a replacement mapping @repl, remap the
* beginning of @got with the space described by @rep.
*/
static inline void
xrep_cow_replace_mapping(
struct xfs_inode *ip,
struct xfs_iext_cursor *icur,
const struct xfs_bmbt_irec *got,
const struct xrep_cow_extent *repl)
{
struct xfs_bmbt_irec new = *got; /* struct copy */
ASSERT(repl->len > 0);
ASSERT(!isnullstartblock(got->br_startblock));
trace_xrep_cow_replace_mapping(ip, got, repl->fsbno, repl->len);
if (got->br_blockcount == repl->len) {
/*
* The new extent is a complete replacement for the existing
* extent. Update the COW fork record.
*/
new.br_startblock = repl->fsbno;
xfs_iext_update_extent(ip, BMAP_COWFORK, icur, &new);
return;
}
/*
* The new extent can replace the beginning of the COW fork record.
* Move the left side of @got upwards, then insert the new record.
*/
new.br_startoff += repl->len;
new.br_startblock += repl->len;
new.br_blockcount -= repl->len;
xfs_iext_update_extent(ip, BMAP_COWFORK, icur, &new);
new.br_startoff = got->br_startoff;
new.br_startblock = repl->fsbno;
new.br_blockcount = repl->len;
xfs_iext_insert(ip, icur, &new, BMAP_COWFORK);
}
/*
* Replace the unwritten CoW staging extent backing the given file range with a
* new space extent that isn't as problematic.
*/
STATIC int
xrep_cow_replace_range(
struct xrep_cow *xc,
xfs_fileoff_t startoff,
xfs_extlen_t *blockcount)
{
struct xfs_iext_cursor icur;
struct xrep_cow_extent repl;
struct xfs_bmbt_irec got;
struct xfs_scrub *sc = xc->sc;
xfs_fileoff_t nextoff;
xfs_extlen_t alloc_len;
int error;
/*
* Put the existing CoW fork mapping in @got. If @got ends before
* @rep, truncate @rep so we only replace one extent mapping at a time.
*/
error = xrep_cow_find_mapping(xc, &icur, startoff, &got);
if (error)
return error;
nextoff = min(startoff + *blockcount,
got.br_startoff + got.br_blockcount);
/*
* Allocate a replacement extent. If we don't fill all the blocks,
* shorten the quantity that will be deleted in this step.
*/
alloc_len = min_t(xfs_fileoff_t, XFS_MAX_BMBT_EXTLEN,
nextoff - startoff);
error = xrep_cow_alloc(sc, alloc_len, &repl);
if (error)
return error;
/*
* Replace the old mapping with the new one, and commit the metadata
* changes made so far.
*/
xrep_cow_replace_mapping(sc->ip, &icur, &got, &repl);
xfs_inode_set_cowblocks_tag(sc->ip);
error = xfs_defer_finish(&sc->tp);
if (error)
return error;
/* Note the old CoW staging extents; we'll reap them all later. */
error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks, got.br_startblock,
repl.len);
if (error)
return error;
*blockcount = repl.len;
return 0;
}
/*
* Replace a bad part of an unwritten CoW staging extent with a fresh delalloc
* reservation.
*/
STATIC int
xrep_cow_replace(
uint64_t startoff,
uint64_t blockcount,
void *priv)
{
struct xrep_cow *xc = priv;
int error = 0;
while (blockcount > 0) {
xfs_extlen_t len = min_t(xfs_filblks_t, blockcount,
XFS_MAX_BMBT_EXTLEN);
error = xrep_cow_replace_range(xc, startoff, &len);
if (error)
break;
blockcount -= len;
startoff += len;
}
return error;
}
/*
* Repair an inode's CoW fork. The CoW fork is an in-core structure, so
* there's no btree to rebuid. Instead, we replace any mappings that are
* cross-linked or lack ondisk CoW fork records in the refcount btree.
*/
int
xrep_bmap_cow(
struct xfs_scrub *sc)
{
struct xrep_cow *xc;
struct xfs_iext_cursor icur;
struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_COW_FORK);
int error;
if (!xfs_has_rmapbt(sc->mp) || !xfs_has_reflink(sc->mp))
return -EOPNOTSUPP;
if (!ifp)
return 0;
/* realtime files aren't supported yet */
if (XFS_IS_REALTIME_INODE(sc->ip))
return -EOPNOTSUPP;
/*
* If we're somehow not in extents format, then reinitialize it to
* an empty extent mapping fork and exit.
*/
if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) {
ifp->if_format = XFS_DINODE_FMT_EXTENTS;
ifp->if_nextents = 0;
return 0;
}
xc = kzalloc(sizeof(struct xrep_cow), XCHK_GFP_FLAGS);
if (!xc)
return -ENOMEM;
xfs_trans_ijoin(sc->tp, sc->ip, 0);
xc->sc = sc;
xoff_bitmap_init(&xc->bad_fileoffs);
xfsb_bitmap_init(&xc->old_cowfork_fsblocks);
for_each_xfs_iext(ifp, &icur, &xc->irec) {
if (xchk_should_terminate(sc, &error))
goto out_bitmap;
/*
* delalloc reservations only exist incore, so there is no
* ondisk metadata that we can examine. Hence we leave them
* alone.
*/
if (isnullstartblock(xc->irec.br_startblock))
continue;
/*
* COW fork extents are only in the written state if writeback
* is actively writing to disk. We cannot restart the write
* at a different disk address since we've already issued the
* IO, so we leave these alone and hope for the best.
*/
if (xfs_bmap_is_written_extent(&xc->irec))
continue;
error = xrep_cow_find_bad(xc);
if (error)
goto out_bitmap;
}
/* Replace any bad unwritten mappings with fresh reservations. */
error = xoff_bitmap_walk(&xc->bad_fileoffs, xrep_cow_replace, xc);
if (error)
goto out_bitmap;
/*
* Reap as many of the old CoW blocks as we can. They are owned ondisk
* by the refcount btree, not the inode, so it is correct to treat them
* like inode metadata.
*/
error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks,
&XFS_RMAP_OINFO_COW);
if (error)
goto out_bitmap;
out_bitmap:
xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks);
xoff_bitmap_destroy(&xc->bad_fileoffs);
kfree(xc);
return error;
}