linux/fs/bcachefs/sb-downgrade.c

// SPDX-License-Identifier: GPL-2.0

/*
 * Superblock section that contains a list of recovery passes to run when
 * downgrading past a given version
 */

#include "bcachefs.h"
#include "darray.h"
#include "recovery_passes.h"
#include "sb-downgrade.h"
#include "sb-errors.h"
#include "super-io.h"

#define RECOVERY_PASS_ALL_FSCK		BIT_ULL(63)

/*
 * Upgrade, downgrade tables - run certain recovery passes, fix certain errors
 *
 * x(version, recovery_passes, errors...)
 */
#define UPGRADE_TABLE()						\
	x(backpointers,						\
	  RECOVERY_PASS_ALL_FSCK)				\
	x(inode_v3,						\
	  RECOVERY_PASS_ALL_FSCK)				\
	x(unwritten_extents,					\
	  RECOVERY_PASS_ALL_FSCK)				\
	x(bucket_gens,						\
	  BIT_ULL(BCH_RECOVERY_PASS_bucket_gens_init)|		\
	  RECOVERY_PASS_ALL_FSCK)				\
	x(lru_v2,						\
	  RECOVERY_PASS_ALL_FSCK)				\
	x(fragmentation_lru,					\
	  RECOVERY_PASS_ALL_FSCK)				\
	x(no_bps_in_alloc_keys,					\
	  RECOVERY_PASS_ALL_FSCK)				\
	x(snapshot_trees,					\
	  RECOVERY_PASS_ALL_FSCK)				\
	x(snapshot_skiplists,					\
	  BIT_ULL(BCH_RECOVERY_PASS_check_snapshots),		\
	  BCH_FSCK_ERR_snapshot_bad_depth,			\
	  BCH_FSCK_ERR_snapshot_bad_skiplist)			\
	x(deleted_inodes,					\
	  BIT_ULL(BCH_RECOVERY_PASS_check_inodes),		\
	  BCH_FSCK_ERR_unlinked_inode_not_on_deleted_list)	\
	x(rebalance_work,					\
	  BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance))	\
	x(subvolume_fs_parent,					\
	  BIT_ULL(BCH_RECOVERY_PASS_check_dirents),		\
	  BCH_FSCK_ERR_subvol_fs_path_parent_wrong)		\
	x(btree_subvolume_children,				\
	  BIT_ULL(BCH_RECOVERY_PASS_check_subvols),		\
	  BCH_FSCK_ERR_subvol_children_not_set)			\
	x(mi_btree_bitmap,					\
	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
	  BCH_FSCK_ERR_btree_bitmap_not_marked)			\
	x(disk_accounting_v2,					\
	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
	  BCH_FSCK_ERR_bkey_version_in_future,			\
	  BCH_FSCK_ERR_dev_usage_buckets_wrong,			\
	  BCH_FSCK_ERR_dev_usage_sectors_wrong,			\
	  BCH_FSCK_ERR_dev_usage_fragmented_wrong,		\
	  BCH_FSCK_ERR_accounting_mismatch)			\
	x(disk_accounting_v3,					\
	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
	  BCH_FSCK_ERR_bkey_version_in_future,			\
	  BCH_FSCK_ERR_dev_usage_buckets_wrong,			\
	  BCH_FSCK_ERR_dev_usage_sectors_wrong,			\
	  BCH_FSCK_ERR_dev_usage_fragmented_wrong,		\
	  BCH_FSCK_ERR_accounting_mismatch,			\
	  BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0,	\
	  BCH_FSCK_ERR_accounting_key_replicas_nr_required_bad,	\
	  BCH_FSCK_ERR_accounting_key_replicas_devs_unsorted,	\
	  BCH_FSCK_ERR_accounting_key_junk_at_end)		\
	x(disk_accounting_inum,					\
	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
	  BCH_FSCK_ERR_accounting_mismatch)			\
	x(rebalance_work_acct_fix,				\
	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
	  BCH_FSCK_ERR_accounting_mismatch)

#define DOWNGRADE_TABLE()					\
	x(bucket_stripe_sectors,				\
	  0)							\
	x(disk_accounting_v2,					\
	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
	  BCH_FSCK_ERR_dev_usage_buckets_wrong,			\
	  BCH_FSCK_ERR_dev_usage_sectors_wrong,			\
	  BCH_FSCK_ERR_dev_usage_fragmented_wrong,		\
	  BCH_FSCK_ERR_fs_usage_hidden_wrong,			\
	  BCH_FSCK_ERR_fs_usage_btree_wrong,			\
	  BCH_FSCK_ERR_fs_usage_data_wrong,			\
	  BCH_FSCK_ERR_fs_usage_cached_wrong,			\
	  BCH_FSCK_ERR_fs_usage_reserved_wrong,			\
	  BCH_FSCK_ERR_fs_usage_nr_inodes_wrong,		\
	  BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong,	\
	  BCH_FSCK_ERR_fs_usage_replicas_wrong,			\
	  BCH_FSCK_ERR_bkey_version_in_future)			\
	x(disk_accounting_v3,					\
	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
	  BCH_FSCK_ERR_dev_usage_buckets_wrong,			\
	  BCH_FSCK_ERR_dev_usage_sectors_wrong,			\
	  BCH_FSCK_ERR_dev_usage_fragmented_wrong,		\
	  BCH_FSCK_ERR_fs_usage_hidden_wrong,			\
	  BCH_FSCK_ERR_fs_usage_btree_wrong,			\
	  BCH_FSCK_ERR_fs_usage_data_wrong,			\
	  BCH_FSCK_ERR_fs_usage_cached_wrong,			\
	  BCH_FSCK_ERR_fs_usage_reserved_wrong,			\
	  BCH_FSCK_ERR_fs_usage_nr_inodes_wrong,		\
	  BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong,	\
	  BCH_FSCK_ERR_fs_usage_replicas_wrong,			\
	  BCH_FSCK_ERR_accounting_replicas_not_marked,		\
	  BCH_FSCK_ERR_bkey_version_in_future)			\
	x(rebalance_work_acct_fix,				\
	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
	  BCH_FSCK_ERR_accounting_mismatch)

struct upgrade_downgrade_entry {
	u64		recovery_passes;
	u16		version;
	u16		nr_errors;
	const u16	*errors;
};

#define x(ver, passes, ...) static const u16 upgrade_##ver##_errors[] = { __VA_ARGS__ };
UPGRADE_TABLE()
#undef x

static const struct upgrade_downgrade_entry upgrade_table[] = {
#define x(ver, passes, ...) {					\
	.recovery_passes	= passes,			\
	.version		= bcachefs_metadata_version_##ver,\
	.nr_errors		= ARRAY_SIZE(upgrade_##ver##_errors),	\
	.errors			= upgrade_##ver##_errors,	\
},
UPGRADE_TABLE()
#undef x
};

static int have_stripes(struct bch_fs *c)
{
	return !btree_node_fake(c->btree_roots_known[BTREE_ID_stripes].b);
}

int bch2_sb_set_upgrade_extra(struct bch_fs *c)
{
	unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
	unsigned new_version = c->sb.version;
	bool write_sb = false;
	int ret = 0;

	mutex_lock(&c->sb_lock);
	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);

	if (old_version <  bcachefs_metadata_version_bucket_stripe_sectors &&
	    new_version >= bcachefs_metadata_version_bucket_stripe_sectors &&
	    (ret = have_stripes(c) > 0)) {
		__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required);
		__set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
		__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_sectors_wrong, ext->errors_silent);
		write_sb = true;
	}

	if (write_sb)
		bch2_write_super(c);
	mutex_unlock(&c->sb_lock);

	return ret < 0 ? ret : 0;
}

void bch2_sb_set_upgrade(struct bch_fs *c,
			 unsigned old_version,
			 unsigned new_version)
{
	lockdep_assert_held(&c->sb_lock);

	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);

	for (const struct upgrade_downgrade_entry *i = upgrade_table;
	     i < upgrade_table + ARRAY_SIZE(upgrade_table);
	     i++)
		if (i->version > old_version && i->version <= new_version) {
			u64 passes = i->recovery_passes;

			if (passes & RECOVERY_PASS_ALL_FSCK)
				passes |= bch2_fsck_recovery_passes();
			passes &= ~RECOVERY_PASS_ALL_FSCK;

			ext->recovery_passes_required[0] |=
				cpu_to_le64(bch2_recovery_passes_to_stable(passes));

			for (const u16 *e = i->errors; e < i->errors + i->nr_errors; e++)
				__set_bit_le64(*e, ext->errors_silent);
		}
}

#define x(ver, passes, ...) static const u16 downgrade_##ver##_errors[] = { __VA_ARGS__ };
DOWNGRADE_TABLE()
#undef x

static const struct upgrade_downgrade_entry downgrade_table[] = {
#define x(ver, passes, ...) {					\
	.recovery_passes	= passes,			\
	.version		= bcachefs_metadata_version_##ver,\
	.nr_errors		= ARRAY_SIZE(downgrade_##ver##_errors),	\
	.errors			= downgrade_##ver##_errors,	\
},
DOWNGRADE_TABLE()
#undef x
};

static int downgrade_table_extra(struct bch_fs *c, darray_char *table)
{
	struct bch_sb_field_downgrade_entry *dst = (void *) &darray_top(*table);
	unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors);
	int ret = 0;

	unsigned nr_errors = le16_to_cpu(dst->nr_errors);

	switch (le16_to_cpu(dst->version)) {
	case bcachefs_metadata_version_bucket_stripe_sectors:
		if (have_stripes(c)) {
			bytes += sizeof(dst->errors[0]) * 2;

			ret = darray_make_room(table, bytes);
			if (ret)
				return ret;

			/* open coded __set_bit_le64, as dst is packed and
			 * dst->recovery_passes is misaligned */
			unsigned b = BCH_RECOVERY_PASS_STABLE_check_allocations;
			dst->recovery_passes[b / 64] |= cpu_to_le64(BIT_ULL(b % 64));

			dst->errors[nr_errors++] = cpu_to_le16(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong);
		}
		break;
	}

	dst->nr_errors = cpu_to_le16(nr_errors);
	return ret;
}

static inline const struct bch_sb_field_downgrade_entry *
downgrade_entry_next_c(const struct bch_sb_field_downgrade_entry *e)
{
	return (void *) &e->errors[le16_to_cpu(e->nr_errors)];
}

#define for_each_downgrade_entry(_d, _i)						\
	for (const struct bch_sb_field_downgrade_entry *_i = (_d)->entries;		\
	     (void *) _i	< vstruct_end(&(_d)->field) &&				\
	     (void *) &_i->errors[0] <= vstruct_end(&(_d)->field) &&			\
	     (void *) downgrade_entry_next_c(_i) <= vstruct_end(&(_d)->field);		\
	     _i = downgrade_entry_next_c(_i))

static int bch2_sb_downgrade_validate(struct bch_sb *sb, struct bch_sb_field *f,
				      enum bch_validate_flags flags, struct printbuf *err)
{
	struct bch_sb_field_downgrade *e = field_to_type(f, downgrade);

	for (const struct bch_sb_field_downgrade_entry *i = e->entries;
	     (void *) i	< vstruct_end(&e->field);
	     i = downgrade_entry_next_c(i)) {
		/*
		 * Careful: sb_field_downgrade_entry is only 2 byte aligned, but
		 * section sizes are 8 byte aligned - an empty entry spanning
		 * the end of the section is allowed (and ignored):
		 */
		if ((void *) &i->errors[0] > vstruct_end(&e->field))
			break;

		if (flags & BCH_VALIDATE_write &&
		    (void *) downgrade_entry_next_c(i) > vstruct_end(&e->field)) {
			prt_printf(err, "downgrade entry overruns end of superblock section");
			return -BCH_ERR_invalid_sb_downgrade;
		}

		if (BCH_VERSION_MAJOR(le16_to_cpu(i->version)) !=
		    BCH_VERSION_MAJOR(le16_to_cpu(sb->version))) {
			prt_printf(err, "downgrade entry with mismatched major version (%u != %u)",
				   BCH_VERSION_MAJOR(le16_to_cpu(i->version)),
				   BCH_VERSION_MAJOR(le16_to_cpu(sb->version)));
			return -BCH_ERR_invalid_sb_downgrade;
		}
	}

	return 0;
}

static void bch2_sb_downgrade_to_text(struct printbuf *out, struct bch_sb *sb,
				      struct bch_sb_field *f)
{
	struct bch_sb_field_downgrade *e = field_to_type(f, downgrade);

	if (out->nr_tabstops <= 1)
		printbuf_tabstop_push(out, 16);

	for_each_downgrade_entry(e, i) {
		prt_str(out, "version:\t");
		bch2_version_to_text(out, le16_to_cpu(i->version));
		prt_newline(out);

		prt_str(out, "recovery passes:\t");
		prt_bitflags(out, bch2_recovery_passes,
			     bch2_recovery_passes_from_stable(le64_to_cpu(i->recovery_passes[0])));
		prt_newline(out);

		prt_str(out, "errors:\t");
		bool first = true;
		for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) {
			if (!first)
				prt_char(out, ',');
			first = false;
			bch2_sb_error_id_to_text(out, le16_to_cpu(i->errors[j]));
		}
		prt_newline(out);
	}
}

const struct bch_sb_field_ops bch_sb_field_ops_downgrade = {
	.validate	= bch2_sb_downgrade_validate,
	.to_text	= bch2_sb_downgrade_to_text,
};

int bch2_sb_downgrade_update(struct bch_fs *c)
{
	if (!test_bit(BCH_FS_btree_running, &c->flags))
		return 0;

	darray_char table = {};
	int ret = 0;

	for (const struct upgrade_downgrade_entry *src = downgrade_table;
	     src < downgrade_table + ARRAY_SIZE(downgrade_table);
	     src++) {
		if (BCH_VERSION_MAJOR(src->version) != BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version)))
			continue;

		struct bch_sb_field_downgrade_entry *dst;
		unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * src->nr_errors;

		ret = darray_make_room(&table, bytes);
		if (ret)
			goto out;

		dst = (void *) &darray_top(table);
		dst->version = cpu_to_le16(src->version);
		dst->recovery_passes[0]	= cpu_to_le64(bch2_recovery_passes_to_stable(src->recovery_passes));
		dst->recovery_passes[1]	= 0;
		dst->nr_errors		= cpu_to_le16(src->nr_errors);
		for (unsigned i = 0; i < src->nr_errors; i++)
			dst->errors[i] = cpu_to_le16(src->errors[i]);

		ret = downgrade_table_extra(c, &table);
		if (ret)
			goto out;

		if (!dst->recovery_passes[0] &&
		    !dst->recovery_passes[1] &&
		    !dst->nr_errors)
			continue;

		table.nr += sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors);
	}

	struct bch_sb_field_downgrade *d = bch2_sb_field_get(c->disk_sb.sb, downgrade);

	unsigned sb_u64s = DIV_ROUND_UP(sizeof(*d) + table.nr, sizeof(u64));

	if (d && le32_to_cpu(d->field.u64s) > sb_u64s)
		goto out;

	d = bch2_sb_field_resize(&c->disk_sb, downgrade, sb_u64s);
	if (!d) {
		ret = -BCH_ERR_ENOSPC_sb_downgrade;
		goto out;
	}

	memcpy(d->entries, table.data, table.nr);
	memset_u64s_tail(d->entries, 0, table.nr);
out:
	darray_exit(&table);
	return ret;
}

void bch2_sb_set_downgrade(struct bch_fs *c, unsigned new_minor, unsigned old_minor)
{
	struct bch_sb_field_downgrade *d = bch2_sb_field_get(c->disk_sb.sb, downgrade);
	if (!d)
		return;

	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);

	for_each_downgrade_entry(d, i) {
		unsigned minor = BCH_VERSION_MINOR(le16_to_cpu(i->version));
		if (new_minor < minor && minor <= old_minor) {
			ext->recovery_passes_required[0] |= i->recovery_passes[0];
			ext->recovery_passes_required[1] |= i->recovery_passes[1];

			for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) {
				unsigned e = le16_to_cpu(i->errors[j]);
				if (e < BCH_FSCK_ERR_MAX)
					__set_bit(e, c->sb.errors_silent);
				if (e < sizeof(ext->errors_silent) * 8)
					__set_bit_le64(e, ext->errors_silent);
			}
		}
	}
}