bcachefs_format.h | Explore in Territory

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_FORMAT_H
#define _BCACHEFS_FORMAT_H

/*
 * bcachefs on disk data structures
 *
 * OVERVIEW:
 *
 * There are three main types of on disk data structures in bcachefs (this is
 * reduced from 5 in bcache)
 *
 *  - superblock
 *  - journal
 *  - btree
 *
 * The btree is the primary structure; most metadata exists as keys in the
 * various btrees. There are only a small number of btrees, they're not
 * sharded - we have one btree for extents, another for inodes, et cetera.
 *
 * SUPERBLOCK:
 *
 * The superblock contains the location of the journal, the list of devices in
 * the filesystem, and in general any metadata we need in order to decide
 * whether we can start a filesystem or prior to reading the journal/btree
 * roots.
 *
 * The superblock is extensible, and most of the contents of the superblock are
 * in variable length, type tagged fields; see struct bch_sb_field.
 *
 * Backup superblocks do not reside in a fixed location; also, superblocks do
 * not have a fixed size. To locate backup superblocks we have struct
 * bch_sb_layout; we store a copy of this inside every superblock, and also
 * before the first superblock.
 *
 * JOURNAL:
 *
 * The journal primarily records btree updates in the order they occurred;
 * journal replay consists of just iterating over all the keys in the open
 * journal entries and re-inserting them into the btrees.
 *
 * The journal also contains entry types for the btree roots, and blacklisted
 * journal sequence numbers (see journal_seq_blacklist.c).
 *
 * BTREE:
 *
 * bcachefs btrees are copy on write b+ trees, where nodes are big (typically
 * 128k-256k) and log structured. We use struct btree_node for writing the first
 * entry in a given node (offset 0), and struct btree_node_entry for all
 * subsequent writes.
 *
 * After the header, btree node entries contain a list of keys in sorted order.
 * Values are stored inline with the keys; since values are variable length (and
 * keys effectively are variable length too, due to packing) we can't do random
 * access without building up additional in memory tables in the btree node read
 * path.
 *
 * BTREE KEYS (struct bkey):
 *
 * The various btrees share a common format for the key - so as to avoid
 * switching in fastpath lookup/comparison code - but define their own
 * structures for the key values.
 *
 * The size of a key/value pair is stored as a u8 in units of u64s, so the max
 * size is just under 2k. The common part also contains a type tag for the
 * value, and a format field indicating whether the key is packed or not (and
 * also meant to allow adding new key fields in the future, if desired).
 *
 * bkeys, when stored within a btree node, may also be packed. In that case, the
 * bkey_format in that node is used to unpack it. Packed bkeys mean that we can
 * be generous with field sizes in the common part of the key format (64 bit
 * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost.
 */

#include <asm/types.h>
#include <asm/byteorder.h>
#include <linux/kernel.h>
#include <linux/uuid.h>
#include <uapi/linux/magic.h>
#include "vstructs.h"

#ifdef __KERNEL__
__uuid_t;
#endif

#define BITMASK(name, type, field, offset, end) …

#define LE_BITMASK(_bits, name, type, field, offset, end) …

#define LE16_BITMASK(n, t, f, o, e) …
#define LE32_BITMASK(n, t, f, o, e) …
#define LE64_BITMASK(n, t, f, o, e) …

struct bkey_format { … };

/* Btree keys - all units are in sectors */

struct bpos { … } __packed
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
__aligned(…)
#endif
;

#define KEY_INODE_MAX …
#define KEY_OFFSET_MAX …
#define KEY_SNAPSHOT_MAX …
#define KEY_SIZE_MAX …

static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot)
{ … }

#define POS_MIN …
#define POS_MAX …
#define SPOS_MAX …
#define POS(_inode, _offset) …

/* Empty placeholder struct, for container_of() */
struct bch_val { … };

struct bversion { … } __packed
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
__aligned(…)
#endif
;

struct bkey { … } __packed
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
/*
 * The big-endian version of bkey can't be compiled by rustc with the "aligned"
 * attr since it doesn't allow types to have both "packed" and "aligned" attrs.
 * So for Rust compatibility, don't include this. It can be included in the LE
 * version because the "packed" attr is redundant in that case.
 *
 * History: (quoting Kent)
 *
 * Specifically, when i was designing bkey, I wanted the header to be no
 * bigger than necessary so that bkey_packed could use the rest. That means that
 * decently offten extent keys will fit into only 8 bytes, instead of spilling over
 * to 16.
 *
 * But packed_bkey treats the part after the header - the packed section -
 * as a single multi word, variable length integer. And bkey, the unpacked
 * version, is just a special case version of a bkey_packed; all the packed
 * bkey code will work on keys in any packed format, the in-memory
 * representation of an unpacked key also is just one type of packed key...
 *
 * So that constrains the key part of a bkig endian bkey to start right
 * after the header.
 *
 * If we ever do a bkey_v2 and need to expand the hedaer by another byte for
 * some reason - that will clean up this wart.
 */
__aligned(…)
#endif
;

struct bkey_packed { … } __packed __aligned(…);

bch_le128;

#define BKEY_U64s …
#define BKEY_U64s_MAX …
#define BKEY_VAL_U64s_MAX …

#define KEY_PACKED_BITS_START …

#define KEY_FORMAT_LOCAL_BTREE …
#define KEY_FORMAT_CURRENT …

enum bch_bkey_fields { … };

#define bkey_format_field(name, field) …

#define BKEY_FORMAT_CURRENT …

/* bkey with inline value */
struct bkey_i { … };

#define POS_KEY(_pos) …

#define KEY(_inode, _offset, _size) …

static inline void bkey_init(struct bkey *k)
{ … }

#define bkey_bytes(_k) …

#define __BKEY_PADDED(key, pad) …

/*
 * - DELETED keys are used internally to mark keys that should be ignored but
 *   override keys in composition order.  Their version number is ignored.
 *
 * - DISCARDED keys indicate that the data is all 0s because it has been
 *   discarded. DISCARDs may have a version; if the version is nonzero the key
 *   will be persistent, otherwise the key will be dropped whenever the btree
 *   node is rewritten (like DELETED keys).
 *
 * - ERROR: any read of the data returns a read error, as the data was lost due
 *   to a failing device. Like DISCARDED keys, they can be removed (overridden)
 *   by new writes or cluster-wide GC. Node repair can also overwrite them with
 *   the same or a more recent version number, but not with an older version
 *   number.
 *
 * - WHITEOUT: for hash table btrees
 */
#define BCH_BKEY_TYPES() …

enum bch_bkey_type { … };

struct bch_deleted { … };

struct bch_whiteout { … };

struct bch_error { … };

struct bch_cookie { … };

struct bch_hash_whiteout { … };

struct bch_set { … };

/* 128 bits, sufficient for cryptographic MACs: */
struct bch_csum { … } __packed __aligned(…);

struct bch_backpointer { … } __packed __aligned(…);

/* Optional/variable size superblock sections: */

struct bch_sb_field { … };

#define BCH_SB_FIELDS() …

#include "alloc_background_format.h"
#include "dirent_format.h"
#include "disk_accounting_format.h"
#include "disk_groups_format.h"
#include "extents_format.h"
#include "ec_format.h"
#include "dirent_format.h"
#include "disk_groups_format.h"
#include "inode_format.h"
#include "journal_seq_blacklist_format.h"
#include "logged_ops_format.h"
#include "lru_format.h"
#include "quota_format.h"
#include "reflink_format.h"
#include "replicas_format.h"
#include "snapshot_format.h"
#include "subvolume_format.h"
#include "sb-counters_format.h"
#include "sb-downgrade_format.h"
#include "sb-errors_format.h"
#include "sb-members_format.h"
#include "xattr_format.h"

enum bch_sb_field_type { … };

/*
 * Most superblock fields are replicated in all device's superblocks - a few are
 * not:
 */
#define BCH_SINGLE_DEVICE_SB_FIELDS …

/* BCH_SB_FIELD_journal: */

struct bch_sb_field_journal { … };

struct bch_sb_field_journal_v2 { … };

/* BCH_SB_FIELD_crypt: */

struct nonce { … };

struct bch_key { … };

#define BCH_KEY_MAGIC …

struct bch_encrypted_key { … };

/*
 * If this field is present in the superblock, it stores an encryption key which
 * is used encrypt all other data/metadata. The key will normally be encrypted
 * with the key userspace provides, but if encryption has been turned off we'll
 * just store the master key unencrypted in the superblock so we can access the
 * previously encrypted data.
 */
struct bch_sb_field_crypt { … };

LE64_BITMASK(BCH_CRYPT_KDF_TYPE,	struct bch_sb_field_crypt, flags, 0, 4);

enum bch_kdf_types { … };

/* stored as base 2 log of scrypt params: */
LE64_BITMASK(BCH_KDF_SCRYPT_N,	struct bch_sb_field_crypt, kdf_flags,  0, 16);
LE64_BITMASK(BCH_KDF_SCRYPT_R,	struct bch_sb_field_crypt, kdf_flags, 16, 32);
LE64_BITMASK(BCH_KDF_SCRYPT_P,	struct bch_sb_field_crypt, kdf_flags, 32, 48);

/*
 * On clean shutdown, store btree roots and current journal sequence number in
 * the superblock:
 */
struct jset_entry { … };

struct bch_sb_field_clean { … };

struct bch_sb_field_ext { … };

/* Superblock: */

/*
 * New versioning scheme:
 * One common version number for all on disk data structures - superblock, btree
 * nodes, journal entries
 */
#define BCH_VERSION_MAJOR(_v) …
#define BCH_VERSION_MINOR(_v) …
#define BCH_VERSION(_major, _minor) …

/*
 * field 1:		version name
 * field 2:		BCH_VERSION(major, minor)
 * field 3:		recovery passess required on upgrade
 */
#define BCH_METADATA_VERSIONS() …

enum bcachefs_metadata_version { … };

static const __maybe_unused
unsigned bcachefs_metadata_required_upgrade_below = …;

#define bcachefs_metadata_version_current …

#define BCH_SB_SECTOR …

#define BCH_SB_LAYOUT_SIZE_BITS_MAX …

struct bch_sb_layout { … } __packed __aligned(…);

#define BCH_SB_LAYOUT_SECTOR …

/*
 * @offset	- sector where this sb was written
 * @version	- on disk format version
 * @version_min	- Oldest metadata version this filesystem contains; so we can
 *		  safely drop compatibility code and refuse to mount filesystems
 *		  we'd need it for
 * @magic	- identifies as a bcachefs superblock (BCHFS_MAGIC)
 * @seq		- incremented each time superblock is written
 * @uuid	- used for generating various magic numbers and identifying
 *                member devices, never changes
 * @user_uuid	- user visible UUID, may be changed
 * @label	- filesystem label
 * @seq		- identifies most recent superblock, incremented each time
 *		  superblock is written
 * @features	- enabled incompatible features
 */
struct bch_sb { … } __packed __aligned(…);

/*
 * Flags:
 * BCH_SB_INITALIZED	- set on first mount
 * BCH_SB_CLEAN		- did we shut down cleanly? Just a hint, doesn't affect
 *			  behaviour of mount/recovery path:
 * BCH_SB_INODE_32BIT	- limit inode numbers to 32 bits
 * BCH_SB_128_BIT_MACS	- 128 bit macs instead of 80
 * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides
 *			   DATA/META_CSUM_TYPE. Also indicates encryption
 *			   algorithm in use, if/when we get more than one
 */

LE16_BITMASK(BCH_SB_BLOCK_SIZE,		struct bch_sb, block_size, 0, 16);

LE64_BITMASK(BCH_SB_INITIALIZED,	struct bch_sb, flags[0],  0,  1);
LE64_BITMASK(BCH_SB_CLEAN,		struct bch_sb, flags[0],  1,  2);
LE64_BITMASK(BCH_SB_CSUM_TYPE,		struct bch_sb, flags[0],  2,  8);
LE64_BITMASK(BCH_SB_ERROR_ACTION,	struct bch_sb, flags[0],  8, 12);

LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE,	struct bch_sb, flags[0], 12, 28);

LE64_BITMASK(BCH_SB_GC_RESERVE,		struct bch_sb, flags[0], 28, 33);
LE64_BITMASK(BCH_SB_ROOT_RESERVE,	struct bch_sb, flags[0], 33, 40);

LE64_BITMASK(BCH_SB_META_CSUM_TYPE,	struct bch_sb, flags[0], 40, 44);
LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE,	struct bch_sb, flags[0], 44, 48);

LE64_BITMASK(BCH_SB_META_REPLICAS_WANT,	struct bch_sb, flags[0], 48, 52);
LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT,	struct bch_sb, flags[0], 52, 56);

LE64_BITMASK(BCH_SB_POSIX_ACL,		struct bch_sb, flags[0], 56, 57);
LE64_BITMASK(BCH_SB_USRQUOTA,		struct bch_sb, flags[0], 57, 58);
LE64_BITMASK(BCH_SB_GRPQUOTA,		struct bch_sb, flags[0], 58, 59);
LE64_BITMASK(BCH_SB_PRJQUOTA,		struct bch_sb, flags[0], 59, 60);

LE64_BITMASK(BCH_SB_HAS_ERRORS,		struct bch_sb, flags[0], 60, 61);
LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62);

LE64_BITMASK(BCH_SB_BIG_ENDIAN,		struct bch_sb, flags[0], 62, 63);

LE64_BITMASK(BCH_SB_STR_HASH_TYPE,	struct bch_sb, flags[1],  0,  4);
LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_LO,struct bch_sb, flags[1],  4,  8);
LE64_BITMASK(BCH_SB_INODE_32BIT,	struct bch_sb, flags[1],  8,  9);

LE64_BITMASK(BCH_SB_128_BIT_MACS,	struct bch_sb, flags[1],  9, 10);
LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE,	struct bch_sb, flags[1], 10, 14);

/*
 * Max size of an extent that may require bouncing to read or write
 * (checksummed, compressed): 64k
 */
LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS,
					struct bch_sb, flags[1], 14, 20);

LE64_BITMASK(BCH_SB_META_REPLICAS_REQ,	struct bch_sb, flags[1], 20, 24);
LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ,	struct bch_sb, flags[1], 24, 28);

LE64_BITMASK(BCH_SB_PROMOTE_TARGET,	struct bch_sb, flags[1], 28, 40);
LE64_BITMASK(BCH_SB_FOREGROUND_TARGET,	struct bch_sb, flags[1], 40, 52);
LE64_BITMASK(BCH_SB_BACKGROUND_TARGET,	struct bch_sb, flags[1], 52, 64);

LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO,
					struct bch_sb, flags[2],  0,  4);
LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES,	struct bch_sb, flags[2],  4, 64);

LE64_BITMASK(BCH_SB_ERASURE_CODE,	struct bch_sb, flags[3],  0, 16);
LE64_BITMASK(BCH_SB_METADATA_TARGET,	struct bch_sb, flags[3], 16, 28);
LE64_BITMASK(BCH_SB_SHARD_INUMS,	struct bch_sb, flags[3], 28, 29);
LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
LE64_BITMASK(BCH_SB_NOCOW,		struct bch_sb, flags[4], 33, 34);
LE64_BITMASK(BCH_SB_WRITE_BUFFER_SIZE,	struct bch_sb, flags[4], 34, 54);
LE64_BITMASK(BCH_SB_VERSION_UPGRADE,	struct bch_sb, flags[4], 54, 56);

LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_HI,struct bch_sb, flags[4], 56, 60);
LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI,
					struct bch_sb, flags[4], 60, 64);

LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE,
					struct bch_sb, flags[5],  0, 16);
LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT,
					struct bch_sb, flags[5], 16, 32);

static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
{ … }

static inline void SET_BCH_SB_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v)
{ … }

static inline __u64 BCH_SB_BACKGROUND_COMPRESSION_TYPE(const struct bch_sb *sb)
{ … }

static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v)
{ … }

/*
 * Features:
 *
 * journal_seq_blacklist_v3:	gates BCH_SB_FIELD_journal_seq_blacklist
 * reflink:			gates KEY_TYPE_reflink
 * inline_data:			gates KEY_TYPE_inline_data
 * new_siphash:			gates BCH_STR_HASH_siphash
 * new_extent_overwrite:	gates BTREE_NODE_NEW_EXTENT_OVERWRITE
 */
#define BCH_SB_FEATURES() …

#define BCH_SB_FEATURES_ALWAYS …

#define BCH_SB_FEATURES_ALL …

enum bch_sb_feature { … };

#define BCH_SB_COMPAT() …

enum bch_sb_compat { … };

/* options: */

#define BCH_VERSION_UPGRADE_OPTS() …

enum bch_version_upgrade_opts { … };

#define BCH_REPLICAS_MAX …

#define BCH_BKEY_PTRS_MAX …

#define BCH_ERROR_ACTIONS() …

enum bch_error_actions { … };

#define BCH_STR_HASH_TYPES() …

enum bch_str_hash_type { … };

#define BCH_STR_HASH_OPTS() …

enum bch_str_hash_opts { … };

#define BCH_CSUM_TYPES() …

enum bch_csum_type { … };

static const __maybe_unused unsigned bch_crc_bytes[] = …;

static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
{ … }

#define BCH_CSUM_OPTS() …

enum bch_csum_opts { … };

#define BCH_COMPRESSION_TYPES() …

enum bch_compression_type { … };

#define BCH_COMPRESSION_OPTS() …

enum bch_compression_opts { … };

/*
 * Magic numbers
 *
 * The various other data structures have their own magic numbers, which are
 * xored with the first part of the cache set's UUID
 */

#define BCACHE_MAGIC …
#define BCHFS_MAGIC …

#define BCACHEFS_STATFS_MAGIC …

#define JSET_MAGIC …
#define BSET_MAGIC …

static inline __le64 __bch2_sb_magic(struct bch_sb *sb)
{ … }

static inline __u64 __jset_magic(struct bch_sb *sb)
{ … }

static inline __u64 __bset_magic(struct bch_sb *sb)
{ … }

/* Journal */

#define JSET_KEYS_U64s …

#define BCH_JSET_ENTRY_TYPES() …

enum bch_jset_entry_type { … };

static inline bool jset_entry_is_key(struct jset_entry *e)
{ … }

/*
 * Journal sequence numbers can be blacklisted: bsets record the max sequence
 * number of all the journal entries they contain updates for, so that on
 * recovery we can ignore those bsets that contain index updates newer that what
 * made it into the journal.
 *
 * This means that we can't reuse that journal_seq - we have to skip it, and
 * then record that we skipped it so that the next time we crash and recover we
 * don't think there was a missing journal entry.
 */
struct jset_entry_blacklist { … };

struct jset_entry_blacklist_v2 { … };

#define BCH_FS_USAGE_TYPES() …

enum bch_fs_usage_type { … };

struct jset_entry_usage { … } __packed;

struct jset_entry_data_usage { … } __packed;

struct jset_entry_clock { … } __packed;

struct jset_entry_dev_usage_type { … } __packed;

struct jset_entry_dev_usage { … };

static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u)
{ … }

struct jset_entry_log { … } __packed __aligned(…);

struct jset_entry_datetime { … } __packed __aligned(…);

/*
 * On disk format for a journal entry:
 * seq is monotonically increasing; every journal entry has its own unique
 * sequence number.
 *
 * last_seq is the oldest journal entry that still has keys the btree hasn't
 * flushed to disk yet.
 *
 * version is for on disk format changes.
 */
struct jset { … } __packed __aligned(…);

LE32_BITMASK(JSET_CSUM_TYPE,	struct jset, flags, 0, 4);
LE32_BITMASK(JSET_BIG_ENDIAN,	struct jset, flags, 4, 5);
LE32_BITMASK(JSET_NO_FLUSH,	struct jset, flags, 5, 6);

#define BCH_JOURNAL_BUCKETS_MIN …

/* Btree: */

enum btree_id_flags { … };

#define BCH_BTREE_IDS() …						\

enum btree_id { … };

/*
 * Maximum number of btrees that we will _ever_ have under the current scheme,
 * where we refer to them with 64 bit bitfields - and we also need a bit for
 * the interior btree node type:
 */
#define BTREE_ID_NR_MAX …

static inline bool btree_id_is_alloc(enum btree_id id)
{ … }

#define BTREE_MAX_DEPTH …

/* Btree nodes */

/*
 * Btree nodes
 *
 * On disk a btree node is a list/log of these; within each set the keys are
 * sorted
 */
struct bset { … } __packed __aligned(…);

LE32_BITMASK(BSET_CSUM_TYPE,	struct bset, flags, 0, 4);

LE32_BITMASK(BSET_BIG_ENDIAN,	struct bset, flags, 4, 5);
LE32_BITMASK(BSET_SEPARATE_WHITEOUTS,
				struct bset, flags, 5, 6);

/* Sector offset within the btree node: */
LE32_BITMASK(BSET_OFFSET,	struct bset, flags, 16, 32);

struct btree_node { … } __packed __aligned(…);

LE64_BITMASK(BTREE_NODE_ID_LO,	struct btree_node, flags,  0,  4);
LE64_BITMASK(BTREE_NODE_LEVEL,	struct btree_node, flags,  4,  8);
LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE,
				struct btree_node, flags,  8,  9);
LE64_BITMASK(BTREE_NODE_ID_HI,	struct btree_node, flags,  9, 25);
/* 25-32 unused */
LE64_BITMASK(BTREE_NODE_SEQ,	struct btree_node, flags, 32, 64);

static inline __u64 BTREE_NODE_ID(struct btree_node *n)
{ … }

static inline void SET_BTREE_NODE_ID(struct btree_node *n, __u64 v)
{ … }

struct btree_node_entry { … } __packed __aligned(…);

#endif /* _BCACHEFS_FORMAT_H */
linux/fs/bcachefs/bcachefs_format.h