bcachefs.h | Explore in Territory

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_H
#define _BCACHEFS_H

/*
 * SOME HIGH LEVEL CODE DOCUMENTATION:
 *
 * Bcache mostly works with cache sets, cache devices, and backing devices.
 *
 * Support for multiple cache devices hasn't quite been finished off yet, but
 * it's about 95% plumbed through. A cache set and its cache devices is sort of
 * like a md raid array and its component devices. Most of the code doesn't care
 * about individual cache devices, the main abstraction is the cache set.
 *
 * Multiple cache devices is intended to give us the ability to mirror dirty
 * cached data and metadata, without mirroring clean cached data.
 *
 * Backing devices are different, in that they have a lifetime independent of a
 * cache set. When you register a newly formatted backing device it'll come up
 * in passthrough mode, and then you can attach and detach a backing device from
 * a cache set at runtime - while it's mounted and in use. Detaching implicitly
 * invalidates any cached data for that backing device.
 *
 * A cache set can have multiple (many) backing devices attached to it.
 *
 * There's also flash only volumes - this is the reason for the distinction
 * between struct cached_dev and struct bcache_device. A flash only volume
 * works much like a bcache device that has a backing device, except the
 * "cached" data is always dirty. The end result is that we get thin
 * provisioning with very little additional code.
 *
 * Flash only volumes work but they're not production ready because the moving
 * garbage collector needs more work. More on that later.
 *
 * BUCKETS/ALLOCATION:
 *
 * Bcache is primarily designed for caching, which means that in normal
 * operation all of our available space will be allocated. Thus, we need an
 * efficient way of deleting things from the cache so we can write new things to
 * it.
 *
 * To do this, we first divide the cache device up into buckets. A bucket is the
 * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+
 * works efficiently.
 *
 * Each bucket has a 16 bit priority, and an 8 bit generation associated with
 * it. The gens and priorities for all the buckets are stored contiguously and
 * packed on disk (in a linked list of buckets - aside from the superblock, all
 * of bcache's metadata is stored in buckets).
 *
 * The priority is used to implement an LRU. We reset a bucket's priority when
 * we allocate it or on cache it, and every so often we decrement the priority
 * of each bucket. It could be used to implement something more sophisticated,
 * if anyone ever gets around to it.
 *
 * The generation is used for invalidating buckets. Each pointer also has an 8
 * bit generation embedded in it; for a pointer to be considered valid, its gen
 * must match the gen of the bucket it points into.  Thus, to reuse a bucket all
 * we have to do is increment its gen (and write its new gen to disk; we batch
 * this up).
 *
 * Bcache is entirely COW - we never write twice to a bucket, even buckets that
 * contain metadata (including btree nodes).
 *
 * THE BTREE:
 *
 * Bcache is in large part design around the btree.
 *
 * At a high level, the btree is just an index of key -> ptr tuples.
 *
 * Keys represent extents, and thus have a size field. Keys also have a variable
 * number of pointers attached to them (potentially zero, which is handy for
 * invalidating the cache).
 *
 * The key itself is an inode:offset pair. The inode number corresponds to a
 * backing device or a flash only volume. The offset is the ending offset of the
 * extent within the inode - not the starting offset; this makes lookups
 * slightly more convenient.
 *
 * Pointers contain the cache device id, the offset on that device, and an 8 bit
 * generation number. More on the gen later.
 *
 * Index lookups are not fully abstracted - cache lookups in particular are
 * still somewhat mixed in with the btree code, but things are headed in that
 * direction.
 *
 * Updates are fairly well abstracted, though. There are two different ways of
 * updating the btree; insert and replace.
 *
 * BTREE_INSERT will just take a list of keys and insert them into the btree -
 * overwriting (possibly only partially) any extents they overlap with. This is
 * used to update the index after a write.
 *
 * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is
 * overwriting a key that matches another given key. This is used for inserting
 * data into the cache after a cache miss, and for background writeback, and for
 * the moving garbage collector.
 *
 * There is no "delete" operation; deleting things from the index is
 * accomplished by either by invalidating pointers (by incrementing a bucket's
 * gen) or by inserting a key with 0 pointers - which will overwrite anything
 * previously present at that location in the index.
 *
 * This means that there are always stale/invalid keys in the btree. They're
 * filtered out by the code that iterates through a btree node, and removed when
 * a btree node is rewritten.
 *
 * BTREE NODES:
 *
 * Our unit of allocation is a bucket, and we can't arbitrarily allocate and
 * free smaller than a bucket - so, that's how big our btree nodes are.
 *
 * (If buckets are really big we'll only use part of the bucket for a btree node
 * - no less than 1/4th - but a bucket still contains no more than a single
 * btree node. I'd actually like to change this, but for now we rely on the
 * bucket's gen for deleting btree nodes when we rewrite/split a node.)
 *
 * Anyways, btree nodes are big - big enough to be inefficient with a textbook
 * btree implementation.
 *
 * The way this is solved is that btree nodes are internally log structured; we
 * can append new keys to an existing btree node without rewriting it. This
 * means each set of keys we write is sorted, but the node is not.
 *
 * We maintain this log structure in memory - keeping 1Mb of keys sorted would
 * be expensive, and we have to distinguish between the keys we have written and
 * the keys we haven't. So to do a lookup in a btree node, we have to search
 * each sorted set. But we do merge written sets together lazily, so the cost of
 * these extra searches is quite low (normally most of the keys in a btree node
 * will be in one big set, and then there'll be one or two sets that are much
 * smaller).
 *
 * This log structure makes bcache's btree more of a hybrid between a
 * conventional btree and a compacting data structure, with some of the
 * advantages of both.
 *
 * GARBAGE COLLECTION:
 *
 * We can't just invalidate any bucket - it might contain dirty data or
 * metadata. If it once contained dirty data, other writes might overwrite it
 * later, leaving no valid pointers into that bucket in the index.
 *
 * Thus, the primary purpose of garbage collection is to find buckets to reuse.
 * It also counts how much valid data it each bucket currently contains, so that
 * allocation can reuse buckets sooner when they've been mostly overwritten.
 *
 * It also does some things that are really internal to the btree
 * implementation. If a btree node contains pointers that are stale by more than
 * some threshold, it rewrites the btree node to avoid the bucket's generation
 * wrapping around. It also merges adjacent btree nodes if they're empty enough.
 *
 * THE JOURNAL:
 *
 * Bcache's journal is not necessary for consistency; we always strictly
 * order metadata writes so that the btree and everything else is consistent on
 * disk in the event of an unclean shutdown, and in fact bcache had writeback
 * caching (with recovery from unclean shutdown) before journalling was
 * implemented.
 *
 * Rather, the journal is purely a performance optimization; we can't complete a
 * write until we've updated the index on disk, otherwise the cache would be
 * inconsistent in the event of an unclean shutdown. This means that without the
 * journal, on random write workloads we constantly have to update all the leaf
 * nodes in the btree, and those writes will be mostly empty (appending at most
 * a few keys each) - highly inefficient in terms of amount of metadata writes,
 * and it puts more strain on the various btree resorting/compacting code.
 *
 * The journal is just a log of keys we've inserted; on startup we just reinsert
 * all the keys in the open journal entries. That means that when we're updating
 * a node in the btree, we can wait until a 4k block of keys fills up before
 * writing them out.
 *
 * For simplicity, we only journal updates to leaf nodes; updates to parent
 * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth
 * the complexity to deal with journalling them (in particular, journal replay)
 * - updates to non leaf nodes just happen synchronously (see btree_split()).
 */

#undef pr_fmt
#ifdef __KERNEL__
#define pr_fmt(fmt) …
#else
#define pr_fmt …
#endif

#include <linux/backing-dev-defs.h>
#include <linux/bug.h>
#include <linux/bio.h>
#include <linux/closure.h>
#include <linux/kobject.h>
#include <linux/list.h>
#include <linux/math64.h>
#include <linux/mutex.h>
#include <linux/percpu-refcount.h>
#include <linux/percpu-rwsem.h>
#include <linux/refcount.h>
#include <linux/rhashtable.h>
#include <linux/rwsem.h>
#include <linux/semaphore.h>
#include <linux/seqlock.h>
#include <linux/shrinker.h>
#include <linux/srcu.h>
#include <linux/types.h>
#include <linux/workqueue.h>
#include <linux/zstd.h>

#include "bcachefs_format.h"
#include "disk_accounting_types.h"
#include "errcode.h"
#include "fifo.h"
#include "nocow_locking_types.h"
#include "opts.h"
#include "recovery_passes_types.h"
#include "sb-errors_types.h"
#include "seqmutex.h"
#include "time_stats.h"
#include "util.h"

#ifdef CONFIG_BCACHEFS_DEBUG
#define BCH_WRITE_REF_DEBUG
#endif

#ifndef dynamic_fault
#define dynamic_fault(...) …
#endif

#define race_fault(...) …

#define count_event(_c, _name) …

#define trace_and_count(_c, _name, ...) …

#define bch2_fs_init_fault(name) …
#define bch2_meta_read_fault(name) …
#define bch2_meta_write_fault(name) …

#ifdef __KERNEL__
#define BCACHEFS_LOG_PREFIX
#endif

#ifdef BCACHEFS_LOG_PREFIX

#define bch2_log_msg(_c, fmt) …
#define bch2_fmt_dev(_ca, fmt) …
#define bch2_fmt_dev_offset(_ca, _offset, fmt) …
#define bch2_fmt_inum(_c, _inum, fmt) …
#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt) …

#else

#define bch2_log_msg …
#define bch2_fmt_dev …
#define bch2_fmt_dev_offset …
#define bch2_fmt_inum …
#define bch2_fmt_inum_offset …

#endif

#define bch2_fmt(_c, fmt) …

void bch2_print_str(struct bch_fs *, const char *);

__printf(2, 3)
void bch2_print_opts(struct bch_opts *, const char *, ...);

__printf(2, 3)
void __bch2_print(struct bch_fs *c, const char *fmt, ...);

#define maybe_dev_to_fs(_c) …

#define bch2_print(_c, ...) …

#define bch2_print_ratelimited(_c, ...) …

#define bch_info(c, fmt, ...) …
#define bch_notice(c, fmt, ...) …
#define bch_warn(c, fmt, ...) …
#define bch_warn_ratelimited(c, fmt, ...) …

#define bch_err(c, fmt, ...) …
#define bch_err_dev(ca, fmt, ...) …
#define bch_err_dev_offset(ca, _offset, fmt, ...) …
#define bch_err_inum(c, _inum, fmt, ...) …
#define bch_err_inum_offset(c, _inum, _offset, fmt, ...) …

#define bch_err_ratelimited(c, fmt, ...) …
#define bch_err_dev_ratelimited(ca, fmt, ...) …
#define bch_err_dev_offset_ratelimited(ca, _offset, fmt, ...) …
#define bch_err_inum_ratelimited(c, _inum, fmt, ...) …
#define bch_err_inum_offset_ratelimited(c, _inum, _offset, fmt, ...) …

static inline bool should_print_err(int err)
{ … }

#define bch_err_fn(_c, _ret) …

#define bch_err_fn_ratelimited(_c, _ret) …

#define bch_err_msg(_c, _ret, _msg, ...) …

#define bch_verbose(c, fmt, ...) …

#define pr_verbose_init(opts, fmt, ...) …

/* Parameters that are useful for debugging, but should always be compiled in: */
#define BCH_DEBUG_PARAMS_ALWAYS() …

/* Parameters that should only be compiled in debug mode: */
#define BCH_DEBUG_PARAMS_DEBUG() …

#define BCH_DEBUG_PARAMS_ALL() …

#ifdef CONFIG_BCACHEFS_DEBUG
#define BCH_DEBUG_PARAMS() …
#else
#define BCH_DEBUG_PARAMS …
#endif

#define BCH_DEBUG_PARAM …
BCH_DEBUG_PARAMS(…)
#undef BCH_DEBUG_PARAM

#ifndef CONFIG_BCACHEFS_DEBUG
#define BCH_DEBUG_PARAM …
BCH_DEBUG_PARAMS_DEBUG()
#undef BCH_DEBUG_PARAM
#endif

#define BCH_TIME_STATS() …

enum bch_time_stats { … };

#include "alloc_types.h"
#include "btree_gc_types.h"
#include "btree_types.h"
#include "btree_node_scan_types.h"
#include "btree_write_buffer_types.h"
#include "buckets_types.h"
#include "buckets_waiting_for_journal_types.h"
#include "clock_types.h"
#include "disk_groups_types.h"
#include "ec_types.h"
#include "journal_types.h"
#include "keylist_types.h"
#include "quota_types.h"
#include "rebalance_types.h"
#include "replicas_types.h"
#include "sb-members_types.h"
#include "subvolume_types.h"
#include "super_types.h"
#include "thread_with_file_types.h"

/* Number of nodes btree coalesce will try to coalesce at once */
#define GC_MERGE_NODES …

/* Maximum number of nodes we might need to allocate atomically: */
#define BTREE_RESERVE_MAX …

/* Size of the freelist we allocate btree nodes from: */
#define BTREE_NODE_RESERVE …

#define BTREE_NODE_OPEN_BUCKET_RESERVE …

struct btree;

struct io_count { … };

struct discard_in_flight { … };

struct bch_dev { … };

/*
 * initial_gc_unfixed
 * error
 * topology error
 */

#define BCH_FS_FLAGS() …

enum bch_fs_flags { … };

struct btree_debug { … };

#define BCH_TRANSACTIONS_NR …

struct btree_transaction_stats { … };

struct bch_fs_pcpu { … };

struct journal_seq_blacklist_table { … };

struct journal_keys { … };

struct btree_trans_buf { … };

#define BCACHEFS_ROOT_SUBVOL_INUM …

#define BCH_WRITE_REFS() …

enum bch_write_ref { … };

struct bch_fs { … };

extern struct wait_queue_head bch2_read_only_wait;

static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref)
{ … }

static inline bool __bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
{ … }

static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
{ … }

static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref)
{ … }

static inline bool bch2_ro_ref_tryget(struct bch_fs *c)
{ … }

static inline void bch2_ro_ref_put(struct bch_fs *c)
{ … }

static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
{ … }

static inline unsigned bucket_bytes(const struct bch_dev *ca)
{ … }

static inline unsigned block_bytes(const struct bch_fs *c)
{ … }

static inline unsigned block_sectors(const struct bch_fs *c)
{ … }

static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
{ … }

static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time)
{ … }

static inline s64 timespec_to_bch2_time(const struct bch_fs *c, struct timespec64 ts)
{ … }

static inline s64 bch2_current_time(const struct bch_fs *c)
{ … }

static inline u64 bch2_current_io_time(const struct bch_fs *c, int rw)
{ … }

static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c)
{ … }

static inline unsigned metadata_replicas_required(struct bch_fs *c)
{ … }

static inline unsigned data_replicas_required(struct bch_fs *c)
{ … }

#define BKEY_PADDED_ONSTACK(key, pad) …

#endif /* _BCACHEFS_H */
linux/fs/bcachefs/bcachefs.h