/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_H #define _BCACHEFS_H /* * SOME HIGH LEVEL CODE DOCUMENTATION: * * Bcache mostly works with cache sets, cache devices, and backing devices. * * Support for multiple cache devices hasn't quite been finished off yet, but * it's about 95% plumbed through. A cache set and its cache devices is sort of * like a md raid array and its component devices. Most of the code doesn't care * about individual cache devices, the main abstraction is the cache set. * * Multiple cache devices is intended to give us the ability to mirror dirty * cached data and metadata, without mirroring clean cached data. * * Backing devices are different, in that they have a lifetime independent of a * cache set. When you register a newly formatted backing device it'll come up * in passthrough mode, and then you can attach and detach a backing device from * a cache set at runtime - while it's mounted and in use. Detaching implicitly * invalidates any cached data for that backing device. * * A cache set can have multiple (many) backing devices attached to it. * * There's also flash only volumes - this is the reason for the distinction * between struct cached_dev and struct bcache_device. A flash only volume * works much like a bcache device that has a backing device, except the * "cached" data is always dirty. The end result is that we get thin * provisioning with very little additional code. * * Flash only volumes work but they're not production ready because the moving * garbage collector needs more work. More on that later. * * BUCKETS/ALLOCATION: * * Bcache is primarily designed for caching, which means that in normal * operation all of our available space will be allocated. Thus, we need an * efficient way of deleting things from the cache so we can write new things to * it. * * To do this, we first divide the cache device up into buckets. A bucket is the * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+ * works efficiently. * * Each bucket has a 16 bit priority, and an 8 bit generation associated with * it. The gens and priorities for all the buckets are stored contiguously and * packed on disk (in a linked list of buckets - aside from the superblock, all * of bcache's metadata is stored in buckets). * * The priority is used to implement an LRU. We reset a bucket's priority when * we allocate it or on cache it, and every so often we decrement the priority * of each bucket. It could be used to implement something more sophisticated, * if anyone ever gets around to it. * * The generation is used for invalidating buckets. Each pointer also has an 8 * bit generation embedded in it; for a pointer to be considered valid, its gen * must match the gen of the bucket it points into. Thus, to reuse a bucket all * we have to do is increment its gen (and write its new gen to disk; we batch * this up). * * Bcache is entirely COW - we never write twice to a bucket, even buckets that * contain metadata (including btree nodes). * * THE BTREE: * * Bcache is in large part design around the btree. * * At a high level, the btree is just an index of key -> ptr tuples. * * Keys represent extents, and thus have a size field. Keys also have a variable * number of pointers attached to them (potentially zero, which is handy for * invalidating the cache). * * The key itself is an inode:offset pair. The inode number corresponds to a * backing device or a flash only volume. The offset is the ending offset of the * extent within the inode - not the starting offset; this makes lookups * slightly more convenient. * * Pointers contain the cache device id, the offset on that device, and an 8 bit * generation number. More on the gen later. * * Index lookups are not fully abstracted - cache lookups in particular are * still somewhat mixed in with the btree code, but things are headed in that * direction. * * Updates are fairly well abstracted, though. There are two different ways of * updating the btree; insert and replace. * * BTREE_INSERT will just take a list of keys and insert them into the btree - * overwriting (possibly only partially) any extents they overlap with. This is * used to update the index after a write. * * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is * overwriting a key that matches another given key. This is used for inserting * data into the cache after a cache miss, and for background writeback, and for * the moving garbage collector. * * There is no "delete" operation; deleting things from the index is * accomplished by either by invalidating pointers (by incrementing a bucket's * gen) or by inserting a key with 0 pointers - which will overwrite anything * previously present at that location in the index. * * This means that there are always stale/invalid keys in the btree. They're * filtered out by the code that iterates through a btree node, and removed when * a btree node is rewritten. * * BTREE NODES: * * Our unit of allocation is a bucket, and we can't arbitrarily allocate and * free smaller than a bucket - so, that's how big our btree nodes are. * * (If buckets are really big we'll only use part of the bucket for a btree node * - no less than 1/4th - but a bucket still contains no more than a single * btree node. I'd actually like to change this, but for now we rely on the * bucket's gen for deleting btree nodes when we rewrite/split a node.) * * Anyways, btree nodes are big - big enough to be inefficient with a textbook * btree implementation. * * The way this is solved is that btree nodes are internally log structured; we * can append new keys to an existing btree node without rewriting it. This * means each set of keys we write is sorted, but the node is not. * * We maintain this log structure in memory - keeping 1Mb of keys sorted would * be expensive, and we have to distinguish between the keys we have written and * the keys we haven't. So to do a lookup in a btree node, we have to search * each sorted set. But we do merge written sets together lazily, so the cost of * these extra searches is quite low (normally most of the keys in a btree node * will be in one big set, and then there'll be one or two sets that are much * smaller). * * This log structure makes bcache's btree more of a hybrid between a * conventional btree and a compacting data structure, with some of the * advantages of both. * * GARBAGE COLLECTION: * * We can't just invalidate any bucket - it might contain dirty data or * metadata. If it once contained dirty data, other writes might overwrite it * later, leaving no valid pointers into that bucket in the index. * * Thus, the primary purpose of garbage collection is to find buckets to reuse. * It also counts how much valid data it each bucket currently contains, so that * allocation can reuse buckets sooner when they've been mostly overwritten. * * It also does some things that are really internal to the btree * implementation. If a btree node contains pointers that are stale by more than * some threshold, it rewrites the btree node to avoid the bucket's generation * wrapping around. It also merges adjacent btree nodes if they're empty enough. * * THE JOURNAL: * * Bcache's journal is not necessary for consistency; we always strictly * order metadata writes so that the btree and everything else is consistent on * disk in the event of an unclean shutdown, and in fact bcache had writeback * caching (with recovery from unclean shutdown) before journalling was * implemented. * * Rather, the journal is purely a performance optimization; we can't complete a * write until we've updated the index on disk, otherwise the cache would be * inconsistent in the event of an unclean shutdown. This means that without the * journal, on random write workloads we constantly have to update all the leaf * nodes in the btree, and those writes will be mostly empty (appending at most * a few keys each) - highly inefficient in terms of amount of metadata writes, * and it puts more strain on the various btree resorting/compacting code. * * The journal is just a log of keys we've inserted; on startup we just reinsert * all the keys in the open journal entries. That means that when we're updating * a node in the btree, we can wait until a 4k block of keys fills up before * writing them out. * * For simplicity, we only journal updates to leaf nodes; updates to parent * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth * the complexity to deal with journalling them (in particular, journal replay) * - updates to non leaf nodes just happen synchronously (see btree_split()). */ #undef pr_fmt #ifdef __KERNEL__ #define pr_fmt(fmt) … #else #define pr_fmt … #endif #include <linux/backing-dev-defs.h> #include <linux/bug.h> #include <linux/bio.h> #include <linux/closure.h> #include <linux/kobject.h> #include <linux/list.h> #include <linux/math64.h> #include <linux/mutex.h> #include <linux/percpu-refcount.h> #include <linux/percpu-rwsem.h> #include <linux/refcount.h> #include <linux/rhashtable.h> #include <linux/rwsem.h> #include <linux/semaphore.h> #include <linux/seqlock.h> #include <linux/shrinker.h> #include <linux/srcu.h> #include <linux/types.h> #include <linux/workqueue.h> #include <linux/zstd.h> #include "bcachefs_format.h" #include "disk_accounting_types.h" #include "errcode.h" #include "fifo.h" #include "nocow_locking_types.h" #include "opts.h" #include "recovery_passes_types.h" #include "sb-errors_types.h" #include "seqmutex.h" #include "time_stats.h" #include "util.h" #ifdef CONFIG_BCACHEFS_DEBUG #define BCH_WRITE_REF_DEBUG #endif #ifndef dynamic_fault #define dynamic_fault(...) … #endif #define race_fault(...) … #define count_event(_c, _name) … #define trace_and_count(_c, _name, ...) … #define bch2_fs_init_fault(name) … #define bch2_meta_read_fault(name) … #define bch2_meta_write_fault(name) … #ifdef __KERNEL__ #define BCACHEFS_LOG_PREFIX #endif #ifdef BCACHEFS_LOG_PREFIX #define bch2_log_msg(_c, fmt) … #define bch2_fmt_dev(_ca, fmt) … #define bch2_fmt_dev_offset(_ca, _offset, fmt) … #define bch2_fmt_inum(_c, _inum, fmt) … #define bch2_fmt_inum_offset(_c, _inum, _offset, fmt) … #else #define bch2_log_msg … #define bch2_fmt_dev … #define bch2_fmt_dev_offset … #define bch2_fmt_inum … #define bch2_fmt_inum_offset … #endif #define bch2_fmt(_c, fmt) … void bch2_print_str(struct bch_fs *, const char *); __printf(2, 3) void bch2_print_opts(struct bch_opts *, const char *, ...); __printf(2, 3) void __bch2_print(struct bch_fs *c, const char *fmt, ...); #define maybe_dev_to_fs(_c) … #define bch2_print(_c, ...) … #define bch2_print_ratelimited(_c, ...) … #define bch_info(c, fmt, ...) … #define bch_notice(c, fmt, ...) … #define bch_warn(c, fmt, ...) … #define bch_warn_ratelimited(c, fmt, ...) … #define bch_err(c, fmt, ...) … #define bch_err_dev(ca, fmt, ...) … #define bch_err_dev_offset(ca, _offset, fmt, ...) … #define bch_err_inum(c, _inum, fmt, ...) … #define bch_err_inum_offset(c, _inum, _offset, fmt, ...) … #define bch_err_ratelimited(c, fmt, ...) … #define bch_err_dev_ratelimited(ca, fmt, ...) … #define bch_err_dev_offset_ratelimited(ca, _offset, fmt, ...) … #define bch_err_inum_ratelimited(c, _inum, fmt, ...) … #define bch_err_inum_offset_ratelimited(c, _inum, _offset, fmt, ...) … static inline bool should_print_err(int err) { … } #define bch_err_fn(_c, _ret) … #define bch_err_fn_ratelimited(_c, _ret) … #define bch_err_msg(_c, _ret, _msg, ...) … #define bch_verbose(c, fmt, ...) … #define pr_verbose_init(opts, fmt, ...) … /* Parameters that are useful for debugging, but should always be compiled in: */ #define BCH_DEBUG_PARAMS_ALWAYS() … /* Parameters that should only be compiled in debug mode: */ #define BCH_DEBUG_PARAMS_DEBUG() … #define BCH_DEBUG_PARAMS_ALL() … #ifdef CONFIG_BCACHEFS_DEBUG #define BCH_DEBUG_PARAMS() … #else #define BCH_DEBUG_PARAMS … #endif #define BCH_DEBUG_PARAM … BCH_DEBUG_PARAMS(…) #undef BCH_DEBUG_PARAM #ifndef CONFIG_BCACHEFS_DEBUG #define BCH_DEBUG_PARAM … BCH_DEBUG_PARAMS_DEBUG() #undef BCH_DEBUG_PARAM #endif #define BCH_TIME_STATS() … enum bch_time_stats { … }; #include "alloc_types.h" #include "btree_gc_types.h" #include "btree_types.h" #include "btree_node_scan_types.h" #include "btree_write_buffer_types.h" #include "buckets_types.h" #include "buckets_waiting_for_journal_types.h" #include "clock_types.h" #include "disk_groups_types.h" #include "ec_types.h" #include "journal_types.h" #include "keylist_types.h" #include "quota_types.h" #include "rebalance_types.h" #include "replicas_types.h" #include "sb-members_types.h" #include "subvolume_types.h" #include "super_types.h" #include "thread_with_file_types.h" /* Number of nodes btree coalesce will try to coalesce at once */ #define GC_MERGE_NODES … /* Maximum number of nodes we might need to allocate atomically: */ #define BTREE_RESERVE_MAX … /* Size of the freelist we allocate btree nodes from: */ #define BTREE_NODE_RESERVE … #define BTREE_NODE_OPEN_BUCKET_RESERVE … struct btree; struct io_count { … }; struct discard_in_flight { … }; struct bch_dev { … }; /* * initial_gc_unfixed * error * topology error */ #define BCH_FS_FLAGS() … enum bch_fs_flags { … }; struct btree_debug { … }; #define BCH_TRANSACTIONS_NR … struct btree_transaction_stats { … }; struct bch_fs_pcpu { … }; struct journal_seq_blacklist_table { … }; struct journal_keys { … }; struct btree_trans_buf { … }; #define BCACHEFS_ROOT_SUBVOL_INUM … #define BCH_WRITE_REFS() … enum bch_write_ref { … }; struct bch_fs { … }; extern struct wait_queue_head bch2_read_only_wait; static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref) { … } static inline bool __bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) { … } static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) { … } static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref) { … } static inline bool bch2_ro_ref_tryget(struct bch_fs *c) { … } static inline void bch2_ro_ref_put(struct bch_fs *c) { … } static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages) { … } static inline unsigned bucket_bytes(const struct bch_dev *ca) { … } static inline unsigned block_bytes(const struct bch_fs *c) { … } static inline unsigned block_sectors(const struct bch_fs *c) { … } static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree) { … } static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time) { … } static inline s64 timespec_to_bch2_time(const struct bch_fs *c, struct timespec64 ts) { … } static inline s64 bch2_current_time(const struct bch_fs *c) { … } static inline u64 bch2_current_io_time(const struct bch_fs *c, int rw) { … } static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c) { … } static inline unsigned metadata_replicas_required(struct bch_fs *c) { … } static inline unsigned data_replicas_required(struct bch_fs *c) { … } #define BKEY_PADDED_ONSTACK(key, pad) … #endif /* _BCACHEFS_H */