btree.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2010 Kent Overstreet <[email protected]>
 *
 * Uses a block device as cache for other block devices; optimized for SSDs.
 * All allocation is done in buckets, which should match the erase block size
 * of the device.
 *
 * Buckets containing cached data are kept on a heap sorted by priority;
 * bucket priority is increased on cache hit, and periodically all the buckets
 * on the heap have their priority scaled down. This currently is just used as
 * an LRU but in the future should allow for more intelligent heuristics.
 *
 * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
 * counter. Garbage collection is used to remove stale pointers.
 *
 * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
 * as keys are inserted we only sort the pages that have not yet been written.
 * When garbage collection is run, we resort the entire node.
 *
 * All configuration is done via sysfs; see Documentation/admin-guide/bcache.rst.
 */

#include "bcache.h"
#include "btree.h"
#include "debug.h"
#include "extents.h"

#include <linux/slab.h>
#include <linux/bitops.h>
#include <linux/hash.h>
#include <linux/kthread.h>
#include <linux/prefetch.h>
#include <linux/random.h>
#include <linux/rcupdate.h>
#include <linux/sched/clock.h>
#include <linux/rculist.h>
#include <linux/delay.h>
#include <trace/events/bcache.h>

/*
 * Todo:
 * register_bcache: Return errors out to userspace correctly
 *
 * Writeback: don't undirty key until after a cache flush
 *
 * Create an iterator for key pointers
 *
 * On btree write error, mark bucket such that it won't be freed from the cache
 *
 * Journalling:
 *   Check for bad keys in replay
 *   Propagate barriers
 *   Refcount journal entries in journal_replay
 *
 * Garbage collection:
 *   Finish incremental gc
 *   Gc should free old UUIDs, data for invalid UUIDs
 *
 * Provide a way to list backing device UUIDs we have data cached for, and
 * probably how long it's been since we've seen them, and a way to invalidate
 * dirty data for devices that will never be attached again
 *
 * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so
 * that based on that and how much dirty data we have we can keep writeback
 * from being starved
 *
 * Add a tracepoint or somesuch to watch for writeback starvation
 *
 * When btree depth > 1 and splitting an interior node, we have to make sure
 * alloc_bucket() cannot fail. This should be true but is not completely
 * obvious.
 *
 * Plugging?
 *
 * If data write is less than hard sector size of ssd, round up offset in open
 * bucket to the next whole sector
 *
 * Superblock needs to be fleshed out for multiple cache devices
 *
 * Add a sysfs tunable for the number of writeback IOs in flight
 *
 * Add a sysfs tunable for the number of open data buckets
 *
 * IO tracking: Can we track when one process is doing io on behalf of another?
 * IO tracking: Don't use just an average, weigh more recent stuff higher
 *
 * Test module load/unload
 */

#define MAX_NEED_GC …
#define MAX_SAVE_PRIO …
#define MAX_GC_TIMES …
#define MIN_GC_NODES …
#define GC_SLEEP_MS …

#define PTR_DIRTY_BIT …

#define PTR_HASH(c, k) …

static struct workqueue_struct *btree_io_wq;

#define insert_lock(s, b) …


static inline struct bset *write_block(struct btree *b)
{ … }

static void bch_btree_init_next(struct btree *b)
{ … }

/* Btree key manipulation */

void bkey_put(struct cache_set *c, struct bkey *k)
{ … }

/* Btree IO */

static uint64_t btree_csum_set(struct btree *b, struct bset *i)
{ … }

void bch_btree_node_read_done(struct btree *b)
{ … }

static void btree_node_read_endio(struct bio *bio)
{ … }

static void bch_btree_node_read(struct btree *b)
{ … }

static void btree_complete_write(struct btree *b, struct btree_write *w)
{ … }

static CLOSURE_CALLBACK(btree_node_write_unlock)
{ … }

static CLOSURE_CALLBACK(__btree_node_write_done)
{ … }

static CLOSURE_CALLBACK(btree_node_write_done)
{ … }

static void btree_node_write_endio(struct bio *bio)
{ … }

static void do_btree_node_write(struct btree *b)
{ … }

void __bch_btree_node_write(struct btree *b, struct closure *parent)
{ … }

void bch_btree_node_write(struct btree *b, struct closure *parent)
{ … }

static void bch_btree_node_write_sync(struct btree *b)
{ … }

static void btree_node_write_work(struct work_struct *w)
{ … }

static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
{ … }

/*
 * Btree in memory cache - allocation/freeing
 * mca -> memory cache
 */

#define mca_reserve(c) …
#define mca_can_free(c) …

static void mca_data_free(struct btree *b)
{ … }

static void mca_bucket_free(struct btree *b)
{ … }

static unsigned int btree_order(struct bkey *k)
{ … }

static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
{ … }

#define cmp_int(l, r) …

#ifdef CONFIG_PROVE_LOCKING
static int btree_lock_cmp_fn(const struct lockdep_map *_a,
			     const struct lockdep_map *_b)
{ … }

static void btree_lock_print_fn(const struct lockdep_map *map)
{ … }
#endif

static struct btree *mca_bucket_alloc(struct cache_set *c,
				      struct bkey *k, gfp_t gfp)
{ … }

static int mca_reap(struct btree *b, unsigned int min_order, bool flush)
{ … }

static unsigned long bch_mca_scan(struct shrinker *shrink,
				  struct shrink_control *sc)
{ … }

static unsigned long bch_mca_count(struct shrinker *shrink,
				   struct shrink_control *sc)
{ … }

void bch_btree_cache_free(struct cache_set *c)
{ … }

int bch_btree_cache_alloc(struct cache_set *c)
{ … }

/* Btree in memory cache - hash table */

static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k)
{ … }

static struct btree *mca_find(struct cache_set *c, struct bkey *k)
{ … }

static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op)
{ … }

static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op,
				     struct bkey *k)
{ … }

/*
 * We can only have one thread cannibalizing other cached btree nodes at a time,
 * or we'll deadlock. We use an open coded mutex to ensure that, which a
 * cannibalize_bucket() will take. This means every time we unlock the root of
 * the btree, we need to release this lock if we have it held.
 */
void bch_cannibalize_unlock(struct cache_set *c)
{ … }

static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op,
			       struct bkey *k, int level)
{ … }

/*
 * bch_btree_node_get - find a btree node in the cache and lock it, reading it
 * in from disk if necessary.
 *
 * If IO is necessary and running under submit_bio_noacct, returns -EAGAIN.
 *
 * The btree node will have either a read or a write lock held, depending on
 * level and op->lock.
 *
 * Note: Only error code or btree pointer will be returned, it is unncessary
 *       for callers to check NULL pointer.
 */
struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
				 struct bkey *k, int level, bool write,
				 struct btree *parent)
{ … }

static void btree_node_prefetch(struct btree *parent, struct bkey *k)
{ … }

/* Btree alloc */

static void btree_node_free(struct btree *b)
{ … }

/*
 * Only error code or btree pointer will be returned, it is unncessary for
 * callers to check NULL pointer.
 */
struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
				     int level, bool wait,
				     struct btree *parent)
{ … }

static struct btree *bch_btree_node_alloc(struct cache_set *c,
					  struct btree_op *op, int level,
					  struct btree *parent)
{ … }

static struct btree *btree_node_alloc_replacement(struct btree *b,
						  struct btree_op *op)
{ … }

static void make_btree_freeing_key(struct btree *b, struct bkey *k)
{ … }

static int btree_check_reserve(struct btree *b, struct btree_op *op)
{ … }

/* Garbage collection */

static uint8_t __bch_btree_mark_key(struct cache_set *c, int level,
				    struct bkey *k)
{ … }

#define btree_mark_key(b, k) …

void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k)
{ … }

void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats)
{ … }

static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
{ … }

#define GC_MERGE_NODES …

struct gc_merge_info { … };

static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
				 struct keylist *insert_keys,
				 atomic_t *journal_ref,
				 struct bkey *replace_key);

static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
			     struct gc_stat *gc, struct gc_merge_info *r)
{ … }

static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op,
				 struct btree *replace)
{ … }

static unsigned int btree_gc_count_keys(struct btree *b)
{ … }

static size_t btree_gc_min_nodes(struct cache_set *c)
{ … }


static int btree_gc_recurse(struct btree *b, struct btree_op *op,
			    struct closure *writes, struct gc_stat *gc)
{ … }

static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
			     struct closure *writes, struct gc_stat *gc)
{ … }

static void btree_gc_start(struct cache_set *c)
{ … }

static void bch_btree_gc_finish(struct cache_set *c)
{ … }

static void bch_btree_gc(struct cache_set *c)
{ … }

static bool gc_should_run(struct cache_set *c)
{ … }

static int bch_gc_thread(void *arg)
{ … }

int bch_gc_thread_start(struct cache_set *c)
{ … }

/* Initial partial gc */

static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
{ … }


static int bch_btree_check_thread(void *arg)
{ … }



static int bch_btree_chkthread_nr(void)
{ … }

int bch_btree_check(struct cache_set *c)
{ … }

void bch_initial_gc_finish(struct cache_set *c)
{ … }

/* Btree insertion */

static bool btree_insert_key(struct btree *b, struct bkey *k,
			     struct bkey *replace_key)
{ … }

static size_t insert_u64s_remaining(struct btree *b)
{ … }

static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
				  struct keylist *insert_keys,
				  struct bkey *replace_key)
{ … }

static int btree_split(struct btree *b, struct btree_op *op,
		       struct keylist *insert_keys,
		       struct bkey *replace_key)
{ … }

static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
				 struct keylist *insert_keys,
				 atomic_t *journal_ref,
				 struct bkey *replace_key)
{ … }

int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
			       struct bkey *check_key)
{ … }

struct btree_insert_op { … };

static int btree_insert_fn(struct btree_op *b_op, struct btree *b)
{ … }

int bch_btree_insert(struct cache_set *c, struct keylist *keys,
		     atomic_t *journal_ref, struct bkey *replace_key)
{ … }

void bch_btree_set_root(struct btree *b)
{ … }

/* Map across nodes or keys */

static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
				       struct bkey *from,
				       btree_map_nodes_fn *fn, int flags)
{ … }

int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
			  struct bkey *from, btree_map_nodes_fn *fn, int flags)
{ … }

int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
				      struct bkey *from, btree_map_keys_fn *fn,
				      int flags)
{ … }

int bch_btree_map_keys(struct btree_op *op, struct cache_set *c,
		       struct bkey *from, btree_map_keys_fn *fn, int flags)
{ … }

/* Keybuf code */

static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r)
{ … }

static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
					    struct keybuf_key *r)
{ … }

struct refill { … };

static int refill_keybuf_fn(struct btree_op *op, struct btree *b,
			    struct bkey *k)
{ … }

void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
		       struct bkey *end, keybuf_pred_fn *pred)
{ … }

static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
{ … }

void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
{ … }

bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start,
				  struct bkey *end)
{ … }

struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
{ … }

struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
					  struct keybuf *buf,
					  struct bkey *end,
					  keybuf_pred_fn *pred)
{ … }

void bch_keybuf_init(struct keybuf *buf)
{ … }

void bch_btree_exit(void)
{ … }

int __init bch_btree_init(void)
{ … }
linux/drivers/md/bcache/btree.c