alloc.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0
/*
 * Primary bucket allocation code
 *
 * Copyright 2012 Google, Inc.
 *
 * Allocation in bcache is done in terms of buckets:
 *
 * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
 * btree pointers - they must match for the pointer to be considered valid.
 *
 * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
 * bucket simply by incrementing its gen.
 *
 * The gens (along with the priorities; it's really the gens are important but
 * the code is named as if it's the priorities) are written in an arbitrary list
 * of buckets on disk, with a pointer to them in the journal header.
 *
 * When we invalidate a bucket, we have to write its new gen to disk and wait
 * for that write to complete before we use it - otherwise after a crash we
 * could have pointers that appeared to be good but pointed to data that had
 * been overwritten.
 *
 * Since the gens and priorities are all stored contiguously on disk, we can
 * batch this up: We fill up the free_inc list with freshly invalidated buckets,
 * call prio_write(), and when prio_write() finishes we pull buckets off the
 * free_inc list and optionally discard them.
 *
 * free_inc isn't the only freelist - if it was, we'd often to sleep while
 * priorities and gens were being written before we could allocate. c->free is a
 * smaller freelist, and buckets on that list are always ready to be used.
 *
 * If we've got discards enabled, that happens when a bucket moves from the
 * free_inc list to the free list.
 *
 * There is another freelist, because sometimes we have buckets that we know
 * have nothing pointing into them - these we can reuse without waiting for
 * priorities to be rewritten. These come from freed btree nodes and buckets
 * that garbage collection discovered no longer had valid keys pointing into
 * them (because they were overwritten). That's the unused list - buckets on the
 * unused list move to the free list, optionally being discarded in the process.
 *
 * It's also important to ensure that gens don't wrap around - with respect to
 * either the oldest gen in the btree or the gen on disk. This is quite
 * difficult to do in practice, but we explicitly guard against it anyways - if
 * a bucket is in danger of wrapping around we simply skip invalidating it that
 * time around, and we garbage collect or rewrite the priorities sooner than we
 * would have otherwise.
 *
 * bch_bucket_alloc() allocates a single bucket from a specific cache.
 *
 * bch_bucket_alloc_set() allocates one  bucket from different caches
 * out of a cache set.
 *
 * free_some_buckets() drives all the processes described above. It's called
 * from bch_bucket_alloc() and a few other places that need to make sure free
 * buckets are ready.
 *
 * invalidate_buckets_(lru|fifo)() find buckets that are available to be
 * invalidated, and then invalidate them and stick them on the free_inc list -
 * in either lru or fifo order.
 */

#include "bcache.h"
#include "btree.h"

#include <linux/blkdev.h>
#include <linux/kthread.h>
#include <linux/random.h>
#include <trace/events/bcache.h>

#define MAX_OPEN_BUCKETS …

/* Bucket heap / gen */

uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
{ … }

void bch_rescale_priorities(struct cache_set *c, int sectors)
{ … }

/*
 * Background allocation thread: scans for buckets to be invalidated,
 * invalidates them, rewrites prios/gens (marking them as invalidated on disk),
 * then optionally issues discard commands to the newly free buckets, then puts
 * them on the various freelists.
 */

static inline bool can_inc_bucket_gen(struct bucket *b)
{ … }

bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b)
{ … }

void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
{ … }

static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
{ … }

/*
 * Determines what order we're going to reuse buckets, smallest bucket_prio()
 * first: we also take into account the number of sectors of live data in that
 * bucket, and in order for that multiply to make sense we have to scale bucket
 *
 * Thus, we scale the bucket priorities so that the bucket with the smallest
 * prio is worth 1/8th of what INITIAL_PRIO is worth.
 */

static inline unsigned int new_bucket_prio(struct cache *ca, struct bucket *b)
{ … }

static inline bool new_bucket_max_cmp(const void *l, const void *r, void *args)
{ … }

static inline bool new_bucket_min_cmp(const void *l, const void *r, void *args)
{ … }

static inline void new_bucket_swap(void *l, void *r, void __always_unused *args)
{ … }

static void invalidate_buckets_lru(struct cache *ca)
{ … }

static void invalidate_buckets_fifo(struct cache *ca)
{ … }

static void invalidate_buckets_random(struct cache *ca)
{ … }

static void invalidate_buckets(struct cache *ca)
{ … }

#define allocator_wait(ca, cond) …

static int bch_allocator_push(struct cache *ca, long bucket)
{ … }

static int bch_allocator_thread(void *arg)
{ … }

/* Allocation */

long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait)
{ … }

void __bch_bucket_free(struct cache *ca, struct bucket *b)
{ … }

void bch_bucket_free(struct cache_set *c, struct bkey *k)
{ … }

int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
			   struct bkey *k, bool wait)
{ … }

int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
			 struct bkey *k, bool wait)
{ … }

/* Sector allocator */

struct open_bucket { … };

/*
 * We keep multiple buckets open for writes, and try to segregate different
 * write streams for better cache utilization: first we try to segregate flash
 * only volume write streams from cached devices, secondly we look for a bucket
 * where the last write to it was sequential with the current write, and
 * failing that we look for a bucket that was last used by the same task.
 *
 * The ideas is if you've got multiple tasks pulling data into the cache at the
 * same time, you'll get better cache utilization if you try to segregate their
 * data and preserve locality.
 *
 * For example, dirty sectors of flash only volume is not reclaimable, if their
 * dirty sectors mixed with dirty sectors of cached device, such buckets will
 * be marked as dirty and won't be reclaimed, though the dirty data of cached
 * device have been written back to backend device.
 *
 * And say you've starting Firefox at the same time you're copying a
 * bunch of files. Firefox will likely end up being fairly hot and stay in the
 * cache awhile, but the data you copied might not be; if you wrote all that
 * data to the same buckets it'd get invalidated at the same time.
 *
 * Both of those tasks will be doing fairly random IO so we can't rely on
 * detecting sequential IO to segregate their data, but going off of the task
 * should be a sane heuristic.
 */
static struct open_bucket *pick_data_bucket(struct cache_set *c,
					    const struct bkey *search,
					    unsigned int write_point,
					    struct bkey *alloc)
{ … }

/*
 * Allocates some space in the cache to write to, and k to point to the newly
 * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the
 * end of the newly allocated space).
 *
 * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many
 * sectors were actually allocated.
 *
 * If s->writeback is true, will not fail.
 */
bool bch_alloc_sectors(struct cache_set *c,
		       struct bkey *k,
		       unsigned int sectors,
		       unsigned int write_point,
		       unsigned int write_prio,
		       bool wait)
{ … }

/* Init */

void bch_open_buckets_free(struct cache_set *c)
{ … }

int bch_open_buckets_alloc(struct cache_set *c)
{ … }

int bch_cache_allocator_start(struct cache *ca)
{ … }
linux/drivers/md/bcache/alloc.c