// SPDX-License-Identifier: GPL-2.0 /* * Primary bucket allocation code * * Copyright 2012 Google, Inc. * * Allocation in bcache is done in terms of buckets: * * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in * btree pointers - they must match for the pointer to be considered valid. * * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a * bucket simply by incrementing its gen. * * The gens (along with the priorities; it's really the gens are important but * the code is named as if it's the priorities) are written in an arbitrary list * of buckets on disk, with a pointer to them in the journal header. * * When we invalidate a bucket, we have to write its new gen to disk and wait * for that write to complete before we use it - otherwise after a crash we * could have pointers that appeared to be good but pointed to data that had * been overwritten. * * Since the gens and priorities are all stored contiguously on disk, we can * batch this up: We fill up the free_inc list with freshly invalidated buckets, * call prio_write(), and when prio_write() finishes we pull buckets off the * free_inc list and optionally discard them. * * free_inc isn't the only freelist - if it was, we'd often to sleep while * priorities and gens were being written before we could allocate. c->free is a * smaller freelist, and buckets on that list are always ready to be used. * * If we've got discards enabled, that happens when a bucket moves from the * free_inc list to the free list. * * There is another freelist, because sometimes we have buckets that we know * have nothing pointing into them - these we can reuse without waiting for * priorities to be rewritten. These come from freed btree nodes and buckets * that garbage collection discovered no longer had valid keys pointing into * them (because they were overwritten). That's the unused list - buckets on the * unused list move to the free list, optionally being discarded in the process. * * It's also important to ensure that gens don't wrap around - with respect to * either the oldest gen in the btree or the gen on disk. This is quite * difficult to do in practice, but we explicitly guard against it anyways - if * a bucket is in danger of wrapping around we simply skip invalidating it that * time around, and we garbage collect or rewrite the priorities sooner than we * would have otherwise. * * bch_bucket_alloc() allocates a single bucket from a specific cache. * * bch_bucket_alloc_set() allocates one bucket from different caches * out of a cache set. * * free_some_buckets() drives all the processes described above. It's called * from bch_bucket_alloc() and a few other places that need to make sure free * buckets are ready. * * invalidate_buckets_(lru|fifo)() find buckets that are available to be * invalidated, and then invalidate them and stick them on the free_inc list - * in either lru or fifo order. */ #include "bcache.h" #include "btree.h" #include <linux/blkdev.h> #include <linux/kthread.h> #include <linux/random.h> #include <trace/events/bcache.h> #define MAX_OPEN_BUCKETS … /* Bucket heap / gen */ uint8_t bch_inc_gen(struct cache *ca, struct bucket *b) { … } void bch_rescale_priorities(struct cache_set *c, int sectors) { … } /* * Background allocation thread: scans for buckets to be invalidated, * invalidates them, rewrites prios/gens (marking them as invalidated on disk), * then optionally issues discard commands to the newly free buckets, then puts * them on the various freelists. */ static inline bool can_inc_bucket_gen(struct bucket *b) { … } bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b) { … } void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) { … } static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) { … } /* * Determines what order we're going to reuse buckets, smallest bucket_prio() * first: we also take into account the number of sectors of live data in that * bucket, and in order for that multiply to make sense we have to scale bucket * * Thus, we scale the bucket priorities so that the bucket with the smallest * prio is worth 1/8th of what INITIAL_PRIO is worth. */ static inline unsigned int new_bucket_prio(struct cache *ca, struct bucket *b) { … } static inline bool new_bucket_max_cmp(const void *l, const void *r, void *args) { … } static inline bool new_bucket_min_cmp(const void *l, const void *r, void *args) { … } static inline void new_bucket_swap(void *l, void *r, void __always_unused *args) { … } static void invalidate_buckets_lru(struct cache *ca) { … } static void invalidate_buckets_fifo(struct cache *ca) { … } static void invalidate_buckets_random(struct cache *ca) { … } static void invalidate_buckets(struct cache *ca) { … } #define allocator_wait(ca, cond) … static int bch_allocator_push(struct cache *ca, long bucket) { … } static int bch_allocator_thread(void *arg) { … } /* Allocation */ long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait) { … } void __bch_bucket_free(struct cache *ca, struct bucket *b) { … } void bch_bucket_free(struct cache_set *c, struct bkey *k) { … } int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, struct bkey *k, bool wait) { … } int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, struct bkey *k, bool wait) { … } /* Sector allocator */ struct open_bucket { … }; /* * We keep multiple buckets open for writes, and try to segregate different * write streams for better cache utilization: first we try to segregate flash * only volume write streams from cached devices, secondly we look for a bucket * where the last write to it was sequential with the current write, and * failing that we look for a bucket that was last used by the same task. * * The ideas is if you've got multiple tasks pulling data into the cache at the * same time, you'll get better cache utilization if you try to segregate their * data and preserve locality. * * For example, dirty sectors of flash only volume is not reclaimable, if their * dirty sectors mixed with dirty sectors of cached device, such buckets will * be marked as dirty and won't be reclaimed, though the dirty data of cached * device have been written back to backend device. * * And say you've starting Firefox at the same time you're copying a * bunch of files. Firefox will likely end up being fairly hot and stay in the * cache awhile, but the data you copied might not be; if you wrote all that * data to the same buckets it'd get invalidated at the same time. * * Both of those tasks will be doing fairly random IO so we can't rely on * detecting sequential IO to segregate their data, but going off of the task * should be a sane heuristic. */ static struct open_bucket *pick_data_bucket(struct cache_set *c, const struct bkey *search, unsigned int write_point, struct bkey *alloc) { … } /* * Allocates some space in the cache to write to, and k to point to the newly * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the * end of the newly allocated space). * * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many * sectors were actually allocated. * * If s->writeback is true, will not fail. */ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned int sectors, unsigned int write_point, unsigned int write_prio, bool wait) { … } /* Init */ void bch_open_buckets_free(struct cache_set *c) { … } int bch_open_buckets_alloc(struct cache_set *c) { … } int bch_cache_allocator_start(struct cache *ca) { … }