// SPDX-License-Identifier: GPL-2.0-only /* drbd_bitmap.c This file is part of DRBD by Philipp Reisner and Lars Ellenberg. Copyright (C) 2004-2008, LINBIT Information Technologies GmbH. Copyright (C) 2004-2008, Philipp Reisner <[email protected]>. Copyright (C) 2004-2008, Lars Ellenberg <[email protected]>. */ #define pr_fmt(fmt) … #include <linux/bitmap.h> #include <linux/vmalloc.h> #include <linux/string.h> #include <linux/drbd.h> #include <linux/slab.h> #include <linux/highmem.h> #include "drbd_int.h" /* OPAQUE outside this file! * interface defined in drbd_int.h * convention: * function name drbd_bm_... => used elsewhere, "public". * function name bm_... => internal to implementation, "private". */ /* * LIMITATIONS: * We want to support >= peta byte of backend storage, while for now still using * a granularity of one bit per 4KiB of storage. * 1 << 50 bytes backend storage (1 PiB) * 1 << (50 - 12) bits needed * 38 --> we need u64 to index and count bits * 1 << (38 - 3) bitmap bytes needed * 35 --> we still need u64 to index and count bytes * (that's 32 GiB of bitmap for 1 PiB storage) * 1 << (35 - 2) 32bit longs needed * 33 --> we'd even need u64 to index and count 32bit long words. * 1 << (35 - 3) 64bit longs needed * 32 --> we could get away with a 32bit unsigned int to index and count * 64bit long words, but I rather stay with unsigned long for now. * We probably should neither count nor point to bytes or long words * directly, but either by bitnumber, or by page index and offset. * 1 << (35 - 12) * 22 --> we need that much 4KiB pages of bitmap. * 1 << (22 + 3) --> on a 64bit arch, * we need 32 MiB to store the array of page pointers. * * Because I'm lazy, and because the resulting patch was too large, too ugly * and still incomplete, on 32bit we still "only" support 16 TiB (minus some), * (1 << 32) bits * 4k storage. * * bitmap storage and IO: * Bitmap is stored little endian on disk, and is kept little endian in * core memory. Currently we still hold the full bitmap in core as long * as we are "attached" to a local disk, which at 32 GiB for 1PiB storage * seems excessive. * * We plan to reduce the amount of in-core bitmap pages by paging them in * and out against their on-disk location as necessary, but need to make * sure we don't cause too much meta data IO, and must not deadlock in * tight memory situations. This needs some more work. */ /* * NOTE * Access to the *bm_pages is protected by bm_lock. * It is safe to read the other members within the lock. * * drbd_bm_set_bits is called from bio_endio callbacks, * We may be called with irq already disabled, * so we need spin_lock_irqsave(). * And we need the kmap_atomic. */ struct drbd_bitmap { … }; #define bm_print_lock_info(m) … static void __bm_print_lock_info(struct drbd_device *device, const char *func) { … } void drbd_bm_lock(struct drbd_device *device, char *why, enum bm_flag flags) { … } void drbd_bm_unlock(struct drbd_device *device) { … } /* we store some "meta" info about our pages in page->private */ /* at a granularity of 4k storage per bitmap bit: * one peta byte storage: 1<<50 byte, 1<<38 * 4k storage blocks * 1<<38 bits, * 1<<23 4k bitmap pages. * Use 24 bits as page index, covers 2 peta byte storage * at a granularity of 4k per bit. * Used to report the failed page idx on io error from the endio handlers. */ #define BM_PAGE_IDX_MASK … /* this page is currently read in, or written back */ #define BM_PAGE_IO_LOCK … /* if there has been an IO error for this page */ #define BM_PAGE_IO_ERROR … /* this is to be able to intelligently skip disk IO, * set if bits have been set since last IO. */ #define BM_PAGE_NEED_WRITEOUT … /* to mark for lazy writeout once syncer cleared all clearable bits, * we if bits have been cleared since last IO. */ #define BM_PAGE_LAZY_WRITEOUT … /* pages marked with this "HINT" will be considered for writeout * on activity log transactions */ #define BM_PAGE_HINT_WRITEOUT … /* store_page_idx uses non-atomic assignment. It is only used directly after * allocating the page. All other bm_set_page_* and bm_clear_page_* need to * use atomic bit manipulation, as set_out_of_sync (and therefore bitmap * changes) may happen from various contexts, and wait_on_bit/wake_up_bit * requires it all to be atomic as well. */ static void bm_store_page_idx(struct page *page, unsigned long idx) { … } static unsigned long bm_page_to_idx(struct page *page) { … } /* As is very unlikely that the same page is under IO from more than one * context, we can get away with a bit per page and one wait queue per bitmap. */ static void bm_page_lock_io(struct drbd_device *device, int page_nr) { … } static void bm_page_unlock_io(struct drbd_device *device, int page_nr) { … } /* set _before_ submit_io, so it may be reset due to being changed * while this page is in flight... will get submitted later again */ static void bm_set_page_unchanged(struct page *page) { … } static void bm_set_page_need_writeout(struct page *page) { … } void drbd_bm_reset_al_hints(struct drbd_device *device) { … } /** * drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout * @device: DRBD device. * @page_nr: the bitmap page to mark with the "hint" flag * * From within an activity log transaction, we mark a few pages with these * hints, then call drbd_bm_write_hinted(), which will only write out changed * pages which are flagged with this mark. */ void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr) { … } static int bm_test_page_unchanged(struct page *page) { … } static void bm_set_page_io_err(struct page *page) { … } static void bm_clear_page_io_err(struct page *page) { … } static void bm_set_page_lazy_writeout(struct page *page) { … } static int bm_test_page_lazy_writeout(struct page *page) { … } /* on a 32bit box, this would allow for exactly (2<<38) bits. */ static unsigned int bm_word_to_page_idx(struct drbd_bitmap *b, unsigned long long_nr) { … } static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr) { … } static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx) { … } static unsigned long *bm_map_pidx(struct drbd_bitmap *b, unsigned int idx) { … } static void __bm_unmap(unsigned long *p_addr) { kunmap_atomic(p_addr); }; static void bm_unmap(unsigned long *p_addr) { … } /* long word offset of _bitmap_ sector */ #define S2W(s) … /* word offset from start of bitmap to word number _in_page_ * modulo longs per page #define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long)) hm, well, Philipp thinks gcc might not optimize the % into & (... - 1) so do it explicitly: */ #define MLPP(X) … /* Long words per page */ #define LWPP … /* * actually most functions herein should take a struct drbd_bitmap*, not a * struct drbd_device*, but for the debug macros I like to have the device around * to be able to report device specific. */ static void bm_free_pages(struct page **pages, unsigned long number) { … } static inline void bm_vk_free(void *ptr) { … } /* * "have" and "want" are NUMBER OF PAGES. */ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) { … } /* * allocates the drbd_bitmap and stores it in device->bitmap. */ int drbd_bm_init(struct drbd_device *device) { … } sector_t drbd_bm_capacity(struct drbd_device *device) { … } /* called on driver unload. TODO: call when a device is destroyed. */ void drbd_bm_cleanup(struct drbd_device *device) { … } /* * since (b->bm_bits % BITS_PER_LONG) != 0, * this masks out the remaining bits. * Returns the number of bits cleared. */ #ifndef BITS_PER_PAGE #define BITS_PER_PAGE … #define BITS_PER_PAGE_MASK … #else # if BITS_PER_PAGE != (1UL << (PAGE_SHIFT + 3)) # error "ambiguous BITS_PER_PAGE" # endif #endif #define BITS_PER_LONG_MASK … static int bm_clear_surplus(struct drbd_bitmap *b) { … } static void bm_set_surplus(struct drbd_bitmap *b) { … } /* you better not modify the bitmap while this is running, * or its results will be stale */ static unsigned long bm_count_bits(struct drbd_bitmap *b) { … } /* offset and len in long words.*/ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len) { … } /* For the layout, see comment above drbd_md_set_sector_offsets(). */ static u64 drbd_md_on_disk_bits(struct drbd_backing_dev *ldev) { … } /* * make sure the bitmap has enough room for the attached storage, * if necessary, resize. * called whenever we may have changed the device size. * returns -ENOMEM if we could not allocate enough memory, 0 on success. * In case this is actually a resize, we copy the old bitmap into the new one. * Otherwise, the bitmap is initialized to all bits set. */ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bits) { … } /* inherently racy: * if not protected by other means, return value may be out of date when * leaving this function... * we still need to lock it, since it is important that this returns * bm_set == 0 precisely. * * maybe bm_set should be atomic_t ? */ unsigned long _drbd_bm_total_weight(struct drbd_device *device) { … } unsigned long drbd_bm_total_weight(struct drbd_device *device) { … } size_t drbd_bm_words(struct drbd_device *device) { … } unsigned long drbd_bm_bits(struct drbd_device *device) { … } /* merge number words from buffer into the bitmap starting at offset. * buffer[i] is expected to be little endian unsigned long. * bitmap must be locked by drbd_bm_lock. * currently only used from receive_bitmap. */ void drbd_bm_merge_lel(struct drbd_device *device, size_t offset, size_t number, unsigned long *buffer) { … } /* copy number words from the bitmap starting at offset into the buffer. * buffer[i] will be little endian unsigned long. */ void drbd_bm_get_lel(struct drbd_device *device, size_t offset, size_t number, unsigned long *buffer) { … } /* set all bits in the bitmap */ void drbd_bm_set_all(struct drbd_device *device) { … } /* clear all bits in the bitmap */ void drbd_bm_clear_all(struct drbd_device *device) { … } static void drbd_bm_aio_ctx_destroy(struct kref *kref) { … } /* bv_page may be a copy, or may be the original */ static void drbd_bm_endio(struct bio *bio) { … } /* For the layout, see comment above drbd_md_set_sector_offsets(). */ static inline sector_t drbd_md_last_bitmap_sector(struct drbd_backing_dev *bdev) { … } static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local) { … } /* * bm_rw: read/write the whole bitmap from/to its on disk location. */ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned lazy_writeout_upper_idx) __must_hold(local) { … } /** * drbd_bm_read() - Read the whole bitmap from its on disk location. * @device: DRBD device. */ int drbd_bm_read(struct drbd_device *device, struct drbd_peer_device *peer_device) __must_hold(local) { … } /** * drbd_bm_write() - Write the whole bitmap to its on disk location. * @device: DRBD device. * * Will only write pages that have changed since last IO. */ int drbd_bm_write(struct drbd_device *device, struct drbd_peer_device *peer_device) __must_hold(local) { … } /** * drbd_bm_write_all() - Write the whole bitmap to its on disk location. * @device: DRBD device. * * Will write all pages. */ int drbd_bm_write_all(struct drbd_device *device, struct drbd_peer_device *peer_device) __must_hold(local) { … } /** * drbd_bm_write_lazy() - Write bitmap pages 0 to @upper_idx-1, if they have changed. * @device: DRBD device. * @upper_idx: 0: write all changed pages; +ve: page index to stop scanning for changed pages */ int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local) { … } /** * drbd_bm_write_copy_pages() - Write the whole bitmap to its on disk location. * @device: DRBD device. * * Will only write pages that have changed since last IO. * In contrast to drbd_bm_write(), this will copy the bitmap pages * to temporary writeout pages. It is intended to trigger a full write-out * while still allowing the bitmap to change, for example if a resync or online * verify is aborted due to a failed peer disk, while local IO continues, or * pending resync acks are still being processed. */ int drbd_bm_write_copy_pages(struct drbd_device *device, struct drbd_peer_device *peer_device) __must_hold(local) { … } /** * drbd_bm_write_hinted() - Write bitmap pages with "hint" marks, if they have changed. * @device: DRBD device. */ int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local) { … } /* NOTE * find_first_bit returns int, we return unsigned long. * For this to work on 32bit arch with bitnumbers > (1<<32), * we'd need to return u64, and get a whole lot of other places * fixed where we still use unsigned long. * * this returns a bit number, NOT a sector! */ static unsigned long __bm_find_next(struct drbd_device *device, unsigned long bm_fo, const int find_zero_bit) { … } static unsigned long bm_find_next(struct drbd_device *device, unsigned long bm_fo, const int find_zero_bit) { … } unsigned long drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo) { … } #if 0 /* not yet needed for anything. */ unsigned long drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo) { return bm_find_next(device, bm_fo, 1); } #endif /* does not spin_lock_irqsave. * you must take drbd_bm_lock() first */ unsigned long _drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo) { … } unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo) { … } /* returns number of bits actually changed. * for val != 0, we change 0 -> 1, return code positive * for val == 0, we change 1 -> 0, return code negative * wants bitnr, not sector. * expected to be called for only a few bits (e - s about BITS_PER_LONG). * Must hold bitmap lock already. */ static int __bm_change_bits_to(struct drbd_device *device, const unsigned long s, unsigned long e, int val) { … } /* returns number of bits actually changed. * for val != 0, we change 0 -> 1, return code positive * for val == 0, we change 1 -> 0, return code negative * wants bitnr, not sector */ static int bm_change_bits_to(struct drbd_device *device, const unsigned long s, const unsigned long e, int val) { … } /* returns number of bits changed 0 -> 1 */ int drbd_bm_set_bits(struct drbd_device *device, const unsigned long s, const unsigned long e) { … } /* returns number of bits changed 1 -> 0 */ int drbd_bm_clear_bits(struct drbd_device *device, const unsigned long s, const unsigned long e) { … } /* sets all bits in full words, * from first_word up to, but not including, last_word */ static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b, int page_nr, int first_word, int last_word) { … } /* Same thing as drbd_bm_set_bits, * but more efficient for a large bit range. * You must first drbd_bm_lock(). * Can be called to set the whole bitmap in one go. * Sets bits from s to e _inclusive_. */ void _drbd_bm_set_bits(struct drbd_device *device, const unsigned long s, const unsigned long e) { … } /* returns bit state * wants bitnr, NOT sector. * inherently racy... area needs to be locked by means of {al,rs}_lru * 1 ... bit set * 0 ... bit not set * -1 ... first out of bounds access, stop testing for bits! */ int drbd_bm_test_bit(struct drbd_device *device, const unsigned long bitnr) { … } /* returns number of bits set in the range [s, e] */ int drbd_bm_count_bits(struct drbd_device *device, const unsigned long s, const unsigned long e) { … } /* inherently racy... * return value may be already out-of-date when this function returns. * but the general usage is that this is only use during a cstate when bits are * only cleared, not set, and typically only care for the case when the return * value is zero, or we already "locked" this "bitmap extent" by other means. * * enr is bm-extent number, since we chose to name one sector (512 bytes) * worth of the bitmap a "bitmap extent". * * TODO * I think since we use it like a reference count, we should use the real * reference count of some bitmap extent element from some lru instead... * */ int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr) { … }