// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. */ #include <linux/sched.h> #include <linux/sched/signal.h> #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/sort.h> #include <linux/rcupdate.h> #include <linux/kthread.h> #include <linux/slab.h> #include <linux/ratelimit.h> #include <linux/percpu_counter.h> #include <linux/lockdep.h> #include <linux/crc32c.h> #include "ctree.h" #include "extent-tree.h" #include "transaction.h" #include "disk-io.h" #include "print-tree.h" #include "volumes.h" #include "raid56.h" #include "locking.h" #include "free-space-cache.h" #include "free-space-tree.h" #include "qgroup.h" #include "ref-verify.h" #include "space-info.h" #include "block-rsv.h" #include "discard.h" #include "zoned.h" #include "dev-replace.h" #include "fs.h" #include "accessors.h" #include "root-tree.h" #include "file-item.h" #include "orphan.h" #include "tree-checker.h" #include "raid-stripe-tree.h" #undef SCRAMBLE_DELAYED_REFS static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extra_op); static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, struct extent_buffer *leaf, struct btrfs_extent_item *ei); static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, u64 flags, u64 owner, u64 offset, struct btrfs_key *ins, int ref_mod, u64 oref_root); static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op); static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); static int block_group_bits(struct btrfs_block_group *cache, u64 bits) { … } /* simple helper to search for an existing data extent at a given offset */ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) { … } /* * helper function to lookup reference count and flags of a tree block. * * the head node for delayed ref is used to store the sum of all the * reference count modifications queued up in the rbtree. the head * node may also store the extent flags to set. This way you can check * to see what the reference count and extent flags would be if all of * the delayed refs are not processed. */ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 offset, int metadata, u64 *refs, u64 *flags, u64 *owning_root) { … } /* * Back reference rules. Back refs have three main goals: * * 1) differentiate between all holders of references to an extent so that * when a reference is dropped we can make sure it was a valid reference * before freeing the extent. * * 2) Provide enough information to quickly find the holders of an extent * if we notice a given block is corrupted or bad. * * 3) Make it easy to migrate blocks for FS shrinking or storage pool * maintenance. This is actually the same as #2, but with a slightly * different use case. * * There are two kinds of back refs. The implicit back refs is optimized * for pointers in non-shared tree blocks. For a given pointer in a block, * back refs of this kind provide information about the block's owner tree * and the pointer's key. These information allow us to find the block by * b-tree searching. The full back refs is for pointers in tree blocks not * referenced by their owner trees. The location of tree block is recorded * in the back refs. Actually the full back refs is generic, and can be * used in all cases the implicit back refs is used. The major shortcoming * of the full back refs is its overhead. Every time a tree block gets * COWed, we have to update back refs entry for all pointers in it. * * For a newly allocated tree block, we use implicit back refs for * pointers in it. This means most tree related operations only involve * implicit back refs. For a tree block created in old transaction, the * only way to drop a reference to it is COW it. So we can detect the * event that tree block loses its owner tree's reference and do the * back refs conversion. * * When a tree block is COWed through a tree, there are four cases: * * The reference count of the block is one and the tree is the block's * owner tree. Nothing to do in this case. * * The reference count of the block is one and the tree is not the * block's owner tree. In this case, full back refs is used for pointers * in the block. Remove these full back refs, add implicit back refs for * every pointers in the new block. * * The reference count of the block is greater than one and the tree is * the block's owner tree. In this case, implicit back refs is used for * pointers in the block. Add full back refs for every pointers in the * block, increase lower level extents' reference counts. The original * implicit back refs are entailed to the new block. * * The reference count of the block is greater than one and the tree is * not the block's owner tree. Add implicit back refs for every pointer in * the new block, increase lower level extents' reference count. * * Back Reference Key composing: * * The key objectid corresponds to the first byte in the extent, * The key type is used to differentiate between types of back refs. * There are different meanings of the key offset for different types * of back refs. * * File extents can be referenced by: * * - multiple snapshots, subvolumes, or different generations in one subvol * - different files inside a single subvolume * - different offsets inside a file (bookend extents in file.c) * * The extent ref structure for the implicit back refs has fields for: * * - Objectid of the subvolume root * - objectid of the file holding the reference * - original offset in the file * - how many bookend extents * * The key offset for the implicit back refs is hash of the first * three fields. * * The extent ref structure for the full back refs has field for: * * - number of pointers in the tree leaf * * The key offset for the implicit back refs is the first byte of * the tree leaf * * When a file extent is allocated, The implicit back refs is used. * the fields are filled in: * * (root_key.objectid, inode objectid, offset in file, 1) * * When a file extent is removed file truncation, we find the * corresponding implicit back refs and check the following fields: * * (btrfs_header_owner(leaf), inode objectid, offset in file) * * Btree extents can be referenced by: * * - Different subvolumes * * Both the implicit back refs and the full back refs for tree blocks * only consist of key. The key offset for the implicit back refs is * objectid of block's owner tree. The key offset for the full back refs * is the first byte of parent block. * * When implicit back refs is used, information about the lowest key and * level of the tree block are required. These information are stored in * tree block info structure. */ /* * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required, * is_data == BTRFS_REF_TYPE_DATA, data type is requiried, * is_data == BTRFS_REF_TYPE_ANY, either type is OK. */ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, struct btrfs_extent_inline_ref *iref, enum btrfs_inline_ref_type is_data) { … } u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) { … } static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, struct btrfs_extent_data_ref *ref) { … } static int match_extent_data_ref(struct extent_buffer *leaf, struct btrfs_extent_data_ref *ref, u64 root_objectid, u64 owner, u64 offset) { … } static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 bytenr, u64 parent, u64 root_objectid, u64 owner, u64 offset) { … } static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_delayed_ref_node *node, u64 bytenr) { … } static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int refs_to_drop) { … } static noinline u32 extent_data_ref_count(struct btrfs_path *path, struct btrfs_extent_inline_ref *iref) { … } static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 bytenr, u64 parent, u64 root_objectid) { … } static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_delayed_ref_node *node, u64 bytenr) { … } static inline int extent_ref_type(u64 parent, u64 owner) { … } static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key) { … } /* * look for inline back ref. if back ref is found, *ref_ret is set * to the address of inline back ref, and 0 is returned. * * if back ref isn't found, *ref_ret is set to the address where it * should be inserted, and -ENOENT is returned. * * if insert is true and there are too many inline back refs, the path * points to the extent item, and -EAGAIN is returned. * * NOTE: inline back refs are ordered in the same way that back ref * items in the tree are ordered. */ static noinline_for_stack int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_extent_inline_ref **ref_ret, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset, int insert) { … } /* * helper to add new inline back ref */ static noinline_for_stack void setup_inline_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, u64 parent, u64 root_objectid, u64 owner, u64 offset, int refs_to_add, struct btrfs_delayed_extent_op *extent_op) { … } static int lookup_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_extent_inline_ref **ref_ret, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset) { … } /* * helper to update/remove inline back ref */ static noinline_for_stack int update_inline_extent_backref( struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, int refs_to_mod, struct btrfs_delayed_extent_op *extent_op) { … } static noinline_for_stack int insert_inline_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset, int refs_to_add, struct btrfs_delayed_extent_op *extent_op) { … } static int remove_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, int refs_to_drop, int is_data) { … } static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, u64 *discarded_bytes) { … } static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes) { … } int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 *actual_bytes) { … } /* Can return -ENOMEM */ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref) { … } /* * Insert backreference for a given extent. * * The counterpart is in __btrfs_free_extent(), with examples and more details * how it works. * * @trans: Handle of transaction * * @node: The delayed ref node used to get the bytenr/length for * extent whose references are incremented. * * @extent_op Pointer to a structure, holding information necessary when * updating a tree block's flags * */ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op) { … } static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *href) { … } static int run_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, bool insert_reserved) { … } static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, struct extent_buffer *leaf, struct btrfs_extent_item *ei) { … } static int run_delayed_extent_op(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_extent_op *extent_op) { … } static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, bool insert_reserved) { … } /* helper function to actually process a single delayed ref entry */ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, bool insert_reserved) { … } static inline struct btrfs_delayed_ref_node * select_delayed_ref(struct btrfs_delayed_ref_head *head) { … } static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { … } static struct btrfs_delayed_extent_op *cleanup_extent_op( struct btrfs_delayed_ref_head *head) { … } static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head) { … } u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { … } static int cleanup_ref_head(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head, u64 *bytes_released) { … } static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( struct btrfs_trans_handle *trans) { … } static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *locked_ref, u64 *bytes_released) { … } /* * Returns 0 on success or if called with an already aborted transaction. * Returns -ENOMEM or -EIO on failure and will abort the transaction. */ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes) { … } #ifdef SCRAMBLE_DELAYED_REFS /* * Normally delayed refs get processed in ascending bytenr order. This * correlates in most cases to the order added. To expose dependencies on this * order, we start to process the tree in the middle instead of the beginning */ static u64 find_middle(struct rb_root *root) { struct rb_node *n = root->rb_node; struct btrfs_delayed_ref_node *entry; int alt = 1; u64 middle; u64 first = 0, last = 0; n = rb_first(root); if (n) { entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); first = entry->bytenr; } n = rb_last(root); if (n) { entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); last = entry->bytenr; } n = root->rb_node; while (n) { entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); WARN_ON(!entry->in_tree); middle = entry->bytenr; if (alt) n = n->rb_left; else n = n->rb_right; alt = 1 - alt; } return middle; } #endif /* * Start processing the delayed reference count updates and extent insertions * we have queued up so far. * * @trans: Transaction handle. * @min_bytes: How many bytes of delayed references to process. After this * many bytes we stop processing delayed references if there are * any more. If 0 it means to run all existing delayed references, * but not new ones added after running all existing ones. * Use (u64)-1 (U64_MAX) to run all existing delayed references * plus any new ones that are added. * * Returns 0 on success or if called with an aborted transaction * Returns <0 on error and aborts the transaction */ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes) { … } int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct extent_buffer *eb, u64 flags) { … } static noinline int check_delayed_ref(struct btrfs_root *root, struct btrfs_path *path, u64 objectid, u64 offset, u64 bytenr) { … } static noinline int check_committed_ref(struct btrfs_root *root, struct btrfs_path *path, u64 objectid, u64 offset, u64 bytenr, bool strict) { … } int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, u64 bytenr, bool strict, struct btrfs_path *path) { … } static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref, int inc) { … } int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref) { … } int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref) { … } static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) { … } static u64 first_logical_byte(struct btrfs_fs_info *fs_info) { … } static int pin_down_extent(struct btrfs_trans_handle *trans, struct btrfs_block_group *cache, u64 bytenr, u64 num_bytes, int reserved) { … } int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, int reserved) { … } int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, const struct extent_buffer *eb) { … } static int __exclude_logged_extent(struct btrfs_fs_info *fs_info, u64 start, u64 num_bytes) { … } int btrfs_exclude_logged_extents(struct extent_buffer *eb) { … } static void btrfs_inc_block_group_reservations(struct btrfs_block_group *bg) { … } /* * Returns the free cluster for the given space info and sets empty_cluster to * what it should be based on the mount options. */ static struct btrfs_free_cluster * fetch_cluster_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 *empty_cluster) { … } static int unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end, const bool return_free_space) { … } int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) { … } /* * Parse an extent item's inline extents looking for a simple quotas owner ref. * * @fs_info: the btrfs_fs_info for this mount * @leaf: a leaf in the extent tree containing the extent item * @slot: the slot in the leaf where the extent item is found * * Returns the objectid of the root that originally allocated the extent item * if the inline owner ref is expected and present, otherwise 0. * * If an extent item has an owner ref item, it will be the first inline ref * item. Therefore the logic is to check whether there are any inline ref * items, then check the type of the first one. */ u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, int slot) { … } static int do_free_extent_accounting(struct btrfs_trans_handle *trans, u64 bytenr, struct btrfs_squota_delta *delta) { … } #define abort_and_dump(trans, path, fmt, args...) … /* * Drop one or more refs of @node. * * 1. Locate the extent refs. * It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item. * Locate it, then reduce the refs number or remove the ref line completely. * * 2. Update the refs count in EXTENT/METADATA_ITEM * * Inline backref case: * * in extent tree we have: * * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82 * refs 2 gen 6 flags DATA * extent data backref root FS_TREE objectid 258 offset 0 count 1 * extent data backref root FS_TREE objectid 257 offset 0 count 1 * * This function gets called with: * * node->bytenr = 13631488 * node->num_bytes = 1048576 * root_objectid = FS_TREE * owner_objectid = 257 * owner_offset = 0 * refs_to_drop = 1 * * Then we should get some like: * * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82 * refs 1 gen 6 flags DATA * extent data backref root FS_TREE objectid 258 offset 0 count 1 * * Keyed backref case: * * in extent tree we have: * * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24 * refs 754 gen 6 flags DATA * [...] * item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28 * extent data backref root FS_TREE objectid 866 offset 0 count 1 * * This function get called with: * * node->bytenr = 13631488 * node->num_bytes = 1048576 * root_objectid = FS_TREE * owner_objectid = 866 * owner_offset = 0 * refs_to_drop = 1 * * Then we should get some like: * * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24 * refs 753 gen 6 flags DATA * * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed. */ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op) { … } /* * when we free an block, it is possible (and likely) that we free the last * delayed ref for that extent as well. This searches the delayed ref tree for * a given extent, and if there are no other delayed refs to be processed, it * removes it from the tree. */ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, u64 bytenr) { … } int btrfs_free_tree_block(struct btrfs_trans_handle *trans, u64 root_id, struct extent_buffer *buf, u64 parent, int last_ref) { … } /* Can return -ENOMEM */ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) { … } enum btrfs_loop_type { … }; static inline void btrfs_lock_block_group(struct btrfs_block_group *cache, int delalloc) { … } static inline void btrfs_grab_block_group(struct btrfs_block_group *cache, int delalloc) { … } static struct btrfs_block_group *btrfs_lock_cluster( struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, int delalloc) __acquires(&cluster->refill_lock) { … } static inline void btrfs_release_block_group(struct btrfs_block_group *cache, int delalloc) { … } /* * Helper function for find_free_extent(). * * Return -ENOENT to inform caller that we need fallback to unclustered mode. * Return >0 to inform caller that we find nothing * Return 0 means we have found a location and set ffe_ctl->found_offset. */ static int find_free_extent_clustered(struct btrfs_block_group *bg, struct find_free_extent_ctl *ffe_ctl, struct btrfs_block_group **cluster_bg_ret) { … } /* * Return >0 to inform caller that we find nothing * Return 0 when we found an free extent and set ffe_ctrl->found_offset */ static int find_free_extent_unclustered(struct btrfs_block_group *bg, struct find_free_extent_ctl *ffe_ctl) { … } static int do_allocation_clustered(struct btrfs_block_group *block_group, struct find_free_extent_ctl *ffe_ctl, struct btrfs_block_group **bg_ret) { … } /* * Tree-log block group locking * ============================ * * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which * indicates the starting address of a block group, which is reserved only * for tree-log metadata. * * Lock nesting * ============ * * space_info::lock * block_group::lock * fs_info::treelog_bg_lock */ /* * Simple allocator for sequential-only block group. It only allows sequential * allocation. No need to play with trees. This function also reserves the * bytes as in btrfs_add_reserved_bytes. */ static int do_allocation_zoned(struct btrfs_block_group *block_group, struct find_free_extent_ctl *ffe_ctl, struct btrfs_block_group **bg_ret) { … } static int do_allocation(struct btrfs_block_group *block_group, struct find_free_extent_ctl *ffe_ctl, struct btrfs_block_group **bg_ret) { … } static void release_block_group(struct btrfs_block_group *block_group, struct find_free_extent_ctl *ffe_ctl, int delalloc) { … } static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl, struct btrfs_key *ins) { … } static void found_extent(struct find_free_extent_ctl *ffe_ctl, struct btrfs_key *ins) { … } static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl) { … } static int can_allocate_chunk(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl) { … } /* * Return >0 means caller needs to re-search for free extent * Return 0 means we have the needed free extent. * Return <0 means we failed to locate any free extent. */ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, struct btrfs_key *ins, struct find_free_extent_ctl *ffe_ctl, bool full_search) { … } static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl, struct btrfs_block_group *bg) { … } static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl, struct btrfs_space_info *space_info, struct btrfs_key *ins) { … } static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl) { … } static int prepare_allocation(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl, struct btrfs_space_info *space_info, struct btrfs_key *ins) { … } /* * walks the btree of allocated extents and find a hole of a given size. * The key ins is changed to record the hole: * ins->objectid == start position * ins->flags = BTRFS_EXTENT_ITEM_KEY * ins->offset == the size of the hole. * Any available blocks before search_start are skipped. * * If there is no suitable free space, we will record the max size of * the free space extent currently. * * The overall logic and call chain: * * find_free_extent() * |- Iterate through all block groups * | |- Get a valid block group * | |- Try to do clustered allocation in that block group * | |- Try to do unclustered allocation in that block group * | |- Check if the result is valid * | | |- If valid, then exit * | |- Jump to next block group * | * |- Push harder to find free extents * |- If not found, re-iterate all block groups */ static noinline int find_free_extent(struct btrfs_root *root, struct btrfs_key *ins, struct find_free_extent_ctl *ffe_ctl) { … } /* * Entry point to the extent allocator. Tries to find a hole that is at least * as big as @num_bytes. * * @root - The root that will contain this extent * * @ram_bytes - The amount of space in ram that @num_bytes take. This * is used for accounting purposes. This value differs * from @num_bytes only in the case of compressed extents. * * @num_bytes - Number of bytes to allocate on-disk. * * @min_alloc_size - Indicates the minimum amount of space that the * allocator should try to satisfy. In some cases * @num_bytes may be larger than what is required and if * the filesystem is fragmented then allocation fails. * However, the presence of @min_alloc_size gives a * chance to try and satisfy the smaller allocation. * * @empty_size - A hint that you plan on doing more COW. This is the * size in bytes the allocator should try to find free * next to the block it returns. This is just a hint and * may be ignored by the allocator. * * @hint_byte - Hint to the allocator to start searching above the byte * address passed. It might be ignored. * * @ins - This key is modified to record the found hole. It will * have the following values: * ins->objectid == start position * ins->flags = BTRFS_EXTENT_ITEM_KEY * ins->offset == the size of the hole. * * @is_data - Boolean flag indicating whether an extent is * allocated for data (true) or metadata (false) * * @delalloc - Boolean flag indicating whether this allocation is for * delalloc or not. If 'true' data_rwsem of block groups * is going to be acquired. * * * Returns 0 when an allocation succeeded or < 0 when an error occurred. In * case -ENOSPC is returned then @ins->offset will contain the size of the * largest available hole the allocator managed to find. */ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, int is_data, int delalloc) { … } int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, int delalloc) { … } int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, const struct extent_buffer *eb) { … } static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes) { … } static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, u64 flags, u64 owner, u64 offset, struct btrfs_key *ins, int ref_mod, u64 oref_root) { … } static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op) { … } int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 owner, u64 offset, u64 ram_bytes, struct btrfs_key *ins) { … } /* * this is used by the tree logging recovery code. It records that * an extent has been allocated and makes sure to clear the free * space cache bits as well */ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, u64 root_objectid, u64 owner, u64 offset, struct btrfs_key *ins) { … } #ifdef CONFIG_BTRFS_DEBUG /* * Extra safety check in case the extent tree is corrupted and extent allocator * chooses to use a tree block which is already used and locked. */ static bool check_eb_lock_owner(const struct extent_buffer *eb) { … } #else static bool check_eb_lock_owner(struct extent_buffer *eb) { return false; } #endif static struct extent_buffer * btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, int level, u64 owner, enum btrfs_lock_nesting nest) { … } /* * finds a free extent and does all the dirty work required for allocation * returns the tree buffer or an ERR_PTR on error. */ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, const struct btrfs_disk_key *key, int level, u64 hint, u64 empty_size, u64 reloc_src_root, enum btrfs_lock_nesting nest) { … } struct walk_control { … }; /* * This is our normal stage. We are traversing blocks the current snapshot owns * and we are dropping any of our references to any children we are able to, and * then freeing the block once we've processed all of the children. */ #define DROP_REFERENCE … /* * We enter this stage when we have to walk into a child block (meaning we can't * simply drop our reference to it from our current parent node) and there are * more than one reference on it. If we are the owner of any of the children * blocks from the current parent node then we have to do the FULL_BACKREF dance * on them in order to drop our normal ref and add the shared ref. */ #define UPDATE_BACKREF … /* * Decide if we need to walk down into this node to adjust the references. * * @root: the root we are currently deleting * @wc: the walk control for this deletion * @eb: the parent eb that we're currently visiting * @refs: the number of refs for wc->level - 1 * @flags: the flags for wc->level - 1 * @slot: the slot in the eb that we're currently checking * * This is meant to be called when we're evaluating if a node we point to at * wc->level should be read and walked into, or if we can simply delete our * reference to it. We return true if we should walk into the node, false if we * can skip it. * * We have assertions in here to make sure this is called correctly. We assume * that sanity checking on the blocks read to this point has been done, so any * corrupted file systems must have been caught before calling this function. */ static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control *wc, struct extent_buffer *eb, u64 refs, u64 flags, int slot) { … } static noinline void reada_walk_down(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct walk_control *wc, struct btrfs_path *path) { … } /* * helper to process tree block while walking down the tree. * * when wc->stage == UPDATE_BACKREF, this function updates * back refs for pointers in the block. * * NOTE: return value 1 means we should stop walking down. */ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct walk_control *wc) { … } /* * This is used to verify a ref exists for this root to deal with a bug where we * would have a drop_progress key that hadn't been updated properly. */ static int check_ref_exists(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 parent, int level) { … } /* * We may not have an uptodate block, so if we are going to walk down into this * block we need to drop the lock, read it off of the disk, re-lock it and * return to continue dropping the snapshot. */ static int check_next_block_uptodate(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct walk_control *wc, struct extent_buffer *next) { … } /* * If we determine that we don't have to visit wc->level - 1 then we need to * determine if we can drop our reference. * * If we are UPDATE_BACKREF then we will not, we need to update our backrefs. * * If we are DROP_REFERENCE this will figure out if we need to drop our current * reference, skipping it if we dropped it from a previous incompleted drop, or * dropping it if we still have a reference to it. */ static int maybe_drop_reference(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct walk_control *wc, struct extent_buffer *next, u64 owner_root) { … } /* * helper to process tree block pointer. * * when wc->stage == DROP_REFERENCE, this function checks * reference count of the block pointed to. if the block * is shared and we need update back refs for the subtree * rooted at the block, this function changes wc->stage to * UPDATE_BACKREF. if the block is shared and there is no * need to update back, this function drops the reference * to the block. * * NOTE: return value 1 means we should stop walking down. */ static noinline int do_walk_down(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct walk_control *wc) { … } /* * helper to process tree block while walking up the tree. * * when wc->stage == DROP_REFERENCE, this function drops * reference count on the block. * * when wc->stage == UPDATE_BACKREF, this function changes * wc->stage back to DROP_REFERENCE if we changed wc->stage * to UPDATE_BACKREF previously while processing the block. * * NOTE: return value 1 means we should stop walking up. */ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct walk_control *wc) { … } /* * walk_down_tree consists of two steps. * * walk_down_proc(). Look up the reference count and reference of our current * wc->level. At this point path->nodes[wc->level] should be populated and * uptodate, and in most cases should already be locked. If we are in * DROP_REFERENCE and our refcount is > 1 then we've entered a shared node and * we can walk back up the tree. If we are UPDATE_BACKREF we have to set * FULL_BACKREF on this node if it's not already set, and then do the * FULL_BACKREF conversion dance, which is to drop the root reference and add * the shared reference to all of this nodes children. * * do_walk_down(). This is where we actually start iterating on the children of * our current path->nodes[wc->level]. For DROP_REFERENCE that means dropping * our reference to the children that return false from visit_node_for_delete(), * which has various conditions where we know we can just drop our reference * without visiting the node. For UPDATE_BACKREF we will skip any children that * visit_node_for_delete() returns false for, only walking down when necessary. * The bulk of the work for UPDATE_BACKREF occurs in the walk_up_tree() part of * snapshot deletion. */ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct walk_control *wc) { … } /* * walk_up_tree() is responsible for making sure we visit every slot on our * current node, and if we're at the end of that node then we call * walk_up_proc() on our current node which will do one of a few things based on * our stage. * * UPDATE_BACKREF. If we wc->level is currently less than our wc->shared_level * then we need to walk back up the tree, and then going back down into the * other slots via walk_down_tree to update any other children from our original * wc->shared_level. Once we're at or above our wc->shared_level we can switch * back to DROP_REFERENCE, lookup the current nodes refs and flags, and carry on. * * DROP_REFERENCE. If our refs == 1 then we're going to free this tree block. * If we're level 0 then we need to btrfs_dec_ref() on all of the data extents * in our current leaf. After that we call btrfs_free_tree_block() on the * current node and walk up to the next node to walk down the next slot. */ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct walk_control *wc, int max_level) { … } /* * drop a subvolume tree. * * this function traverses the tree freeing any blocks that only * referenced by the tree. * * when a shared tree block is found. this function decreases its * reference count by one. if update_ref is true, this function * also make sure backrefs for the shared block and all lower level * blocks are properly updated. * * If called with for_reloc == 0, may exit early with -EAGAIN */ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) { … } /* * drop subtree rooted at tree block 'node'. * * NOTE: this function will unlock and release tree block 'node' * only used by relocation code */ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, struct extent_buffer *parent) { … } /* * Unpin the extent range in an error context and don't add the space back. * Errors are not propagated further. */ void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end) { … } /* * It used to be that old block groups would be left around forever. * Iterating over them would be enough to trim unused space. Since we * now automatically remove them, we also need to iterate over unallocated * space. * * We don't want a transaction for this since the discard may take a * substantial amount of time. We don't require that a transaction be * running, but we do need to take a running transaction into account * to ensure that we're not discarding chunks that were released or * allocated in the current transaction. * * Holding the chunks lock will prevent other threads from allocating * or releasing chunks, but it won't prevent a running transaction * from committing and releasing the memory that the pending chunks * list head uses. For that, we need to take a reference to the * transaction and hold the commit root sem. We only need to hold * it while performing the free space search since we have already * held back allocations. */ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) { … } /* * Trim the whole filesystem by: * 1) trimming the free space in each block group * 2) trimming the unallocated space on each device * * This will also continue trimming even if a block group or device encounters * an error. The return value will be the last error, or 0 if nothing bad * happens. */ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) { … }