// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2008 Oracle. All rights reserved. */ #include <linux/sched.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/list_sort.h> #include <linux/iversion.h> #include "misc.h" #include "ctree.h" #include "tree-log.h" #include "disk-io.h" #include "locking.h" #include "backref.h" #include "compression.h" #include "qgroup.h" #include "block-group.h" #include "space-info.h" #include "inode-item.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" #include "dir-item.h" #include "file-item.h" #include "file.h" #include "orphan.h" #include "tree-checker.h" #define MAX_CONFLICT_INODES … /* magic values for the inode_only field in btrfs_log_inode: * * LOG_INODE_ALL means to log everything * LOG_INODE_EXISTS means to log just enough to recreate the inode * during log replay */ enum { … }; /* * directory trouble cases * * 1) on rename or unlink, if the inode being unlinked isn't in the fsync * log, we must force a full commit before doing an fsync of the directory * where the unlink was done. * ---> record transid of last unlink/rename per directory * * mkdir foo/some_dir * normal commit * rename foo/some_dir foo2/some_dir * mkdir foo/some_dir * fsync foo/some_dir/some_file * * The fsync above will unlink the original some_dir without recording * it in its new location (foo2). After a crash, some_dir will be gone * unless the fsync of some_file forces a full commit * * 2) we must log any new names for any file or dir that is in the fsync * log. ---> check inode while renaming/linking. * * 2a) we must log any new names for any file or dir during rename * when the directory they are being removed from was logged. * ---> check inode and old parent dir during rename * * 2a is actually the more important variant. With the extra logging * a crash might unlink the old name without recreating the new one * * 3) after a crash, we must go through any directories with a link count * of zero and redo the rm -rf * * mkdir f1/foo * normal commit * rm -rf f1/foo * fsync(f1) * * The directory f1 was fully removed from the FS, but fsync was never * called on f1, only its parent dir. After a crash the rm -rf must * be replayed. This must be able to recurse down the entire * directory tree. The inode link count fixup code takes care of the * ugly details. */ /* * stages for the tree walking. The first * stage (0) is to only pin down the blocks we find * the second stage (1) is to make sure that all the inodes * we find in the log are created in the subvolume. * * The last stage is to deal with directories and links and extents * and all the other fun semantics */ enum { … }; static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, int inode_only, struct btrfs_log_ctx *ctx); static int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_root *log, struct btrfs_path *path, u64 dirid, int del_all); static void wait_log_commit(struct btrfs_root *root, int transid); /* * tree logging is a special write ahead log used to make sure that * fsyncs and O_SYNCs can happen without doing full tree commits. * * Full tree commits are expensive because they require commonly * modified blocks to be recowed, creating many dirty pages in the * extent tree an 4x-6x higher write load than ext3. * * Instead of doing a tree commit on every fsync, we use the * key ranges and transaction ids to find items for a given file or directory * that have changed in this transaction. Those items are copied into * a special tree (one per subvolume root), that tree is written to disk * and then the fsync is considered complete. * * After a crash, items are copied out of the log-tree back into the * subvolume tree. Any file data extents found are recorded in the extent * allocation tree, and the log-tree freed. * * The log tree is read three times, once to pin down all the extents it is * using in ram and once, once to create all the inodes logged in the tree * and once to do all the other items. */ static struct inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root) { … } /* * start a sub transaction and setup the log tree * this increments the log tree writer count to make the people * syncing the tree wait for us to finish */ static int start_log_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_log_ctx *ctx) { … } /* * returns 0 if there was a log transaction running and we were able * to join, or returns -ENOENT if there were not transactions * in progress */ static int join_running_log_trans(struct btrfs_root *root) { … } /* * This either makes the current running log transaction wait * until you call btrfs_end_log_trans() or it makes any future * log transactions wait until you call btrfs_end_log_trans() */ void btrfs_pin_log_trans(struct btrfs_root *root) { … } /* * indicate we're done making changes to the log tree * and wake up anyone waiting to do a sync */ void btrfs_end_log_trans(struct btrfs_root *root) { … } /* * the walk control struct is used to pass state down the chain when * processing the log tree. The stage field tells us which part * of the log tree processing we are currently doing. The others * are state fields used for that specific part */ struct walk_control { … }; /* * process_func used to pin down extents, write them or wait on them */ static int process_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, struct walk_control *wc, u64 gen, int level) { … } /* * Item overwrite used by replay and tree logging. eb, slot and key all refer * to the src data we are copying out. * * root is the tree we are copying into, and path is a scratch * path for use in this function (it should be released on entry and * will be released on exit). * * If the key is already in the destination tree the existing item is * overwritten. If the existing item isn't big enough, it is extended. * If it is too large, it is truncated. * * If the key isn't in the destination yet, a new item is inserted. */ static int overwrite_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct extent_buffer *eb, int slot, struct btrfs_key *key) { … } static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len, struct fscrypt_str *name) { … } /* * simple helper to read an inode off the disk from a given root * This can only be called for subvolume roots and not for the log */ static noinline struct inode *read_one_inode(struct btrfs_root *root, u64 objectid) { … } /* replays a single extent in 'eb' at 'slot' with 'key' into the * subvolume 'root'. path is released on entry and should be released * on exit. * * extents in the log tree have not been allocated out of the extent * tree yet. So, this completes the allocation, taking a reference * as required if the extent already exists or creating a new extent * if it isn't in the extent allocation tree yet. * * The extent is inserted into the file, dropping any existing extents * from the file that overlap the new one. */ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct extent_buffer *eb, int slot, struct btrfs_key *key) { … } static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, const struct fscrypt_str *name) { … } /* * when cleaning up conflicts between the directory names in the * subvolume, directory names in the log and directory names in the * inode back references, we may have to unlink inodes from directories. * * This is a helper function to do the unlink of a specific directory * item */ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_inode *dir, struct btrfs_dir_item *di) { … } /* * See if a given name and sequence number found in an inode back reference are * already in a directory and correctly point to this inode. * * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it * exists. */ static noinline int inode_in_dir(struct btrfs_root *root, struct btrfs_path *path, u64 dirid, u64 objectid, u64 index, struct fscrypt_str *name) { … } /* * helper function to check a log tree for a named back reference in * an inode. This is used to decide if a back reference that is * found in the subvolume conflicts with what we find in the log. * * inode backreferences may have multiple refs in a single item, * during replay we process one reference at a time, and we don't * want to delete valid links to a file from the subvolume if that * link is also in the log. */ static noinline int backref_in_log(struct btrfs_root *log, struct btrfs_key *key, u64 ref_objectid, const struct fscrypt_str *name) { … } static inline int __add_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_root *log_root, struct btrfs_inode *dir, struct btrfs_inode *inode, u64 inode_objectid, u64 parent_objectid, u64 ref_index, struct fscrypt_str *name) { … } static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, struct fscrypt_str *name, u64 *index, u64 *parent_objectid) { … } static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, struct fscrypt_str *name, u64 *index) { … } /* * Take an inode reference item from the log tree and iterate all names from the * inode reference item in the subvolume tree with the same key (if it exists). * For any name that is not in the inode reference item from the log tree, do a * proper unlink of that name (that is, remove its entry from the inode * reference item and both dir index keys). */ static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_inode *inode, struct extent_buffer *log_eb, int log_slot, struct btrfs_key *key) { … } /* * replay one inode back reference item found in the log tree. * eb, slot and key refer to the buffer and key found in the log tree. * root is the destination we are replaying into, and path is for temp * use by this function. (it should be released on return). */ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_root *log, struct btrfs_path *path, struct extent_buffer *eb, int slot, struct btrfs_key *key) { … } static int count_inode_extrefs(struct btrfs_inode *inode, struct btrfs_path *path) { … } static int count_inode_refs(struct btrfs_inode *inode, struct btrfs_path *path) { … } /* * There are a few corners where the link count of the file can't * be properly maintained during replay. So, instead of adding * lots of complexity to the log code, we just scan the backrefs * for any file that has been through replay. * * The scan will update the link count on the inode to reflect the * number of back refs found. If it goes down to zero, the iput * will free the inode. */ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, struct inode *inode) { … } static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path) { … } /* * record a given inode in the fixup dir so we can check its link * count when replay is done. The link count is incremented here * so the inode won't go away until we check it */ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid) { … } /* * when replaying the log for a directory, we only insert names * for inodes that actually exist. This means an fsync on a directory * does not implicitly fsync all the new files in it */ static noinline int insert_one_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 dirid, u64 index, const struct fscrypt_str *name, struct btrfs_key *location) { … } static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_path *path, struct btrfs_dir_item *dst_di, const struct btrfs_key *log_key, u8 log_flags, bool exists) { … } /* * take a single entry in a log directory item and replay it into * the subvolume. * * if a conflicting item exists in the subdirectory already, * the inode it points to is unlinked and put into the link count * fix up tree. * * If a name from the log points to a file or directory that does * not exist in the FS, it is skipped. fsyncs on directories * do not force down inodes inside that directory, just changes to the * names or unlinks in a directory. * * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a * non-existing inode) and 1 if the name was replayed. */ static noinline int replay_one_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct extent_buffer *eb, struct btrfs_dir_item *di, struct btrfs_key *key) { … } /* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct extent_buffer *eb, int slot, struct btrfs_key *key) { … } /* * directory replay has two parts. There are the standard directory * items in the log copied from the subvolume, and range items * created in the log while the subvolume was logged. * * The range items tell us which parts of the key space the log * is authoritative for. During replay, if a key in the subvolume * directory is in a logged range item, but not actually in the log * that means it was deleted from the directory before the fsync * and should be removed. */ static noinline int find_dir_range(struct btrfs_root *root, struct btrfs_path *path, u64 dirid, u64 *start_ret, u64 *end_ret) { … } /* * this looks for a given directory item in the log. If the directory * item is not in the log, the item is removed and the inode it points * to is unlinked */ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, struct btrfs_path *log_path, struct inode *dir, struct btrfs_key *dir_key) { … } static int replay_xattr_deletes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_root *log, struct btrfs_path *path, const u64 ino) { … } /* * deletion replay happens before we copy any new directory items * out of the log or out of backreferences from inodes. It * scans the log to find ranges of keys that log is authoritative for, * and then scans the directory to find items in those ranges that are * not present in the log. * * Anything we don't find in the log is unlinked and removed from the * directory. */ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_root *log, struct btrfs_path *path, u64 dirid, int del_all) { … } /* * the process_func used to replay items from the log tree. This * gets called in two different stages. The first stage just looks * for inodes and makes sure they are all copied into the subvolume. * * The second stage copies all the other item types from the log into * the subvolume. The two stage approach is slower, but gets rid of * lots of complexity around inodes referencing other inodes that exist * only in the log (references come from either directory items or inode * back refs). */ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, struct walk_control *wc, u64 gen, int level) { … } /* * Correctly adjust the reserved bytes occupied by a log tree extent buffer */ static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) { … } static int clean_log_buffer(struct btrfs_trans_handle *trans, struct extent_buffer *eb) { … } static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int *level, struct walk_control *wc) { … } static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int *level, struct walk_control *wc) { … } /* * drop the reference count on the tree rooted at 'snap'. This traverses * the tree freeing any blocks that have a ref count of zero after being * decremented. */ static int walk_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct walk_control *wc) { … } /* * helper function to update the item for a given subvolumes log root * in the tree of log roots */ static int update_log_root(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_root_item *root_item) { … } static void wait_log_commit(struct btrfs_root *root, int transid) { … } static void wait_for_writer(struct btrfs_root *root) { … } void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct btrfs_inode *inode) { … } void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx) { … } void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx) { … } static inline void btrfs_remove_log_ctx(struct btrfs_root *root, struct btrfs_log_ctx *ctx) { … } /* * Invoked in log mutex context, or be sure there is no other task which * can access the list. */ static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, int index, int error) { … } /* * Sends a given tree log down to the disk and updates the super blocks to * record it. When this call is done, you know that any inodes previously * logged are safely on disk only if it returns 0. * * Any other return value means you need to call btrfs_commit_transaction. * Some of the edge cases for fsyncing directories that have had unlinks * or renames done in the past mean that sometimes the only safe * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, * that has happened. */ int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_log_ctx *ctx) { … } static void free_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *log) { … } /* * free all the extents used by the tree log. This should be called * at commit time of the full transaction */ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) { … } int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { … } /* * Check if an inode was logged in the current transaction. This correctly deals * with the case where the inode was logged but has a logged_trans of 0, which * happens if the inode is evicted and loaded again, as logged_trans is an in * memory only field (not persisted). * * Returns 1 if the inode was logged before in the transaction, 0 if it was not, * and < 0 on error. */ static int inode_logged(const struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path_in) { … } /* * Delete a directory entry from the log if it exists. * * Returns < 0 on error * 1 if the entry does not exists * 0 if the entry existed and was successfully deleted */ static int del_logged_dentry(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, u64 dir_ino, const struct fscrypt_str *name, u64 index) { … } /* * If both a file and directory are logged, and unlinks or renames are * mixed in, we have a few interesting corners: * * create file X in dir Y * link file X to X.link in dir Y * fsync file X * unlink file X but leave X.link * fsync dir Y * * After a crash we would expect only X.link to exist. But file X * didn't get fsync'd again so the log has back refs for X and X.link. * * We solve this by removing directory entries and inode backrefs from the * log when a file that was logged in the current transaction is * unlinked. Any later fsync will include the updated log entries, and * we'll be able to reconstruct the proper directory items from backrefs. * * This optimizations allows us to avoid relogging the entire inode * or the entire directory. */ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct fscrypt_str *name, struct btrfs_inode *dir, u64 index) { … } /* see comments for btrfs_del_dir_entries_in_log */ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct fscrypt_str *name, struct btrfs_inode *inode, u64 dirid) { … } /* * creates a range item in the log for 'dirid'. first_offset and * last_offset tell us which parts of the key space the log should * be considered authoritative for. */ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, u64 dirid, u64 first_offset, u64 last_offset) { … } static int flush_dir_items_batch(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct extent_buffer *src, struct btrfs_path *dst_path, int start_slot, int count) { … } static int clone_leaf(struct btrfs_path *path, struct btrfs_log_ctx *ctx) { … } static int process_dir_items_leaf(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_path *dst_path, struct btrfs_log_ctx *ctx, u64 *last_old_dentry_offset) { … } /* * log all the items included in the current transaction for a given * directory. This also creates the range items in the log tree required * to replay anything deleted before the fsync */ static noinline int log_dir_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_path *dst_path, struct btrfs_log_ctx *ctx, u64 min_offset, u64 *last_offset_ret) { … } /* * If the inode was logged before and it was evicted, then its * last_dir_index_offset is (u64)-1, so we don't the value of the last index * key offset. If that's the case, search for it and update the inode. This * is to avoid lookups in the log tree every time we try to insert a dir index * key from a leaf changed in the current transaction, and to allow us to always * do batch insertions of dir index keys. */ static int update_last_dir_index_offset(struct btrfs_inode *inode, struct btrfs_path *path, const struct btrfs_log_ctx *ctx) { … } /* * logging directories is very similar to logging inodes, We find all the items * from the current transaction and write them to the log. * * The recovery code scans the directory in the subvolume, and if it finds a * key in the range logged that is not present in the log tree, then it means * that dir entry was unlinked during the transaction. * * In order for that scan to work, we must include one key smaller than * the smallest logged by this transaction and one key larger than the largest * key logged by this transaction. */ static noinline int log_directory_changes(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_path *dst_path, struct btrfs_log_ctx *ctx) { … } /* * a helper function to drop items from the log before we relog an * inode. max_key_type indicates the highest item type to remove. * This cannot be run for file data extents because it does not * free the extents they point to. */ static int drop_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, struct btrfs_inode *inode, int max_key_type) { … } static int truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *log_root, struct btrfs_inode *inode, u64 new_size, u32 min_type) { … } static void fill_inode_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf, struct btrfs_inode_item *item, struct inode *inode, int log_inode_only, u64 logged_isize) { … } static int log_inode_item(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, struct btrfs_inode *inode, bool inode_item_dropped) { … } static int log_csums(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_root *log_root, struct btrfs_ordered_sum *sums) { … } static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *dst_path, struct btrfs_path *src_path, int start_slot, int nr, int inode_only, u64 logged_isize, struct btrfs_log_ctx *ctx) { … } static int extent_cmp(void *priv, const struct list_head *a, const struct list_head *b) { … } static int log_extent_csums(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_root *log_root, const struct extent_map *em, struct btrfs_log_ctx *ctx) { … } static int log_one_extent(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, const struct extent_map *em, struct btrfs_path *path, struct btrfs_log_ctx *ctx) { … } /* * Log all prealloc extents beyond the inode's i_size to make sure we do not * lose them after doing a full/fast fsync and replaying the log. We scan the * subvolume's root instead of iterating the inode's extent map tree because * otherwise we can log incorrect extent items based on extent map conversion. * That can happen due to the fact that extent maps are merged when they * are not in the extent map tree's list of modified extents. */ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_log_ctx *ctx) { … } static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_log_ctx *ctx) { … } static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, struct btrfs_path *path, u64 *size_ret) { … } /* * At the moment we always log all xattrs. This is to figure out at log replay * time which xattrs must have their deletion replayed. If a xattr is missing * in the log tree and exists in the fs/subvol tree, we delete it. This is * because if a xattr is deleted, the inode is fsynced and a power failure * happens, causing the log to be replayed the next time the fs is mounted, * we want the xattr to not exist anymore (same behaviour as other filesystems * with a journal, ext3/4, xfs, f2fs, etc). */ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_path *dst_path, struct btrfs_log_ctx *ctx) { … } /* * When using the NO_HOLES feature if we punched a hole that causes the * deletion of entire leafs or all the extent items of the first leaf (the one * that contains the inode item and references) we may end up not processing * any extents, because there are no leafs with a generation matching the * current transaction that have extent items for our inode. So we need to find * if any holes exist and then log them. We also need to log holes after any * truncate operation that changes the inode's size. */ static int btrfs_log_holes(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path) { … } /* * When we are logging a new inode X, check if it doesn't have a reference that * matches the reference from some other inode Y created in a past transaction * and that was renamed in the current transaction. If we don't do this, then at * log replay time we can lose inode Y (and all its files if it's a directory): * * mkdir /mnt/x * echo "hello world" > /mnt/x/foobar * sync * mv /mnt/x /mnt/y * mkdir /mnt/x # or touch /mnt/x * xfs_io -c fsync /mnt/x * <power fail> * mount fs, trigger log replay * * After the log replay procedure, we would lose the first directory and all its * files (file foobar). * For the case where inode Y is not a directory we simply end up losing it: * * echo "123" > /mnt/foo * sync * mv /mnt/foo /mnt/bar * echo "abc" > /mnt/foo * xfs_io -c fsync /mnt/foo * <power fail> * * We also need this for cases where a snapshot entry is replaced by some other * entry (file or directory) otherwise we end up with an unreplayable log due to * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as * if it were a regular entry: * * mkdir /mnt/x * btrfs subvolume snapshot /mnt /mnt/x/snap * btrfs subvolume delete /mnt/x/snap * rmdir /mnt/x * mkdir /mnt/x * fsync /mnt/x or fsync some new file inside it * <power fail> * * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in * the same transaction. */ static int btrfs_check_ref_name_override(struct extent_buffer *eb, const int slot, const struct btrfs_key *key, struct btrfs_inode *inode, u64 *other_ino, u64 *other_parent) { … } /* * Check if we need to log an inode. This is used in contexts where while * logging an inode we need to log another inode (either that it exists or in * full mode). This is used instead of btrfs_inode_in_log() because the later * requires the inode to be in the log and have the log transaction committed, * while here we do not care if the log transaction was already committed - our * caller will commit the log later - and we want to avoid logging an inode * multiple times when multiple tasks have joined the same log transaction. */ static bool need_log_inode(const struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { … } struct btrfs_dir_list { … }; /* * Log the inodes of the new dentries of a directory. * See process_dir_items_leaf() for details about why it is needed. * This is a recursive operation - if an existing dentry corresponds to a * directory, that directory's new entries are logged too (same behaviour as * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes * the dentries point to we do not acquire their VFS lock, otherwise lockdep * complains about the following circular lock dependency / possible deadlock: * * CPU0 CPU1 * ---- ---- * lock(&type->i_mutex_dir_key#3/2); * lock(sb_internal#2); * lock(&type->i_mutex_dir_key#3/2); * lock(&sb->s_type->i_mutex_key#14); * * Where sb_internal is the lock (a counter that works as a lock) acquired by * sb_start_intwrite() in btrfs_start_transaction(). * Not acquiring the VFS lock of the inodes is still safe because: * * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible * that while logging the inode new references (names) are added or removed * from the inode, leaving the logged inode item with a link count that does * not match the number of logged inode reference items. This is fine because * at log replay time we compute the real number of links and correct the * link count in the inode item (see replay_one_buffer() and * link_to_fixup_dir()); * * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that * while logging the inode's items new index items (key type * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item * has a size that doesn't match the sum of the lengths of all the logged * names - this is ok, not a problem, because at log replay time we set the * directory's i_size to the correct value (see replay_one_name() and * overwrite_item()). */ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, struct btrfs_inode *start_inode, struct btrfs_log_ctx *ctx) { … } struct btrfs_ino_list { … }; static void free_conflicting_inodes(struct btrfs_log_ctx *ctx) { … } static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino, struct btrfs_path *path) { … } static int add_conflicting_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 ino, u64 parent, struct btrfs_log_ctx *ctx) { … } static int log_conflicting_inodes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_log_ctx *ctx) { … } static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_key *min_key, const struct btrfs_key *max_key, struct btrfs_path *path, struct btrfs_path *dst_path, const u64 logged_isize, const int inode_only, struct btrfs_log_ctx *ctx, bool *need_log_inode_item) { … } static int insert_delayed_items_batch(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, const struct btrfs_item_batch *batch, const struct btrfs_delayed_item *first_item) { … } static int log_delayed_insertion_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, const struct list_head *delayed_ins_list, struct btrfs_log_ctx *ctx) { … } static int log_delayed_deletions_full(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, const struct list_head *delayed_del_list, struct btrfs_log_ctx *ctx) { … } static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_log_ctx *ctx, const struct list_head *delayed_del_list, const struct btrfs_delayed_item *first, const struct btrfs_delayed_item **last_ret) { … } static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, const struct list_head *delayed_del_list, struct btrfs_log_ctx *ctx) { … } static int log_delayed_deletion_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, const struct list_head *delayed_del_list, struct btrfs_log_ctx *ctx) { … } /* * Similar logic as for log_new_dir_dentries(), but it iterates over the delayed * items instead of the subvolume tree. */ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, const struct list_head *delayed_ins_list, struct btrfs_log_ctx *ctx) { … } /* log a single inode in the tree log. * At least one parent directory for this inode must exist in the tree * or be logged already. * * Any items from this inode changed by the current transaction are copied * to the log tree. An extra reference is taken on any extents in this * file, allowing us to avoid a whole pile of corner cases around logging * blocks that have been removed from the tree. * * See LOG_INODE_ALL and related defines for a description of what inode_only * does. * * This handles both files and directories. */ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, int inode_only, struct btrfs_log_ctx *ctx) { … } static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_log_ctx *ctx) { … } static int log_new_ancestors(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_log_ctx *ctx) { … } static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct dentry *parent, struct btrfs_log_ctx *ctx) { … } static int log_all_new_ancestors(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct dentry *parent, struct btrfs_log_ctx *ctx) { … } /* * helper function around btrfs_log_inode to make sure newly created * parent directories also end up in the log. A minimal inode and backref * only logging is done of any parent directories that are older than * the last committed transaction */ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct dentry *parent, int inode_only, struct btrfs_log_ctx *ctx) { … } /* * it is not safe to log dentry if the chunk root has added new * chunks. This returns 0 if the dentry was logged, and 1 otherwise. * If this returns 1, you must commit the transaction to safely get your * data on disk. */ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct dentry *dentry, struct btrfs_log_ctx *ctx) { … } /* * should be called during mount to recover any replay any log trees * from the FS */ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) { … } /* * there are some corner cases where we want to force a full * commit instead of allowing a directory to be logged. * * They revolve around files there were unlinked from the directory, and * this function updates the parent directory so that a full commit is * properly done if it is fsync'd later after the unlinks are done. * * Must be called before the unlink operations (updates to the subvolume tree, * inodes, etc) are done. */ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, bool for_rename) { … } /* * Make sure that if someone attempts to fsync the parent directory of a deleted * snapshot, it ends up triggering a transaction commit. This is to guarantee * that after replaying the log tree of the parent directory's root we will not * see the snapshot anymore and at log replay time we will not see any log tree * corresponding to the deleted snapshot's root, which could lead to replaying * it after replaying the log tree of the parent directory (which would replay * the snapshot delete operation). * * Must be called before the actual snapshot destroy operation (updates to the * parent root and tree of tree roots trees, etc) are done. */ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, struct btrfs_inode *dir) { … } /* * Call this when creating a subvolume in a directory. * Because we don't commit a transaction when creating a subvolume, we can't * allow the directory pointing to the subvolume to be logged with an entry that * points to an unpersisted root if we are still in the transaction used to * create the subvolume, so make any attempt to log the directory to result in a * full log sync. * Also we don't need to worry with renames, since btrfs_rename() marks the log * for full commit when renaming a subvolume. */ void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans, struct btrfs_inode *dir) { … } /* * Update the log after adding a new name for an inode. * * @trans: Transaction handle. * @old_dentry: The dentry associated with the old name and the old * parent directory. * @old_dir: The inode of the previous parent directory for the case * of a rename. For a link operation, it must be NULL. * @old_dir_index: The index number associated with the old name, meaningful * only for rename operations (when @old_dir is not NULL). * Ignored for link operations. * @parent: The dentry associated with the directory under which the * new name is located. * * Call this after adding a new name for an inode, as a result of a link or * rename operation, and it will properly update the log to reflect the new name. */ void btrfs_log_new_name(struct btrfs_trans_handle *trans, struct dentry *old_dentry, struct btrfs_inode *old_dir, u64 old_dir_index, struct dentry *parent) { … }