tree-log.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2008 Oracle.  All rights reserved.
 */

#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/list_sort.h>
#include <linux/iversion.h>
#include "misc.h"
#include "ctree.h"
#include "tree-log.h"
#include "disk-io.h"
#include "locking.h"
#include "backref.h"
#include "compression.h"
#include "qgroup.h"
#include "block-group.h"
#include "space-info.h"
#include "inode-item.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
#include "root-tree.h"
#include "dir-item.h"
#include "file-item.h"
#include "file.h"
#include "orphan.h"
#include "tree-checker.h"

#define MAX_CONFLICT_INODES …

/* magic values for the inode_only field in btrfs_log_inode:
 *
 * LOG_INODE_ALL means to log everything
 * LOG_INODE_EXISTS means to log just enough to recreate the inode
 * during log replay
 */
enum { … };

/*
 * directory trouble cases
 *
 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
 * log, we must force a full commit before doing an fsync of the directory
 * where the unlink was done.
 * ---> record transid of last unlink/rename per directory
 *
 * mkdir foo/some_dir
 * normal commit
 * rename foo/some_dir foo2/some_dir
 * mkdir foo/some_dir
 * fsync foo/some_dir/some_file
 *
 * The fsync above will unlink the original some_dir without recording
 * it in its new location (foo2).  After a crash, some_dir will be gone
 * unless the fsync of some_file forces a full commit
 *
 * 2) we must log any new names for any file or dir that is in the fsync
 * log. ---> check inode while renaming/linking.
 *
 * 2a) we must log any new names for any file or dir during rename
 * when the directory they are being removed from was logged.
 * ---> check inode and old parent dir during rename
 *
 *  2a is actually the more important variant.  With the extra logging
 *  a crash might unlink the old name without recreating the new one
 *
 * 3) after a crash, we must go through any directories with a link count
 * of zero and redo the rm -rf
 *
 * mkdir f1/foo
 * normal commit
 * rm -rf f1/foo
 * fsync(f1)
 *
 * The directory f1 was fully removed from the FS, but fsync was never
 * called on f1, only its parent dir.  After a crash the rm -rf must
 * be replayed.  This must be able to recurse down the entire
 * directory tree.  The inode link count fixup code takes care of the
 * ugly details.
 */

/*
 * stages for the tree walking.  The first
 * stage (0) is to only pin down the blocks we find
 * the second stage (1) is to make sure that all the inodes
 * we find in the log are created in the subvolume.
 *
 * The last stage is to deal with directories and links and extents
 * and all the other fun semantics
 */
enum { … };

static int btrfs_log_inode(struct btrfs_trans_handle *trans,
			   struct btrfs_inode *inode,
			   int inode_only,
			   struct btrfs_log_ctx *ctx);
static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
			     struct btrfs_root *root,
			     struct btrfs_path *path, u64 objectid);
static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
				       struct btrfs_root *root,
				       struct btrfs_root *log,
				       struct btrfs_path *path,
				       u64 dirid, int del_all);
static void wait_log_commit(struct btrfs_root *root, int transid);

/*
 * tree logging is a special write ahead log used to make sure that
 * fsyncs and O_SYNCs can happen without doing full tree commits.
 *
 * Full tree commits are expensive because they require commonly
 * modified blocks to be recowed, creating many dirty pages in the
 * extent tree an 4x-6x higher write load than ext3.
 *
 * Instead of doing a tree commit on every fsync, we use the
 * key ranges and transaction ids to find items for a given file or directory
 * that have changed in this transaction.  Those items are copied into
 * a special tree (one per subvolume root), that tree is written to disk
 * and then the fsync is considered complete.
 *
 * After a crash, items are copied out of the log-tree back into the
 * subvolume tree.  Any file data extents found are recorded in the extent
 * allocation tree, and the log-tree freed.
 *
 * The log tree is read three times, once to pin down all the extents it is
 * using in ram and once, once to create all the inodes logged in the tree
 * and once to do all the other items.
 */

static struct inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root)
{ … }

/*
 * start a sub transaction and setup the log tree
 * this increments the log tree writer count to make the people
 * syncing the tree wait for us to finish
 */
static int start_log_trans(struct btrfs_trans_handle *trans,
			   struct btrfs_root *root,
			   struct btrfs_log_ctx *ctx)
{ … }

/*
 * returns 0 if there was a log transaction running and we were able
 * to join, or returns -ENOENT if there were not transactions
 * in progress
 */
static int join_running_log_trans(struct btrfs_root *root)
{ … }

/*
 * This either makes the current running log transaction wait
 * until you call btrfs_end_log_trans() or it makes any future
 * log transactions wait until you call btrfs_end_log_trans()
 */
void btrfs_pin_log_trans(struct btrfs_root *root)
{ … }

/*
 * indicate we're done making changes to the log tree
 * and wake up anyone waiting to do a sync
 */
void btrfs_end_log_trans(struct btrfs_root *root)
{ … }

/*
 * the walk control struct is used to pass state down the chain when
 * processing the log tree.  The stage field tells us which part
 * of the log tree processing we are currently doing.  The others
 * are state fields used for that specific part
 */
struct walk_control { … };

/*
 * process_func used to pin down extents, write them or wait on them
 */
static int process_one_buffer(struct btrfs_root *log,
			      struct extent_buffer *eb,
			      struct walk_control *wc, u64 gen, int level)
{ … }

/*
 * Item overwrite used by replay and tree logging.  eb, slot and key all refer
 * to the src data we are copying out.
 *
 * root is the tree we are copying into, and path is a scratch
 * path for use in this function (it should be released on entry and
 * will be released on exit).
 *
 * If the key is already in the destination tree the existing item is
 * overwritten.  If the existing item isn't big enough, it is extended.
 * If it is too large, it is truncated.
 *
 * If the key isn't in the destination yet, a new item is inserted.
 */
static int overwrite_item(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root,
			  struct btrfs_path *path,
			  struct extent_buffer *eb, int slot,
			  struct btrfs_key *key)
{ … }

static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
			       struct fscrypt_str *name)
{ … }

/*
 * simple helper to read an inode off the disk from a given root
 * This can only be called for subvolume roots and not for the log
 */
static noinline struct inode *read_one_inode(struct btrfs_root *root,
					     u64 objectid)
{ … }

/* replays a single extent in 'eb' at 'slot' with 'key' into the
 * subvolume 'root'.  path is released on entry and should be released
 * on exit.
 *
 * extents in the log tree have not been allocated out of the extent
 * tree yet.  So, this completes the allocation, taking a reference
 * as required if the extent already exists or creating a new extent
 * if it isn't in the extent allocation tree yet.
 *
 * The extent is inserted into the file, dropping any existing extents
 * from the file that overlap the new one.
 */
static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
				      struct btrfs_root *root,
				      struct btrfs_path *path,
				      struct extent_buffer *eb, int slot,
				      struct btrfs_key *key)
{ … }

static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans,
				       struct btrfs_inode *dir,
				       struct btrfs_inode *inode,
				       const struct fscrypt_str *name)
{ … }

/*
 * when cleaning up conflicts between the directory names in the
 * subvolume, directory names in the log and directory names in the
 * inode back references, we may have to unlink inodes from directories.
 *
 * This is a helper function to do the unlink of a specific directory
 * item
 */
static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
				      struct btrfs_path *path,
				      struct btrfs_inode *dir,
				      struct btrfs_dir_item *di)
{ … }

/*
 * See if a given name and sequence number found in an inode back reference are
 * already in a directory and correctly point to this inode.
 *
 * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
 * exists.
 */
static noinline int inode_in_dir(struct btrfs_root *root,
				 struct btrfs_path *path,
				 u64 dirid, u64 objectid, u64 index,
				 struct fscrypt_str *name)
{ … }

/*
 * helper function to check a log tree for a named back reference in
 * an inode.  This is used to decide if a back reference that is
 * found in the subvolume conflicts with what we find in the log.
 *
 * inode backreferences may have multiple refs in a single item,
 * during replay we process one reference at a time, and we don't
 * want to delete valid links to a file from the subvolume if that
 * link is also in the log.
 */
static noinline int backref_in_log(struct btrfs_root *log,
				   struct btrfs_key *key,
				   u64 ref_objectid,
				   const struct fscrypt_str *name)
{ … }

static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
				  struct btrfs_root *root,
				  struct btrfs_path *path,
				  struct btrfs_root *log_root,
				  struct btrfs_inode *dir,
				  struct btrfs_inode *inode,
				  u64 inode_objectid, u64 parent_objectid,
				  u64 ref_index, struct fscrypt_str *name)
{ … }

static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
			     struct fscrypt_str *name, u64 *index,
			     u64 *parent_objectid)
{ … }

static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
			  struct fscrypt_str *name, u64 *index)
{ … }

/*
 * Take an inode reference item from the log tree and iterate all names from the
 * inode reference item in the subvolume tree with the same key (if it exists).
 * For any name that is not in the inode reference item from the log tree, do a
 * proper unlink of that name (that is, remove its entry from the inode
 * reference item and both dir index keys).
 */
static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root,
				 struct btrfs_path *path,
				 struct btrfs_inode *inode,
				 struct extent_buffer *log_eb,
				 int log_slot,
				 struct btrfs_key *key)
{ … }

/*
 * replay one inode back reference item found in the log tree.
 * eb, slot and key refer to the buffer and key found in the log tree.
 * root is the destination we are replaying into, and path is for temp
 * use by this function.  (it should be released on return).
 */
static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
				  struct btrfs_root *root,
				  struct btrfs_root *log,
				  struct btrfs_path *path,
				  struct extent_buffer *eb, int slot,
				  struct btrfs_key *key)
{ … }

static int count_inode_extrefs(struct btrfs_inode *inode, struct btrfs_path *path)
{ … }

static int count_inode_refs(struct btrfs_inode *inode, struct btrfs_path *path)
{ … }

/*
 * There are a few corners where the link count of the file can't
 * be properly maintained during replay.  So, instead of adding
 * lots of complexity to the log code, we just scan the backrefs
 * for any file that has been through replay.
 *
 * The scan will update the link count on the inode to reflect the
 * number of back refs found.  If it goes down to zero, the iput
 * will free the inode.
 */
static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
					   struct inode *inode)
{ … }

static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
					    struct btrfs_root *root,
					    struct btrfs_path *path)
{ … }


/*
 * record a given inode in the fixup dir so we can check its link
 * count when replay is done.  The link count is incremented here
 * so the inode won't go away until we check it
 */
static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
				      struct btrfs_root *root,
				      struct btrfs_path *path,
				      u64 objectid)
{ … }

/*
 * when replaying the log for a directory, we only insert names
 * for inodes that actually exist.  This means an fsync on a directory
 * does not implicitly fsync all the new files in it
 */
static noinline int insert_one_name(struct btrfs_trans_handle *trans,
				    struct btrfs_root *root,
				    u64 dirid, u64 index,
				    const struct fscrypt_str *name,
				    struct btrfs_key *location)
{ … }

static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
					struct btrfs_inode *dir,
					struct btrfs_path *path,
					struct btrfs_dir_item *dst_di,
					const struct btrfs_key *log_key,
					u8 log_flags,
					bool exists)
{ … }

/*
 * take a single entry in a log directory item and replay it into
 * the subvolume.
 *
 * if a conflicting item exists in the subdirectory already,
 * the inode it points to is unlinked and put into the link count
 * fix up tree.
 *
 * If a name from the log points to a file or directory that does
 * not exist in the FS, it is skipped.  fsyncs on directories
 * do not force down inodes inside that directory, just changes to the
 * names or unlinks in a directory.
 *
 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
 * non-existing inode) and 1 if the name was replayed.
 */
static noinline int replay_one_name(struct btrfs_trans_handle *trans,
				    struct btrfs_root *root,
				    struct btrfs_path *path,
				    struct extent_buffer *eb,
				    struct btrfs_dir_item *di,
				    struct btrfs_key *key)
{ … }

/* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */
static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
					struct btrfs_root *root,
					struct btrfs_path *path,
					struct extent_buffer *eb, int slot,
					struct btrfs_key *key)
{ … }

/*
 * directory replay has two parts.  There are the standard directory
 * items in the log copied from the subvolume, and range items
 * created in the log while the subvolume was logged.
 *
 * The range items tell us which parts of the key space the log
 * is authoritative for.  During replay, if a key in the subvolume
 * directory is in a logged range item, but not actually in the log
 * that means it was deleted from the directory before the fsync
 * and should be removed.
 */
static noinline int find_dir_range(struct btrfs_root *root,
				   struct btrfs_path *path,
				   u64 dirid,
				   u64 *start_ret, u64 *end_ret)
{ … }

/*
 * this looks for a given directory item in the log.  If the directory
 * item is not in the log, the item is removed and the inode it points
 * to is unlinked
 */
static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
				      struct btrfs_root *log,
				      struct btrfs_path *path,
				      struct btrfs_path *log_path,
				      struct inode *dir,
				      struct btrfs_key *dir_key)
{ … }

static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
			      struct btrfs_root *root,
			      struct btrfs_root *log,
			      struct btrfs_path *path,
			      const u64 ino)
{ … }


/*
 * deletion replay happens before we copy any new directory items
 * out of the log or out of backreferences from inodes.  It
 * scans the log to find ranges of keys that log is authoritative for,
 * and then scans the directory to find items in those ranges that are
 * not present in the log.
 *
 * Anything we don't find in the log is unlinked and removed from the
 * directory.
 */
static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
				       struct btrfs_root *root,
				       struct btrfs_root *log,
				       struct btrfs_path *path,
				       u64 dirid, int del_all)
{ … }

/*
 * the process_func used to replay items from the log tree.  This
 * gets called in two different stages.  The first stage just looks
 * for inodes and makes sure they are all copied into the subvolume.
 *
 * The second stage copies all the other item types from the log into
 * the subvolume.  The two stage approach is slower, but gets rid of
 * lots of complexity around inodes referencing other inodes that exist
 * only in the log (references come from either directory items or inode
 * back refs).
 */
static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
			     struct walk_control *wc, u64 gen, int level)
{ … }

/*
 * Correctly adjust the reserved bytes occupied by a log tree extent buffer
 */
static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
{ … }

static int clean_log_buffer(struct btrfs_trans_handle *trans,
			    struct extent_buffer *eb)
{ … }

static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
				   struct btrfs_root *root,
				   struct btrfs_path *path, int *level,
				   struct walk_control *wc)
{ … }

static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root,
				 struct btrfs_path *path, int *level,
				 struct walk_control *wc)
{ … }

/*
 * drop the reference count on the tree rooted at 'snap'.  This traverses
 * the tree freeing any blocks that have a ref count of zero after being
 * decremented.
 */
static int walk_log_tree(struct btrfs_trans_handle *trans,
			 struct btrfs_root *log, struct walk_control *wc)
{ … }

/*
 * helper function to update the item for a given subvolumes log root
 * in the tree of log roots
 */
static int update_log_root(struct btrfs_trans_handle *trans,
			   struct btrfs_root *log,
			   struct btrfs_root_item *root_item)
{ … }

static void wait_log_commit(struct btrfs_root *root, int transid)
{ … }

static void wait_for_writer(struct btrfs_root *root)
{ … }

void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct btrfs_inode *inode)
{ … }

void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx)
{ … }

void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
{ … }


static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
					struct btrfs_log_ctx *ctx)
{ … }

/* 
 * Invoked in log mutex context, or be sure there is no other task which
 * can access the list.
 */
static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
					     int index, int error)
{ … }

/*
 * Sends a given tree log down to the disk and updates the super blocks to
 * record it.  When this call is done, you know that any inodes previously
 * logged are safely on disk only if it returns 0.
 *
 * Any other return value means you need to call btrfs_commit_transaction.
 * Some of the edge cases for fsyncing directories that have had unlinks
 * or renames done in the past mean that sometimes the only safe
 * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
 * that has happened.
 */
int btrfs_sync_log(struct btrfs_trans_handle *trans,
		   struct btrfs_root *root, struct btrfs_log_ctx *ctx)
{ … }

static void free_log_tree(struct btrfs_trans_handle *trans,
			  struct btrfs_root *log)
{ … }

/*
 * free all the extents used by the tree log.  This should be called
 * at commit time of the full transaction
 */
int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
{ … }

int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
			     struct btrfs_fs_info *fs_info)
{ … }

/*
 * Check if an inode was logged in the current transaction. This correctly deals
 * with the case where the inode was logged but has a logged_trans of 0, which
 * happens if the inode is evicted and loaded again, as logged_trans is an in
 * memory only field (not persisted).
 *
 * Returns 1 if the inode was logged before in the transaction, 0 if it was not,
 * and < 0 on error.
 */
static int inode_logged(const struct btrfs_trans_handle *trans,
			struct btrfs_inode *inode,
			struct btrfs_path *path_in)
{ … }

/*
 * Delete a directory entry from the log if it exists.
 *
 * Returns < 0 on error
 *           1 if the entry does not exists
 *           0 if the entry existed and was successfully deleted
 */
static int del_logged_dentry(struct btrfs_trans_handle *trans,
			     struct btrfs_root *log,
			     struct btrfs_path *path,
			     u64 dir_ino,
			     const struct fscrypt_str *name,
			     u64 index)
{ … }

/*
 * If both a file and directory are logged, and unlinks or renames are
 * mixed in, we have a few interesting corners:
 *
 * create file X in dir Y
 * link file X to X.link in dir Y
 * fsync file X
 * unlink file X but leave X.link
 * fsync dir Y
 *
 * After a crash we would expect only X.link to exist.  But file X
 * didn't get fsync'd again so the log has back refs for X and X.link.
 *
 * We solve this by removing directory entries and inode backrefs from the
 * log when a file that was logged in the current transaction is
 * unlinked.  Any later fsync will include the updated log entries, and
 * we'll be able to reconstruct the proper directory items from backrefs.
 *
 * This optimizations allows us to avoid relogging the entire inode
 * or the entire directory.
 */
void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
				  struct btrfs_root *root,
				  const struct fscrypt_str *name,
				  struct btrfs_inode *dir, u64 index)
{ … }

/* see comments for btrfs_del_dir_entries_in_log */
void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
				struct btrfs_root *root,
				const struct fscrypt_str *name,
				struct btrfs_inode *inode, u64 dirid)
{ … }

/*
 * creates a range item in the log for 'dirid'.  first_offset and
 * last_offset tell us which parts of the key space the log should
 * be considered authoritative for.
 */
static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
				       struct btrfs_root *log,
				       struct btrfs_path *path,
				       u64 dirid,
				       u64 first_offset, u64 last_offset)
{ … }

static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
				 struct btrfs_inode *inode,
				 struct extent_buffer *src,
				 struct btrfs_path *dst_path,
				 int start_slot,
				 int count)
{ … }

static int clone_leaf(struct btrfs_path *path, struct btrfs_log_ctx *ctx)
{ … }

static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
				  struct btrfs_inode *inode,
				  struct btrfs_path *path,
				  struct btrfs_path *dst_path,
				  struct btrfs_log_ctx *ctx,
				  u64 *last_old_dentry_offset)
{ … }

/*
 * log all the items included in the current transaction for a given
 * directory.  This also creates the range items in the log tree required
 * to replay anything deleted before the fsync
 */
static noinline int log_dir_items(struct btrfs_trans_handle *trans,
			  struct btrfs_inode *inode,
			  struct btrfs_path *path,
			  struct btrfs_path *dst_path,
			  struct btrfs_log_ctx *ctx,
			  u64 min_offset, u64 *last_offset_ret)
{ … }

/*
 * If the inode was logged before and it was evicted, then its
 * last_dir_index_offset is (u64)-1, so we don't the value of the last index
 * key offset. If that's the case, search for it and update the inode. This
 * is to avoid lookups in the log tree every time we try to insert a dir index
 * key from a leaf changed in the current transaction, and to allow us to always
 * do batch insertions of dir index keys.
 */
static int update_last_dir_index_offset(struct btrfs_inode *inode,
					struct btrfs_path *path,
					const struct btrfs_log_ctx *ctx)
{ … }

/*
 * logging directories is very similar to logging inodes, We find all the items
 * from the current transaction and write them to the log.
 *
 * The recovery code scans the directory in the subvolume, and if it finds a
 * key in the range logged that is not present in the log tree, then it means
 * that dir entry was unlinked during the transaction.
 *
 * In order for that scan to work, we must include one key smaller than
 * the smallest logged by this transaction and one key larger than the largest
 * key logged by this transaction.
 */
static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
			  struct btrfs_inode *inode,
			  struct btrfs_path *path,
			  struct btrfs_path *dst_path,
			  struct btrfs_log_ctx *ctx)
{ … }

/*
 * a helper function to drop items from the log before we relog an
 * inode.  max_key_type indicates the highest item type to remove.
 * This cannot be run for file data extents because it does not
 * free the extents they point to.
 */
static int drop_inode_items(struct btrfs_trans_handle *trans,
				  struct btrfs_root *log,
				  struct btrfs_path *path,
				  struct btrfs_inode *inode,
				  int max_key_type)
{ … }

static int truncate_inode_items(struct btrfs_trans_handle *trans,
				struct btrfs_root *log_root,
				struct btrfs_inode *inode,
				u64 new_size, u32 min_type)
{ … }

static void fill_inode_item(struct btrfs_trans_handle *trans,
			    struct extent_buffer *leaf,
			    struct btrfs_inode_item *item,
			    struct inode *inode, int log_inode_only,
			    u64 logged_isize)
{ … }

static int log_inode_item(struct btrfs_trans_handle *trans,
			  struct btrfs_root *log, struct btrfs_path *path,
			  struct btrfs_inode *inode, bool inode_item_dropped)
{ … }

static int log_csums(struct btrfs_trans_handle *trans,
		     struct btrfs_inode *inode,
		     struct btrfs_root *log_root,
		     struct btrfs_ordered_sum *sums)
{ … }

static noinline int copy_items(struct btrfs_trans_handle *trans,
			       struct btrfs_inode *inode,
			       struct btrfs_path *dst_path,
			       struct btrfs_path *src_path,
			       int start_slot, int nr, int inode_only,
			       u64 logged_isize, struct btrfs_log_ctx *ctx)
{ … }

static int extent_cmp(void *priv, const struct list_head *a,
		      const struct list_head *b)
{ … }

static int log_extent_csums(struct btrfs_trans_handle *trans,
			    struct btrfs_inode *inode,
			    struct btrfs_root *log_root,
			    const struct extent_map *em,
			    struct btrfs_log_ctx *ctx)
{ … }

static int log_one_extent(struct btrfs_trans_handle *trans,
			  struct btrfs_inode *inode,
			  const struct extent_map *em,
			  struct btrfs_path *path,
			  struct btrfs_log_ctx *ctx)
{ … }

/*
 * Log all prealloc extents beyond the inode's i_size to make sure we do not
 * lose them after doing a full/fast fsync and replaying the log. We scan the
 * subvolume's root instead of iterating the inode's extent map tree because
 * otherwise we can log incorrect extent items based on extent map conversion.
 * That can happen due to the fact that extent maps are merged when they
 * are not in the extent map tree's list of modified extents.
 */
static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
				      struct btrfs_inode *inode,
				      struct btrfs_path *path,
				      struct btrfs_log_ctx *ctx)
{ … }

static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
				     struct btrfs_inode *inode,
				     struct btrfs_path *path,
				     struct btrfs_log_ctx *ctx)
{ … }

static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
			     struct btrfs_path *path, u64 *size_ret)
{ … }

/*
 * At the moment we always log all xattrs. This is to figure out at log replay
 * time which xattrs must have their deletion replayed. If a xattr is missing
 * in the log tree and exists in the fs/subvol tree, we delete it. This is
 * because if a xattr is deleted, the inode is fsynced and a power failure
 * happens, causing the log to be replayed the next time the fs is mounted,
 * we want the xattr to not exist anymore (same behaviour as other filesystems
 * with a journal, ext3/4, xfs, f2fs, etc).
 */
static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
				struct btrfs_inode *inode,
				struct btrfs_path *path,
				struct btrfs_path *dst_path,
				struct btrfs_log_ctx *ctx)
{ … }

/*
 * When using the NO_HOLES feature if we punched a hole that causes the
 * deletion of entire leafs or all the extent items of the first leaf (the one
 * that contains the inode item and references) we may end up not processing
 * any extents, because there are no leafs with a generation matching the
 * current transaction that have extent items for our inode. So we need to find
 * if any holes exist and then log them. We also need to log holes after any
 * truncate operation that changes the inode's size.
 */
static int btrfs_log_holes(struct btrfs_trans_handle *trans,
			   struct btrfs_inode *inode,
			   struct btrfs_path *path)
{ … }

/*
 * When we are logging a new inode X, check if it doesn't have a reference that
 * matches the reference from some other inode Y created in a past transaction
 * and that was renamed in the current transaction. If we don't do this, then at
 * log replay time we can lose inode Y (and all its files if it's a directory):
 *
 * mkdir /mnt/x
 * echo "hello world" > /mnt/x/foobar
 * sync
 * mv /mnt/x /mnt/y
 * mkdir /mnt/x                 # or touch /mnt/x
 * xfs_io -c fsync /mnt/x
 * <power fail>
 * mount fs, trigger log replay
 *
 * After the log replay procedure, we would lose the first directory and all its
 * files (file foobar).
 * For the case where inode Y is not a directory we simply end up losing it:
 *
 * echo "123" > /mnt/foo
 * sync
 * mv /mnt/foo /mnt/bar
 * echo "abc" > /mnt/foo
 * xfs_io -c fsync /mnt/foo
 * <power fail>
 *
 * We also need this for cases where a snapshot entry is replaced by some other
 * entry (file or directory) otherwise we end up with an unreplayable log due to
 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
 * if it were a regular entry:
 *
 * mkdir /mnt/x
 * btrfs subvolume snapshot /mnt /mnt/x/snap
 * btrfs subvolume delete /mnt/x/snap
 * rmdir /mnt/x
 * mkdir /mnt/x
 * fsync /mnt/x or fsync some new file inside it
 * <power fail>
 *
 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
 * the same transaction.
 */
static int btrfs_check_ref_name_override(struct extent_buffer *eb,
					 const int slot,
					 const struct btrfs_key *key,
					 struct btrfs_inode *inode,
					 u64 *other_ino, u64 *other_parent)
{ … }

/*
 * Check if we need to log an inode. This is used in contexts where while
 * logging an inode we need to log another inode (either that it exists or in
 * full mode). This is used instead of btrfs_inode_in_log() because the later
 * requires the inode to be in the log and have the log transaction committed,
 * while here we do not care if the log transaction was already committed - our
 * caller will commit the log later - and we want to avoid logging an inode
 * multiple times when multiple tasks have joined the same log transaction.
 */
static bool need_log_inode(const struct btrfs_trans_handle *trans,
			   struct btrfs_inode *inode)
{ … }

struct btrfs_dir_list { … };

/*
 * Log the inodes of the new dentries of a directory.
 * See process_dir_items_leaf() for details about why it is needed.
 * This is a recursive operation - if an existing dentry corresponds to a
 * directory, that directory's new entries are logged too (same behaviour as
 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
 * the dentries point to we do not acquire their VFS lock, otherwise lockdep
 * complains about the following circular lock dependency / possible deadlock:
 *
 *        CPU0                                        CPU1
 *        ----                                        ----
 * lock(&type->i_mutex_dir_key#3/2);
 *                                            lock(sb_internal#2);
 *                                            lock(&type->i_mutex_dir_key#3/2);
 * lock(&sb->s_type->i_mutex_key#14);
 *
 * Where sb_internal is the lock (a counter that works as a lock) acquired by
 * sb_start_intwrite() in btrfs_start_transaction().
 * Not acquiring the VFS lock of the inodes is still safe because:
 *
 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
 *    that while logging the inode new references (names) are added or removed
 *    from the inode, leaving the logged inode item with a link count that does
 *    not match the number of logged inode reference items. This is fine because
 *    at log replay time we compute the real number of links and correct the
 *    link count in the inode item (see replay_one_buffer() and
 *    link_to_fixup_dir());
 *
 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
 *    while logging the inode's items new index items (key type
 *    BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
 *    has a size that doesn't match the sum of the lengths of all the logged
 *    names - this is ok, not a problem, because at log replay time we set the
 *    directory's i_size to the correct value (see replay_one_name() and
 *    overwrite_item()).
 */
static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
				struct btrfs_inode *start_inode,
				struct btrfs_log_ctx *ctx)
{ … }

struct btrfs_ino_list { … };

static void free_conflicting_inodes(struct btrfs_log_ctx *ctx)
{ … }

static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino,
				    struct btrfs_path *path)
{ … }

static int add_conflicting_inode(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root,
				 struct btrfs_path *path,
				 u64 ino, u64 parent,
				 struct btrfs_log_ctx *ctx)
{ … }

static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
				  struct btrfs_root *root,
				  struct btrfs_log_ctx *ctx)
{ … }

static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
				   struct btrfs_inode *inode,
				   struct btrfs_key *min_key,
				   const struct btrfs_key *max_key,
				   struct btrfs_path *path,
				   struct btrfs_path *dst_path,
				   const u64 logged_isize,
				   const int inode_only,
				   struct btrfs_log_ctx *ctx,
				   bool *need_log_inode_item)
{ … }

static int insert_delayed_items_batch(struct btrfs_trans_handle *trans,
				      struct btrfs_root *log,
				      struct btrfs_path *path,
				      const struct btrfs_item_batch *batch,
				      const struct btrfs_delayed_item *first_item)
{ … }

static int log_delayed_insertion_items(struct btrfs_trans_handle *trans,
				       struct btrfs_inode *inode,
				       struct btrfs_path *path,
				       const struct list_head *delayed_ins_list,
				       struct btrfs_log_ctx *ctx)
{ … }

static int log_delayed_deletions_full(struct btrfs_trans_handle *trans,
				      struct btrfs_inode *inode,
				      struct btrfs_path *path,
				      const struct list_head *delayed_del_list,
				      struct btrfs_log_ctx *ctx)
{ … }

static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans,
					struct btrfs_inode *inode,
					struct btrfs_path *path,
					struct btrfs_log_ctx *ctx,
					const struct list_head *delayed_del_list,
					const struct btrfs_delayed_item *first,
					const struct btrfs_delayed_item **last_ret)
{ … }

static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans,
					     struct btrfs_inode *inode,
					     struct btrfs_path *path,
					     const struct list_head *delayed_del_list,
					     struct btrfs_log_ctx *ctx)
{ … }

static int log_delayed_deletion_items(struct btrfs_trans_handle *trans,
				      struct btrfs_inode *inode,
				      struct btrfs_path *path,
				      const struct list_head *delayed_del_list,
				      struct btrfs_log_ctx *ctx)
{ … }

/*
 * Similar logic as for log_new_dir_dentries(), but it iterates over the delayed
 * items instead of the subvolume tree.
 */
static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
				    struct btrfs_inode *inode,
				    const struct list_head *delayed_ins_list,
				    struct btrfs_log_ctx *ctx)
{ … }

/* log a single inode in the tree log.
 * At least one parent directory for this inode must exist in the tree
 * or be logged already.
 *
 * Any items from this inode changed by the current transaction are copied
 * to the log tree.  An extra reference is taken on any extents in this
 * file, allowing us to avoid a whole pile of corner cases around logging
 * blocks that have been removed from the tree.
 *
 * See LOG_INODE_ALL and related defines for a description of what inode_only
 * does.
 *
 * This handles both files and directories.
 */
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
			   struct btrfs_inode *inode,
			   int inode_only,
			   struct btrfs_log_ctx *ctx)
{ … }

static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
				 struct btrfs_inode *inode,
				 struct btrfs_log_ctx *ctx)
{ … }

static int log_new_ancestors(struct btrfs_trans_handle *trans,
			     struct btrfs_root *root,
			     struct btrfs_path *path,
			     struct btrfs_log_ctx *ctx)
{ … }

static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
				  struct btrfs_inode *inode,
				  struct dentry *parent,
				  struct btrfs_log_ctx *ctx)
{ … }

static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
				 struct btrfs_inode *inode,
				 struct dentry *parent,
				 struct btrfs_log_ctx *ctx)
{ … }

/*
 * helper function around btrfs_log_inode to make sure newly created
 * parent directories also end up in the log.  A minimal inode and backref
 * only logging is done of any parent directories that are older than
 * the last committed transaction
 */
static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
				  struct btrfs_inode *inode,
				  struct dentry *parent,
				  int inode_only,
				  struct btrfs_log_ctx *ctx)
{ … }

/*
 * it is not safe to log dentry if the chunk root has added new
 * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
 * If this returns 1, you must commit the transaction to safely get your
 * data on disk.
 */
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
			  struct dentry *dentry,
			  struct btrfs_log_ctx *ctx)
{ … }

/*
 * should be called during mount to recover any replay any log trees
 * from the FS
 */
int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
{ … }

/*
 * there are some corner cases where we want to force a full
 * commit instead of allowing a directory to be logged.
 *
 * They revolve around files there were unlinked from the directory, and
 * this function updates the parent directory so that a full commit is
 * properly done if it is fsync'd later after the unlinks are done.
 *
 * Must be called before the unlink operations (updates to the subvolume tree,
 * inodes, etc) are done.
 */
void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
			     struct btrfs_inode *dir, struct btrfs_inode *inode,
			     bool for_rename)
{ … }

/*
 * Make sure that if someone attempts to fsync the parent directory of a deleted
 * snapshot, it ends up triggering a transaction commit. This is to guarantee
 * that after replaying the log tree of the parent directory's root we will not
 * see the snapshot anymore and at log replay time we will not see any log tree
 * corresponding to the deleted snapshot's root, which could lead to replaying
 * it after replaying the log tree of the parent directory (which would replay
 * the snapshot delete operation).
 *
 * Must be called before the actual snapshot destroy operation (updates to the
 * parent root and tree of tree roots trees, etc) are done.
 */
void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
				   struct btrfs_inode *dir)
{ … }

/*
 * Call this when creating a subvolume in a directory.
 * Because we don't commit a transaction when creating a subvolume, we can't
 * allow the directory pointing to the subvolume to be logged with an entry that
 * points to an unpersisted root if we are still in the transaction used to
 * create the subvolume, so make any attempt to log the directory to result in a
 * full log sync.
 * Also we don't need to worry with renames, since btrfs_rename() marks the log
 * for full commit when renaming a subvolume.
 */
void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans,
				struct btrfs_inode *dir)
{ … }

/*
 * Update the log after adding a new name for an inode.
 *
 * @trans:              Transaction handle.
 * @old_dentry:         The dentry associated with the old name and the old
 *                      parent directory.
 * @old_dir:            The inode of the previous parent directory for the case
 *                      of a rename. For a link operation, it must be NULL.
 * @old_dir_index:      The index number associated with the old name, meaningful
 *                      only for rename operations (when @old_dir is not NULL).
 *                      Ignored for link operations.
 * @parent:             The dentry associated with the directory under which the
 *                      new name is located.
 *
 * Call this after adding a new name for an inode, as a result of a link or
 * rename operation, and it will properly update the log to reflect the new name.
 */
void btrfs_log_new_name(struct btrfs_trans_handle *trans,
			struct dentry *old_dentry, struct btrfs_inode *old_dir,
			u64 old_dir_index, struct dentry *parent)
{ … }
linux/fs/btrfs/tree-log.c