xfs_log_cil.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved.
 */

#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_shared.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_extent_busy.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_log.h"
#include "xfs_log_priv.h"
#include "xfs_trace.h"
#include "xfs_discard.h"

/*
 * Allocate a new ticket. Failing to get a new ticket makes it really hard to
 * recover, so we don't allow failure here. Also, we allocate in a context that
 * we don't want to be issuing transactions from, so we need to tell the
 * allocation code this as well.
 *
 * We don't reserve any space for the ticket - we are going to steal whatever
 * space we require from transactions as they commit. To ensure we reserve all
 * the space required, we need to set the current reservation of the ticket to
 * zero so that we know to steal the initial transaction overhead from the
 * first transaction commit.
 */
static struct xlog_ticket *
xlog_cil_ticket_alloc(
	struct xlog	*log)
{ … }

static inline void
xlog_cil_set_iclog_hdr_count(struct xfs_cil *cil)
{ … }

/*
 * Check if the current log item was first committed in this sequence.
 * We can't rely on just the log item being in the CIL, we have to check
 * the recorded commit sequence number.
 *
 * Note: for this to be used in a non-racy manner, it has to be called with
 * CIL flushing locked out. As a result, it should only be used during the
 * transaction commit process when deciding what to format into the item.
 */
static bool
xlog_item_in_current_chkpt(
	struct xfs_cil		*cil,
	struct xfs_log_item	*lip)
{ … }

bool
xfs_log_item_in_current_chkpt(
	struct xfs_log_item *lip)
{ … }

/*
 * Unavoidable forward declaration - xlog_cil_push_work() calls
 * xlog_cil_ctx_alloc() itself.
 */
static void xlog_cil_push_work(struct work_struct *work);

static struct xfs_cil_ctx *
xlog_cil_ctx_alloc(void)
{ … }

/*
 * Aggregate the CIL per cpu structures into global counts, lists, etc and
 * clear the percpu state ready for the next context to use. This is called
 * from the push code with the context lock held exclusively, hence nothing else
 * will be accessing or modifying the per-cpu counters.
 */
static void
xlog_cil_push_pcp_aggregate(
	struct xfs_cil		*cil,
	struct xfs_cil_ctx	*ctx)
{ … }

/*
 * Aggregate the CIL per-cpu space used counters into the global atomic value.
 * This is called when the per-cpu counter aggregation will first pass the soft
 * limit threshold so we can switch to atomic counter aggregation for accurate
 * detection of hard limit traversal.
 */
static void
xlog_cil_insert_pcp_aggregate(
	struct xfs_cil		*cil,
	struct xfs_cil_ctx	*ctx)
{ … }

static void
xlog_cil_ctx_switch(
	struct xfs_cil		*cil,
	struct xfs_cil_ctx	*ctx)
{ … }

/*
 * After the first stage of log recovery is done, we know where the head and
 * tail of the log are. We need this log initialisation done before we can
 * initialise the first CIL checkpoint context.
 *
 * Here we allocate a log ticket to track space usage during a CIL push.  This
 * ticket is passed to xlog_write() directly so that we don't slowly leak log
 * space by failing to account for space used by log headers and additional
 * region headers for split regions.
 */
void
xlog_cil_init_post_recovery(
	struct xlog	*log)
{ … }

static inline int
xlog_cil_iovec_space(
	uint	niovecs)
{ … }

/*
 * Allocate or pin log vector buffers for CIL insertion.
 *
 * The CIL currently uses disposable buffers for copying a snapshot of the
 * modified items into the log during a push. The biggest problem with this is
 * the requirement to allocate the disposable buffer during the commit if:
 *	a) does not exist; or
 *	b) it is too small
 *
 * If we do this allocation within xlog_cil_insert_format_items(), it is done
 * under the xc_ctx_lock, which means that a CIL push cannot occur during
 * the memory allocation. This means that we have a potential deadlock situation
 * under low memory conditions when we have lots of dirty metadata pinned in
 * the CIL and we need a CIL commit to occur to free memory.
 *
 * To avoid this, we need to move the memory allocation outside the
 * xc_ctx_lock, but because the log vector buffers are disposable, that opens
 * up a TOCTOU race condition w.r.t. the CIL committing and removing the log
 * vector buffers between the check and the formatting of the item into the
 * log vector buffer within the xc_ctx_lock.
 *
 * Because the log vector buffer needs to be unchanged during the CIL push
 * process, we cannot share the buffer between the transaction commit (which
 * modifies the buffer) and the CIL push context that is writing the changes
 * into the log. This means skipping preallocation of buffer space is
 * unreliable, but we most definitely do not want to be allocating and freeing
 * buffers unnecessarily during commits when overwrites can be done safely.
 *
 * The simplest solution to this problem is to allocate a shadow buffer when a
 * log item is committed for the second time, and then to only use this buffer
 * if necessary. The buffer can remain attached to the log item until such time
 * it is needed, and this is the buffer that is reallocated to match the size of
 * the incoming modification. Then during the formatting of the item we can swap
 * the active buffer with the new one if we can't reuse the existing buffer. We
 * don't free the old buffer as it may be reused on the next modification if
 * it's size is right, otherwise we'll free and reallocate it at that point.
 *
 * This function builds a vector for the changes in each log item in the
 * transaction. It then works out the length of the buffer needed for each log
 * item, allocates them and attaches the vector to the log item in preparation
 * for the formatting step which occurs under the xc_ctx_lock.
 *
 * While this means the memory footprint goes up, it avoids the repeated
 * alloc/free pattern that repeated modifications of an item would otherwise
 * cause, and hence minimises the CPU overhead of such behaviour.
 */
static void
xlog_cil_alloc_shadow_bufs(
	struct xlog		*log,
	struct xfs_trans	*tp)
{ … }

/*
 * Prepare the log item for insertion into the CIL. Calculate the difference in
 * log space it will consume, and if it is a new item pin it as well.
 */
STATIC void
xfs_cil_prepare_item(
	struct xlog		*log,
	struct xfs_log_vec	*lv,
	struct xfs_log_vec	*old_lv,
	int			*diff_len)
{ … }

/*
 * Format log item into a flat buffers
 *
 * For delayed logging, we need to hold a formatted buffer containing all the
 * changes on the log item. This enables us to relog the item in memory and
 * write it out asynchronously without needing to relock the object that was
 * modified at the time it gets written into the iclog.
 *
 * This function takes the prepared log vectors attached to each log item, and
 * formats the changes into the log vector buffer. The buffer it uses is
 * dependent on the current state of the vector in the CIL - the shadow lv is
 * guaranteed to be large enough for the current modification, but we will only
 * use that if we can't reuse the existing lv. If we can't reuse the existing
 * lv, then simple swap it out for the shadow lv. We don't free it - that is
 * done lazily either by th enext modification or the freeing of the log item.
 *
 * We don't set up region headers during this process; we simply copy the
 * regions into the flat buffer. We can do this because we still have to do a
 * formatting step to write the regions into the iclog buffer.  Writing the
 * ophdrs during the iclog write means that we can support splitting large
 * regions across iclog boundares without needing a change in the format of the
 * item/region encapsulation.
 *
 * Hence what we need to do now is change the rewrite the vector array to point
 * to the copied region inside the buffer we just allocated. This allows us to
 * format the regions into the iclog as though they are being formatted
 * directly out of the objects themselves.
 */
static void
xlog_cil_insert_format_items(
	struct xlog		*log,
	struct xfs_trans	*tp,
	int			*diff_len)
{ … }

/*
 * The use of lockless waitqueue_active() requires that the caller has
 * serialised itself against the wakeup call in xlog_cil_push_work(). That
 * can be done by either holding the push lock or the context lock.
 */
static inline bool
xlog_cil_over_hard_limit(
	struct xlog	*log,
	int32_t		space_used)
{ … }

/*
 * Insert the log items into the CIL and calculate the difference in space
 * consumed by the item. Add the space to the checkpoint ticket and calculate
 * if the change requires additional log metadata. If it does, take that space
 * as well. Remove the amount of space we added to the checkpoint ticket from
 * the current transaction ticket so that the accounting works out correctly.
 */
static void
xlog_cil_insert_items(
	struct xlog		*log,
	struct xfs_trans	*tp,
	uint32_t		released_space)
{ … }

static inline void
xlog_cil_ail_insert_batch(
	struct xfs_ail		*ailp,
	struct xfs_ail_cursor	*cur,
	struct xfs_log_item	**log_items,
	int			nr_items,
	xfs_lsn_t		commit_lsn)
{ … }

/*
 * Take the checkpoint's log vector chain of items and insert the attached log
 * items into the AIL. This uses bulk insertion techniques to minimise AIL lock
 * traffic.
 *
 * The AIL tracks log items via the start record LSN of the checkpoint,
 * not the commit record LSN. This is because we can pipeline multiple
 * checkpoints, and so the start record of checkpoint N+1 can be
 * written before the commit record of checkpoint N. i.e:
 *
 *   start N			commit N
 *	+-------------+------------+----------------+
 *		  start N+1			commit N+1
 *
 * The tail of the log cannot be moved to the LSN of commit N when all
 * the items of that checkpoint are written back, because then the
 * start record for N+1 is no longer in the active portion of the log
 * and recovery will fail/corrupt the filesystem.
 *
 * Hence when all the log items in checkpoint N are written back, the
 * tail of the log most now only move as far forwards as the start LSN
 * of checkpoint N+1.
 *
 * If we are called with the aborted flag set, it is because a log write during
 * a CIL checkpoint commit has failed. In this case, all the items in the
 * checkpoint have already gone through iop_committed and iop_committing, which
 * means that checkpoint commit abort handling is treated exactly the same as an
 * iclog write error even though we haven't started any IO yet. Hence in this
 * case all we need to do is iop_committed processing, followed by an
 * iop_unpin(aborted) call.
 *
 * The AIL cursor is used to optimise the insert process. If commit_lsn is not
 * at the end of the AIL, the insert cursor avoids the need to walk the AIL to
 * find the insertion point on every xfs_log_item_batch_insert() call. This
 * saves a lot of needless list walking and is a net win, even though it
 * slightly increases that amount of AIL lock traffic to set it up and tear it
 * down.
 */
static void
xlog_cil_ail_insert(
	struct xfs_cil_ctx	*ctx,
	bool			aborted)
{ … }

static void
xlog_cil_free_logvec(
	struct list_head	*lv_chain)
{ … }

/*
 * Mark all items committed and clear busy extents. We free the log vector
 * chains in a separate pass so that we unpin the log items as quickly as
 * possible.
 */
static void
xlog_cil_committed(
	struct xfs_cil_ctx	*ctx)
{ … }

void
xlog_cil_process_committed(
	struct list_head	*list)
{ … }

/*
* Record the LSN of the iclog we were just granted space to start writing into.
* If the context doesn't have a start_lsn recorded, then this iclog will
* contain the start record for the checkpoint. Otherwise this write contains
* the commit record for the checkpoint.
*/
void
xlog_cil_set_ctx_write_state(
	struct xfs_cil_ctx	*ctx,
	struct xlog_in_core	*iclog)
{ … }


/*
 * Ensure that the order of log writes follows checkpoint sequence order. This
 * relies on the context LSN being zero until the log write has guaranteed the
 * LSN that the log write will start at via xlog_state_get_iclog_space().
 */
enum _record_type { … };

static int
xlog_cil_order_write(
	struct xfs_cil		*cil,
	xfs_csn_t		sequence,
	enum _record_type	record)
{ … }

/*
 * Write out the log vector change now attached to the CIL context. This will
 * write a start record that needs to be strictly ordered in ascending CIL
 * sequence order so that log recovery will always use in-order start LSNs when
 * replaying checkpoints.
 */
static int
xlog_cil_write_chain(
	struct xfs_cil_ctx	*ctx,
	uint32_t		chain_len)
{ … }

/*
 * Write out the commit record of a checkpoint transaction to close off a
 * running log write. These commit records are strictly ordered in ascending CIL
 * sequence order so that log recovery will always replay the checkpoints in the
 * correct order.
 */
static int
xlog_cil_write_commit_record(
	struct xfs_cil_ctx	*ctx)
{ … }

struct xlog_cil_trans_hdr { … };

/*
 * Build a checkpoint transaction header to begin the journal transaction.  We
 * need to account for the space used by the transaction header here as it is
 * not accounted for in xlog_write().
 *
 * This is the only place we write a transaction header, so we also build the
 * log opheaders that indicate the start of a log transaction and wrap the
 * transaction header. We keep the start record in it's own log vector rather
 * than compacting them into a single region as this ends up making the logic
 * in xlog_write() for handling empty opheaders for start, commit and unmount
 * records much simpler.
 */
static void
xlog_cil_build_trans_hdr(
	struct xfs_cil_ctx	*ctx,
	struct xlog_cil_trans_hdr *hdr,
	struct xfs_log_vec	*lvhdr,
	int			num_iovecs)
{ … }

/*
 * CIL item reordering compare function. We want to order in ascending ID order,
 * but we want to leave items with the same ID in the order they were added to
 * the list. This is important for operations like reflink where we log 4 order
 * dependent intents in a single transaction when we overwrite an existing
 * shared extent with a new shared extent. i.e. BUI(unmap), CUI(drop),
 * CUI (inc), BUI(remap)...
 */
static int
xlog_cil_order_cmp(
	void			*priv,
	const struct list_head	*a,
	const struct list_head	*b)
{ … }

/*
 * Pull all the log vectors off the items in the CIL, and remove the items from
 * the CIL. We don't need the CIL lock here because it's only needed on the
 * transaction commit side which is currently locked out by the flush lock.
 *
 * If a log item is marked with a whiteout, we do not need to write it to the
 * journal and so we just move them to the whiteout list for the caller to
 * dispose of appropriately.
 */
static void
xlog_cil_build_lv_chain(
	struct xfs_cil_ctx	*ctx,
	struct list_head	*whiteouts,
	uint32_t		*num_iovecs,
	uint32_t		*num_bytes)
{ … }

static void
xlog_cil_cleanup_whiteouts(
	struct list_head	*whiteouts)
{ … }

/*
 * Push the Committed Item List to the log.
 *
 * If the current sequence is the same as xc_push_seq we need to do a flush. If
 * xc_push_seq is less than the current sequence, then it has already been
 * flushed and we don't need to do anything - the caller will wait for it to
 * complete if necessary.
 *
 * xc_push_seq is checked unlocked against the sequence number for a match.
 * Hence we can allow log forces to run racily and not issue pushes for the
 * same sequence twice.  If we get a race between multiple pushes for the same
 * sequence they will block on the first one and then abort, hence avoiding
 * needless pushes.
 *
 * This runs from a workqueue so it does not inherent any specific memory
 * allocation context. However, we do not want to block on memory reclaim
 * recursing back into the filesystem because this push may have been triggered
 * by memory reclaim itself. Hence we really need to run under full GFP_NOFS
 * contraints here.
 */
static void
xlog_cil_push_work(
	struct work_struct	*work)
{ … }

/*
 * We need to push CIL every so often so we don't cache more than we can fit in
 * the log. The limit really is that a checkpoint can't be more than half the
 * log (the current checkpoint is not allowed to overwrite the previous
 * checkpoint), but commit latency and memory usage limit this to a smaller
 * size.
 */
static void
xlog_cil_push_background(
	struct xlog	*log)
{ … }

/*
 * xlog_cil_push_now() is used to trigger an immediate CIL push to the sequence
 * number that is passed. When it returns, the work will be queued for
 * @push_seq, but it won't be completed.
 *
 * If the caller is performing a synchronous force, we will flush the workqueue
 * to get previously queued work moving to minimise the wait time they will
 * undergo waiting for all outstanding pushes to complete. The caller is
 * expected to do the required waiting for push_seq to complete.
 *
 * If the caller is performing an async push, we need to ensure that the
 * checkpoint is fully flushed out of the iclogs when we finish the push. If we
 * don't do this, then the commit record may remain sitting in memory in an
 * ACTIVE iclog. This then requires another full log force to push to disk,
 * which defeats the purpose of having an async, non-blocking CIL force
 * mechanism. Hence in this case we need to pass a flag to the push work to
 * indicate it needs to flush the commit record itself.
 */
static void
xlog_cil_push_now(
	struct xlog	*log,
	xfs_lsn_t	push_seq,
	bool		async)
{ … }

bool
xlog_cil_empty(
	struct xlog	*log)
{ … }

/*
 * If there are intent done items in this transaction and the related intent was
 * committed in the current (same) CIL checkpoint, we don't need to write either
 * the intent or intent done item to the journal as the change will be
 * journalled atomically within this checkpoint. As we cannot remove items from
 * the CIL here, mark the related intent with a whiteout so that the CIL push
 * can remove it rather than writing it to the journal. Then remove the intent
 * done item from the current transaction and release it so it doesn't get put
 * into the CIL at all.
 */
static uint32_t
xlog_cil_process_intents(
	struct xfs_cil		*cil,
	struct xfs_trans	*tp)
{ … }

/*
 * Commit a transaction with the given vector to the Committed Item List.
 *
 * To do this, we need to format the item, pin it in memory if required and
 * account for the space used by the transaction. Once we have done that we
 * need to release the unused reservation for the transaction, attach the
 * transaction to the checkpoint context so we carry the busy extents through
 * to checkpoint completion, and then unlock all the items in the transaction.
 *
 * Called with the context lock already held in read mode to lock out
 * background commit, returns without it held once background commits are
 * allowed again.
 */
void
xlog_cil_commit(
	struct xlog		*log,
	struct xfs_trans	*tp,
	xfs_csn_t		*commit_seq,
	bool			regrant)
{ … }

/*
 * Flush the CIL to stable storage but don't wait for it to complete. This
 * requires the CIL push to ensure the commit record for the push hits the disk,
 * but otherwise is no different to a push done from a log force.
 */
void
xlog_cil_flush(
	struct xlog	*log)
{ … }

/*
 * Conditionally push the CIL based on the sequence passed in.
 *
 * We only need to push if we haven't already pushed the sequence number given.
 * Hence the only time we will trigger a push here is if the push sequence is
 * the same as the current context.
 *
 * We return the current commit lsn to allow the callers to determine if a
 * iclog flush is necessary following this call.
 */
xfs_lsn_t
xlog_cil_force_seq(
	struct xlog	*log,
	xfs_csn_t	sequence)
{ … }

/*
 * Perform initial CIL structure initialisation.
 */
int
xlog_cil_init(
	struct xlog		*log)
{ … }

void
xlog_cil_destroy(
	struct xlog	*log)
{ … }
linux/fs/xfs/xfs_log_cil.c