journal.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * journal.c
 *
 * Defines functions of journalling api
 *
 * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
 */

#include <linux/fs.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/kthread.h>
#include <linux/time.h>
#include <linux/random.h>
#include <linux/delay.h>
#include <linux/writeback.h>

#include <cluster/masklog.h>

#include "ocfs2.h"

#include "alloc.h"
#include "blockcheck.h"
#include "dir.h"
#include "dlmglue.h"
#include "extent_map.h"
#include "heartbeat.h"
#include "inode.h"
#include "journal.h"
#include "localalloc.h"
#include "slot_map.h"
#include "super.h"
#include "sysfile.h"
#include "uptodate.h"
#include "quota.h"
#include "file.h"
#include "namei.h"

#include "buffer_head_io.h"
#include "ocfs2_trace.h"

DEFINE_SPINLOCK(…);

#define ORPHAN_SCAN_SCHEDULE_TIMEOUT …

static int ocfs2_force_read_journal(struct inode *inode);
static int ocfs2_recover_node(struct ocfs2_super *osb,
			      int node_num, int slot_num);
static int __ocfs2_recovery_thread(void *arg);
static int ocfs2_commit_cache(struct ocfs2_super *osb);
static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota);
static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
				      int dirty, int replayed);
static int ocfs2_trylock_journal(struct ocfs2_super *osb,
				 int slot_num);
static int ocfs2_recover_orphans(struct ocfs2_super *osb,
				 int slot,
				 enum ocfs2_orphan_reco_type orphan_reco_type);
static int ocfs2_commit_thread(void *arg);
static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
					    int slot_num,
					    struct ocfs2_dinode *la_dinode,
					    struct ocfs2_dinode *tl_dinode,
					    struct ocfs2_quota_recovery *qrec,
					    enum ocfs2_orphan_reco_type orphan_reco_type);

static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
{ … }

static inline int ocfs2_wait_on_quotas(struct ocfs2_super *osb)
{ … }

/*
 * This replay_map is to track online/offline slots, so we could recover
 * offline slots during recovery and mount
 */

enum ocfs2_replay_state { … };

struct ocfs2_replay_map { … };

static void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state)
{ … }

int ocfs2_compute_replay_slots(struct ocfs2_super *osb)
{ … }

static void ocfs2_queue_replay_slots(struct ocfs2_super *osb,
		enum ocfs2_orphan_reco_type orphan_reco_type)
{ … }

void ocfs2_free_replay_slots(struct ocfs2_super *osb)
{ … }

int ocfs2_recovery_init(struct ocfs2_super *osb)
{ … }

/* we can't grab the goofy sem lock from inside wait_event, so we use
 * memory barriers to make sure that we'll see the null task before
 * being woken up */
static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
{ … }

void ocfs2_recovery_exit(struct ocfs2_super *osb)
{ … }

static int __ocfs2_recovery_map_test(struct ocfs2_super *osb,
				     unsigned int node_num)
{ … }

/* Behaves like test-and-set.  Returns the previous value */
static int ocfs2_recovery_map_set(struct ocfs2_super *osb,
				  unsigned int node_num)
{ … }

static void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
				     unsigned int node_num)
{ … }

static int ocfs2_commit_cache(struct ocfs2_super *osb)
{ … }

handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
{ … }

int ocfs2_commit_trans(struct ocfs2_super *osb,
		       handle_t *handle)
{ … }

/*
 * 'nblocks' is what you want to add to the current transaction.
 *
 * This might call jbd2_journal_restart() which will commit dirty buffers
 * and then restart the transaction. Before calling
 * ocfs2_extend_trans(), any changed blocks should have been
 * dirtied. After calling it, all blocks which need to be changed must
 * go through another set of journal_access/journal_dirty calls.
 *
 * WARNING: This will not release any semaphores or disk locks taken
 * during the transaction, so make sure they were taken *before*
 * start_trans or we'll have ordering deadlocks.
 *
 * WARNING2: Note that we do *not* drop j_trans_barrier here. This is
 * good because transaction ids haven't yet been recorded on the
 * cluster locks associated with this handle.
 */
int ocfs2_extend_trans(handle_t *handle, int nblocks)
{ … }

/*
 * Make sure handle has at least 'nblocks' credits available. If it does not
 * have that many credits available, we will try to extend the handle to have
 * enough credits. If that fails, we will restart transaction to have enough
 * credits. Similar notes regarding data consistency and locking implications
 * as for ocfs2_extend_trans() apply here.
 */
int ocfs2_assure_trans_credits(handle_t *handle, int nblocks)
{ … }

/*
 * If we have fewer than thresh credits, extend by OCFS2_MAX_TRANS_DATA.
 * If that fails, restart the transaction & regain write access for the
 * buffer head which is used for metadata modifications.
 * Taken from Ext4: extend_or_restart_transaction()
 */
int ocfs2_allocate_extend_trans(handle_t *handle, int thresh)
{ … }

static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger_type *triggers)
{ … }

static void ocfs2_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
				 struct buffer_head *bh,
				 void *data, size_t size)
{ … }

/*
 * Quota blocks have their own trigger because the struct ocfs2_block_check
 * offset depends on the blocksize.
 */
static void ocfs2_dq_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
				 struct buffer_head *bh,
				 void *data, size_t size)
{ … }

/*
 * Directory blocks also have their own trigger because the
 * struct ocfs2_block_check offset depends on the blocksize.
 */
static void ocfs2_db_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
				 struct buffer_head *bh,
				 void *data, size_t size)
{ … }

static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
				struct buffer_head *bh)
{ … }

static void ocfs2_setup_csum_triggers(struct super_block *sb,
				      enum ocfs2_journal_trigger_type type,
				      struct ocfs2_triggers *ot)
{ … }

void ocfs2_initialize_journal_triggers(struct super_block *sb,
				       struct ocfs2_triggers triggers[])
{ … }

static int __ocfs2_journal_access(handle_t *handle,
				  struct ocfs2_caching_info *ci,
				  struct buffer_head *bh,
				  struct ocfs2_triggers *triggers,
				  int type)
{ … }

int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci,
			    struct buffer_head *bh, int type)
{ … }

int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci,
			    struct buffer_head *bh, int type)
{ … }

int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci,
			    struct buffer_head *bh, int type)
{ … }

int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci,
			    struct buffer_head *bh, int type)
{ … }

int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci,
			    struct buffer_head *bh, int type)
{ … }

int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci,
			    struct buffer_head *bh, int type)
{ … }

int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci,
			    struct buffer_head *bh, int type)
{ … }

int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci,
			    struct buffer_head *bh, int type)
{ … }

int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci,
			    struct buffer_head *bh, int type)
{ … }

int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
			 struct buffer_head *bh, int type)
{ … }

void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh)
{ … }

#define OCFS2_DEFAULT_COMMIT_INTERVAL …

void ocfs2_set_journal_params(struct ocfs2_super *osb)
{ … }

/*
 * alloc & initialize skeleton for journal structure.
 * ocfs2_journal_init() will make fs have journal ability.
 */
int ocfs2_journal_alloc(struct ocfs2_super *osb)
{ … }

static int ocfs2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
{ … }

int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty)
{ … }

static void ocfs2_bump_recovery_generation(struct ocfs2_dinode *di)
{ … }

static u32 ocfs2_get_recovery_generation(struct ocfs2_dinode *di)
{ … }

static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
				      int dirty, int replayed)
{ … }

/*
 * If the journal has been kmalloc'd it needs to be freed after this
 * call.
 */
void ocfs2_journal_shutdown(struct ocfs2_super *osb)
{ … }

static void ocfs2_clear_journal_error(struct super_block *sb,
				      journal_t *journal,
				      int slot)
{ … }

int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
{ … }


/* 'full' flag tells us whether we clear out all blocks or if we just
 * mark the journal clean */
int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
{ … }

static int ocfs2_recovery_completed(struct ocfs2_super *osb)
{ … }

void ocfs2_wait_for_recovery(struct ocfs2_super *osb)
{ … }

/*
 * JBD Might read a cached version of another nodes journal file. We
 * don't want this as this file changes often and we get no
 * notification on those changes. The only way to be sure that we've
 * got the most up to date version of those blocks then is to force
 * read them off disk. Just searching through the buffer cache won't
 * work as there may be pages backing this file which are still marked
 * up to date. We know things can't change on this file underneath us
 * as we have the lock by now :)
 */
static int ocfs2_force_read_journal(struct inode *inode)
{ … }

struct ocfs2_la_recovery_item { … };

/* Does the second half of the recovery process. By this point, the
 * node is marked clean and can actually be considered recovered,
 * hence it's no longer in the recovery map, but there's still some
 * cleanup we can do which shouldn't happen within the recovery thread
 * as locking in that context becomes very difficult if we are to take
 * recovering nodes into account.
 *
 * NOTE: This function can and will sleep on recovery of other nodes
 * during cluster locking, just like any other ocfs2 process.
 */
void ocfs2_complete_recovery(struct work_struct *work)
{ … }

/* NOTE: This function always eats your references to la_dinode and
 * tl_dinode, either manually on error, or by passing them to
 * ocfs2_complete_recovery */
static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
					    int slot_num,
					    struct ocfs2_dinode *la_dinode,
					    struct ocfs2_dinode *tl_dinode,
					    struct ocfs2_quota_recovery *qrec,
					    enum ocfs2_orphan_reco_type orphan_reco_type)
{ … }

/* Called by the mount code to queue recovery the last part of
 * recovery for it's own and offline slot(s). */
void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
{ … }

void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
{ … }

static int __ocfs2_recovery_thread(void *arg)
{ … }

void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
{ … }

static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
				    int slot_num,
				    struct buffer_head **bh,
				    struct inode **ret_inode)
{ … }

/* Does the actual journal replay and marks the journal inode as
 * clean. Will only replay if the journal inode is marked dirty. */
static int ocfs2_replay_journal(struct ocfs2_super *osb,
				int node_num,
				int slot_num)
{ … }

/*
 * Do the most important parts of node recovery:
 *  - Replay it's journal
 *  - Stamp a clean local allocator file
 *  - Stamp a clean truncate log
 *  - Mark the node clean
 *
 * If this function completes without error, a node in OCFS2 can be
 * said to have been safely recovered. As a result, failure during the
 * second part of a nodes recovery process (local alloc recovery) is
 * far less concerning.
 */
static int ocfs2_recover_node(struct ocfs2_super *osb,
			      int node_num, int slot_num)
{ … }

/* Test node liveness by trylocking his journal. If we get the lock,
 * we drop it here. Return 0 if we got the lock, -EAGAIN if node is
 * still alive (we couldn't get the lock) and < 0 on error. */
static int ocfs2_trylock_journal(struct ocfs2_super *osb,
				 int slot_num)
{ … }

/* Call this underneath ocfs2_super_lock. It also assumes that the
 * slot info struct has been updated from disk. */
int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
{ … }

/*
 * Scan timer should get fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT. Add some
 * randomness to the timeout to minimize multple nodes firing the timer at the
 * same time.
 */
static inline unsigned long ocfs2_orphan_scan_timeout(void)
{ … }

/*
 * ocfs2_queue_orphan_scan calls ocfs2_queue_recovery_completion for
 * every slot, queuing a recovery of the slot on the ocfs2_wq thread. This
 * is done to catch any orphans that are left over in orphan directories.
 *
 * It scans all slots, even ones that are in use. It does so to handle the
 * case described below:
 *
 *   Node 1 has an inode it was using. The dentry went away due to memory
 *   pressure.  Node 1 closes the inode, but it's on the free list. The node
 *   has the open lock.
 *   Node 2 unlinks the inode. It grabs the dentry lock to notify others,
 *   but node 1 has no dentry and doesn't get the message. It trylocks the
 *   open lock, sees that another node has a PR, and does nothing.
 *   Later node 2 runs its orphan dir. It igets the inode, trylocks the
 *   open lock, sees the PR still, and does nothing.
 *   Basically, we have to trigger an orphan iput on node 1. The only way
 *   for this to happen is if node 1 runs node 2's orphan dir.
 *
 * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT
 * seconds.  It gets an EX lock on os_lockres and checks sequence number
 * stored in LVB. If the sequence number has changed, it means some other
 * node has done the scan.  This node skips the scan and tracks the
 * sequence number.  If the sequence number didn't change, it means a scan
 * hasn't happened.  The node queues a scan and increments the
 * sequence number in the LVB.
 */
static void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
{ … }

/* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT millsec */
static void ocfs2_orphan_scan_work(struct work_struct *work)
{ … }

void ocfs2_orphan_scan_stop(struct ocfs2_super *osb)
{ … }

void ocfs2_orphan_scan_init(struct ocfs2_super *osb)
{ … }

void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
{ … }

struct ocfs2_orphan_filldir_priv { … };

static bool ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
				int name_len, loff_t pos, u64 ino,
				unsigned type)
{ … }

static int ocfs2_queue_orphans(struct ocfs2_super *osb,
			       int slot,
			       struct inode **head,
			       enum ocfs2_orphan_reco_type orphan_reco_type)
{ … }

static int ocfs2_orphan_recovery_can_continue(struct ocfs2_super *osb,
					      int slot)
{ … }

static void ocfs2_mark_recovering_orphan_dir(struct ocfs2_super *osb,
					     int slot)
{ … }

static void ocfs2_clear_recovering_orphan_dir(struct ocfs2_super *osb,
					      int slot)
{ … }

/*
 * Orphan recovery. Each mounted node has it's own orphan dir which we
 * must run during recovery. Our strategy here is to build a list of
 * the inodes in the orphan dir and iget/iput them. The VFS does
 * (most) of the rest of the work.
 *
 * Orphan recovery can happen at any time, not just mount so we have a
 * couple of extra considerations.
 *
 * - We grab as many inodes as we can under the orphan dir lock -
 *   doing iget() outside the orphan dir risks getting a reference on
 *   an invalid inode.
 * - We must be sure not to deadlock with other processes on the
 *   system wanting to run delete_inode(). This can happen when they go
 *   to lock the orphan dir and the orphan recovery process attempts to
 *   iget() inside the orphan dir lock. This can be avoided by
 *   advertising our state to ocfs2_delete_inode().
 */
static int ocfs2_recover_orphans(struct ocfs2_super *osb,
				 int slot,
				 enum ocfs2_orphan_reco_type orphan_reco_type)
{ … }

static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota)
{ … }

static int ocfs2_commit_thread(void *arg)
{ … }

/* Reads all the journal inodes without taking any cluster locks. Used
 * for hard readonly access to determine whether any journal requires
 * recovery. Also used to refresh the recovery generation numbers after
 * a journal has been recovered by another node.
 */
int ocfs2_check_journals_nolocks(struct ocfs2_super *osb)
{ … }
linux/fs/ocfs2/journal.c