journal.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0
/*
 * Write ahead logging implementation copyright Chris Mason 2000
 *
 * The background commits make this code very interrelated, and
 * overly complex.  I need to rethink things a bit....The major players:
 *
 * journal_begin -- call with the number of blocks you expect to log.
 *                  If the current transaction is too
 *		    old, it will block until the current transaction is
 *		    finished, and then start a new one.
 *		    Usually, your transaction will get joined in with
 *                  previous ones for speed.
 *
 * journal_join  -- same as journal_begin, but won't block on the current
 *                  transaction regardless of age.  Don't ever call
 *                  this.  Ever.  There are only two places it should be
 *                  called from, and they are both inside this file.
 *
 * journal_mark_dirty -- adds blocks into this transaction.  clears any flags
 *                       that might make them get sent to disk
 *                       and then marks them BH_JDirty.  Puts the buffer head
 *                       into the current transaction hash.
 *
 * journal_end -- if the current transaction is batchable, it does nothing
 *                   otherwise, it could do an async/synchronous commit, or
 *                   a full flush of all log and real blocks in the
 *                   transaction.
 *
 * flush_old_commits -- if the current transaction is too old, it is ended and
 *                      commit blocks are sent to disk.  Forces commit blocks
 *                      to disk for all backgrounded commits that have been
 *                      around too long.
 *		     -- Note, if you call this as an immediate flush from
 *		        within kupdate, it will ignore the immediate flag
 */

#include <linux/time.h>
#include <linux/semaphore.h>
#include <linux/vmalloc.h>
#include "reiserfs.h"
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fcntl.h>
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/buffer_head.h>
#include <linux/workqueue.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/uaccess.h>
#include <linux/slab.h>


/* gets a struct reiserfs_journal_list * from a list head */
#define JOURNAL_LIST_ENTRY(h) …

/* must be correct to keep the desc and commit structs at 4k */
#define JOURNAL_TRANS_HALF …
#define BUFNR …

/* cnode stat bits.  Move these into reiserfs_fs.h */

/* this block was freed, and can't be written.  */
#define BLOCK_FREED …
/* this block was freed during this transaction, and can't be written */
#define BLOCK_FREED_HOLDER …

/* used in flush_journal_list */
#define BLOCK_NEEDS_FLUSH …
#define BLOCK_DIRTIED …

/* journal list state bits */
#define LIST_TOUCHED …
#define LIST_DIRTY …
#define LIST_COMMIT_PENDING …

/* flags for do_journal_end */
#define FLUSH_ALL …
#define COMMIT_NOW …
#define WAIT …

static int do_journal_end(struct reiserfs_transaction_handle *, int flags);
static int flush_journal_list(struct super_block *s,
			      struct reiserfs_journal_list *jl, int flushall);
static int flush_commit_list(struct super_block *s,
			     struct reiserfs_journal_list *jl, int flushall);
static int can_dirty(struct reiserfs_journal_cnode *cn);
static int journal_join(struct reiserfs_transaction_handle *th,
			struct super_block *sb);
static void release_journal_dev(struct reiserfs_journal *journal);
static void dirty_one_transaction(struct super_block *s,
				 struct reiserfs_journal_list *jl);
static void flush_async_commits(struct work_struct *work);
static void queue_log_writer(struct super_block *s);

/* values for join in do_journal_begin_r */
enum { … };

static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
			      struct super_block *sb,
			      unsigned long nblocks, int join);

static void init_journal_hash(struct super_block *sb)
{ … }

/*
 * clears BH_Dirty and sticks the buffer on the clean list.  Called because
 * I can't allow refile_buffer to make schedule happen after I've freed a
 * block.  Look at remove_from_transaction and journal_mark_freed for
 * more details.
 */
static int reiserfs_clean_and_file_buffer(struct buffer_head *bh)
{ … }

static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
							 *sb)
{ … }

static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *sb)
{ … }
static inline void free_bitmap_node(struct super_block *sb,
				    struct reiserfs_bitmap_node *bn)
{ … }

static void allocate_bitmap_nodes(struct super_block *sb)
{ … }

static int set_bit_in_list_bitmap(struct super_block *sb,
				  b_blocknr_t block,
				  struct reiserfs_list_bitmap *jb)
{ … }

static void cleanup_bitmap_list(struct super_block *sb,
				struct reiserfs_list_bitmap *jb)
{ … }

/*
 * only call this on FS unmount.
 */
static int free_list_bitmaps(struct super_block *sb,
			     struct reiserfs_list_bitmap *jb_array)
{ … }

static int free_bitmap_nodes(struct super_block *sb)
{ … }

/*
 * get memory for JOURNAL_NUM_BITMAPS worth of bitmaps.
 * jb_array is the array to be filled in.
 */
int reiserfs_allocate_list_bitmaps(struct super_block *sb,
				   struct reiserfs_list_bitmap *jb_array,
				   unsigned int bmap_nr)
{ … }

/*
 * find an available list bitmap.  If you can't find one, flush a commit list
 * and try again
 */
static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *sb,
						    struct reiserfs_journal_list
						    *jl)
{ … }

/*
 * allocates a new chunk of X nodes, and links them all together as a list.
 * Uses the cnode->next and cnode->prev pointers
 * returns NULL on failure
 */
static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes)
{ … }

/* pulls a cnode off the free list, or returns NULL on failure */
static struct reiserfs_journal_cnode *get_cnode(struct super_block *sb)
{ … }

/*
 * returns a cnode to the free list
 */
static void free_cnode(struct super_block *sb,
		       struct reiserfs_journal_cnode *cn)
{ … }

static void clear_prepared_bits(struct buffer_head *bh)
{ … }

/*
 * return a cnode with same dev, block number and size in table,
 * or null if not found
 */
static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct
								  super_block
								  *sb,
								  struct
								  reiserfs_journal_cnode
								  **table,
								  long bl)
{ … }

/*
 * this actually means 'can this block be reallocated yet?'.  If you set
 * search_all, a block can only be allocated if it is not in the current
 * transaction, was not freed by the current transaction, and has no chance
 * of ever being overwritten by a replay after crashing.
 *
 * If you don't set search_all, a block can only be allocated if it is not
 * in the current transaction.  Since deleting a block removes it from the
 * current transaction, this case should never happen.  If you don't set
 * search_all, make sure you never write the block without logging it.
 *
 * next_zero_bit is a suggestion about the next block to try for find_forward.
 * when bl is rejected because it is set in a journal list bitmap, we search
 * for the next zero bit in the bitmap that rejected bl.  Then, we return
 * that through next_zero_bit for find_forward to try.
 *
 * Just because we return something in next_zero_bit does not mean we won't
 * reject it on the next call to reiserfs_in_journal
 */
int reiserfs_in_journal(struct super_block *sb,
			unsigned int bmap_nr, int bit_nr, int search_all,
			b_blocknr_t * next_zero_bit)
{ … }

/* insert cn into table */
static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,
				       struct reiserfs_journal_cnode *cn)
{ … }

/* lock the current transaction */
static inline void lock_journal(struct super_block *sb)
{ … }

/* unlock the current transaction */
static inline void unlock_journal(struct super_block *sb)
{ … }

static inline void get_journal_list(struct reiserfs_journal_list *jl)
{ … }

static inline void put_journal_list(struct super_block *s,
				    struct reiserfs_journal_list *jl)
{ … }

/*
 * this used to be much more involved, and I'm keeping it just in case
 * things get ugly again.  it gets called by flush_commit_list, and
 * cleans up any data stored about blocks freed during a transaction.
 */
static void cleanup_freed_for_journal_list(struct super_block *sb,
					   struct reiserfs_journal_list *jl)
{ … }

static int journal_list_still_alive(struct super_block *s,
				    unsigned int trans_id)
{ … }

/*
 * If page->mapping was null, we failed to truncate this page for
 * some reason.  Most likely because it was truncated after being
 * logged via data=journal.
 *
 * This does a check to see if the buffer belongs to one of these
 * lost pages before doing the final put_bh.  If page->mapping was
 * null, it tries to free buffers on the page, which should make the
 * final put_page drop the page from the lru.
 */
static void release_buffer_page(struct buffer_head *bh)
{ … }

static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
{ … }

static void reiserfs_end_ordered_io(struct buffer_head *bh, int uptodate)
{ … }

static void submit_logged_buffer(struct buffer_head *bh)
{ … }

static void submit_ordered_buffer(struct buffer_head *bh)
{ … }

#define CHUNK_SIZE …
struct buffer_chunk { … };

static void write_chunk(struct buffer_chunk *chunk)
{ … }

static void write_ordered_chunk(struct buffer_chunk *chunk)
{ … }

static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh,
			spinlock_t * lock, void (fn) (struct buffer_chunk *))
{ … }

static atomic_t nr_reiserfs_jh = …;
static struct reiserfs_jh *alloc_jh(void)
{ … }

/*
 * we want to free the jh when the buffer has been written
 * and waited on
 */
void reiserfs_free_jh(struct buffer_head *bh)
{ … }

static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh,
			   int tail)
{ … }

int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh)
{ … }
int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh)
{ … }

#define JH_ENTRY(l) …
static int write_ordered_buffers(spinlock_t * lock,
				 struct reiserfs_journal *j,
				 struct reiserfs_journal_list *jl,
				 struct list_head *list)
{ … }

static int flush_older_commits(struct super_block *s,
			       struct reiserfs_journal_list *jl)
{ … }

static int reiserfs_async_progress_wait(struct super_block *s)
{ … }

/*
 * if this journal list still has commit blocks unflushed, send them to disk.
 *
 * log areas must be flushed in order (transaction 2 can't commit before
 * transaction 1) Before the commit block can by written, every other log
 * block must be safely on disk
 */
static int flush_commit_list(struct super_block *s,
			     struct reiserfs_journal_list *jl, int flushall)
{ … }

/*
 * flush_journal_list frequently needs to find a newer transaction for a
 * given block.  This does that, or returns NULL if it can't find anything
 */
static struct reiserfs_journal_list *find_newer_jl_for_cn(struct
							  reiserfs_journal_cnode
							  *cn)
{ … }

static void remove_journal_hash(struct super_block *,
				struct reiserfs_journal_cnode **,
				struct reiserfs_journal_list *, unsigned long,
				int);

/*
 * once all the real blocks have been flushed, it is safe to remove them
 * from the journal list for this transaction.  Aside from freeing the
 * cnode, this also allows the block to be reallocated for data blocks
 * if it had been deleted.
 */
static void remove_all_from_journal_list(struct super_block *sb,
					 struct reiserfs_journal_list *jl,
					 int debug)
{ … }

/*
 * if this timestamp is greater than the timestamp we wrote last to the
 * header block, write it to the header block.  once this is done, I can
 * safely say the log area for this transaction won't ever be replayed,
 * and I can start releasing blocks in this transaction for reuse as data
 * blocks.  called by flush_journal_list, before it calls
 * remove_all_from_journal_list
 */
static int _update_journal_header_block(struct super_block *sb,
					unsigned long offset,
					unsigned int trans_id)
{ … }

static int update_journal_header_block(struct super_block *sb,
				       unsigned long offset,
				       unsigned int trans_id)
{ … }

/*
** flush any and all journal lists older than you are
** can only be called from flush_journal_list
*/
static int flush_older_journal_lists(struct super_block *sb,
				     struct reiserfs_journal_list *jl)
{ … }

static void del_from_work_list(struct super_block *s,
			       struct reiserfs_journal_list *jl)
{ … }

/*
 * flush a journal list, both commit and real blocks
 *
 * always set flushall to 1, unless you are calling from inside
 * flush_journal_list
 *
 * IMPORTANT.  This can only be called while there are no journal writers,
 * and the journal is locked.  That means it can only be called from
 * do_journal_end, or by journal_release
 */
static int flush_journal_list(struct super_block *s,
			      struct reiserfs_journal_list *jl, int flushall)
{ … }

static int write_one_transaction(struct super_block *s,
				 struct reiserfs_journal_list *jl,
				 struct buffer_chunk *chunk)
{ … }

/* used by flush_commit_list */
static void dirty_one_transaction(struct super_block *s,
				 struct reiserfs_journal_list *jl)
{ … }

static int kupdate_transactions(struct super_block *s,
				struct reiserfs_journal_list *jl,
				struct reiserfs_journal_list **next_jl,
				unsigned int *next_trans_id,
				int num_blocks, int num_trans)
{ … }

/*
 * for o_sync and fsync heavy applications, they tend to use
 * all the journa list slots with tiny transactions.  These
 * trigger lots and lots of calls to update the header block, which
 * adds seeks and slows things down.
 *
 * This function tries to clear out a large chunk of the journal lists
 * at once, which makes everything faster since only the newest journal
 * list updates the header block
 */
static int flush_used_journal_lists(struct super_block *s,
				    struct reiserfs_journal_list *jl)
{ … }

/*
 * removes any nodes in table with name block and dev as bh.
 * only touchs the hnext and hprev pointers.
 */
static void remove_journal_hash(struct super_block *sb,
			 struct reiserfs_journal_cnode **table,
			 struct reiserfs_journal_list *jl,
			 unsigned long block, int remove_freed)
{ … }

static void free_journal_ram(struct super_block *sb)
{ … }

/*
 * call on unmount.  Only set error to 1 if you haven't made your way out
 * of read_super() yet.  Any other caller must keep error at 0.
 */
static int do_journal_release(struct reiserfs_transaction_handle *th,
			      struct super_block *sb, int error)
{ … }

/* * call on unmount.  flush all journal trans, release all alloc'd ram */
int journal_release(struct reiserfs_transaction_handle *th,
		    struct super_block *sb)
{ … }

/* only call from an error condition inside reiserfs_read_super!  */
int journal_release_error(struct reiserfs_transaction_handle *th,
			  struct super_block *sb)
{ … }

/*
 * compares description block with commit block.
 * returns 1 if they differ, 0 if they are the same
 */
static int journal_compare_desc_commit(struct super_block *sb,
				       struct reiserfs_journal_desc *desc,
				       struct reiserfs_journal_commit *commit)
{ … }

/*
 * returns 0 if it did not find a description block
 * returns -1 if it found a corrupt commit block
 * returns 1 if both desc and commit were valid
 * NOTE: only called during fs mount
 */
static int journal_transaction_is_valid(struct super_block *sb,
					struct buffer_head *d_bh,
					unsigned int *oldest_invalid_trans_id,
					unsigned long *newest_mount_id)
{ … }

static void brelse_array(struct buffer_head **heads, int num)
{ … }

/*
 * given the start, and values for the oldest acceptable transactions,
 * this either reads in a replays a transaction, or returns because the
 * transaction is invalid, or too old.
 * NOTE: only called during fs mount
 */
static int journal_read_transaction(struct super_block *sb,
				    unsigned long cur_dblock,
				    unsigned long oldest_start,
				    unsigned int oldest_trans_id,
				    unsigned long newest_mount_id)
{ … }

/*
 * This function reads blocks starting from block and to max_block of bufsize
 * size (but no more than BUFNR blocks at a time). This proved to improve
 * mounting speed on self-rebuilding raid5 arrays at least.
 * Right now it is only used from journal code. But later we might use it
 * from other places.
 * Note: Do not use journal_getblk/sb_getblk functions here!
 */
static struct buffer_head *reiserfs_breada(struct block_device *dev,
					   b_blocknr_t block, int bufsize,
					   b_blocknr_t max_block)
{ … }

/*
 * read and replay the log
 * on a clean unmount, the journal header's next unflushed pointer will be
 * to an invalid transaction.  This tests that before finding all the
 * transactions in the log, which makes normal mount times fast.
 *
 * After a crash, this starts with the next unflushed transaction, and
 * replays until it finds one too old, or invalid.
 *
 * On exit, it sets things up so the first transaction will work correctly.
 * NOTE: only called during fs mount
 */
static int journal_read(struct super_block *sb)
{ … }

static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
{ … }

static void journal_list_init(struct super_block *sb)
{ … }

static void release_journal_dev(struct reiserfs_journal *journal)
{ … }

static int journal_init_dev(struct super_block *super,
			    struct reiserfs_journal *journal,
			    const char *jdev_name)
{ … }

/*
 * When creating/tuning a file system user can assign some
 * journal params within boundaries which depend on the ratio
 * blocksize/standard_blocksize.
 *
 * For blocks >= standard_blocksize transaction size should
 * be not less then JOURNAL_TRANS_MIN_DEFAULT, and not more
 * then JOURNAL_TRANS_MAX_DEFAULT.
 *
 * For blocks < standard_blocksize these boundaries should be
 * decreased proportionally.
 */
#define REISERFS_STANDARD_BLKSIZE …

static int check_advise_trans_params(struct super_block *sb,
				     struct reiserfs_journal *journal)
{ … }

/* must be called once on fs mount.  calls journal_read for you */
int journal_init(struct super_block *sb, const char *j_dev_name,
		 int old_format, unsigned int commit_max_age)
{ … }

/*
 * test for a polite end of the current transaction.  Used by file_write,
 * and should be used by delete to make sure they don't write more than
 * can fit inside a single transaction
 */
int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
				   int new_alloc)
{ … }

/* this must be called inside a transaction */
void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
{ … }

/* this must be called without a transaction started */
void reiserfs_allow_writes(struct super_block *s)
{ … }

/* this must be called without a transaction started */
void reiserfs_wait_on_write_block(struct super_block *s)
{ … }

static void queue_log_writer(struct super_block *s)
{ … }

static void wake_queued_writers(struct super_block *s)
{ … }

static void let_transaction_grow(struct super_block *sb, unsigned int trans_id)
{ … }

/*
 * join == true if you must join an existing transaction.
 * join == false if you can deal with waiting for others to finish
 *
 * this will block until the transaction is joinable.  send the number of
 * blocks you expect to use in nblocks.
*/
static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
			      struct super_block *sb, unsigned long nblocks,
			      int join)
{ … }

struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
								    super_block
								    *s,
								    int nblocks)
{ … }

int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th)
{ … }

static int journal_join(struct reiserfs_transaction_handle *th,
			struct super_block *sb)
{ … }

int journal_join_abort(struct reiserfs_transaction_handle *th,
		       struct super_block *sb)
{ … }

int journal_begin(struct reiserfs_transaction_handle *th,
		  struct super_block *sb, unsigned long nblocks)
{ … }

/*
 * puts bh into the current transaction.  If it was already there, reorders
 * removes the old pointers from the hash, and puts new ones in (to make
 * sure replay happen in the right order).
 *
 * if it was dirty, cleans and files onto the clean list.  I can't let it
 * be dirty again until the transaction is committed.
 *
 * if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len.
 */
int journal_mark_dirty(struct reiserfs_transaction_handle *th,
		       struct buffer_head *bh)
{ … }

int journal_end(struct reiserfs_transaction_handle *th)
{ … }

/*
 * removes from the current transaction, relsing and descrementing any counters.
 * also files the removed buffer directly onto the clean list
 *
 * called by journal_mark_freed when a block has been deleted
 *
 * returns 1 if it cleaned and relsed the buffer. 0 otherwise
 */
static int remove_from_transaction(struct super_block *sb,
				   b_blocknr_t blocknr, int already_cleaned)
{ … }

/*
 * for any cnode in a journal list, it can only be dirtied of all the
 * transactions that include it are committed to disk.
 * this checks through each transaction, and returns 1 if you are allowed
 * to dirty, and 0 if you aren't
 *
 * it is called by dirty_journal_list, which is called after
 * flush_commit_list has gotten all the log blocks for a given
 * transaction on disk
 *
 */
static int can_dirty(struct reiserfs_journal_cnode *cn)
{ … }

/*
 * syncs the commit blocks, but does not force the real buffers to disk
 * will wait until the current transaction is done/committed before returning
 */
int journal_end_sync(struct reiserfs_transaction_handle *th)
{ … }

/* writeback the pending async commits to disk */
static void flush_async_commits(struct work_struct *work)
{ … }

/*
 * flushes any old transactions to disk
 * ends the current transaction if it is too old
 */
void reiserfs_flush_old_commits(struct super_block *sb)
{ … }

/*
 * returns 0 if do_journal_end should return right away, returns 1 if
 * do_journal_end should finish the commit
 *
 * if the current transaction is too old, but still has writers, this will
 * wait on j_join_wait until all the writers are done.  By the time it
 * wakes up, the transaction it was called has already ended, so it just
 * flushes the commit list and returns 0.
 *
 * Won't batch when flush or commit_now is set.  Also won't batch when
 * others are waiting on j_join_wait.
 *
 * Note, we can't allow the journal_end to proceed while there are still
 * writers in the log.
 */
static int check_journal_end(struct reiserfs_transaction_handle *th, int flags)
{ … }

/*
 * Does all the work that makes deleting blocks safe.
 * when deleting a block mark BH_JNew, just remove it from the current
 * transaction, clean it's buffer_head and move on.
 *
 * otherwise:
 * set a bit for the block in the journal bitmap.  That will prevent it from
 * being allocated for unformatted nodes before this transaction has finished.
 *
 * mark any cnodes for this block as BLOCK_FREED, and clear their bh pointers.
 * That will prevent any old transactions with this block from trying to flush
 * to the real location.  Since we aren't removing the cnode from the
 * journal_list_hash, *the block can't be reallocated yet.
 *
 * Then remove it from the current transaction, decrementing any counters and
 * filing it on the clean list.
 */
int journal_mark_freed(struct reiserfs_transaction_handle *th,
		       struct super_block *sb, b_blocknr_t blocknr)
{ … }

void reiserfs_update_inode_transaction(struct inode *inode)
{ … }

/*
 * returns -1 on error, 0 if no commits/barriers were done and 1
 * if a transaction was actually committed and the barrier was done
 */
static int __commit_trans_jl(struct inode *inode, unsigned long id,
			     struct reiserfs_journal_list *jl)
{ … }

int reiserfs_commit_for_inode(struct inode *inode)
{ … }

void reiserfs_restore_prepared_buffer(struct super_block *sb,
				      struct buffer_head *bh)
{ … }

extern struct tree_balance *cur_tb;
/*
 * before we can change a metadata block, we have to make sure it won't
 * be written to disk while we are altering it.  So, we must:
 * clean it
 * wait on it.
 */
int reiserfs_prepare_for_journal(struct super_block *sb,
				 struct buffer_head *bh, int wait)
{ … }

/*
 * long and ugly.  If flush, will not return until all commit
 * blocks and all real buffers in the trans are on disk.
 * If no_async, won't return until all commit blocks are on disk.
 *
 * keep reading, there are comments as you go along
 *
 * If the journal is aborted, we just clean up. Things like flushing
 * journal lists, etc just won't happen.
 */
static int do_journal_end(struct reiserfs_transaction_handle *th, int flags)
{ … }

/* Send the file system read only and refuse new transactions */
void reiserfs_abort_journal(struct super_block *sb, int errno)
{ … }
linux/fs/reiserfs/journal.c