dlmrecovery.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * dlmrecovery.c
 *
 * recovery stuff
 *
 * Copyright (C) 2004 Oracle.  All rights reserved.
 */


#include <linux/module.h>
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
#include <linux/blkdev.h>
#include <linux/socket.h>
#include <linux/inet.h>
#include <linux/timer.h>
#include <linux/kthread.h>
#include <linux/delay.h>


#include "../cluster/heartbeat.h"
#include "../cluster/nodemanager.h"
#include "../cluster/tcp.h"

#include "dlmapi.h"
#include "dlmcommon.h"
#include "dlmdomain.h"

#define MLOG_MASK_PREFIX …
#include "../cluster/masklog.h"

static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node);

static int dlm_recovery_thread(void *data);
static int dlm_do_recovery(struct dlm_ctxt *dlm);

static int dlm_pick_recovery_master(struct dlm_ctxt *dlm);
static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node);
static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
static int dlm_request_all_locks(struct dlm_ctxt *dlm,
				 u8 request_from, u8 dead_node);
static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm);

static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res);
static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
					const char *lockname, int namelen,
					int total_locks, u64 cookie,
					u8 flags, u8 master);
static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
				    struct dlm_migratable_lockres *mres,
				    u8 send_to,
				    struct dlm_lock_resource *res,
				    int total_locks);
static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
				     struct dlm_lock_resource *res,
				     struct dlm_migratable_lockres *mres);
static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm);
static int dlm_send_all_done_msg(struct dlm_ctxt *dlm,
				 u8 dead_node, u8 send_to);
static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node);
static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
					struct list_head *list, u8 dead_node);
static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
					      u8 dead_node, u8 new_master);
static void dlm_reco_ast(void *astdata);
static void dlm_reco_bast(void *astdata, int blocked_type);
static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st);
static void dlm_request_all_locks_worker(struct dlm_work_item *item,
					 void *data);
static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data);
static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
				      struct dlm_lock_resource *res,
				      u8 *real_master);

static u64 dlm_get_next_mig_cookie(void);

static DEFINE_SPINLOCK(dlm_reco_state_lock);
static DEFINE_SPINLOCK(dlm_mig_cookie_lock);
static u64 dlm_mig_cookie = …;

static u64 dlm_get_next_mig_cookie(void)
{ … }

static inline void dlm_set_reco_dead_node(struct dlm_ctxt *dlm,
					  u8 dead_node)
{ … }

static inline void dlm_set_reco_master(struct dlm_ctxt *dlm,
				       u8 master)
{ … }

static inline void __dlm_reset_recovery(struct dlm_ctxt *dlm)
{ … }

/* Worker function used during recovery. */
void dlm_dispatch_work(struct work_struct *work)
{ … }

/*
 * RECOVERY THREAD
 */

void dlm_kick_recovery_thread(struct dlm_ctxt *dlm)
{ … }

/* Launch the recovery thread */
int dlm_launch_recovery_thread(struct dlm_ctxt *dlm)
{ … }

void dlm_complete_recovery_thread(struct dlm_ctxt *dlm)
{ … }



/*
 * this is lame, but here's how recovery works...
 * 1) all recovery threads cluster wide will work on recovering
 *    ONE node at a time
 * 2) negotiate who will take over all the locks for the dead node.
 *    thats right... ALL the locks.
 * 3) once a new master is chosen, everyone scans all locks
 *    and moves aside those mastered by the dead guy
 * 4) each of these locks should be locked until recovery is done
 * 5) the new master collects up all of secondary lock queue info
 *    one lock at a time, forcing each node to communicate back
 *    before continuing
 * 6) each secondary lock queue responds with the full known lock info
 * 7) once the new master has run all its locks, it sends a ALLDONE!
 *    message to everyone
 * 8) upon receiving this message, the secondary queue node unlocks
 *    and responds to the ALLDONE
 * 9) once the new master gets responses from everyone, he unlocks
 *    everything and recovery for this dead node is done
 *10) go back to 2) while there are still dead nodes
 *
 */

static void dlm_print_reco_node_status(struct dlm_ctxt *dlm)
{ … }

#define DLM_RECO_THREAD_TIMEOUT_MS …

static int dlm_recovery_thread(void *data)
{ … }

/* returns true when the recovery master has contacted us */
static int dlm_reco_master_ready(struct dlm_ctxt *dlm)
{ … }

/* returns true if node is no longer in the domain
 * could be dead or just not joined */
int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node)
{ … }

/* returns true if node is no longer in the domain
 * could be dead or just not joined */
static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node)
{ … }


void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
{ … }

void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
{ … }

/* callers of the top-level api calls (dlmlock/dlmunlock) should
 * block on the dlm->reco.event when recovery is in progress.
 * the dlm recovery thread will set this state when it begins
 * recovering a dead node (as the new master or not) and clear
 * the state and wake as soon as all affected lock resources have
 * been marked with the RECOVERY flag */
static int dlm_in_recovery(struct dlm_ctxt *dlm)
{ … }


void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
{ … }

static void dlm_begin_recovery(struct dlm_ctxt *dlm)
{ … }

static void dlm_end_recovery(struct dlm_ctxt *dlm)
{ … }

static void dlm_print_recovery_master(struct dlm_ctxt *dlm)
{ … }

static int dlm_do_recovery(struct dlm_ctxt *dlm)
{ … }

static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
{ … }

static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
{ … }

static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm)
{ … }

static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
				 u8 dead_node)
{ … }

int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data,
				  void **ret_data)
{ … }

static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
{ … }


static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
{ … }


int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data,
			       void **ret_data)
{ … }

static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
					struct list_head *list,
				       	u8 dead_node)
{ … }

static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res)
{ … }


static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
				      struct dlm_migratable_lockres *mres,
				      u8 send_to,
				      struct dlm_lock_resource *res,
				      int total_locks)
{ … }

static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
					const char *lockname, int namelen,
					int total_locks, u64 cookie,
					u8 flags, u8 master)
{ … }

static void dlm_prepare_lvb_for_migration(struct dlm_lock *lock,
					  struct dlm_migratable_lockres *mres,
					  int queue)
{ … }

/* returns 1 if this lock fills the network structure,
 * 0 otherwise */
static int dlm_add_lock_to_array(struct dlm_lock *lock,
				 struct dlm_migratable_lockres *mres, int queue)
{ … }

static void dlm_add_dummy_lock(struct dlm_ctxt *dlm,
			       struct dlm_migratable_lockres *mres)
{ … }

static inline int dlm_is_dummy_lock(struct dlm_ctxt *dlm,
				    struct dlm_migratable_lock *ml,
				    u8 *nodenum)
{ … }

int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
			 struct dlm_migratable_lockres *mres,
			 u8 send_to, u8 flags)
{ … }



/*
 * this message will contain no more than one page worth of
 * recovery data, and it will work on only one lockres.
 * there may be many locks in this page, and we may need to wait
 * for additional packets to complete all the locks (rare, but
 * possible).
 */
/*
 * NOTE: the allocation error cases here are scary
 * we really cannot afford to fail an alloc in recovery
 * do we spin?  returning an error only delays the problem really
 */

int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
			    void **ret_data)
{ … }


static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data)
{ … }



static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
				      struct dlm_lock_resource *res,
				      u8 *real_master)
{ … }


int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
			  u8 nodenum, u8 *real_master)
{ … }


/* this function cannot error, so unless the sending
 * or receiving of the message failed, the owner can
 * be trusted */
int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
			       void **ret_data)
{ … }

static inline struct list_head *
dlm_list_num_to_pointer(struct dlm_lock_resource *res, int list_num)
{ … }
/* TODO: do ast flush business
 * TODO: do MIGRATING and RECOVERING spinning
 */

/*
* NOTE about in-flight requests during migration:
*
* Before attempting the migrate, the master has marked the lockres as
* MIGRATING and then flushed all of its pending ASTS.  So any in-flight
* requests either got queued before the MIGRATING flag got set, in which
* case the lock data will reflect the change and a return message is on
* the way, or the request failed to get in before MIGRATING got set.  In
* this case, the caller will be told to spin and wait for the MIGRATING
* flag to be dropped, then recheck the master.
* This holds true for the convert, cancel and unlock cases, and since lvb
* updates are tied to these same messages, it applies to lvb updates as
* well.  For the lock case, there is no way a lock can be on the master
* queue and not be on the secondary queue since the lock is always added
* locally first.  This means that the new target node will never be sent
* a lock that he doesn't already have on the list.
* In total, this means that the local lock is correct and should not be
* updated to match the one sent by the master.  Any messages sent back
* from the master before the MIGRATING flag will bring the lock properly
* up-to-date, and the change will be ordered properly for the waiter.
* We will *not* attempt to modify the lock underneath the waiter.
*/

static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
				     struct dlm_lock_resource *res,
				     struct dlm_migratable_lockres *mres)
{ … }

void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
				       struct dlm_lock_resource *res)
{ … }



/* removes all recovered locks from the recovery list.
 * sets the res->owner to the new master.
 * unsets the RECOVERY flag and wakes waiters. */
static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
					      u8 dead_node, u8 new_master)
{ … }

static inline int dlm_lvb_needs_invalidation(struct dlm_lock *lock, int local)
{ … }

static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
			       struct dlm_lock_resource *res, u8 dead_node)
{ … }

static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
				struct dlm_lock_resource *res, u8 dead_node)
{ … }

static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
{ … }

static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
{ … }

void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data)
{ … }

void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data)
{ … }

static void dlm_reco_ast(void *astdata)
{ … }
static void dlm_reco_bast(void *astdata, int blocked_type)
{ … }
static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st)
{ … }

/*
 * dlm_pick_recovery_master will continually attempt to use
 * dlmlock() on the special "$RECOVERY" lockres with the
 * LKM_NOQUEUE flag to get an EX.  every thread that enters
 * this function on each node racing to become the recovery
 * master will not stop attempting this until either:
 * a) this node gets the EX (and becomes the recovery master),
 * or b) dlm->reco.new_master gets set to some nodenum
 * != O2NM_INVALID_NODE_NUM (another node will do the reco).
 * so each time a recovery master is needed, the entire cluster
 * will sync at this point.  if the new master dies, that will
 * be detected in dlm_do_recovery */
static int dlm_pick_recovery_master(struct dlm_ctxt *dlm)
{ … }

static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
{ … }

int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data,
			   void **ret_data)
{ … }

#define DLM_FINALIZE_STAGE2 …
static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
{ … }

int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
			      void **ret_data)
{ … }
linux/fs/ocfs2/dlm/dlmrecovery.c