// SPDX-License-Identifier: GPL-2.0-only /****************************************************************************** ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. ** Copyright (C) 2004-2021 Red Hat, Inc. All rights reserved. ** ** ******************************************************************************* ******************************************************************************/ /* * midcomms.c * * This is the appallingly named "mid-level" comms layer. It takes care about * deliver an on application layer "reliable" communication above the used * lowcomms transport layer. * * How it works: * * Each nodes keeps track of all send DLM messages in send_queue with a sequence * number. The receive will send an DLM_ACK message back for every DLM message * received at the other side. If a reconnect happens in lowcomms we will send * all unacknowledged dlm messages again. The receiving side might drop any already * received message by comparing sequence numbers. * * How version detection works: * * Due the fact that dlm has pre-configured node addresses on every side * it is in it's nature that every side connects at starts to transmit * dlm messages which ends in a race. However DLM_RCOM_NAMES, DLM_RCOM_STATUS * and their replies are the first messages which are exchanges. Due backwards * compatibility these messages are not covered by the midcomms re-transmission * layer. These messages have their own re-transmission handling in the dlm * application layer. The version field of every node will be set on these RCOM * messages as soon as they arrived and the node isn't yet part of the nodes * hash. There exists also logic to detect version mismatched if something weird * going on or the first messages isn't an expected one. * * Termination: * * The midcomms layer does a 4 way handshake for termination on DLM protocol * like TCP supports it with half-closed socket support. SCTP doesn't support * half-closed socket, so we do it on DLM layer. Also socket shutdown() can be * interrupted by .e.g. tcp reset itself. Additional there exists the othercon * paradigm in lowcomms which cannot be easily without breaking backwards * compatibility. A node cannot send anything to another node when a DLM_FIN * message was send. There exists additional logic to print a warning if * DLM wants to do it. There exists a state handling like RFC 793 but reduced * to termination only. The event "member removal event" describes the cluster * manager removed the node from internal lists, at this point DLM does not * send any message to the other node. There exists two cases: * * 1. The cluster member was removed and we received a FIN * OR * 2. We received a FIN but the member was not removed yet * * One of these cases will do the CLOSE_WAIT to LAST_ACK change. * * * +---------+ * | CLOSED | * +---------+ * | add member/receive RCOM version * | detection msg * V * +---------+ * | ESTAB | * +---------+ * CLOSE | | rcv FIN * ------- | | ------- * +---------+ snd FIN / \ snd ACK +---------+ * | FIN |<----------------- ------------------>| CLOSE | * | WAIT-1 |------------------ | WAIT | * +---------+ rcv FIN \ +---------+ * | rcv ACK of FIN ------- | CLOSE | member * | -------------- snd ACK | ------- | removal * V x V snd FIN V event * +---------+ +---------+ +---------+ * |FINWAIT-2| | CLOSING | | LAST-ACK| * +---------+ +---------+ +---------+ * | rcv ACK of FIN | rcv ACK of FIN | * | rcv FIN -------------- | -------------- | * | ------- x V x V * \ snd ACK +---------+ +---------+ * ------------------------>| CLOSED | | CLOSED | * +---------+ +---------+ * * NOTE: any state can interrupted by midcomms_close() and state will be * switched to CLOSED in case of fencing. There exists also some timeout * handling when we receive the version detection RCOM messages which is * made by observation. * * Future improvements: * * There exists some known issues/improvements of the dlm handling. Some * of them should be done in a next major dlm version bump which makes * it incompatible with previous versions. * * Unaligned memory access: * * There exists cases when the dlm message buffer length is not aligned * to 8 byte. However seems nobody detected any problem with it. This * can be fixed in the next major version bump of dlm. * * Version detection: * * The version detection and how it's done is related to backwards * compatibility. There exists better ways to make a better handling. * However this should be changed in the next major version bump of dlm. * * Tail Size checking: * * There exists a message tail payload in e.g. DLM_MSG however we don't * check it against the message length yet regarding to the receive buffer * length. That need to be validated. * * Fencing bad nodes: * * At timeout places or weird sequence number behaviours we should send * a fencing request to the cluster manager. */ /* Debug switch to enable a 5 seconds sleep waiting of a termination. * This can be useful to test fencing while termination is running. * This requires a setup with only gfs2 as dlm user, so that the * last umount will terminate the connection. * * However it became useful to test, while the 5 seconds block in umount * just press the reset button. In a lot of dropping the termination * process can could take several seconds. */ #define DLM_DEBUG_FENCE_TERMINATION … #include <trace/events/dlm.h> #include <net/tcp.h> #include "dlm_internal.h" #include "lowcomms.h" #include "config.h" #include "memory.h" #include "lock.h" #include "util.h" #include "midcomms.h" /* init value for sequence numbers for testing purpose only e.g. overflows */ #define DLM_SEQ_INIT … /* 5 seconds wait to sync ending of dlm */ #define DLM_SHUTDOWN_TIMEOUT … #define DLM_VERSION_NOT_SET … #define DLM_SEND_ACK_BACK_MSG_THRESHOLD … #define DLM_RECV_ACK_BACK_MSG_THRESHOLD … struct midcomms_node { … }; struct dlm_mhandle { … }; static struct hlist_head node_hash[CONN_HASH_SIZE]; static DEFINE_SPINLOCK(nodes_lock); DEFINE_STATIC_SRCU(…); /* This mutex prevents that midcomms_close() is running while * stop() or remove(). As I experienced invalid memory access * behaviours when DLM_DEBUG_FENCE_TERMINATION is enabled and * resetting machines. I will end in some double deletion in nodes * datastructure. */ static DEFINE_MUTEX(close_lock); struct kmem_cache *dlm_midcomms_cache_create(void) { … } static inline const char *dlm_state_str(int state) { … } const char *dlm_midcomms_state(struct midcomms_node *node) { … } unsigned long dlm_midcomms_flags(struct midcomms_node *node) { … } int dlm_midcomms_send_queue_cnt(struct midcomms_node *node) { … } uint32_t dlm_midcomms_version(struct midcomms_node *node) { … } static struct midcomms_node *__find_node(int nodeid, int r) { … } static void dlm_mhandle_release(struct rcu_head *rcu) { … } static void dlm_mhandle_delete(struct midcomms_node *node, struct dlm_mhandle *mh) { … } static void dlm_send_queue_flush(struct midcomms_node *node) { … } static void midcomms_node_reset(struct midcomms_node *node) { … } static struct midcomms_node *nodeid2node(int nodeid) { … } int dlm_midcomms_addr(int nodeid, struct sockaddr_storage *addr) { … } static int dlm_send_ack(int nodeid, uint32_t seq) { … } static void dlm_send_ack_threshold(struct midcomms_node *node, uint32_t threshold) { … } static int dlm_send_fin(struct midcomms_node *node, void (*ack_rcv)(struct midcomms_node *node)) { … } static void dlm_receive_ack(struct midcomms_node *node, uint32_t seq) { … } static void dlm_pas_fin_ack_rcv(struct midcomms_node *node) { … } static void dlm_receive_buffer_3_2_trace(uint32_t seq, const union dlm_packet *p) { … } static void dlm_midcomms_receive_buffer(const union dlm_packet *p, struct midcomms_node *node, uint32_t seq) { … } static int dlm_opts_check_msglen(const union dlm_packet *p, uint16_t msglen, int nodeid) { … } static void dlm_midcomms_receive_buffer_3_2(const union dlm_packet *p, int nodeid) { … } static void dlm_midcomms_receive_buffer_3_1(const union dlm_packet *p, int nodeid) { … } int dlm_validate_incoming_buffer(int nodeid, unsigned char *buf, int len) { … } /* * Called from the low-level comms layer to process a buffer of * commands. */ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) { … } void dlm_midcomms_unack_msg_resend(int nodeid) { … } static void dlm_fill_opts_header(struct dlm_opts *opts, uint16_t inner_len, uint32_t seq) { … } static void midcomms_new_msg_cb(void *data) { … } static struct dlm_msg *dlm_midcomms_get_msg_3_2(struct dlm_mhandle *mh, int nodeid, int len, char **ppc) { … } /* avoid false positive for nodes_srcu, unlock happens in * dlm_midcomms_commit_mhandle which is a must call if success */ #ifndef __CHECKER__ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, char **ppc) { … } #endif static void dlm_midcomms_commit_msg_3_2_trace(const struct dlm_mhandle *mh, const void *name, int namelen) { … } static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh, const void *name, int namelen) { … } /* avoid false positive for nodes_srcu, lock was happen in * dlm_midcomms_get_mhandle */ #ifndef __CHECKER__ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, const void *name, int namelen) { … } #endif int dlm_midcomms_start(void) { … } void dlm_midcomms_stop(void) { … } void dlm_midcomms_init(void) { … } static void midcomms_node_release(struct rcu_head *rcu) { … } void dlm_midcomms_exit(void) { … } static void dlm_act_fin_ack_rcv(struct midcomms_node *node) { … } void dlm_midcomms_add_member(int nodeid) { … } void dlm_midcomms_remove_member(int nodeid) { … } void dlm_midcomms_version_wait(void) { … } static void midcomms_shutdown(struct midcomms_node *node) { … } void dlm_midcomms_shutdown(void) { … } int dlm_midcomms_close(int nodeid) { … } /* debug functionality to send raw dlm msg from user space */ struct dlm_rawmsg_data { … }; static void midcomms_new_rawmsg_cb(void *data) { … } int dlm_midcomms_rawmsg_send(struct midcomms_node *node, void *buf, int buflen) { … }