siw_qp_tx.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause

/* Authors: Bernard Metzler <[email protected]> */
/* Copyright (c) 2008-2019, IBM Corporation */

#include <linux/errno.h>
#include <linux/types.h>
#include <linux/net.h>
#include <linux/scatterlist.h>
#include <linux/highmem.h>
#include <net/tcp.h>

#include <rdma/iw_cm.h>
#include <rdma/ib_verbs.h>
#include <rdma/ib_user_verbs.h>

#include "siw.h"
#include "siw_verbs.h"
#include "siw_mem.h"

#define MAX_HDR_INLINE …

static struct page *siw_get_pblpage(struct siw_mem *mem, u64 addr, int *idx)
{ … }

static struct page *siw_get_page(struct siw_mem *mem, struct siw_sge *sge,
				 unsigned long offset, int *pbl_idx)
{ … }

/*
 * Copy short payload at provided destination payload address
 */
static int siw_try_1seg(struct siw_iwarp_tx *c_tx, void *paddr)
{ … }

#define PKT_FRAGMENTED …
#define PKT_COMPLETE …

/*
 * siw_qp_prepare_tx()
 *
 * Prepare tx state for sending out one fpdu. Builds complete pkt
 * if no user data or only immediate data are present.
 *
 * returns PKT_COMPLETE if complete pkt built, PKT_FRAGMENTED otherwise.
 */
static int siw_qp_prepare_tx(struct siw_iwarp_tx *c_tx)
{ … }

/*
 * Send out one complete control type FPDU, or header of FPDU carrying
 * data. Used for fixed sized packets like Read.Requests or zero length
 * SENDs, WRITEs, READ.Responses, or header only.
 */
static int siw_tx_ctrl(struct siw_iwarp_tx *c_tx, struct socket *s,
			      int flags)
{ … }

/*
 * 0copy TCP transmit interface: Use MSG_SPLICE_PAGES.
 *
 * Using sendpage to push page by page appears to be less efficient
 * than using sendmsg, even if data are copied.
 *
 * A general performance limitation might be the extra four bytes
 * trailer checksum segment to be pushed after user data.
 */
static int siw_tcp_sendpages(struct socket *s, struct page **page, int offset,
			     size_t size)
{ … }

/*
 * siw_0copy_tx()
 *
 * Pushes list of pages to TCP socket. If pages from multiple
 * SGE's, all referenced pages of each SGE are pushed in one
 * shot.
 */
static int siw_0copy_tx(struct socket *s, struct page **page,
			struct siw_sge *sge, unsigned int offset,
			unsigned int size)
{ … }

#define MAX_TRAILER …

static void siw_unmap_pages(struct kvec *iov, unsigned long kmap_mask, int len)
{ … }

/*
 * siw_tx_hdt() tries to push a complete packet to TCP where all
 * packet fragments are referenced by the elements of one iovec.
 * For the data portion, each involved page must be referenced by
 * one extra element. All sge's data can be non-aligned to page
 * boundaries. Two more elements are referencing iWARP header
 * and trailer:
 * MAX_ARRAY = 64KB/PAGE_SIZE + 1 + (2 * (SIW_MAX_SGE - 1) + HDR + TRL
 */
#define MAX_ARRAY …

/*
 * Write out iov referencing hdr, data and trailer of current FPDU.
 * Update transmit state dependent on write return status
 */
static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s)
{ … }

static void siw_update_tcpseg(struct siw_iwarp_tx *c_tx,
				     struct socket *s)
{ … }

/*
 * siw_prepare_fpdu()
 *
 * Prepares transmit context to send out one FPDU if FPDU will contain
 * user data and user data are not immediate data.
 * Computes maximum FPDU length to fill up TCP MSS if possible.
 *
 * @qp:		QP from which to transmit
 * @wqe:	Current WQE causing transmission
 *
 * TODO: Take into account real available sendspace on socket
 *       to avoid header misalignment due to send pausing within
 *       fpdu transmission
 */
static void siw_prepare_fpdu(struct siw_qp *qp, struct siw_wqe *wqe)
{ … }

/*
 * siw_check_sgl_tx()
 *
 * Check permissions for a list of SGE's (SGL).
 * A successful check will have all memory referenced
 * for transmission resolved and assigned to the WQE.
 *
 * @pd:		Protection Domain SGL should belong to
 * @wqe:	WQE to be checked
 * @perms:	requested access permissions
 *
 */

static int siw_check_sgl_tx(struct ib_pd *pd, struct siw_wqe *wqe,
			    enum ib_access_flags perms)
{ … }

/*
 * siw_qp_sq_proc_tx()
 *
 * Process one WQE which needs transmission on the wire.
 */
static int siw_qp_sq_proc_tx(struct siw_qp *qp, struct siw_wqe *wqe)
{ … }

static int siw_fastreg_mr(struct ib_pd *pd, struct siw_sqe *sqe)
{ … }

static int siw_qp_sq_proc_local(struct siw_qp *qp, struct siw_wqe *wqe)
{ … }

/*
 * siw_qp_sq_process()
 *
 * Core TX path routine for RDMAP/DDP/MPA using a TCP kernel socket.
 * Sends RDMAP payload for the current SQ WR @wqe of @qp in one or more
 * MPA FPDUs, each containing a DDP segment.
 *
 * SQ processing may occur in user context as a result of posting
 * new WQE's or from siw_tx_thread context. Processing in
 * user context is limited to non-kernel verbs users.
 *
 * SQ processing may get paused anytime, possibly in the middle of a WR
 * or FPDU, if insufficient send space is available. SQ processing
 * gets resumed from siw_tx_thread, if send space becomes available again.
 *
 * Must be called with the QP state read-locked.
 *
 * Note:
 * An outbound RREQ can be satisfied by the corresponding RRESP
 * _before_ it gets assigned to the ORQ. This happens regularly
 * in RDMA READ via loopback case. Since both outbound RREQ and
 * inbound RRESP can be handled by the same CPU, locking the ORQ
 * is dead-lock prone and thus not an option. With that, the
 * RREQ gets assigned to the ORQ _before_ being sent - see
 * siw_activate_tx() - and pulled back in case of send failure.
 */
int siw_qp_sq_process(struct siw_qp *qp)
{ … }

static void siw_sq_resume(struct siw_qp *qp)
{ … }

struct tx_task_t { … };

static DEFINE_PER_CPU(struct tx_task_t, siw_tx_task_g);

int siw_create_tx_threads(void)
{ … }

void siw_stop_tx_threads(void)
{ … }

int siw_run_sq(void *data)
{ … }

int siw_sq_start(struct siw_qp *qp)
{ … }
linux/drivers/infiniband/sw/siw/siw_qp_tx.c