rpc_rdma.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
/*
 * Copyright (c) 2014-2020, Oracle and/or its affiliates.
 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the BSD-type
 * license below:
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 *      Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *
 *      Redistributions in binary form must reproduce the above
 *      copyright notice, this list of conditions and the following
 *      disclaimer in the documentation and/or other materials provided
 *      with the distribution.
 *
 *      Neither the name of the Network Appliance, Inc. nor the names of
 *      its contributors may be used to endorse or promote products
 *      derived from this software without specific prior written
 *      permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * rpc_rdma.c
 *
 * This file contains the guts of the RPC RDMA protocol, and
 * does marshaling/unmarshaling, etc. It is also where interfacing
 * to the Linux RPC framework lives.
 */

#include <linux/highmem.h>

#include <linux/sunrpc/svc_rdma.h>

#include "xprt_rdma.h"
#include <trace/events/rpcrdma.h>

/* Returns size of largest RPC-over-RDMA header in a Call message
 *
 * The largest Call header contains a full-size Read list and a
 * minimal Reply chunk.
 */
static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs)
{ … }

/* Returns size of largest RPC-over-RDMA header in a Reply message
 *
 * There is only one Write list or one Reply chunk per Reply
 * message.  The larger list is the Write list.
 */
static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
{ … }

/**
 * rpcrdma_set_max_header_sizes - Initialize inline payload sizes
 * @ep: endpoint to initialize
 *
 * The max_inline fields contain the maximum size of an RPC message
 * so the marshaling code doesn't have to repeat this calculation
 * for every RPC.
 */
void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep)
{ … }

/* The client can send a request inline as long as the RPCRDMA header
 * plus the RPC call fit under the transport's inline limit. If the
 * combined call message size exceeds that limit, the client must use
 * a Read chunk for this operation.
 *
 * A Read chunk is also required if sending the RPC call inline would
 * exceed this device's max_sge limit.
 */
static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
				struct rpc_rqst *rqst)
{ … }

/* The client can't know how large the actual reply will be. Thus it
 * plans for the largest possible reply for that particular ULP
 * operation. If the maximum combined reply message size exceeds that
 * limit, the client must provide a write list or a reply chunk for
 * this request.
 */
static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
				   struct rpc_rqst *rqst)
{ … }

/* The client is required to provide a Reply chunk if the maximum
 * size of the non-payload part of the RPC Reply is larger than
 * the inline threshold.
 */
static bool
rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt,
			  const struct rpc_rqst *rqst)
{ … }

/* ACL likes to be lazy in allocating pages. For TCP, these
 * pages can be allocated during receive processing. Not true
 * for RDMA, which must always provision receive buffers
 * up front.
 */
static noinline int
rpcrdma_alloc_sparse_pages(struct xdr_buf *buf)
{ … }

/* Convert @vec to a single SGL element.
 *
 * Returns pointer to next available SGE, and bumps the total number
 * of SGEs consumed.
 */
static struct rpcrdma_mr_seg *
rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
		     unsigned int *n)
{ … }

/* Convert @xdrbuf into SGEs no larger than a page each. As they
 * are registered, these SGEs are then coalesced into RDMA segments
 * when the selected memreg mode supports it.
 *
 * Returns positive number of SGEs consumed, or a negative errno.
 */

static int
rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
		     unsigned int pos, enum rpcrdma_chunktype type,
		     struct rpcrdma_mr_seg *seg)
{ … }

static int
encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr)
{ … }

static int
encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr,
		    u32 position)
{ … }

static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt,
						 struct rpcrdma_req *req,
						 struct rpcrdma_mr_seg *seg,
						 int nsegs, bool writing,
						 struct rpcrdma_mr **mr)
{ … }

/* Register and XDR encode the Read list. Supports encoding a list of read
 * segments that belong to a single read chunk.
 *
 * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
 *
 *  Read chunklist (a linked list):
 *   N elements, position P (same P for all chunks of same arg!):
 *    1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0
 *
 * Returns zero on success, or a negative errno if a failure occurred.
 * @xdr is advanced to the next position in the stream.
 *
 * Only a single @pos value is currently supported.
 */
static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
				    struct rpcrdma_req *req,
				    struct rpc_rqst *rqst,
				    enum rpcrdma_chunktype rtype)
{ … }

/* Register and XDR encode the Write list. Supports encoding a list
 * containing one array of plain segments that belong to a single
 * write chunk.
 *
 * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
 *
 *  Write chunklist (a list of (one) counted array):
 *   N elements:
 *    1 - N - HLOO - HLOO - ... - HLOO - 0
 *
 * Returns zero on success, or a negative errno if a failure occurred.
 * @xdr is advanced to the next position in the stream.
 *
 * Only a single Write chunk is currently supported.
 */
static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt,
				     struct rpcrdma_req *req,
				     struct rpc_rqst *rqst,
				     enum rpcrdma_chunktype wtype)
{ … }

/* Register and XDR encode the Reply chunk. Supports encoding an array
 * of plain segments that belong to a single write (reply) chunk.
 *
 * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
 *
 *  Reply chunk (a counted array):
 *   N elements:
 *    1 - N - HLOO - HLOO - ... - HLOO
 *
 * Returns zero on success, or a negative errno if a failure occurred.
 * @xdr is advanced to the next position in the stream.
 */
static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
				      struct rpcrdma_req *req,
				      struct rpc_rqst *rqst,
				      enum rpcrdma_chunktype wtype)
{ … }

static void rpcrdma_sendctx_done(struct kref *kref)
{ … }

/**
 * rpcrdma_sendctx_unmap - DMA-unmap Send buffer
 * @sc: sendctx containing SGEs to unmap
 *
 */
void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc)
{ … }

/* Prepare an SGE for the RPC-over-RDMA transport header.
 */
static void rpcrdma_prepare_hdr_sge(struct rpcrdma_xprt *r_xprt,
				    struct rpcrdma_req *req, u32 len)
{ … }

/* The head iovec is straightforward, as it is usually already
 * DMA-mapped. Sync the content that has changed.
 */
static bool rpcrdma_prepare_head_iov(struct rpcrdma_xprt *r_xprt,
				     struct rpcrdma_req *req, unsigned int len)
{ … }

/* If there is a page list present, DMA map and prepare an
 * SGE for each page to be sent.
 */
static bool rpcrdma_prepare_pagelist(struct rpcrdma_req *req,
				     struct xdr_buf *xdr)
{ … }

/* The tail iovec may include an XDR pad for the page list,
 * as well as additional content, and may not reside in the
 * same page as the head iovec.
 */
static bool rpcrdma_prepare_tail_iov(struct rpcrdma_req *req,
				     struct xdr_buf *xdr,
				     unsigned int page_base, unsigned int len)
{ … }

/* Copy the tail to the end of the head buffer.
 */
static void rpcrdma_pullup_tail_iov(struct rpcrdma_xprt *r_xprt,
				    struct rpcrdma_req *req,
				    struct xdr_buf *xdr)
{ … }

/* Copy pagelist content into the head buffer.
 */
static void rpcrdma_pullup_pagelist(struct rpcrdma_xprt *r_xprt,
				    struct rpcrdma_req *req,
				    struct xdr_buf *xdr)
{ … }

/* Copy the contents of @xdr into @rl_sendbuf and DMA sync it.
 * When the head, pagelist, and tail are small, a pull-up copy
 * is considerably less costly than DMA mapping the components
 * of @xdr.
 *
 * Assumptions:
 *  - the caller has already verified that the total length
 *    of the RPC Call body will fit into @rl_sendbuf.
 */
static bool rpcrdma_prepare_noch_pullup(struct rpcrdma_xprt *r_xprt,
					struct rpcrdma_req *req,
					struct xdr_buf *xdr)
{ … }

static bool rpcrdma_prepare_noch_mapped(struct rpcrdma_xprt *r_xprt,
					struct rpcrdma_req *req,
					struct xdr_buf *xdr)
{ … }

static bool rpcrdma_prepare_readch(struct rpcrdma_xprt *r_xprt,
				   struct rpcrdma_req *req,
				   struct xdr_buf *xdr)
{ … }

/**
 * rpcrdma_prepare_send_sges - Construct SGEs for a Send WR
 * @r_xprt: controlling transport
 * @req: context of RPC Call being marshalled
 * @hdrlen: size of transport header, in bytes
 * @xdr: xdr_buf containing RPC Call
 * @rtype: chunk type being encoded
 *
 * Returns 0 on success; otherwise a negative errno is returned.
 */
inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
				     struct rpcrdma_req *req, u32 hdrlen,
				     struct xdr_buf *xdr,
				     enum rpcrdma_chunktype rtype)
{ … }

/**
 * rpcrdma_marshal_req - Marshal and send one RPC request
 * @r_xprt: controlling transport
 * @rqst: RPC request to be marshaled
 *
 * For the RPC in "rqst", this function:
 *  - Chooses the transfer mode (eg., RDMA_MSG or RDMA_NOMSG)
 *  - Registers Read, Write, and Reply chunks
 *  - Constructs the transport header
 *  - Posts a Send WR to send the transport header and request
 *
 * Returns:
 *	%0 if the RPC was sent successfully,
 *	%-ENOTCONN if the connection was lost,
 *	%-EAGAIN if the caller should call again with the same arguments,
 *	%-ENOBUFS if the caller should call again after a delay,
 *	%-EMSGSIZE if the transport header is too small,
 *	%-EIO if a permanent problem occurred while marshaling.
 */
int
rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
{ … }

static void __rpcrdma_update_cwnd_locked(struct rpc_xprt *xprt,
					 struct rpcrdma_buffer *buf,
					 u32 grant)
{ … }

static void rpcrdma_update_cwnd(struct rpcrdma_xprt *r_xprt, u32 grant)
{ … }

/**
 * rpcrdma_reset_cwnd - Reset the xprt's congestion window
 * @r_xprt: controlling transport instance
 *
 * Prepare @r_xprt for the next connection by reinitializing
 * its credit grant to one (see RFC 8166, Section 3.3.3).
 */
void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt)
{ … }

/**
 * rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs
 * @rqst: controlling RPC request
 * @srcp: points to RPC message payload in receive buffer
 * @copy_len: remaining length of receive buffer content
 * @pad: Write chunk pad bytes needed (zero for pure inline)
 *
 * The upper layer has set the maximum number of bytes it can
 * receive in each component of rq_rcv_buf. These values are set in
 * the head.iov_len, page_len, tail.iov_len, and buflen fields.
 *
 * Unlike the TCP equivalent (xdr_partial_copy_from_skb), in
 * many cases this function simply updates iov_base pointers in
 * rq_rcv_buf to point directly to the received reply data, to
 * avoid copying reply data.
 *
 * Returns the count of bytes which had to be memcopied.
 */
static unsigned long
rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
{ … }

/* By convention, backchannel calls arrive via rdma_msg type
 * messages, and never populate the chunk lists. This makes
 * the RPC/RDMA header small and fixed in size, so it is
 * straightforward to check the RPC header's direction field.
 */
static bool
rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep)
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
{ … }
#else	/* CONFIG_SUNRPC_BACKCHANNEL */
{
	return false;
}
#endif	/* CONFIG_SUNRPC_BACKCHANNEL */

static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length)
{ … }

static int decode_write_chunk(struct xdr_stream *xdr, u32 *length)
{ … }

/* In RPC-over-RDMA Version One replies, a Read list is never
 * expected. This decoder is a stub that returns an error if
 * a Read list is present.
 */
static int decode_read_list(struct xdr_stream *xdr)
{ … }

/* Supports only one Write chunk in the Write list
 */
static int decode_write_list(struct xdr_stream *xdr, u32 *length)
{ … }

static int decode_reply_chunk(struct xdr_stream *xdr, u32 *length)
{ … }

static int
rpcrdma_decode_msg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
		   struct rpc_rqst *rqst)
{ … }

static noinline int
rpcrdma_decode_nomsg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep)
{ … }

static noinline int
rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
		     struct rpc_rqst *rqst)
{ … }

/**
 * rpcrdma_unpin_rqst - Release rqst without completing it
 * @rep: RPC/RDMA Receive context
 *
 * This is done when a connection is lost so that a Reply
 * can be dropped and its matching Call can be subsequently
 * retransmitted on a new connection.
 */
void rpcrdma_unpin_rqst(struct rpcrdma_rep *rep)
{ … }

/**
 * rpcrdma_complete_rqst - Pass completed rqst back to RPC
 * @rep: RPC/RDMA Receive context
 *
 * Reconstruct the RPC reply and complete the transaction
 * while @rqst is still pinned to ensure the rep, rqst, and
 * rq_task pointers remain stable.
 */
void rpcrdma_complete_rqst(struct rpcrdma_rep *rep)
{ … }

static void rpcrdma_reply_done(struct kref *kref)
{ … }

/**
 * rpcrdma_reply_handler - Process received RPC/RDMA messages
 * @rep: Incoming rpcrdma_rep object to process
 *
 * Errors must result in the RPC task either being awakened, or
 * allowed to timeout, to discover the errors at that time.
 */
void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
{ … }
linux/net/sunrpc/xprtrdma/rpc_rdma.c