// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* * Copyright (c) 2014-2020, Oracle and/or its affiliates. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the BSD-type * license below: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials provided * with the distribution. * * Neither the name of the Network Appliance, Inc. nor the names of * its contributors may be used to endorse or promote products * derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * rpc_rdma.c * * This file contains the guts of the RPC RDMA protocol, and * does marshaling/unmarshaling, etc. It is also where interfacing * to the Linux RPC framework lives. */ #include <linux/highmem.h> #include <linux/sunrpc/svc_rdma.h> #include "xprt_rdma.h" #include <trace/events/rpcrdma.h> /* Returns size of largest RPC-over-RDMA header in a Call message * * The largest Call header contains a full-size Read list and a * minimal Reply chunk. */ static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs) { … } /* Returns size of largest RPC-over-RDMA header in a Reply message * * There is only one Write list or one Reply chunk per Reply * message. The larger list is the Write list. */ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) { … } /** * rpcrdma_set_max_header_sizes - Initialize inline payload sizes * @ep: endpoint to initialize * * The max_inline fields contain the maximum size of an RPC message * so the marshaling code doesn't have to repeat this calculation * for every RPC. */ void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep) { … } /* The client can send a request inline as long as the RPCRDMA header * plus the RPC call fit under the transport's inline limit. If the * combined call message size exceeds that limit, the client must use * a Read chunk for this operation. * * A Read chunk is also required if sending the RPC call inline would * exceed this device's max_sge limit. */ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) { … } /* The client can't know how large the actual reply will be. Thus it * plans for the largest possible reply for that particular ULP * operation. If the maximum combined reply message size exceeds that * limit, the client must provide a write list or a reply chunk for * this request. */ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) { … } /* The client is required to provide a Reply chunk if the maximum * size of the non-payload part of the RPC Reply is larger than * the inline threshold. */ static bool rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt, const struct rpc_rqst *rqst) { … } /* ACL likes to be lazy in allocating pages. For TCP, these * pages can be allocated during receive processing. Not true * for RDMA, which must always provision receive buffers * up front. */ static noinline int rpcrdma_alloc_sparse_pages(struct xdr_buf *buf) { … } /* Convert @vec to a single SGL element. * * Returns pointer to next available SGE, and bumps the total number * of SGEs consumed. */ static struct rpcrdma_mr_seg * rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, unsigned int *n) { … } /* Convert @xdrbuf into SGEs no larger than a page each. As they * are registered, these SGEs are then coalesced into RDMA segments * when the selected memreg mode supports it. * * Returns positive number of SGEs consumed, or a negative errno. */ static int rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, unsigned int pos, enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg) { … } static int encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr) { … } static int encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr, u32 position) { … } static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, struct rpcrdma_mr_seg *seg, int nsegs, bool writing, struct rpcrdma_mr **mr) { … } /* Register and XDR encode the Read list. Supports encoding a list of read * segments that belong to a single read chunk. * * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): * * Read chunklist (a linked list): * N elements, position P (same P for all chunks of same arg!): * 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0 * * Returns zero on success, or a negative errno if a failure occurred. * @xdr is advanced to the next position in the stream. * * Only a single @pos value is currently supported. */ static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, struct rpc_rqst *rqst, enum rpcrdma_chunktype rtype) { … } /* Register and XDR encode the Write list. Supports encoding a list * containing one array of plain segments that belong to a single * write chunk. * * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): * * Write chunklist (a list of (one) counted array): * N elements: * 1 - N - HLOO - HLOO - ... - HLOO - 0 * * Returns zero on success, or a negative errno if a failure occurred. * @xdr is advanced to the next position in the stream. * * Only a single Write chunk is currently supported. */ static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype) { … } /* Register and XDR encode the Reply chunk. Supports encoding an array * of plain segments that belong to a single write (reply) chunk. * * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): * * Reply chunk (a counted array): * N elements: * 1 - N - HLOO - HLOO - ... - HLOO * * Returns zero on success, or a negative errno if a failure occurred. * @xdr is advanced to the next position in the stream. */ static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype) { … } static void rpcrdma_sendctx_done(struct kref *kref) { … } /** * rpcrdma_sendctx_unmap - DMA-unmap Send buffer * @sc: sendctx containing SGEs to unmap * */ void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc) { … } /* Prepare an SGE for the RPC-over-RDMA transport header. */ static void rpcrdma_prepare_hdr_sge(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, u32 len) { … } /* The head iovec is straightforward, as it is usually already * DMA-mapped. Sync the content that has changed. */ static bool rpcrdma_prepare_head_iov(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, unsigned int len) { … } /* If there is a page list present, DMA map and prepare an * SGE for each page to be sent. */ static bool rpcrdma_prepare_pagelist(struct rpcrdma_req *req, struct xdr_buf *xdr) { … } /* The tail iovec may include an XDR pad for the page list, * as well as additional content, and may not reside in the * same page as the head iovec. */ static bool rpcrdma_prepare_tail_iov(struct rpcrdma_req *req, struct xdr_buf *xdr, unsigned int page_base, unsigned int len) { … } /* Copy the tail to the end of the head buffer. */ static void rpcrdma_pullup_tail_iov(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, struct xdr_buf *xdr) { … } /* Copy pagelist content into the head buffer. */ static void rpcrdma_pullup_pagelist(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, struct xdr_buf *xdr) { … } /* Copy the contents of @xdr into @rl_sendbuf and DMA sync it. * When the head, pagelist, and tail are small, a pull-up copy * is considerably less costly than DMA mapping the components * of @xdr. * * Assumptions: * - the caller has already verified that the total length * of the RPC Call body will fit into @rl_sendbuf. */ static bool rpcrdma_prepare_noch_pullup(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, struct xdr_buf *xdr) { … } static bool rpcrdma_prepare_noch_mapped(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, struct xdr_buf *xdr) { … } static bool rpcrdma_prepare_readch(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, struct xdr_buf *xdr) { … } /** * rpcrdma_prepare_send_sges - Construct SGEs for a Send WR * @r_xprt: controlling transport * @req: context of RPC Call being marshalled * @hdrlen: size of transport header, in bytes * @xdr: xdr_buf containing RPC Call * @rtype: chunk type being encoded * * Returns 0 on success; otherwise a negative errno is returned. */ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, u32 hdrlen, struct xdr_buf *xdr, enum rpcrdma_chunktype rtype) { … } /** * rpcrdma_marshal_req - Marshal and send one RPC request * @r_xprt: controlling transport * @rqst: RPC request to be marshaled * * For the RPC in "rqst", this function: * - Chooses the transfer mode (eg., RDMA_MSG or RDMA_NOMSG) * - Registers Read, Write, and Reply chunks * - Constructs the transport header * - Posts a Send WR to send the transport header and request * * Returns: * %0 if the RPC was sent successfully, * %-ENOTCONN if the connection was lost, * %-EAGAIN if the caller should call again with the same arguments, * %-ENOBUFS if the caller should call again after a delay, * %-EMSGSIZE if the transport header is too small, * %-EIO if a permanent problem occurred while marshaling. */ int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) { … } static void __rpcrdma_update_cwnd_locked(struct rpc_xprt *xprt, struct rpcrdma_buffer *buf, u32 grant) { … } static void rpcrdma_update_cwnd(struct rpcrdma_xprt *r_xprt, u32 grant) { … } /** * rpcrdma_reset_cwnd - Reset the xprt's congestion window * @r_xprt: controlling transport instance * * Prepare @r_xprt for the next connection by reinitializing * its credit grant to one (see RFC 8166, Section 3.3.3). */ void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt) { … } /** * rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs * @rqst: controlling RPC request * @srcp: points to RPC message payload in receive buffer * @copy_len: remaining length of receive buffer content * @pad: Write chunk pad bytes needed (zero for pure inline) * * The upper layer has set the maximum number of bytes it can * receive in each component of rq_rcv_buf. These values are set in * the head.iov_len, page_len, tail.iov_len, and buflen fields. * * Unlike the TCP equivalent (xdr_partial_copy_from_skb), in * many cases this function simply updates iov_base pointers in * rq_rcv_buf to point directly to the received reply data, to * avoid copying reply data. * * Returns the count of bytes which had to be memcopied. */ static unsigned long rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) { … } /* By convention, backchannel calls arrive via rdma_msg type * messages, and never populate the chunk lists. This makes * the RPC/RDMA header small and fixed in size, so it is * straightforward to check the RPC header's direction field. */ static bool rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) #if defined(CONFIG_SUNRPC_BACKCHANNEL) { … } #else /* CONFIG_SUNRPC_BACKCHANNEL */ { return false; } #endif /* CONFIG_SUNRPC_BACKCHANNEL */ static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length) { … } static int decode_write_chunk(struct xdr_stream *xdr, u32 *length) { … } /* In RPC-over-RDMA Version One replies, a Read list is never * expected. This decoder is a stub that returns an error if * a Read list is present. */ static int decode_read_list(struct xdr_stream *xdr) { … } /* Supports only one Write chunk in the Write list */ static int decode_write_list(struct xdr_stream *xdr, u32 *length) { … } static int decode_reply_chunk(struct xdr_stream *xdr, u32 *length) { … } static int rpcrdma_decode_msg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, struct rpc_rqst *rqst) { … } static noinline int rpcrdma_decode_nomsg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) { … } static noinline int rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, struct rpc_rqst *rqst) { … } /** * rpcrdma_unpin_rqst - Release rqst without completing it * @rep: RPC/RDMA Receive context * * This is done when a connection is lost so that a Reply * can be dropped and its matching Call can be subsequently * retransmitted on a new connection. */ void rpcrdma_unpin_rqst(struct rpcrdma_rep *rep) { … } /** * rpcrdma_complete_rqst - Pass completed rqst back to RPC * @rep: RPC/RDMA Receive context * * Reconstruct the RPC reply and complete the transaction * while @rqst is still pinned to ensure the rep, rqst, and * rq_task pointers remain stable. */ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) { … } static void rpcrdma_reply_done(struct kref *kref) { … } /** * rpcrdma_reply_handler - Process received RPC/RDMA messages * @rep: Incoming rpcrdma_rep object to process * * Errors must result in the RPC task either being awakened, or * allowed to timeout, to discover the errors at that time. */ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) { … }