Skip to content

Commit 9c40c49

Browse files
chuckleveramschuma-ntap
authored andcommitted
xprtrdma: Initialize separate RPC call and reply buffers
RPC-over-RDMA needs to separate its RPC call and reply buffers. o When an RPC Call is sent, rq_snd_buf is DMA mapped for an RDMA Send operation using DMA_TO_DEVICE o If the client expects a large RPC reply, it DMA maps rq_rcv_buf as part of a Reply chunk using DMA_FROM_DEVICE The two mappings are for data movement in opposite directions. DMA-API.txt suggests that if these mappings share a DMA cacheline, bad things can happen. This could occur in the final bytes of rq_snd_buf and the first bytes of rq_rcv_buf if the two buffers happen to share a DMA cacheline. On x86_64 the cacheline size is typically 8 bytes, and RPC call messages are usually much smaller than the send buffer, so this hasn't been a noticeable problem. But the DMA cacheline size can be larger on other platforms. Also, often rq_rcv_buf starts most of the way into a page, thus an additional RDMA segment is needed to map and register the end of that buffer. Try to avoid that scenario to reduce the cost of registering and invalidating Reply chunks. Instead of carrying a single regbuf that covers both rq_snd_buf and rq_rcv_buf, each struct rpcrdma_req now carries one regbuf for rq_snd_buf and one regbuf for rq_rcv_buf. Some incidental changes worth noting: - To clear out some spaghetti, refactor xprt_rdma_allocate. - The value stored in rg_size is the same as the value stored in the iov.length field, so eliminate rg_size Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
1 parent 5a6d1db commit 9c40c49

File tree

3 files changed

+99
-59
lines changed

3 files changed

+99
-59
lines changed

net/sunrpc/xprtrdma/transport.c

Lines changed: 95 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -477,6 +477,86 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
477477
}
478478
}
479479

480+
/* Allocate a fixed-size buffer in which to construct and send the
481+
* RPC-over-RDMA header for this request.
482+
*/
483+
static bool
484+
rpcrdma_get_rdmabuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
485+
gfp_t flags)
486+
{
487+
size_t size = r_xprt->rx_data.inline_wsize;
488+
struct rpcrdma_regbuf *rb;
489+
490+
if (req->rl_rdmabuf)
491+
return true;
492+
493+
rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, size, flags);
494+
if (IS_ERR(rb))
495+
return false;
496+
497+
r_xprt->rx_stats.hardway_register_count += size;
498+
req->rl_rdmabuf = rb;
499+
return true;
500+
}
501+
502+
/* RPC/RDMA marshaling may choose to send payload bearing ops inline,
503+
* if the resulting Call message is smaller than the inline threshold.
504+
* The value of the "rq_callsize" argument accounts for RPC header
505+
* requirements, but not for the data payload in these cases.
506+
*
507+
* See rpcrdma_inline_pullup.
508+
*/
509+
static bool
510+
rpcrdma_get_sendbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
511+
size_t size, gfp_t flags)
512+
{
513+
struct rpcrdma_regbuf *rb;
514+
size_t min_size;
515+
516+
if (req->rl_sendbuf && rdmab_length(req->rl_sendbuf) >= size)
517+
return true;
518+
519+
min_size = max_t(size_t, size, r_xprt->rx_data.inline_wsize);
520+
rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, min_size, flags);
521+
if (IS_ERR(rb))
522+
return false;
523+
524+
rpcrdma_free_regbuf(&r_xprt->rx_ia, req->rl_sendbuf);
525+
r_xprt->rx_stats.hardway_register_count += min_size;
526+
req->rl_sendbuf = rb;
527+
return true;
528+
}
529+
530+
/* The rq_rcv_buf is used only if a Reply chunk is necessary.
531+
* The decision to use a Reply chunk is made later in
532+
* rpcrdma_marshal_req. This buffer is registered at that time.
533+
*
534+
* Otherwise, the associated RPC Reply arrives in a separate
535+
* Receive buffer, arbitrarily chosen by the HCA. The buffer
536+
* allocated here for the RPC Reply is not utilized in that
537+
* case. See rpcrdma_inline_fixup.
538+
*
539+
* A regbuf is used here to remember the buffer size.
540+
*/
541+
static bool
542+
rpcrdma_get_recvbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
543+
size_t size, gfp_t flags)
544+
{
545+
struct rpcrdma_regbuf *rb;
546+
547+
if (req->rl_recvbuf && rdmab_length(req->rl_recvbuf) >= size)
548+
return true;
549+
550+
rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, size, flags);
551+
if (IS_ERR(rb))
552+
return false;
553+
554+
rpcrdma_free_regbuf(&r_xprt->rx_ia, req->rl_recvbuf);
555+
r_xprt->rx_stats.hardway_register_count += size;
556+
req->rl_recvbuf = rb;
557+
return true;
558+
}
559+
480560
/**
481561
* xprt_rdma_allocate - allocate transport resources for an RPC
482562
* @task: RPC task
@@ -487,22 +567,18 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
487567
* EIO: A permanent error occurred, do not retry
488568
*
489569
* The RDMA allocate/free functions need the task structure as a place
490-
* to hide the struct rpcrdma_req, which is necessary for the actual send/recv
491-
* sequence.
570+
* to hide the struct rpcrdma_req, which is necessary for the actual
571+
* send/recv sequence.
492572
*
493-
* The RPC layer allocates both send and receive buffers in the same call
494-
* (rq_send_buf and rq_rcv_buf are both part of a single contiguous buffer).
495-
* We may register rq_rcv_buf when using reply chunks.
573+
* xprt_rdma_allocate provides buffers that are already mapped for
574+
* DMA, and a local DMA lkey is provided for each.
496575
*/
497576
static int
498577
xprt_rdma_allocate(struct rpc_task *task)
499578
{
500579
struct rpc_rqst *rqst = task->tk_rqstp;
501-
size_t size = rqst->rq_callsize + rqst->rq_rcvsize;
502580
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
503-
struct rpcrdma_regbuf *rb;
504581
struct rpcrdma_req *req;
505-
size_t min_size;
506582
gfp_t flags;
507583

508584
req = rpcrdma_buffer_get(&r_xprt->rx_buf);
@@ -513,59 +589,23 @@ xprt_rdma_allocate(struct rpc_task *task)
513589
if (RPC_IS_SWAPPER(task))
514590
flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN;
515591

516-
if (req->rl_rdmabuf == NULL)
517-
goto out_rdmabuf;
518-
if (req->rl_sendbuf == NULL)
519-
goto out_sendbuf;
520-
if (size > req->rl_sendbuf->rg_size)
521-
goto out_sendbuf;
592+
if (!rpcrdma_get_rdmabuf(r_xprt, req, flags))
593+
goto out_fail;
594+
if (!rpcrdma_get_sendbuf(r_xprt, req, rqst->rq_callsize, flags))
595+
goto out_fail;
596+
if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags))
597+
goto out_fail;
598+
599+
dprintk("RPC: %5u %s: send size = %zd, recv size = %zd, req = %p\n",
600+
task->tk_pid, __func__, rqst->rq_callsize,
601+
rqst->rq_rcvsize, req);
522602

523-
out:
524-
dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req);
525603
req->rl_connect_cookie = 0; /* our reserved value */
526604
rpcrdma_set_xprtdata(rqst, req);
527605
rqst->rq_buffer = req->rl_sendbuf->rg_base;
528-
rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_rcvsize;
606+
rqst->rq_rbuffer = req->rl_recvbuf->rg_base;
529607
return 0;
530608

531-
out_rdmabuf:
532-
min_size = r_xprt->rx_data.inline_wsize;
533-
rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, min_size, flags);
534-
if (IS_ERR(rb))
535-
goto out_fail;
536-
req->rl_rdmabuf = rb;
537-
538-
out_sendbuf:
539-
/* XDR encoding and RPC/RDMA marshaling of this request has not
540-
* yet occurred. Thus a lower bound is needed to prevent buffer
541-
* overrun during marshaling.
542-
*
543-
* RPC/RDMA marshaling may choose to send payload bearing ops
544-
* inline, if the result is smaller than the inline threshold.
545-
* The value of the "size" argument accounts for header
546-
* requirements but not for the payload in these cases.
547-
*
548-
* Likewise, allocate enough space to receive a reply up to the
549-
* size of the inline threshold.
550-
*
551-
* It's unlikely that both the send header and the received
552-
* reply will be large, but slush is provided here to allow
553-
* flexibility when marshaling.
554-
*/
555-
min_size = r_xprt->rx_data.inline_rsize;
556-
min_size += r_xprt->rx_data.inline_wsize;
557-
if (size < min_size)
558-
size = min_size;
559-
560-
rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, size, flags);
561-
if (IS_ERR(rb))
562-
goto out_fail;
563-
564-
r_xprt->rx_stats.hardway_register_count += size;
565-
rpcrdma_free_regbuf(&r_xprt->rx_ia, req->rl_sendbuf);
566-
req->rl_sendbuf = rb;
567-
goto out;
568-
569609
out_fail:
570610
rpcrdma_buffer_put(req);
571611
return -ENOMEM;

net/sunrpc/xprtrdma/verbs.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -975,6 +975,7 @@ rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep)
975975
void
976976
rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
977977
{
978+
rpcrdma_free_regbuf(ia, req->rl_recvbuf);
978979
rpcrdma_free_regbuf(ia, req->rl_sendbuf);
979980
rpcrdma_free_regbuf(ia, req->rl_rdmabuf);
980981
kfree(req);
@@ -1209,7 +1210,6 @@ rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
12091210

12101211
iov->length = size;
12111212
iov->lkey = ia->ri_pd->local_dma_lkey;
1212-
rb->rg_size = size;
12131213
return rb;
12141214

12151215
out_free:

net/sunrpc/xprtrdma/xprt_rdma.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,6 @@ struct rpcrdma_ep {
112112
*/
113113

114114
struct rpcrdma_regbuf {
115-
size_t rg_size;
116115
struct ib_sge rg_iov;
117116
__be32 rg_base[0] __attribute__ ((aligned(256)));
118117
};
@@ -285,8 +284,9 @@ struct rpcrdma_req {
285284
struct rpcrdma_buffer *rl_buffer;
286285
struct rpcrdma_rep *rl_reply;/* holder for reply buffer */
287286
struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS];
288-
struct rpcrdma_regbuf *rl_rdmabuf;
289-
struct rpcrdma_regbuf *rl_sendbuf;
287+
struct rpcrdma_regbuf *rl_rdmabuf; /* xprt header */
288+
struct rpcrdma_regbuf *rl_sendbuf; /* rq_snd_buf */
289+
struct rpcrdma_regbuf *rl_recvbuf; /* rq_rcv_buf */
290290

291291
struct ib_cqe rl_cqe;
292292
struct list_head rl_all;

0 commit comments

Comments
 (0)