From bcf3ffd405df6998914b248d2f22625544a4dd56 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 7 May 2018 15:26:55 -0400 Subject: svcrdma: Add proper SPDX tags for NetApp-contributed source Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 7337e1221590..88da0c9bd7b1 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ /* * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. * -- cgit v1.2.3 From ecf85b2384ea5f7cb0577bf6143bc46d9ecfe4d3 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 7 May 2018 15:27:21 -0400 Subject: svcrdma: Introduce svc_rdma_recv_ctxt svc_rdma_op_ctxt's are pre-allocated and maintained on a per-xprt free list. This eliminates the overhead of calling kmalloc / kfree, both of which grab a globally shared lock that disables interrupts. To reduce contention further, separate the use of these objects in the Receive and Send paths in svcrdma. Subsequent patches will take advantage of this separation by allocating real resources which are then cached in these objects. The allocations are freed when the transport is torn down. I've renamed the structure so that static type checking can be used to ensure that uses of op_ctxt and recv_ctxt are not confused. As an additional clean up, structure fields are renamed to conform with kernel coding conventions. As a final clean up, helpers related to recv_ctxt are moved closer to the functions that use them. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 24 ++- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 318 ++++++++++++++++++++++++++----- net/sunrpc/xprtrdma/svc_rdma_rw.c | 84 ++++---- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 2 +- net/sunrpc/xprtrdma/svc_rdma_transport.c | 142 +------------- 5 files changed, 349 insertions(+), 221 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 88da0c9bd7b1..37f759d65348 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -128,6 +128,9 @@ struct svcxprt_rdma { unsigned long sc_flags; struct list_head sc_read_complete_q; struct work_struct sc_work; + + spinlock_t sc_recv_lock; + struct list_head sc_recv_ctxts; }; /* sc_flags */ #define RDMAXPRT_CONN_PENDING 3 @@ -142,6 +145,19 @@ struct svcxprt_rdma { #define RPCSVC_MAXPAYLOAD_RDMA RPCSVC_MAXPAYLOAD +struct svc_rdma_recv_ctxt { + struct list_head rc_list; + struct ib_recv_wr rc_recv_wr; + struct ib_cqe rc_cqe; + struct xdr_buf rc_arg; + u32 rc_byte_len; + unsigned int rc_page_count; + unsigned int rc_hdr_count; + struct ib_sge rc_sges[1 + + RPCRDMA_MAX_INLINE_THRESH / PAGE_SIZE]; + struct page *rc_pages[RPCSVC_MAXPAGES]; +}; + /* Track DMA maps for this transport and context */ static inline void svc_rdma_count_mappings(struct svcxprt_rdma *rdma, struct svc_rdma_op_ctxt *ctxt) @@ -155,13 +171,19 @@ extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct xdr_buf *rcvbuf); /* svc_rdma_recvfrom.c */ +extern void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma); +extern bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma); +extern void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma, + struct svc_rdma_recv_ctxt *ctxt, + int free_pages); +extern void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma); extern int svc_rdma_recvfrom(struct svc_rqst *); /* svc_rdma_rw.c */ extern void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma); extern int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, - struct svc_rdma_op_ctxt *head, __be32 *p); + struct svc_rdma_recv_ctxt *head, __be32 *p); extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch, struct xdr_buf *xdr); extern int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 330d542fd96e..b7d9c55ee896 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* - * Copyright (c) 2016, 2017 Oracle. All rights reserved. + * Copyright (c) 2016-2018 Oracle. All rights reserved. * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved. * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. * @@ -61,7 +61,7 @@ * svc_rdma_recvfrom must post RDMA Reads to pull the RPC Call's * data payload from the client. svc_rdma_recvfrom sets up the * RDMA Reads using pages in svc_rqst::rq_pages, which are - * transferred to an svc_rdma_op_ctxt for the duration of the + * transferred to an svc_rdma_recv_ctxt for the duration of the * I/O. svc_rdma_recvfrom then returns zero, since the RPC message * is still not yet ready. * @@ -70,18 +70,18 @@ * svc_rdma_recvfrom again. This second call may use a different * svc_rqst than the first one, thus any information that needs * to be preserved across these two calls is kept in an - * svc_rdma_op_ctxt. + * svc_rdma_recv_ctxt. * * The second call to svc_rdma_recvfrom performs final assembly * of the RPC Call message, using the RDMA Read sink pages kept in - * the svc_rdma_op_ctxt. The xdr_buf is copied from the - * svc_rdma_op_ctxt to the second svc_rqst. The second call returns + * the svc_rdma_recv_ctxt. The xdr_buf is copied from the + * svc_rdma_recv_ctxt to the second svc_rqst. The second call returns * the length of the completed RPC Call message. * * Page Management * * Pages under I/O must be transferred from the first svc_rqst to an - * svc_rdma_op_ctxt before the first svc_rdma_recvfrom call returns. + * svc_rdma_recv_ctxt before the first svc_rdma_recvfrom call returns. * * The first svc_rqst supplies pages for RDMA Reads. These are moved * from rqstp::rq_pages into ctxt::pages. The consumed elements of @@ -89,7 +89,7 @@ * svc_rdma_recvfrom call returns. * * During the second svc_rdma_recvfrom call, RDMA Read sink pages - * are transferred from the svc_rdma_op_ctxt to the second svc_rqst + * are transferred from the svc_rdma_recv_ctxt to the second svc_rqst * (see rdma_read_complete() below). */ @@ -108,13 +108,247 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT +static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc); + +static inline struct svc_rdma_recv_ctxt * +svc_rdma_next_recv_ctxt(struct list_head *list) +{ + return list_first_entry_or_null(list, struct svc_rdma_recv_ctxt, + rc_list); +} + +/** + * svc_rdma_recv_ctxts_destroy - Release all recv_ctxt's for an xprt + * @rdma: svcxprt_rdma being torn down + * + */ +void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_recv_ctxt *ctxt; + + while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_recv_ctxts))) { + list_del(&ctxt->rc_list); + kfree(ctxt); + } +} + +static struct svc_rdma_recv_ctxt * +svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_recv_ctxt *ctxt; + + spin_lock(&rdma->sc_recv_lock); + ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_recv_ctxts); + if (!ctxt) + goto out_empty; + list_del(&ctxt->rc_list); + spin_unlock(&rdma->sc_recv_lock); + +out: + ctxt->rc_recv_wr.num_sge = 0; + ctxt->rc_page_count = 0; + return ctxt; + +out_empty: + spin_unlock(&rdma->sc_recv_lock); + + ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL); + if (!ctxt) + return NULL; + goto out; +} + +static void svc_rdma_recv_ctxt_unmap(struct svcxprt_rdma *rdma, + struct svc_rdma_recv_ctxt *ctxt) +{ + struct ib_device *device = rdma->sc_cm_id->device; + int i; + + for (i = 0; i < ctxt->rc_recv_wr.num_sge; i++) + ib_dma_unmap_page(device, + ctxt->rc_sges[i].addr, + ctxt->rc_sges[i].length, + DMA_FROM_DEVICE); +} + +/** + * svc_rdma_recv_ctxt_put - Return recv_ctxt to free list + * @rdma: controlling svcxprt_rdma + * @ctxt: object to return to the free list + * @free_pages: Non-zero if rc_pages should be freed + * + */ +void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma, + struct svc_rdma_recv_ctxt *ctxt, + int free_pages) +{ + unsigned int i; + + if (free_pages) + for (i = 0; i < ctxt->rc_page_count; i++) + put_page(ctxt->rc_pages[i]); + spin_lock(&rdma->sc_recv_lock); + list_add(&ctxt->rc_list, &rdma->sc_recv_ctxts); + spin_unlock(&rdma->sc_recv_lock); +} + +static int svc_rdma_post_recv(struct svcxprt_rdma *rdma) +{ + struct ib_device *device = rdma->sc_cm_id->device; + struct svc_rdma_recv_ctxt *ctxt; + struct ib_recv_wr *bad_recv_wr; + int sge_no, buflen, ret; + struct page *page; + dma_addr_t pa; + + ctxt = svc_rdma_recv_ctxt_get(rdma); + if (!ctxt) + return -ENOMEM; + + buflen = 0; + ctxt->rc_cqe.done = svc_rdma_wc_receive; + for (sge_no = 0; buflen < rdma->sc_max_req_size; sge_no++) { + if (sge_no >= rdma->sc_max_sge) { + pr_err("svcrdma: Too many sges (%d)\n", sge_no); + goto err_put_ctxt; + } + + page = alloc_page(GFP_KERNEL); + if (!page) + goto err_put_ctxt; + ctxt->rc_pages[sge_no] = page; + ctxt->rc_page_count++; + + pa = ib_dma_map_page(device, ctxt->rc_pages[sge_no], + 0, PAGE_SIZE, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(device, pa)) + goto err_put_ctxt; + ctxt->rc_sges[sge_no].addr = pa; + ctxt->rc_sges[sge_no].length = PAGE_SIZE; + ctxt->rc_sges[sge_no].lkey = rdma->sc_pd->local_dma_lkey; + ctxt->rc_recv_wr.num_sge++; + + buflen += PAGE_SIZE; + } + ctxt->rc_recv_wr.next = NULL; + ctxt->rc_recv_wr.sg_list = &ctxt->rc_sges[0]; + ctxt->rc_recv_wr.wr_cqe = &ctxt->rc_cqe; + + svc_xprt_get(&rdma->sc_xprt); + ret = ib_post_recv(rdma->sc_qp, &ctxt->rc_recv_wr, &bad_recv_wr); + trace_svcrdma_post_recv(&ctxt->rc_recv_wr, ret); + if (ret) + goto err_post; + return 0; + +err_put_ctxt: + svc_rdma_recv_ctxt_unmap(rdma, ctxt); + svc_rdma_recv_ctxt_put(rdma, ctxt, 1); + return -ENOMEM; +err_post: + svc_rdma_recv_ctxt_unmap(rdma, ctxt); + svc_rdma_recv_ctxt_put(rdma, ctxt, 1); + svc_xprt_put(&rdma->sc_xprt); + return ret; +} + +/** + * svc_rdma_post_recvs - Post initial set of Recv WRs + * @rdma: fresh svcxprt_rdma + * + * Returns true if successful, otherwise false. + */ +bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma) +{ + unsigned int i; + int ret; + + for (i = 0; i < rdma->sc_max_requests; i++) { + ret = svc_rdma_post_recv(rdma); + if (ret) { + pr_err("svcrdma: failure posting recv buffers: %d\n", + ret); + return false; + } + } + return true; +} + +/** + * svc_rdma_wc_receive - Invoked by RDMA provider for each polled Receive WC + * @cq: Completion Queue context + * @wc: Work Completion object + * + * NB: The svc_xprt/svcxprt_rdma is pinned whenever it's possible that + * the Receive completion handler could be running. + */ +static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) +{ + struct svcxprt_rdma *rdma = cq->cq_context; + struct ib_cqe *cqe = wc->wr_cqe; + struct svc_rdma_recv_ctxt *ctxt; + + trace_svcrdma_wc_receive(wc); + + /* WARNING: Only wc->wr_cqe and wc->status are reliable */ + ctxt = container_of(cqe, struct svc_rdma_recv_ctxt, rc_cqe); + svc_rdma_recv_ctxt_unmap(rdma, ctxt); + + if (wc->status != IB_WC_SUCCESS) + goto flushed; + + if (svc_rdma_post_recv(rdma)) + goto post_err; + + /* All wc fields are now known to be valid */ + ctxt->rc_byte_len = wc->byte_len; + spin_lock(&rdma->sc_rq_dto_lock); + list_add_tail(&ctxt->rc_list, &rdma->sc_rq_dto_q); + spin_unlock(&rdma->sc_rq_dto_lock); + set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags); + if (!test_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags)) + svc_xprt_enqueue(&rdma->sc_xprt); + goto out; + +flushed: + if (wc->status != IB_WC_WR_FLUSH_ERR) + pr_err("svcrdma: Recv: %s (%u/0x%x)\n", + ib_wc_status_msg(wc->status), + wc->status, wc->vendor_err); +post_err: + svc_rdma_recv_ctxt_put(rdma, ctxt, 1); + set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); + svc_xprt_enqueue(&rdma->sc_xprt); +out: + svc_xprt_put(&rdma->sc_xprt); +} + +/** + * svc_rdma_flush_recv_queues - Drain pending Receive work + * @rdma: svcxprt_rdma being shut down + * + */ +void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_recv_ctxt *ctxt; + + while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_read_complete_q))) { + list_del(&ctxt->rc_list); + svc_rdma_recv_ctxt_put(rdma, ctxt, 1); + } + while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_rq_dto_q))) { + list_del(&ctxt->rc_list); + svc_rdma_recv_ctxt_put(rdma, ctxt, 1); + } +} + /* * Replace the pages in the rq_argpages array with the pages from the SGE in * the RDMA_RECV completion. The SGL should contain full pages up until the * last one. */ static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp, - struct svc_rdma_op_ctxt *ctxt) + struct svc_rdma_recv_ctxt *ctxt) { struct page *page; int sge_no; @@ -123,30 +357,30 @@ static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp, /* The reply path assumes the Call's transport header resides * in rqstp->rq_pages[0]. */ - page = ctxt->pages[0]; + page = ctxt->rc_pages[0]; put_page(rqstp->rq_pages[0]); rqstp->rq_pages[0] = page; /* Set up the XDR head */ rqstp->rq_arg.head[0].iov_base = page_address(page); rqstp->rq_arg.head[0].iov_len = - min_t(size_t, ctxt->byte_len, ctxt->sge[0].length); - rqstp->rq_arg.len = ctxt->byte_len; - rqstp->rq_arg.buflen = ctxt->byte_len; + min_t(size_t, ctxt->rc_byte_len, ctxt->rc_sges[0].length); + rqstp->rq_arg.len = ctxt->rc_byte_len; + rqstp->rq_arg.buflen = ctxt->rc_byte_len; /* Compute bytes past head in the SGL */ - len = ctxt->byte_len - rqstp->rq_arg.head[0].iov_len; + len = ctxt->rc_byte_len - rqstp->rq_arg.head[0].iov_len; /* If data remains, store it in the pagelist */ rqstp->rq_arg.page_len = len; rqstp->rq_arg.page_base = 0; sge_no = 1; - while (len && sge_no < ctxt->count) { - page = ctxt->pages[sge_no]; + while (len && sge_no < ctxt->rc_recv_wr.num_sge) { + page = ctxt->rc_pages[sge_no]; put_page(rqstp->rq_pages[sge_no]); rqstp->rq_pages[sge_no] = page; - len -= min_t(u32, len, ctxt->sge[sge_no].length); + len -= min_t(u32, len, ctxt->rc_sges[sge_no].length); sge_no++; } rqstp->rq_respages = &rqstp->rq_pages[sge_no]; @@ -154,11 +388,11 @@ static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp, /* If not all pages were used from the SGL, free the remaining ones */ len = sge_no; - while (sge_no < ctxt->count) { - page = ctxt->pages[sge_no++]; + while (sge_no < ctxt->rc_recv_wr.num_sge) { + page = ctxt->rc_pages[sge_no++]; put_page(page); } - ctxt->count = len; + ctxt->rc_page_count = len; /* Set up tail */ rqstp->rq_arg.tail[0].iov_base = NULL; @@ -364,29 +598,29 @@ out_inval: } static void rdma_read_complete(struct svc_rqst *rqstp, - struct svc_rdma_op_ctxt *head) + struct svc_rdma_recv_ctxt *head) { int page_no; /* Copy RPC pages */ - for (page_no = 0; page_no < head->count; page_no++) { + for (page_no = 0; page_no < head->rc_page_count; page_no++) { put_page(rqstp->rq_pages[page_no]); - rqstp->rq_pages[page_no] = head->pages[page_no]; + rqstp->rq_pages[page_no] = head->rc_pages[page_no]; } /* Point rq_arg.pages past header */ - rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count]; - rqstp->rq_arg.page_len = head->arg.page_len; + rqstp->rq_arg.pages = &rqstp->rq_pages[head->rc_hdr_count]; + rqstp->rq_arg.page_len = head->rc_arg.page_len; /* rq_respages starts after the last arg page */ rqstp->rq_respages = &rqstp->rq_pages[page_no]; rqstp->rq_next_page = rqstp->rq_respages + 1; /* Rebuild rq_arg head and tail. */ - rqstp->rq_arg.head[0] = head->arg.head[0]; - rqstp->rq_arg.tail[0] = head->arg.tail[0]; - rqstp->rq_arg.len = head->arg.len; - rqstp->rq_arg.buflen = head->arg.buflen; + rqstp->rq_arg.head[0] = head->rc_arg.head[0]; + rqstp->rq_arg.tail[0] = head->rc_arg.tail[0]; + rqstp->rq_arg.len = head->rc_arg.len; + rqstp->rq_arg.buflen = head->rc_arg.buflen; } static void svc_rdma_send_error(struct svcxprt_rdma *xprt, @@ -506,28 +740,26 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) struct svc_xprt *xprt = rqstp->rq_xprt; struct svcxprt_rdma *rdma_xprt = container_of(xprt, struct svcxprt_rdma, sc_xprt); - struct svc_rdma_op_ctxt *ctxt; + struct svc_rdma_recv_ctxt *ctxt; __be32 *p; int ret; spin_lock(&rdma_xprt->sc_rq_dto_lock); - if (!list_empty(&rdma_xprt->sc_read_complete_q)) { - ctxt = list_first_entry(&rdma_xprt->sc_read_complete_q, - struct svc_rdma_op_ctxt, list); - list_del(&ctxt->list); + ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_read_complete_q); + if (ctxt) { + list_del(&ctxt->rc_list); spin_unlock(&rdma_xprt->sc_rq_dto_lock); rdma_read_complete(rqstp, ctxt); goto complete; - } else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) { - ctxt = list_first_entry(&rdma_xprt->sc_rq_dto_q, - struct svc_rdma_op_ctxt, list); - list_del(&ctxt->list); - } else { + } + ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_rq_dto_q); + if (!ctxt) { /* No new incoming requests, terminate the loop */ clear_bit(XPT_DATA, &xprt->xpt_flags); spin_unlock(&rdma_xprt->sc_rq_dto_lock); return 0; } + list_del(&ctxt->rc_list); spin_unlock(&rdma_xprt->sc_rq_dto_lock); atomic_inc(&rdma_stat_recv); @@ -545,7 +777,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) if (svc_rdma_is_backchannel_reply(xprt, p)) { ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, p, &rqstp->rq_arg); - svc_rdma_put_context(ctxt, 0); + svc_rdma_recv_ctxt_put(rdma_xprt, ctxt, 0); return ret; } @@ -554,7 +786,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) goto out_readchunk; complete: - svc_rdma_put_context(ctxt, 0); + svc_rdma_recv_ctxt_put(rdma_xprt, ctxt, 0); rqstp->rq_prot = IPPROTO_MAX; svc_xprt_copy_addrs(rqstp, xprt); return rqstp->rq_arg.len; @@ -567,16 +799,16 @@ out_readchunk: out_err: svc_rdma_send_error(rdma_xprt, p, ret); - svc_rdma_put_context(ctxt, 0); + svc_rdma_recv_ctxt_put(rdma_xprt, ctxt, 0); return 0; out_postfail: if (ret == -EINVAL) svc_rdma_send_error(rdma_xprt, p, ret); - svc_rdma_put_context(ctxt, 1); + svc_rdma_recv_ctxt_put(rdma_xprt, ctxt, 1); return ret; out_drop: - svc_rdma_put_context(ctxt, 1); + svc_rdma_recv_ctxt_put(rdma_xprt, ctxt, 1); return 0; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 887ceef125b2..c080ce20ff40 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (c) 2016 Oracle. All rights reserved. + * Copyright (c) 2016-2018 Oracle. All rights reserved. * * Use the core R/W API to move RPC-over-RDMA Read and Write chunks. */ @@ -227,7 +227,7 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) /* State for pulling a Read chunk. */ struct svc_rdma_read_info { - struct svc_rdma_op_ctxt *ri_readctxt; + struct svc_rdma_recv_ctxt *ri_readctxt; unsigned int ri_position; unsigned int ri_pageno; unsigned int ri_pageoff; @@ -282,10 +282,10 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) pr_err("svcrdma: read ctx: %s (%u/0x%x)\n", ib_wc_status_msg(wc->status), wc->status, wc->vendor_err); - svc_rdma_put_context(info->ri_readctxt, 1); + svc_rdma_recv_ctxt_put(rdma, info->ri_readctxt, 1); } else { spin_lock(&rdma->sc_rq_dto_lock); - list_add_tail(&info->ri_readctxt->list, + list_add_tail(&info->ri_readctxt->rc_list, &rdma->sc_read_complete_q); spin_unlock(&rdma->sc_rq_dto_lock); @@ -607,7 +607,7 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, struct svc_rqst *rqstp, u32 rkey, u32 len, u64 offset) { - struct svc_rdma_op_ctxt *head = info->ri_readctxt; + struct svc_rdma_recv_ctxt *head = info->ri_readctxt; struct svc_rdma_chunk_ctxt *cc = &info->ri_cc; struct svc_rdma_rw_ctxt *ctxt; unsigned int sge_no, seg_len; @@ -625,10 +625,10 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, seg_len = min_t(unsigned int, len, PAGE_SIZE - info->ri_pageoff); - head->arg.pages[info->ri_pageno] = + head->rc_arg.pages[info->ri_pageno] = rqstp->rq_pages[info->ri_pageno]; if (!info->ri_pageoff) - head->count++; + head->rc_page_count++; sg_set_page(sg, rqstp->rq_pages[info->ri_pageno], seg_len, info->ri_pageoff); @@ -705,9 +705,9 @@ static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp, } /* Construct RDMA Reads to pull over a normal Read chunk. The chunk - * data lands in the page list of head->arg.pages. + * data lands in the page list of head->rc_arg.pages. * - * Currently NFSD does not look at the head->arg.tail[0] iovec. + * Currently NFSD does not look at the head->rc_arg.tail[0] iovec. * Therefore, XDR round-up of the Read chunk and trailing * inline content must both be added at the end of the pagelist. */ @@ -715,10 +715,10 @@ static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp, struct svc_rdma_read_info *info, __be32 *p) { - struct svc_rdma_op_ctxt *head = info->ri_readctxt; + struct svc_rdma_recv_ctxt *head = info->ri_readctxt; int ret; - info->ri_pageno = head->hdr_count; + info->ri_pageno = head->rc_hdr_count; info->ri_pageoff = 0; ret = svc_rdma_build_read_chunk(rqstp, info, p); @@ -732,11 +732,11 @@ static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp, * chunk is not included in either the pagelist or in * the tail. */ - head->arg.tail[0].iov_base = - head->arg.head[0].iov_base + info->ri_position; - head->arg.tail[0].iov_len = - head->arg.head[0].iov_len - info->ri_position; - head->arg.head[0].iov_len = info->ri_position; + head->rc_arg.tail[0].iov_base = + head->rc_arg.head[0].iov_base + info->ri_position; + head->rc_arg.tail[0].iov_len = + head->rc_arg.head[0].iov_len - info->ri_position; + head->rc_arg.head[0].iov_len = info->ri_position; /* Read chunk may need XDR roundup (see RFC 8166, s. 3.4.5.2). * @@ -749,9 +749,9 @@ static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp, */ info->ri_chunklen = XDR_QUADLEN(info->ri_chunklen) << 2; - head->arg.page_len = info->ri_chunklen; - head->arg.len += info->ri_chunklen; - head->arg.buflen += info->ri_chunklen; + head->rc_arg.page_len = info->ri_chunklen; + head->rc_arg.len += info->ri_chunklen; + head->rc_arg.buflen += info->ri_chunklen; out: return ret; @@ -760,7 +760,7 @@ out: /* Construct RDMA Reads to pull over a Position Zero Read chunk. * The start of the data lands in the first page just after * the Transport header, and the rest lands in the page list of - * head->arg.pages. + * head->rc_arg.pages. * * Assumptions: * - A PZRC has an XDR-aligned length (no implicit round-up). @@ -772,11 +772,11 @@ static int svc_rdma_build_pz_read_chunk(struct svc_rqst *rqstp, struct svc_rdma_read_info *info, __be32 *p) { - struct svc_rdma_op_ctxt *head = info->ri_readctxt; + struct svc_rdma_recv_ctxt *head = info->ri_readctxt; int ret; - info->ri_pageno = head->hdr_count - 1; - info->ri_pageoff = offset_in_page(head->byte_len); + info->ri_pageno = head->rc_hdr_count - 1; + info->ri_pageoff = offset_in_page(head->rc_byte_len); ret = svc_rdma_build_read_chunk(rqstp, info, p); if (ret < 0) @@ -784,22 +784,22 @@ static int svc_rdma_build_pz_read_chunk(struct svc_rqst *rqstp, trace_svcrdma_encode_pzr(info->ri_chunklen); - head->arg.len += info->ri_chunklen; - head->arg.buflen += info->ri_chunklen; + head->rc_arg.len += info->ri_chunklen; + head->rc_arg.buflen += info->ri_chunklen; - if (head->arg.buflen <= head->sge[0].length) { + if (head->rc_arg.buflen <= head->rc_sges[0].length) { /* Transport header and RPC message fit entirely * in page where head iovec resides. */ - head->arg.head[0].iov_len = info->ri_chunklen; + head->rc_arg.head[0].iov_len = info->ri_chunklen; } else { /* Transport header and part of RPC message reside * in the head iovec's page. */ - head->arg.head[0].iov_len = - head->sge[0].length - head->byte_len; - head->arg.page_len = - info->ri_chunklen - head->arg.head[0].iov_len; + head->rc_arg.head[0].iov_len = + head->rc_sges[0].length - head->rc_byte_len; + head->rc_arg.page_len = + info->ri_chunklen - head->rc_arg.head[0].iov_len; } out: @@ -824,24 +824,24 @@ out: * - All Read segments in @p have the same Position value. */ int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, - struct svc_rdma_op_ctxt *head, __be32 *p) + struct svc_rdma_recv_ctxt *head, __be32 *p) { struct svc_rdma_read_info *info; struct page **page; int ret; /* The request (with page list) is constructed in - * head->arg. Pages involved with RDMA Read I/O are + * head->rc_arg. Pages involved with RDMA Read I/O are * transferred there. */ - head->hdr_count = head->count; - head->arg.head[0] = rqstp->rq_arg.head[0]; - head->arg.tail[0] = rqstp->rq_arg.tail[0]; - head->arg.pages = head->pages; - head->arg.page_base = 0; - head->arg.page_len = 0; - head->arg.len = rqstp->rq_arg.len; - head->arg.buflen = rqstp->rq_arg.buflen; + head->rc_hdr_count = head->rc_page_count; + head->rc_arg.head[0] = rqstp->rq_arg.head[0]; + head->rc_arg.tail[0] = rqstp->rq_arg.tail[0]; + head->rc_arg.pages = head->rc_pages; + head->rc_arg.page_base = 0; + head->rc_arg.page_len = 0; + head->rc_arg.len = rqstp->rq_arg.len; + head->rc_arg.buflen = rqstp->rq_arg.buflen; info = svc_rdma_read_info_alloc(rdma); if (!info) @@ -867,7 +867,7 @@ int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, out: /* Read sink pages have been moved from rqstp->rq_pages to - * head->arg.pages. Force svc_recv to refill those slots + * head->rc_arg.pages. Force svc_recv to refill those slots * in rq_pages. */ for (page = rqstp->rq_pages; page < rqstp->rq_respages; page++) diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index fed28de78d37..a397d9a3d80e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* - * Copyright (c) 2016 Oracle. All rights reserved. + * Copyright (c) 2016-2018 Oracle. All rights reserved. * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved. * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. * diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 05edb18f8ca3..05544f2f50d4 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -63,7 +63,6 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT -static int svc_rdma_post_recv(struct svcxprt_rdma *xprt); static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, struct net *net); static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, @@ -175,11 +174,7 @@ static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt) { unsigned int i; - /* Each RPC/RDMA credit can consume one Receive and - * one Send WQE at the same time. - */ - i = xprt->sc_sq_depth + xprt->sc_rq_depth; - + i = xprt->sc_sq_depth; while (i--) { struct svc_rdma_op_ctxt *ctxt; @@ -297,54 +292,6 @@ static void qp_event_handler(struct ib_event *event, void *context) } } -/** - * svc_rdma_wc_receive - Invoked by RDMA provider for each polled Receive WC - * @cq: completion queue - * @wc: completed WR - * - */ -static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) -{ - struct svcxprt_rdma *xprt = cq->cq_context; - struct ib_cqe *cqe = wc->wr_cqe; - struct svc_rdma_op_ctxt *ctxt; - - trace_svcrdma_wc_receive(wc); - - /* WARNING: Only wc->wr_cqe and wc->status are reliable */ - ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe); - svc_rdma_unmap_dma(ctxt); - - if (wc->status != IB_WC_SUCCESS) - goto flushed; - - /* All wc fields are now known to be valid */ - ctxt->byte_len = wc->byte_len; - spin_lock(&xprt->sc_rq_dto_lock); - list_add_tail(&ctxt->list, &xprt->sc_rq_dto_q); - spin_unlock(&xprt->sc_rq_dto_lock); - - svc_rdma_post_recv(xprt); - - set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); - if (test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags)) - goto out; - goto out_enqueue; - -flushed: - if (wc->status != IB_WC_WR_FLUSH_ERR) - pr_err("svcrdma: Recv: %s (%u/0x%x)\n", - ib_wc_status_msg(wc->status), - wc->status, wc->vendor_err); - set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); - svc_rdma_put_context(ctxt, 1); - -out_enqueue: - svc_xprt_enqueue(&xprt->sc_xprt); -out: - svc_xprt_put(&xprt->sc_xprt); -} - /** * svc_rdma_wc_send - Invoked by RDMA provider for each polled Send WC * @cq: completion queue @@ -392,12 +339,14 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); INIT_LIST_HEAD(&cma_xprt->sc_ctxts); + INIT_LIST_HEAD(&cma_xprt->sc_recv_ctxts); INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts); init_waitqueue_head(&cma_xprt->sc_send_wait); spin_lock_init(&cma_xprt->sc_lock); spin_lock_init(&cma_xprt->sc_rq_dto_lock); spin_lock_init(&cma_xprt->sc_ctxt_lock); + spin_lock_init(&cma_xprt->sc_recv_lock); spin_lock_init(&cma_xprt->sc_rw_ctxt_lock); /* @@ -411,63 +360,6 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, return cma_xprt; } -static int -svc_rdma_post_recv(struct svcxprt_rdma *xprt) -{ - struct ib_recv_wr recv_wr, *bad_recv_wr; - struct svc_rdma_op_ctxt *ctxt; - struct page *page; - dma_addr_t pa; - int sge_no; - int buflen; - int ret; - - ctxt = svc_rdma_get_context(xprt); - buflen = 0; - ctxt->direction = DMA_FROM_DEVICE; - ctxt->cqe.done = svc_rdma_wc_receive; - for (sge_no = 0; buflen < xprt->sc_max_req_size; sge_no++) { - if (sge_no >= xprt->sc_max_sge) { - pr_err("svcrdma: Too many sges (%d)\n", sge_no); - goto err_put_ctxt; - } - page = alloc_page(GFP_KERNEL); - if (!page) - goto err_put_ctxt; - ctxt->pages[sge_no] = page; - pa = ib_dma_map_page(xprt->sc_cm_id->device, - page, 0, PAGE_SIZE, - DMA_FROM_DEVICE); - if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa)) - goto err_put_ctxt; - svc_rdma_count_mappings(xprt, ctxt); - ctxt->sge[sge_no].addr = pa; - ctxt->sge[sge_no].length = PAGE_SIZE; - ctxt->sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey; - ctxt->count = sge_no + 1; - buflen += PAGE_SIZE; - } - recv_wr.next = NULL; - recv_wr.sg_list = &ctxt->sge[0]; - recv_wr.num_sge = ctxt->count; - recv_wr.wr_cqe = &ctxt->cqe; - - svc_xprt_get(&xprt->sc_xprt); - ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr); - trace_svcrdma_post_recv(&recv_wr, ret); - if (ret) { - svc_rdma_unmap_dma(ctxt); - svc_rdma_put_context(ctxt, 1); - svc_xprt_put(&xprt->sc_xprt); - } - return ret; - - err_put_ctxt: - svc_rdma_unmap_dma(ctxt); - svc_rdma_put_context(ctxt, 1); - return -ENOMEM; -} - static void svc_rdma_parse_connect_private(struct svcxprt_rdma *newxprt, struct rdma_conn_param *param) @@ -698,7 +590,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) struct ib_qp_init_attr qp_attr; struct ib_device *dev; struct sockaddr *sap; - unsigned int i, ctxts; + unsigned int ctxts; int ret = 0; listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); @@ -803,14 +695,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) !rdma_ib_or_roce(dev, newxprt->sc_port_num)) goto errout; - /* Post receive buffers */ - for (i = 0; i < newxprt->sc_max_requests; i++) { - ret = svc_rdma_post_recv(newxprt); - if (ret) { - dprintk("svcrdma: failure posting receive buffers\n"); - goto errout; - } - } + if (!svc_rdma_post_recvs(newxprt)) + goto errout; /* Swap out the handler */ newxprt->sc_cm_id->event_handler = rdma_cma_handler; @@ -907,20 +793,7 @@ static void __svc_rdma_free(struct work_struct *work) pr_err("svcrdma: sc_xprt still in use? (%d)\n", kref_read(&xprt->xpt_ref)); - while (!list_empty(&rdma->sc_read_complete_q)) { - struct svc_rdma_op_ctxt *ctxt; - ctxt = list_first_entry(&rdma->sc_read_complete_q, - struct svc_rdma_op_ctxt, list); - list_del(&ctxt->list); - svc_rdma_put_context(ctxt, 1); - } - while (!list_empty(&rdma->sc_rq_dto_q)) { - struct svc_rdma_op_ctxt *ctxt; - ctxt = list_first_entry(&rdma->sc_rq_dto_q, - struct svc_rdma_op_ctxt, list); - list_del(&ctxt->list); - svc_rdma_put_context(ctxt, 1); - } + svc_rdma_flush_recv_queues(rdma); /* Warn if we leaked a resource or under-referenced */ if (rdma->sc_ctxt_used != 0) @@ -935,6 +808,7 @@ static void __svc_rdma_free(struct work_struct *work) svc_rdma_destroy_rw_ctxts(rdma); svc_rdma_destroy_ctxts(rdma); + svc_rdma_recv_ctxts_destroy(rdma); /* Destroy the QP if present (not a listener) */ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) -- cgit v1.2.3 From 2c577bfea85e421bfa91df16ccf5156361aa8d4b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 7 May 2018 15:27:27 -0400 Subject: svcrdma: Remove sc_rq_depth Clean up: No need to retain rq_depth in struct svcrdma_xprt, it is used only in svc_rdma_accept(). Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 1 - net/sunrpc/xprtrdma/svc_rdma_transport.c | 17 ++++++++--------- 2 files changed, 8 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 37f759d65348..3cb66319a814 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -101,7 +101,6 @@ struct svcxprt_rdma { atomic_t sc_sq_avail; /* SQEs ready to be consumed */ unsigned int sc_sq_depth; /* Depth of SQ */ - unsigned int sc_rq_depth; /* Depth of RQ */ __be32 sc_fc_credits; /* Forward credits */ u32 sc_max_requests; /* Max requests */ u32 sc_max_bc_requests;/* Backward credits */ diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 05544f2f50d4..ef32c46a234c 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -588,9 +588,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) struct rdma_conn_param conn_param; struct rpcrdma_connect_private pmsg; struct ib_qp_init_attr qp_attr; + unsigned int ctxts, rq_depth; struct ib_device *dev; struct sockaddr *sap; - unsigned int ctxts; int ret = 0; listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); @@ -621,19 +621,18 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_max_req_size = svcrdma_max_req_size; newxprt->sc_max_requests = svcrdma_max_requests; newxprt->sc_max_bc_requests = svcrdma_max_bc_requests; - newxprt->sc_rq_depth = newxprt->sc_max_requests + - newxprt->sc_max_bc_requests; - if (newxprt->sc_rq_depth > dev->attrs.max_qp_wr) { + rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests; + if (rq_depth > dev->attrs.max_qp_wr) { pr_warn("svcrdma: reducing receive depth to %d\n", dev->attrs.max_qp_wr); - newxprt->sc_rq_depth = dev->attrs.max_qp_wr; - newxprt->sc_max_requests = newxprt->sc_rq_depth - 2; + rq_depth = dev->attrs.max_qp_wr; + newxprt->sc_max_requests = rq_depth - 2; newxprt->sc_max_bc_requests = 2; } newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests); ctxts = rdma_rw_mr_factor(dev, newxprt->sc_port_num, RPCSVC_MAXPAGES); ctxts *= newxprt->sc_max_requests; - newxprt->sc_sq_depth = newxprt->sc_rq_depth + ctxts; + newxprt->sc_sq_depth = rq_depth + ctxts; if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr) { pr_warn("svcrdma: reducing send depth to %d\n", dev->attrs.max_qp_wr); @@ -655,7 +654,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dprintk("svcrdma: error creating SQ CQ for connect request\n"); goto errout; } - newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_rq_depth, + newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, rq_depth, 0, IB_POLL_WORKQUEUE); if (IS_ERR(newxprt->sc_rq_cq)) { dprintk("svcrdma: error creating RQ CQ for connect request\n"); @@ -668,7 +667,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) qp_attr.port_num = newxprt->sc_port_num; qp_attr.cap.max_rdma_ctxs = ctxts; qp_attr.cap.max_send_wr = newxprt->sc_sq_depth - ctxts; - qp_attr.cap.max_recv_wr = newxprt->sc_rq_depth; + qp_attr.cap.max_recv_wr = rq_depth; qp_attr.cap.max_send_sge = newxprt->sc_max_sge; qp_attr.cap.max_recv_sge = newxprt->sc_max_sge; qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; -- cgit v1.2.3 From 1e5f4160745690a0476929d128a336cae95c1df9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 7 May 2018 15:27:32 -0400 Subject: svcrdma: Simplify svc_rdma_recv_ctxt_put Currently svc_rdma_recv_ctxt_put's callers have to know whether they want to free the ctxt's pages or not. This means the human developers have to know when and why to set that free_pages argument. Instead, the ctxt should carry that information with it so that svc_rdma_recv_ctxt_put does the right thing no matter who is calling. We want to keep track of the number of pages in the Receive buffer separately from the number of pages pulled over by RDMA Read. This is so that the correct number of pages can be freed properly and that number is well-documented. So now, rc_hdr_count is the number of pages consumed by head[0] (ie., the page index where the Read chunk should start); and rc_page_count is always the number of pages that need to be released when the ctxt is put. The @free_pages argument is no longer needed. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 3 +-- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 41 ++++++++++++++++++--------------- net/sunrpc/xprtrdma/svc_rdma_rw.c | 4 ++-- 3 files changed, 25 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 3cb66319a814..f0bd0b6d8931 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -173,8 +173,7 @@ extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, extern void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma); extern bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma); extern void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma, - struct svc_rdma_recv_ctxt *ctxt, - int free_pages); + struct svc_rdma_recv_ctxt *ctxt); extern void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma); extern int svc_rdma_recvfrom(struct svc_rqst *); diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index b7d9c55ee896..ecfe7c90a268 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -175,18 +175,15 @@ static void svc_rdma_recv_ctxt_unmap(struct svcxprt_rdma *rdma, * svc_rdma_recv_ctxt_put - Return recv_ctxt to free list * @rdma: controlling svcxprt_rdma * @ctxt: object to return to the free list - * @free_pages: Non-zero if rc_pages should be freed * */ void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma, - struct svc_rdma_recv_ctxt *ctxt, - int free_pages) + struct svc_rdma_recv_ctxt *ctxt) { unsigned int i; - if (free_pages) - for (i = 0; i < ctxt->rc_page_count; i++) - put_page(ctxt->rc_pages[i]); + for (i = 0; i < ctxt->rc_page_count; i++) + put_page(ctxt->rc_pages[i]); spin_lock(&rdma->sc_recv_lock); list_add(&ctxt->rc_list, &rdma->sc_recv_ctxts); spin_unlock(&rdma->sc_recv_lock); @@ -243,11 +240,11 @@ static int svc_rdma_post_recv(struct svcxprt_rdma *rdma) err_put_ctxt: svc_rdma_recv_ctxt_unmap(rdma, ctxt); - svc_rdma_recv_ctxt_put(rdma, ctxt, 1); + svc_rdma_recv_ctxt_put(rdma, ctxt); return -ENOMEM; err_post: svc_rdma_recv_ctxt_unmap(rdma, ctxt); - svc_rdma_recv_ctxt_put(rdma, ctxt, 1); + svc_rdma_recv_ctxt_put(rdma, ctxt); svc_xprt_put(&rdma->sc_xprt); return ret; } @@ -316,7 +313,7 @@ flushed: ib_wc_status_msg(wc->status), wc->status, wc->vendor_err); post_err: - svc_rdma_recv_ctxt_put(rdma, ctxt, 1); + svc_rdma_recv_ctxt_put(rdma, ctxt); set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); svc_xprt_enqueue(&rdma->sc_xprt); out: @@ -334,11 +331,11 @@ void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma) while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_read_complete_q))) { list_del(&ctxt->rc_list); - svc_rdma_recv_ctxt_put(rdma, ctxt, 1); + svc_rdma_recv_ctxt_put(rdma, ctxt); } while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_rq_dto_q))) { list_del(&ctxt->rc_list); - svc_rdma_recv_ctxt_put(rdma, ctxt, 1); + svc_rdma_recv_ctxt_put(rdma, ctxt); } } @@ -383,16 +380,19 @@ static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp, len -= min_t(u32, len, ctxt->rc_sges[sge_no].length); sge_no++; } + ctxt->rc_hdr_count = sge_no; rqstp->rq_respages = &rqstp->rq_pages[sge_no]; rqstp->rq_next_page = rqstp->rq_respages + 1; /* If not all pages were used from the SGL, free the remaining ones */ - len = sge_no; while (sge_no < ctxt->rc_recv_wr.num_sge) { page = ctxt->rc_pages[sge_no++]; put_page(page); } - ctxt->rc_page_count = len; + + /* @ctxt's pages have all been released or moved to @rqstp->rq_pages. + */ + ctxt->rc_page_count = 0; /* Set up tail */ rqstp->rq_arg.tail[0].iov_base = NULL; @@ -602,11 +602,14 @@ static void rdma_read_complete(struct svc_rqst *rqstp, { int page_no; - /* Copy RPC pages */ + /* Move Read chunk pages to rqstp so that they will be released + * when svc_process is done with them. + */ for (page_no = 0; page_no < head->rc_page_count; page_no++) { put_page(rqstp->rq_pages[page_no]); rqstp->rq_pages[page_no] = head->rc_pages[page_no]; } + head->rc_page_count = 0; /* Point rq_arg.pages past header */ rqstp->rq_arg.pages = &rqstp->rq_pages[head->rc_hdr_count]; @@ -777,7 +780,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) if (svc_rdma_is_backchannel_reply(xprt, p)) { ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, p, &rqstp->rq_arg); - svc_rdma_recv_ctxt_put(rdma_xprt, ctxt, 0); + svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); return ret; } @@ -786,7 +789,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) goto out_readchunk; complete: - svc_rdma_recv_ctxt_put(rdma_xprt, ctxt, 0); + svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); rqstp->rq_prot = IPPROTO_MAX; svc_xprt_copy_addrs(rqstp, xprt); return rqstp->rq_arg.len; @@ -799,16 +802,16 @@ out_readchunk: out_err: svc_rdma_send_error(rdma_xprt, p, ret); - svc_rdma_recv_ctxt_put(rdma_xprt, ctxt, 0); + svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); return 0; out_postfail: if (ret == -EINVAL) svc_rdma_send_error(rdma_xprt, p, ret); - svc_rdma_recv_ctxt_put(rdma_xprt, ctxt, 1); + svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); return ret; out_drop: - svc_rdma_recv_ctxt_put(rdma_xprt, ctxt, 1); + svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); return 0; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index c080ce20ff40..8242aa318ac1 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -282,7 +282,7 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) pr_err("svcrdma: read ctx: %s (%u/0x%x)\n", ib_wc_status_msg(wc->status), wc->status, wc->vendor_err); - svc_rdma_recv_ctxt_put(rdma, info->ri_readctxt, 1); + svc_rdma_recv_ctxt_put(rdma, info->ri_readctxt); } else { spin_lock(&rdma->sc_rq_dto_lock); list_add_tail(&info->ri_readctxt->rc_list, @@ -834,7 +834,7 @@ int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, * head->rc_arg. Pages involved with RDMA Read I/O are * transferred there. */ - head->rc_hdr_count = head->rc_page_count; + head->rc_page_count = head->rc_hdr_count; head->rc_arg.head[0] = rqstp->rq_arg.head[0]; head->rc_arg.tail[0] = rqstp->rq_arg.tail[0]; head->rc_arg.pages = head->rc_pages; -- cgit v1.2.3 From 3316f0631139c87631f2652c118da1a0354bd40d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 7 May 2018 15:27:43 -0400 Subject: svcrdma: Persistently allocate and DMA-map Receive buffers The current Receive path uses an array of pages which are allocated and DMA mapped when each Receive WR is posted, and then handed off to the upper layer in rqstp::rq_arg. The page flip releases unused pages in the rq_pages pagelist. This mechanism introduces a significant amount of overhead. So instead, kmalloc the Receive buffer, and leave it DMA-mapped while the transport remains connected. This confers a number of benefits: * Each Receive WR requires only one receive SGE, no matter how large the inline threshold is. This helps the server-side NFS/RDMA transport operate on less capable RDMA devices. * The Receive buffer is left allocated and mapped all the time. This relieves svc_rdma_post_recv from the overhead of allocating and DMA-mapping a fresh buffer. * svc_rdma_wc_receive no longer has to DMA unmap the Receive buffer. It has to DMA sync only the number of bytes that were received. * svc_rdma_build_arg_xdr no longer has to free a page in rq_pages for each page in the Receive buffer, making it a constant-time function. * The Receive buffer is now plugged directly into the rq_arg's head[0].iov_vec, and can be larger than a page without spilling over into rq_arg's page list. This enables simplification of the RDMA Read path in subsequent patches. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 4 +- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 168 +++++++++++-------------------- net/sunrpc/xprtrdma/svc_rdma_rw.c | 32 ++---- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 5 +- net/sunrpc/xprtrdma/svc_rdma_transport.c | 2 +- 5 files changed, 75 insertions(+), 136 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index f0bd0b6d8931..01baabfb863b 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -148,12 +148,12 @@ struct svc_rdma_recv_ctxt { struct list_head rc_list; struct ib_recv_wr rc_recv_wr; struct ib_cqe rc_cqe; + struct ib_sge rc_recv_sge; + void *rc_recv_buf; struct xdr_buf rc_arg; u32 rc_byte_len; unsigned int rc_page_count; unsigned int rc_hdr_count; - struct ib_sge rc_sges[1 + - RPCRDMA_MAX_INLINE_THRESH / PAGE_SIZE]; struct page *rc_pages[RPCSVC_MAXPAGES]; }; diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index d9fef5211116..d4ccd1c0142c 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -117,6 +117,43 @@ svc_rdma_next_recv_ctxt(struct list_head *list) rc_list); } +static struct svc_rdma_recv_ctxt * +svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_recv_ctxt *ctxt; + dma_addr_t addr; + void *buffer; + + ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL); + if (!ctxt) + goto fail0; + buffer = kmalloc(rdma->sc_max_req_size, GFP_KERNEL); + if (!buffer) + goto fail1; + addr = ib_dma_map_single(rdma->sc_pd->device, buffer, + rdma->sc_max_req_size, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(rdma->sc_pd->device, addr)) + goto fail2; + + ctxt->rc_recv_wr.next = NULL; + ctxt->rc_recv_wr.wr_cqe = &ctxt->rc_cqe; + ctxt->rc_recv_wr.sg_list = &ctxt->rc_recv_sge; + ctxt->rc_recv_wr.num_sge = 1; + ctxt->rc_cqe.done = svc_rdma_wc_receive; + ctxt->rc_recv_sge.addr = addr; + ctxt->rc_recv_sge.length = rdma->sc_max_req_size; + ctxt->rc_recv_sge.lkey = rdma->sc_pd->local_dma_lkey; + ctxt->rc_recv_buf = buffer; + return ctxt; + +fail2: + kfree(buffer); +fail1: + kfree(ctxt); +fail0: + return NULL; +} + /** * svc_rdma_recv_ctxts_destroy - Release all recv_ctxt's for an xprt * @rdma: svcxprt_rdma being torn down @@ -128,6 +165,11 @@ void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma) while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_recv_ctxts))) { list_del(&ctxt->rc_list); + ib_dma_unmap_single(rdma->sc_pd->device, + ctxt->rc_recv_sge.addr, + ctxt->rc_recv_sge.length, + DMA_FROM_DEVICE); + kfree(ctxt->rc_recv_buf); kfree(ctxt); } } @@ -145,32 +187,18 @@ svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma) spin_unlock(&rdma->sc_recv_lock); out: - ctxt->rc_recv_wr.num_sge = 0; ctxt->rc_page_count = 0; return ctxt; out_empty: spin_unlock(&rdma->sc_recv_lock); - ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL); + ctxt = svc_rdma_recv_ctxt_alloc(rdma); if (!ctxt) return NULL; goto out; } -static void svc_rdma_recv_ctxt_unmap(struct svcxprt_rdma *rdma, - struct svc_rdma_recv_ctxt *ctxt) -{ - struct ib_device *device = rdma->sc_cm_id->device; - int i; - - for (i = 0; i < ctxt->rc_recv_wr.num_sge; i++) - ib_dma_unmap_page(device, - ctxt->rc_sges[i].addr, - ctxt->rc_sges[i].length, - DMA_FROM_DEVICE); -} - /** * svc_rdma_recv_ctxt_put - Return recv_ctxt to free list * @rdma: controlling svcxprt_rdma @@ -191,46 +219,14 @@ void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma, static int svc_rdma_post_recv(struct svcxprt_rdma *rdma) { - struct ib_device *device = rdma->sc_cm_id->device; struct svc_rdma_recv_ctxt *ctxt; struct ib_recv_wr *bad_recv_wr; - int sge_no, buflen, ret; - struct page *page; - dma_addr_t pa; + int ret; ctxt = svc_rdma_recv_ctxt_get(rdma); if (!ctxt) return -ENOMEM; - buflen = 0; - ctxt->rc_cqe.done = svc_rdma_wc_receive; - for (sge_no = 0; buflen < rdma->sc_max_req_size; sge_no++) { - if (sge_no >= rdma->sc_max_sge) { - pr_err("svcrdma: Too many sges (%d)\n", sge_no); - goto err_put_ctxt; - } - - page = alloc_page(GFP_KERNEL); - if (!page) - goto err_put_ctxt; - ctxt->rc_pages[sge_no] = page; - ctxt->rc_page_count++; - - pa = ib_dma_map_page(device, ctxt->rc_pages[sge_no], - 0, PAGE_SIZE, DMA_FROM_DEVICE); - if (ib_dma_mapping_error(device, pa)) - goto err_put_ctxt; - ctxt->rc_sges[sge_no].addr = pa; - ctxt->rc_sges[sge_no].length = PAGE_SIZE; - ctxt->rc_sges[sge_no].lkey = rdma->sc_pd->local_dma_lkey; - ctxt->rc_recv_wr.num_sge++; - - buflen += PAGE_SIZE; - } - ctxt->rc_recv_wr.next = NULL; - ctxt->rc_recv_wr.sg_list = &ctxt->rc_sges[0]; - ctxt->rc_recv_wr.wr_cqe = &ctxt->rc_cqe; - svc_xprt_get(&rdma->sc_xprt); ret = ib_post_recv(rdma->sc_qp, &ctxt->rc_recv_wr, &bad_recv_wr); trace_svcrdma_post_recv(&ctxt->rc_recv_wr, ret); @@ -238,12 +234,7 @@ static int svc_rdma_post_recv(struct svcxprt_rdma *rdma) goto err_post; return 0; -err_put_ctxt: - svc_rdma_recv_ctxt_unmap(rdma, ctxt); - svc_rdma_recv_ctxt_put(rdma, ctxt); - return -ENOMEM; err_post: - svc_rdma_recv_ctxt_unmap(rdma, ctxt); svc_rdma_recv_ctxt_put(rdma, ctxt); svc_xprt_put(&rdma->sc_xprt); return ret; @@ -289,7 +280,6 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) /* WARNING: Only wc->wr_cqe and wc->status are reliable */ ctxt = container_of(cqe, struct svc_rdma_recv_ctxt, rc_cqe); - svc_rdma_recv_ctxt_unmap(rdma, ctxt); if (wc->status != IB_WC_SUCCESS) goto flushed; @@ -299,6 +289,10 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) /* All wc fields are now known to be valid */ ctxt->rc_byte_len = wc->byte_len; + ib_dma_sync_single_for_cpu(rdma->sc_pd->device, + ctxt->rc_recv_sge.addr, + wc->byte_len, DMA_FROM_DEVICE); + spin_lock(&rdma->sc_rq_dto_lock); list_add_tail(&ctxt->rc_list, &rdma->sc_rq_dto_q); spin_unlock(&rdma->sc_rq_dto_lock); @@ -339,64 +333,22 @@ void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma) } } -/* - * Replace the pages in the rq_argpages array with the pages from the SGE in - * the RDMA_RECV completion. The SGL should contain full pages up until the - * last one. - */ static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp, struct svc_rdma_recv_ctxt *ctxt) { - struct page *page; - int sge_no; - u32 len; - - /* The reply path assumes the Call's transport header resides - * in rqstp->rq_pages[0]. - */ - page = ctxt->rc_pages[0]; - put_page(rqstp->rq_pages[0]); - rqstp->rq_pages[0] = page; - - /* Set up the XDR head */ - rqstp->rq_arg.head[0].iov_base = page_address(page); - rqstp->rq_arg.head[0].iov_len = - min_t(size_t, ctxt->rc_byte_len, ctxt->rc_sges[0].length); - rqstp->rq_arg.len = ctxt->rc_byte_len; - rqstp->rq_arg.buflen = ctxt->rc_byte_len; - - /* Compute bytes past head in the SGL */ - len = ctxt->rc_byte_len - rqstp->rq_arg.head[0].iov_len; - - /* If data remains, store it in the pagelist */ - rqstp->rq_arg.page_len = len; - rqstp->rq_arg.page_base = 0; - - sge_no = 1; - while (len && sge_no < ctxt->rc_recv_wr.num_sge) { - page = ctxt->rc_pages[sge_no]; - put_page(rqstp->rq_pages[sge_no]); - rqstp->rq_pages[sge_no] = page; - len -= min_t(u32, len, ctxt->rc_sges[sge_no].length); - sge_no++; - } - ctxt->rc_hdr_count = sge_no; - rqstp->rq_respages = &rqstp->rq_pages[sge_no]; + struct xdr_buf *arg = &rqstp->rq_arg; + + arg->head[0].iov_base = ctxt->rc_recv_buf; + arg->head[0].iov_len = ctxt->rc_byte_len; + arg->tail[0].iov_base = NULL; + arg->tail[0].iov_len = 0; + arg->page_len = 0; + arg->page_base = 0; + arg->buflen = ctxt->rc_byte_len; + arg->len = ctxt->rc_byte_len; + + rqstp->rq_respages = &rqstp->rq_pages[0]; rqstp->rq_next_page = rqstp->rq_respages + 1; - - /* If not all pages were used from the SGL, free the remaining ones */ - while (sge_no < ctxt->rc_recv_wr.num_sge) { - page = ctxt->rc_pages[sge_no++]; - put_page(page); - } - - /* @ctxt's pages have all been released or moved to @rqstp->rq_pages. - */ - ctxt->rc_page_count = 0; - - /* Set up tail */ - rqstp->rq_arg.tail[0].iov_base = NULL; - rqstp->rq_arg.tail[0].iov_len = 0; } /* This accommodates the largest possible Write chunk, diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 8242aa318ac1..ce3ea8419704 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -718,15 +718,14 @@ static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp, struct svc_rdma_recv_ctxt *head = info->ri_readctxt; int ret; - info->ri_pageno = head->rc_hdr_count; - info->ri_pageoff = 0; - ret = svc_rdma_build_read_chunk(rqstp, info, p); if (ret < 0) goto out; trace_svcrdma_encode_read(info->ri_chunklen, info->ri_position); + head->rc_hdr_count = 0; + /* Split the Receive buffer between the head and tail * buffers at Read chunk's position. XDR roundup of the * chunk is not included in either the pagelist or in @@ -775,9 +774,6 @@ static int svc_rdma_build_pz_read_chunk(struct svc_rqst *rqstp, struct svc_rdma_recv_ctxt *head = info->ri_readctxt; int ret; - info->ri_pageno = head->rc_hdr_count - 1; - info->ri_pageoff = offset_in_page(head->rc_byte_len); - ret = svc_rdma_build_read_chunk(rqstp, info, p); if (ret < 0) goto out; @@ -787,20 +783,13 @@ static int svc_rdma_build_pz_read_chunk(struct svc_rqst *rqstp, head->rc_arg.len += info->ri_chunklen; head->rc_arg.buflen += info->ri_chunklen; - if (head->rc_arg.buflen <= head->rc_sges[0].length) { - /* Transport header and RPC message fit entirely - * in page where head iovec resides. - */ - head->rc_arg.head[0].iov_len = info->ri_chunklen; - } else { - /* Transport header and part of RPC message reside - * in the head iovec's page. - */ - head->rc_arg.head[0].iov_len = - head->rc_sges[0].length - head->rc_byte_len; - head->rc_arg.page_len = - info->ri_chunklen - head->rc_arg.head[0].iov_len; - } + head->rc_hdr_count = 1; + head->rc_arg.head[0].iov_base = page_address(head->rc_pages[0]); + head->rc_arg.head[0].iov_len = min_t(size_t, PAGE_SIZE, + info->ri_chunklen); + + head->rc_arg.page_len = info->ri_chunklen - + head->rc_arg.head[0].iov_len; out: return ret; @@ -834,7 +823,6 @@ int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, * head->rc_arg. Pages involved with RDMA Read I/O are * transferred there. */ - head->rc_page_count = head->rc_hdr_count; head->rc_arg.head[0] = rqstp->rq_arg.head[0]; head->rc_arg.tail[0] = rqstp->rq_arg.tail[0]; head->rc_arg.pages = head->rc_pages; @@ -847,6 +835,8 @@ int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, if (!info) return -ENOMEM; info->ri_readctxt = head; + info->ri_pageno = 0; + info->ri_pageoff = 0; info->ri_position = be32_to_cpup(p + 1); if (info->ri_position) diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index cbbde70eeec5..b27b597d94da 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -629,10 +629,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) struct page *res_page; int ret; - /* Find the call's chunk lists to decide how to send the reply. - * Receive places the Call's xprt header at the start of page 0. - */ - rdma_argp = page_address(rqstp->rq_pages[0]); + rdma_argp = rctxt->rc_recv_buf; svc_rdma_get_write_arrays(rdma_argp, &wr_lst, &rp_ch); /* Create the RDMA response header. xprt->xpt_mutex, diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index ef32c46a234c..baeecbb2f763 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -669,7 +669,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) qp_attr.cap.max_send_wr = newxprt->sc_sq_depth - ctxts; qp_attr.cap.max_recv_wr = rq_depth; qp_attr.cap.max_send_sge = newxprt->sc_max_sge; - qp_attr.cap.max_recv_sge = newxprt->sc_max_sge; + qp_attr.cap.max_recv_sge = 1; qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; qp_attr.qp_type = IB_QPT_RC; qp_attr.send_cq = newxprt->sc_sq_cq; -- cgit v1.2.3 From eb5d7a622e0bbe3fd316b2325d3840a0e030a3c4 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 7 May 2018 15:27:48 -0400 Subject: svcrdma: Allocate recv_ctxt's on CPU handling Receives There is a significant latency penalty when processing an ingress Receive if the Receive buffer resides in memory that is not on the same NUMA node as the the CPU handling completions for a CQ. The system administrator and the device driver determine which CPU handles completions. This CPU does not change during life of the CQ. Further the Upper Layer does not have any visibility of which CPU it is. Allocating Receive buffers in the Receive completion handler guarantees that Receive buffers are allocated on the preferred NUMA node for that CQ. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 1 + net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 52 +++++++++++++++++++++++---------- 2 files changed, 37 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 01baabfb863b..27cf59c7085f 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -151,6 +151,7 @@ struct svc_rdma_recv_ctxt { struct ib_sge rc_recv_sge; void *rc_recv_buf; struct xdr_buf rc_arg; + bool rc_temp; u32 rc_byte_len; unsigned int rc_page_count; unsigned int rc_hdr_count; diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index d4ccd1c0142c..0445e75d76a2 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -144,6 +144,7 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) ctxt->rc_recv_sge.length = rdma->sc_max_req_size; ctxt->rc_recv_sge.lkey = rdma->sc_pd->local_dma_lkey; ctxt->rc_recv_buf = buffer; + ctxt->rc_temp = false; return ctxt; fail2: @@ -154,6 +155,15 @@ fail0: return NULL; } +static void svc_rdma_recv_ctxt_destroy(struct svcxprt_rdma *rdma, + struct svc_rdma_recv_ctxt *ctxt) +{ + ib_dma_unmap_single(rdma->sc_pd->device, ctxt->rc_recv_sge.addr, + ctxt->rc_recv_sge.length, DMA_FROM_DEVICE); + kfree(ctxt->rc_recv_buf); + kfree(ctxt); +} + /** * svc_rdma_recv_ctxts_destroy - Release all recv_ctxt's for an xprt * @rdma: svcxprt_rdma being torn down @@ -165,12 +175,7 @@ void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma) while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_recv_ctxts))) { list_del(&ctxt->rc_list); - ib_dma_unmap_single(rdma->sc_pd->device, - ctxt->rc_recv_sge.addr, - ctxt->rc_recv_sge.length, - DMA_FROM_DEVICE); - kfree(ctxt->rc_recv_buf); - kfree(ctxt); + svc_rdma_recv_ctxt_destroy(rdma, ctxt); } } @@ -212,21 +217,21 @@ void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma, for (i = 0; i < ctxt->rc_page_count; i++) put_page(ctxt->rc_pages[i]); - spin_lock(&rdma->sc_recv_lock); - list_add(&ctxt->rc_list, &rdma->sc_recv_ctxts); - spin_unlock(&rdma->sc_recv_lock); + + if (!ctxt->rc_temp) { + spin_lock(&rdma->sc_recv_lock); + list_add(&ctxt->rc_list, &rdma->sc_recv_ctxts); + spin_unlock(&rdma->sc_recv_lock); + } else + svc_rdma_recv_ctxt_destroy(rdma, ctxt); } -static int svc_rdma_post_recv(struct svcxprt_rdma *rdma) +static int __svc_rdma_post_recv(struct svcxprt_rdma *rdma, + struct svc_rdma_recv_ctxt *ctxt) { - struct svc_rdma_recv_ctxt *ctxt; struct ib_recv_wr *bad_recv_wr; int ret; - ctxt = svc_rdma_recv_ctxt_get(rdma); - if (!ctxt) - return -ENOMEM; - svc_xprt_get(&rdma->sc_xprt); ret = ib_post_recv(rdma->sc_qp, &ctxt->rc_recv_wr, &bad_recv_wr); trace_svcrdma_post_recv(&ctxt->rc_recv_wr, ret); @@ -240,6 +245,16 @@ err_post: return ret; } +static int svc_rdma_post_recv(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_recv_ctxt *ctxt; + + ctxt = svc_rdma_recv_ctxt_get(rdma); + if (!ctxt) + return -ENOMEM; + return __svc_rdma_post_recv(rdma, ctxt); +} + /** * svc_rdma_post_recvs - Post initial set of Recv WRs * @rdma: fresh svcxprt_rdma @@ -248,11 +263,16 @@ err_post: */ bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma) { + struct svc_rdma_recv_ctxt *ctxt; unsigned int i; int ret; for (i = 0; i < rdma->sc_max_requests; i++) { - ret = svc_rdma_post_recv(rdma); + ctxt = svc_rdma_recv_ctxt_get(rdma); + if (!ctxt) + return -ENOMEM; + ctxt->rc_temp = true; + ret = __svc_rdma_post_recv(rdma, ctxt); if (ret) { pr_err("svcrdma: failure posting recv buffers: %d\n", ret); -- cgit v1.2.3 From f016f305f98159a9131ce200ed3b4ed92133012c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 7 May 2018 15:27:53 -0400 Subject: svcrdma: Refactor svc_rdma_dma_map_buf Clean up: svc_rdma_dma_map_buf does mostly the same thing as svc_rdma_dma_map_page, so let's fold these together. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 7 ----- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 50 ++++++++++++----------------------- 2 files changed, 17 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 27cf59c7085f..95530bc7bfab 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -158,13 +158,6 @@ struct svc_rdma_recv_ctxt { struct page *rc_pages[RPCSVC_MAXPAGES]; }; -/* Track DMA maps for this transport and context */ -static inline void svc_rdma_count_mappings(struct svcxprt_rdma *rdma, - struct svc_rdma_op_ctxt *ctxt) -{ - ctxt->mapped_sges++; -} - /* svc_rdma_backchannel.c */ extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index b27b597d94da..ee9ba0736ceb 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -302,41 +302,11 @@ static u32 svc_rdma_get_inv_rkey(__be32 *rdma_argp, return be32_to_cpup(p); } -/* ib_dma_map_page() is used here because svc_rdma_dma_unmap() - * is used during completion to DMA-unmap this memory, and - * it uses ib_dma_unmap_page() exclusively. - */ -static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma, - struct svc_rdma_op_ctxt *ctxt, - unsigned int sge_no, - unsigned char *base, - unsigned int len) -{ - unsigned long offset = (unsigned long)base & ~PAGE_MASK; - struct ib_device *dev = rdma->sc_cm_id->device; - dma_addr_t dma_addr; - - dma_addr = ib_dma_map_page(dev, virt_to_page(base), - offset, len, DMA_TO_DEVICE); - if (ib_dma_mapping_error(dev, dma_addr)) - goto out_maperr; - - ctxt->sge[sge_no].addr = dma_addr; - ctxt->sge[sge_no].length = len; - ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey; - svc_rdma_count_mappings(rdma, ctxt); - return 0; - -out_maperr: - pr_err("svcrdma: failed to map buffer\n"); - return -EIO; -} - static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma, struct svc_rdma_op_ctxt *ctxt, unsigned int sge_no, struct page *page, - unsigned int offset, + unsigned long offset, unsigned int len) { struct ib_device *dev = rdma->sc_cm_id->device; @@ -349,7 +319,7 @@ static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma, ctxt->sge[sge_no].addr = dma_addr; ctxt->sge[sge_no].length = len; ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey; - svc_rdma_count_mappings(rdma, ctxt); + ctxt->mapped_sges++; return 0; out_maperr: @@ -357,6 +327,19 @@ out_maperr: return -EIO; } +/* ib_dma_map_page() is used here because svc_rdma_dma_unmap() + * handles DMA-unmap and it uses ib_dma_unmap_page() exclusively. + */ +static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma, + struct svc_rdma_op_ctxt *ctxt, + unsigned int sge_no, + unsigned char *base, + unsigned int len) +{ + return svc_rdma_dma_map_page(rdma, ctxt, sge_no, virt_to_page(base), + offset_in_page(base), len); +} + /** * svc_rdma_map_reply_hdr - DMA map the transport header buffer * @rdma: controlling transport @@ -389,7 +372,8 @@ static int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, struct svc_rdma_op_ctxt *ctxt, struct xdr_buf *xdr, __be32 *wr_lst) { - unsigned int len, sge_no, remaining, page_off; + unsigned int len, sge_no, remaining; + unsigned long page_off; struct page **ppages; unsigned char *base; u32 xdr_pad; -- cgit v1.2.3 From 232627905f12a05df75853c62451ce0886803cee Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 7 May 2018 15:27:59 -0400 Subject: svcrdma: Clean up Send SGE accounting Clean up: Since there's already a svc_rdma_op_ctxt being passed around with the running count of mapped SGEs, drop unneeded parameters to svc_rdma_post_send_wr(). Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 2 +- net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 2 +- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 2 +- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 17 ++++++++--------- 4 files changed, 11 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 95530bc7bfab..8827b4e36c3c 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -188,7 +188,7 @@ extern int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma, __be32 *rdma_resp, unsigned int len); extern int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma, struct svc_rdma_op_ctxt *ctxt, - int num_sge, u32 inv_rkey); + u32 inv_rkey); extern int svc_rdma_sendto(struct svc_rqst *); /* svc_rdma_transport.c */ diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index d50152163190..0b9ba9f50a76 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -135,7 +135,7 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, * the rq_buffer before all retransmits are complete. */ get_page(virt_to_page(rqst->rq_buffer)); - ret = svc_rdma_post_send_wr(rdma, ctxt, 1, 0); + ret = svc_rdma_post_send_wr(rdma, ctxt, 0); if (ret) goto out_unmap; diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 0445e75d76a2..af6d2f3b3242 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -639,7 +639,7 @@ static void svc_rdma_send_error(struct svcxprt_rdma *xprt, return; } - ret = svc_rdma_post_send_wr(xprt, ctxt, 1, 0); + ret = svc_rdma_post_send_wr(xprt, ctxt, 0); if (ret) { svc_rdma_unmap_dma(ctxt); svc_rdma_put_context(ctxt, 1); diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index ee9ba0736ceb..4591017adc1e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -365,8 +365,7 @@ int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma, /* Load the xdr_buf into the ctxt's sge array, and DMA map each * element as it is added. * - * Returns the number of sge elements loaded on success, or - * a negative errno on failure. + * Returns zero on success, or a negative errno on failure. */ static int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, struct svc_rdma_op_ctxt *ctxt, @@ -429,7 +428,7 @@ tail: return ret; } - return sge_no - 1; + return 0; } /* The svc_rqst and all resources it owns are released as soon as @@ -453,7 +452,6 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, * svc_rdma_post_send_wr - Set up and post one Send Work Request * @rdma: controlling transport * @ctxt: op_ctxt for transmitting the Send WR - * @num_sge: number of SGEs to send * @inv_rkey: R_key argument to Send With Invalidate, or zero * * Returns: @@ -463,18 +461,19 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, * %-ENOMEM if ib_post_send failed. */ int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma, - struct svc_rdma_op_ctxt *ctxt, int num_sge, + struct svc_rdma_op_ctxt *ctxt, u32 inv_rkey) { struct ib_send_wr *send_wr = &ctxt->send_wr; - dprintk("svcrdma: posting Send WR with %u sge(s)\n", num_sge); + dprintk("svcrdma: posting Send WR with %u sge(s)\n", + ctxt->mapped_sges); send_wr->next = NULL; ctxt->cqe.done = svc_rdma_wc_send; send_wr->wr_cqe = &ctxt->cqe; send_wr->sg_list = ctxt->sge; - send_wr->num_sge = num_sge; + send_wr->num_sge = ctxt->mapped_sges; send_wr->send_flags = IB_SEND_SIGNALED; if (inv_rkey) { send_wr->opcode = IB_WR_SEND_WITH_INV; @@ -532,7 +531,7 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, inv_rkey = 0; if (rdma->sc_snd_w_inv) inv_rkey = svc_rdma_get_inv_rkey(rdma_argp, wr_lst, rp_ch); - ret = svc_rdma_post_send_wr(rdma, ctxt, 1 + ret, inv_rkey); + ret = svc_rdma_post_send_wr(rdma, ctxt, inv_rkey); if (ret) goto err; @@ -574,7 +573,7 @@ static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, svc_rdma_save_io_pages(rqstp, ctxt); - ret = svc_rdma_post_send_wr(rdma, ctxt, 1 + ret, 0); + ret = svc_rdma_post_send_wr(rdma, ctxt, 0); if (ret) goto err; -- cgit v1.2.3 From 4201c7464753827803366b40e82eb050c04ebdef Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 7 May 2018 15:28:04 -0400 Subject: svcrdma: Introduce svc_rdma_send_ctxt svc_rdma_op_ctxt's are pre-allocated and maintained on a per-xprt free list. This eliminates the overhead of calling kmalloc / kfree, both of which grab a globally shared lock that disables interrupts. Introduce a replacement to svc_rdma_op_ctxt's that is built especially for the svcrdma Send path. Subsequent patches will take advantage of this new structure by allocating real resources which are then cached in these objects. The allocations are freed when the transport is torn down. I've renamed the structure so that static type checking can be used to ensure that uses of op_ctxt and send_ctxt are not confused. As an additional clean up, structure fields are renamed to conform with kernel coding conventions. Additional clean ups: - Handle svc_rdma_send_ctxt_get allocation failure at each call site, rather than pre-allocating and hoping we guessed correctly - All send_ctxt_put call-sites request page freeing, so remove the @free_pages argument - All send_ctxt_put call-sites unmap SGEs, so fold that into svc_rdma_send_ctxt_put Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 35 ++-- net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 13 +- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 13 +- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 254 ++++++++++++++++++++++++----- net/sunrpc/xprtrdma/svc_rdma_transport.c | 205 +---------------------- 5 files changed, 254 insertions(+), 266 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 8827b4e36c3c..d3e2bb331264 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -109,8 +109,8 @@ struct svcxprt_rdma { struct ib_pd *sc_pd; - spinlock_t sc_ctxt_lock; - struct list_head sc_ctxts; + spinlock_t sc_send_lock; + struct list_head sc_send_ctxts; int sc_ctxt_used; spinlock_t sc_rw_ctxt_lock; struct list_head sc_rw_ctxts; @@ -158,6 +158,19 @@ struct svc_rdma_recv_ctxt { struct page *rc_pages[RPCSVC_MAXPAGES]; }; +enum { + RPCRDMA_MAX_SGES = 1 + (RPCRDMA_MAX_INLINE_THRESH / PAGE_SIZE), +}; + +struct svc_rdma_send_ctxt { + struct list_head sc_list; + struct ib_send_wr sc_send_wr; + struct ib_cqe sc_cqe; + int sc_page_count; + struct page *sc_pages[RPCSVC_MAXPAGES]; + struct ib_sge sc_sges[RPCRDMA_MAX_SGES]; +}; + /* svc_rdma_backchannel.c */ extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, @@ -183,24 +196,22 @@ extern int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, struct xdr_buf *xdr); /* svc_rdma_sendto.c */ +extern void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma); +extern struct svc_rdma_send_ctxt * + svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma); +extern void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt); +extern int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr); extern int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma, - struct svc_rdma_op_ctxt *ctxt, + struct svc_rdma_send_ctxt *ctxt, __be32 *rdma_resp, unsigned int len); extern int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma, - struct svc_rdma_op_ctxt *ctxt, + struct svc_rdma_send_ctxt *ctxt, u32 inv_rkey); extern int svc_rdma_sendto(struct svc_rqst *); /* svc_rdma_transport.c */ -extern void svc_rdma_wc_send(struct ib_cq *, struct ib_wc *); -extern void svc_rdma_wc_reg(struct ib_cq *, struct ib_wc *); -extern void svc_rdma_wc_read(struct ib_cq *, struct ib_wc *); -extern void svc_rdma_wc_inv(struct ib_cq *, struct ib_wc *); -extern int svc_rdma_send(struct svcxprt_rdma *, struct ib_send_wr *); extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *); -extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *); -extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int); -extern void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt); extern void svc_sq_reap(struct svcxprt_rdma *); extern void svc_rq_reap(struct svcxprt_rdma *); extern void svc_rdma_prep_reply_hdr(struct svc_rqst *); diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index 0b9ba9f50a76..95e33511cc6f 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2015-2018 Oracle. All rights reserved. * * Support for backward direction RPCs on RPC/RDMA (server-side). */ @@ -117,10 +117,14 @@ out_notfound: static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst) { - struct svc_rdma_op_ctxt *ctxt; + struct svc_rdma_send_ctxt *ctxt; int ret; - ctxt = svc_rdma_get_context(rdma); + ctxt = svc_rdma_send_ctxt_get(rdma); + if (!ctxt) { + ret = -ENOMEM; + goto out_err; + } /* rpcrdma_bc_send_request builds the transport header and * the backchannel RPC message in the same buffer. Thus only @@ -144,8 +148,7 @@ out_err: return ret; out_unmap: - svc_rdma_unmap_dma(ctxt); - svc_rdma_put_context(ctxt, 1); + svc_rdma_send_ctxt_put(rdma, ctxt); ret = -EIO; goto out_err; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index af6d2f3b3242..2d1e0db4c869 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -601,7 +601,7 @@ static void rdma_read_complete(struct svc_rqst *rqstp, static void svc_rdma_send_error(struct svcxprt_rdma *xprt, __be32 *rdma_argp, int status) { - struct svc_rdma_op_ctxt *ctxt; + struct svc_rdma_send_ctxt *ctxt; __be32 *p, *err_msgp; unsigned int length; struct page *page; @@ -631,7 +631,10 @@ static void svc_rdma_send_error(struct svcxprt_rdma *xprt, length = (unsigned long)p - (unsigned long)err_msgp; /* Map transport header; no RPC message payload */ - ctxt = svc_rdma_get_context(xprt); + ctxt = svc_rdma_send_ctxt_get(xprt); + if (!ctxt) + return; + ret = svc_rdma_map_reply_hdr(xprt, ctxt, err_msgp, length); if (ret) { dprintk("svcrdma: Error %d mapping send for protocol error\n", @@ -640,10 +643,8 @@ static void svc_rdma_send_error(struct svcxprt_rdma *xprt, } ret = svc_rdma_post_send_wr(xprt, ctxt, 0); - if (ret) { - svc_rdma_unmap_dma(ctxt); - svc_rdma_put_context(ctxt, 1); - } + if (ret) + svc_rdma_send_ctxt_put(xprt, ctxt); } /* By convention, backchannel calls arrive via rdma_msg type diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 4591017adc1e..b286d6a6e429 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -75,11 +75,11 @@ * DMA-unmap the pages under I/O for that Write segment. The Write * completion handler does not release any pages. * - * When the Send WR is constructed, it also gets its own svc_rdma_op_ctxt. + * When the Send WR is constructed, it also gets its own svc_rdma_send_ctxt. * The ownership of all of the Reply's pages are transferred into that * ctxt, the Send WR is posted, and sendto returns. * - * The svc_rdma_op_ctxt is presented when the Send WR completes. The + * The svc_rdma_send_ctxt is presented when the Send WR completes. The * Send completion handler finally releases the Reply's pages. * * This mechanism also assumes that completions on the transport's Send @@ -114,6 +114,184 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT +static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc); + +static inline struct svc_rdma_send_ctxt * +svc_rdma_next_send_ctxt(struct list_head *list) +{ + return list_first_entry_or_null(list, struct svc_rdma_send_ctxt, + sc_list); +} + +static struct svc_rdma_send_ctxt * +svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_send_ctxt *ctxt; + int i; + + ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL); + if (!ctxt) + return NULL; + + ctxt->sc_cqe.done = svc_rdma_wc_send; + ctxt->sc_send_wr.next = NULL; + ctxt->sc_send_wr.wr_cqe = &ctxt->sc_cqe; + ctxt->sc_send_wr.sg_list = ctxt->sc_sges; + ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED; + for (i = 0; i < ARRAY_SIZE(ctxt->sc_sges); i++) + ctxt->sc_sges[i].lkey = rdma->sc_pd->local_dma_lkey; + return ctxt; +} + +/** + * svc_rdma_send_ctxts_destroy - Release all send_ctxt's for an xprt + * @rdma: svcxprt_rdma being torn down + * + */ +void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_send_ctxt *ctxt; + + while ((ctxt = svc_rdma_next_send_ctxt(&rdma->sc_send_ctxts))) { + list_del(&ctxt->sc_list); + kfree(ctxt); + } +} + +/** + * svc_rdma_send_ctxt_get - Get a free send_ctxt + * @rdma: controlling svcxprt_rdma + * + * Returns a ready-to-use send_ctxt, or NULL if none are + * available and a fresh one cannot be allocated. + */ +struct svc_rdma_send_ctxt *svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_send_ctxt *ctxt; + + spin_lock(&rdma->sc_send_lock); + ctxt = svc_rdma_next_send_ctxt(&rdma->sc_send_ctxts); + if (!ctxt) + goto out_empty; + list_del(&ctxt->sc_list); + spin_unlock(&rdma->sc_send_lock); + +out: + ctxt->sc_send_wr.num_sge = 0; + ctxt->sc_page_count = 0; + return ctxt; + +out_empty: + spin_unlock(&rdma->sc_send_lock); + ctxt = svc_rdma_send_ctxt_alloc(rdma); + if (!ctxt) + return NULL; + goto out; +} + +/** + * svc_rdma_send_ctxt_put - Return send_ctxt to free list + * @rdma: controlling svcxprt_rdma + * @ctxt: object to return to the free list + * + * Pages left in sc_pages are DMA unmapped and released. + */ +void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt) +{ + struct ib_device *device = rdma->sc_cm_id->device; + unsigned int i; + + for (i = 0; i < ctxt->sc_send_wr.num_sge; i++) + ib_dma_unmap_page(device, + ctxt->sc_sges[i].addr, + ctxt->sc_sges[i].length, + DMA_TO_DEVICE); + + for (i = 0; i < ctxt->sc_page_count; ++i) + put_page(ctxt->sc_pages[i]); + + spin_lock(&rdma->sc_send_lock); + list_add(&ctxt->sc_list, &rdma->sc_send_ctxts); + spin_unlock(&rdma->sc_send_lock); +} + +/** + * svc_rdma_wc_send - Invoked by RDMA provider for each polled Send WC + * @cq: Completion Queue context + * @wc: Work Completion object + * + * NB: The svc_xprt/svcxprt_rdma is pinned whenever it's possible that + * the Send completion handler could be running. + */ +static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) +{ + struct svcxprt_rdma *rdma = cq->cq_context; + struct ib_cqe *cqe = wc->wr_cqe; + struct svc_rdma_send_ctxt *ctxt; + + trace_svcrdma_wc_send(wc); + + atomic_inc(&rdma->sc_sq_avail); + wake_up(&rdma->sc_send_wait); + + ctxt = container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe); + svc_rdma_send_ctxt_put(rdma, ctxt); + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); + svc_xprt_enqueue(&rdma->sc_xprt); + if (wc->status != IB_WC_WR_FLUSH_ERR) + pr_err("svcrdma: Send: %s (%u/0x%x)\n", + ib_wc_status_msg(wc->status), + wc->status, wc->vendor_err); + } + + svc_xprt_put(&rdma->sc_xprt); +} + +int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr) +{ + struct ib_send_wr *bad_wr, *n_wr; + int wr_count; + int i; + int ret; + + wr_count = 1; + for (n_wr = wr->next; n_wr; n_wr = n_wr->next) + wr_count++; + + /* If the SQ is full, wait until an SQ entry is available */ + while (1) { + if ((atomic_sub_return(wr_count, &rdma->sc_sq_avail) < 0)) { + atomic_inc(&rdma_stat_sq_starve); + trace_svcrdma_sq_full(rdma); + atomic_add(wr_count, &rdma->sc_sq_avail); + wait_event(rdma->sc_send_wait, + atomic_read(&rdma->sc_sq_avail) > wr_count); + if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) + return -ENOTCONN; + trace_svcrdma_sq_retry(rdma); + continue; + } + /* Take a transport ref for each WR posted */ + for (i = 0; i < wr_count; i++) + svc_xprt_get(&rdma->sc_xprt); + + /* Bump used SQ WR count and post */ + ret = ib_post_send(rdma->sc_qp, wr, &bad_wr); + trace_svcrdma_post_send(wr, ret); + if (ret) { + set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); + for (i = 0; i < wr_count; i++) + svc_xprt_put(&rdma->sc_xprt); + wake_up(&rdma->sc_send_wait); + } + break; + } + return ret; +} + static u32 xdr_padsize(u32 len) { return (len & 3) ? (4 - (len & 3)) : 0; @@ -303,7 +481,7 @@ static u32 svc_rdma_get_inv_rkey(__be32 *rdma_argp, } static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma, - struct svc_rdma_op_ctxt *ctxt, + struct svc_rdma_send_ctxt *ctxt, unsigned int sge_no, struct page *page, unsigned long offset, @@ -316,10 +494,9 @@ static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma, if (ib_dma_mapping_error(dev, dma_addr)) goto out_maperr; - ctxt->sge[sge_no].addr = dma_addr; - ctxt->sge[sge_no].length = len; - ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey; - ctxt->mapped_sges++; + ctxt->sc_sges[sge_no].addr = dma_addr; + ctxt->sc_sges[sge_no].length = len; + ctxt->sc_send_wr.num_sge++; return 0; out_maperr: @@ -331,7 +508,7 @@ out_maperr: * handles DMA-unmap and it uses ib_dma_unmap_page() exclusively. */ static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma, - struct svc_rdma_op_ctxt *ctxt, + struct svc_rdma_send_ctxt *ctxt, unsigned int sge_no, unsigned char *base, unsigned int len) @@ -352,14 +529,13 @@ static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma, * %-EIO if DMA mapping failed. */ int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma, - struct svc_rdma_op_ctxt *ctxt, + struct svc_rdma_send_ctxt *ctxt, __be32 *rdma_resp, unsigned int len) { - ctxt->direction = DMA_TO_DEVICE; - ctxt->pages[0] = virt_to_page(rdma_resp); - ctxt->count = 1; - return svc_rdma_dma_map_page(rdma, ctxt, 0, ctxt->pages[0], 0, len); + ctxt->sc_pages[0] = virt_to_page(rdma_resp); + ctxt->sc_page_count++; + return svc_rdma_dma_map_page(rdma, ctxt, 0, ctxt->sc_pages[0], 0, len); } /* Load the xdr_buf into the ctxt's sge array, and DMA map each @@ -368,7 +544,7 @@ int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma, * Returns zero on success, or a negative errno on failure. */ static int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, - struct svc_rdma_op_ctxt *ctxt, + struct svc_rdma_send_ctxt *ctxt, struct xdr_buf *xdr, __be32 *wr_lst) { unsigned int len, sge_no, remaining; @@ -436,13 +612,13 @@ tail: * so they are released by the Send completion handler. */ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, - struct svc_rdma_op_ctxt *ctxt) + struct svc_rdma_send_ctxt *ctxt) { int i, pages = rqstp->rq_next_page - rqstp->rq_respages; - ctxt->count += pages; + ctxt->sc_page_count += pages; for (i = 0; i < pages; i++) { - ctxt->pages[i + 1] = rqstp->rq_respages[i]; + ctxt->sc_pages[i + 1] = rqstp->rq_respages[i]; rqstp->rq_respages[i] = NULL; } rqstp->rq_next_page = rqstp->rq_respages + 1; @@ -461,37 +637,29 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, * %-ENOMEM if ib_post_send failed. */ int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma, - struct svc_rdma_op_ctxt *ctxt, + struct svc_rdma_send_ctxt *ctxt, u32 inv_rkey) { - struct ib_send_wr *send_wr = &ctxt->send_wr; - dprintk("svcrdma: posting Send WR with %u sge(s)\n", - ctxt->mapped_sges); - - send_wr->next = NULL; - ctxt->cqe.done = svc_rdma_wc_send; - send_wr->wr_cqe = &ctxt->cqe; - send_wr->sg_list = ctxt->sge; - send_wr->num_sge = ctxt->mapped_sges; - send_wr->send_flags = IB_SEND_SIGNALED; + ctxt->sc_send_wr.num_sge); + if (inv_rkey) { - send_wr->opcode = IB_WR_SEND_WITH_INV; - send_wr->ex.invalidate_rkey = inv_rkey; + ctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV; + ctxt->sc_send_wr.ex.invalidate_rkey = inv_rkey; } else { - send_wr->opcode = IB_WR_SEND; + ctxt->sc_send_wr.opcode = IB_WR_SEND; } - return svc_rdma_send(rdma, send_wr); + return svc_rdma_send(rdma, &ctxt->sc_send_wr); } /* Prepare the portion of the RPC Reply that will be transmitted * via RDMA Send. The RPC-over-RDMA transport header is prepared - * in sge[0], and the RPC xdr_buf is prepared in following sges. + * in sc_sges[0], and the RPC xdr_buf is prepared in following sges. * * Depending on whether a Write list or Reply chunk is present, * the server may send all, a portion of, or none of the xdr_buf. - * In the latter case, only the transport header (sge[0]) is + * In the latter case, only the transport header (sc_sges[0]) is * transmitted. * * RDMA Send is the last step of transmitting an RPC reply. Pages @@ -508,11 +676,13 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, __be32 *wr_lst, __be32 *rp_ch) { - struct svc_rdma_op_ctxt *ctxt; + struct svc_rdma_send_ctxt *ctxt; u32 inv_rkey; int ret; - ctxt = svc_rdma_get_context(rdma); + ctxt = svc_rdma_send_ctxt_get(rdma); + if (!ctxt) + return -ENOMEM; ret = svc_rdma_map_reply_hdr(rdma, ctxt, rdma_resp, svc_rdma_reply_hdr_len(rdma_resp)); @@ -538,8 +708,7 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, return 0; err: - svc_rdma_unmap_dma(ctxt); - svc_rdma_put_context(ctxt, 1); + svc_rdma_send_ctxt_put(rdma, ctxt); return ret; } @@ -553,11 +722,13 @@ err: static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, __be32 *rdma_resp, struct svc_rqst *rqstp) { - struct svc_rdma_op_ctxt *ctxt; + struct svc_rdma_send_ctxt *ctxt; __be32 *p; int ret; - ctxt = svc_rdma_get_context(rdma); + ctxt = svc_rdma_send_ctxt_get(rdma); + if (!ctxt) + return -ENOMEM; /* Replace the original transport header with an * RDMA_ERROR response. XID etc are preserved. @@ -580,8 +751,7 @@ static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, return 0; err: - svc_rdma_unmap_dma(ctxt); - svc_rdma_put_context(ctxt, 1); + svc_rdma_send_ctxt_put(rdma, ctxt); return ret; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index baeecbb2f763..3de81735a6cc 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* + * Copyright (c) 2015-2018 Oracle. All rights reserved. * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved. * Copyright (c) 2005-2007 Network Appliance, Inc. All rights reserved. * @@ -157,114 +158,6 @@ static void svc_rdma_bc_free(struct svc_xprt *xprt) } #endif /* CONFIG_SUNRPC_BACKCHANNEL */ -static struct svc_rdma_op_ctxt *alloc_ctxt(struct svcxprt_rdma *xprt, - gfp_t flags) -{ - struct svc_rdma_op_ctxt *ctxt; - - ctxt = kmalloc(sizeof(*ctxt), flags); - if (ctxt) { - ctxt->xprt = xprt; - INIT_LIST_HEAD(&ctxt->list); - } - return ctxt; -} - -static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt) -{ - unsigned int i; - - i = xprt->sc_sq_depth; - while (i--) { - struct svc_rdma_op_ctxt *ctxt; - - ctxt = alloc_ctxt(xprt, GFP_KERNEL); - if (!ctxt) { - dprintk("svcrdma: No memory for RDMA ctxt\n"); - return false; - } - list_add(&ctxt->list, &xprt->sc_ctxts); - } - return true; -} - -struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) -{ - struct svc_rdma_op_ctxt *ctxt = NULL; - - spin_lock(&xprt->sc_ctxt_lock); - xprt->sc_ctxt_used++; - if (list_empty(&xprt->sc_ctxts)) - goto out_empty; - - ctxt = list_first_entry(&xprt->sc_ctxts, - struct svc_rdma_op_ctxt, list); - list_del(&ctxt->list); - spin_unlock(&xprt->sc_ctxt_lock); - -out: - ctxt->count = 0; - ctxt->mapped_sges = 0; - return ctxt; - -out_empty: - /* Either pre-allocation missed the mark, or send - * queue accounting is broken. - */ - spin_unlock(&xprt->sc_ctxt_lock); - - ctxt = alloc_ctxt(xprt, GFP_NOIO); - if (ctxt) - goto out; - - spin_lock(&xprt->sc_ctxt_lock); - xprt->sc_ctxt_used--; - spin_unlock(&xprt->sc_ctxt_lock); - WARN_ONCE(1, "svcrdma: empty RDMA ctxt list?\n"); - return NULL; -} - -void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) -{ - struct svcxprt_rdma *xprt = ctxt->xprt; - struct ib_device *device = xprt->sc_cm_id->device; - unsigned int i; - - for (i = 0; i < ctxt->mapped_sges; i++) - ib_dma_unmap_page(device, - ctxt->sge[i].addr, - ctxt->sge[i].length, - ctxt->direction); - ctxt->mapped_sges = 0; -} - -void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) -{ - struct svcxprt_rdma *xprt = ctxt->xprt; - int i; - - if (free_pages) - for (i = 0; i < ctxt->count; i++) - put_page(ctxt->pages[i]); - - spin_lock(&xprt->sc_ctxt_lock); - xprt->sc_ctxt_used--; - list_add(&ctxt->list, &xprt->sc_ctxts); - spin_unlock(&xprt->sc_ctxt_lock); -} - -static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt) -{ - while (!list_empty(&xprt->sc_ctxts)) { - struct svc_rdma_op_ctxt *ctxt; - - ctxt = list_first_entry(&xprt->sc_ctxts, - struct svc_rdma_op_ctxt, list); - list_del(&ctxt->list); - kfree(ctxt); - } -} - /* QP event handler */ static void qp_event_handler(struct ib_event *event, void *context) { @@ -292,39 +185,6 @@ static void qp_event_handler(struct ib_event *event, void *context) } } -/** - * svc_rdma_wc_send - Invoked by RDMA provider for each polled Send WC - * @cq: completion queue - * @wc: completed WR - * - */ -void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) -{ - struct svcxprt_rdma *xprt = cq->cq_context; - struct ib_cqe *cqe = wc->wr_cqe; - struct svc_rdma_op_ctxt *ctxt; - - trace_svcrdma_wc_send(wc); - - atomic_inc(&xprt->sc_sq_avail); - wake_up(&xprt->sc_send_wait); - - ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe); - svc_rdma_unmap_dma(ctxt); - svc_rdma_put_context(ctxt, 1); - - if (unlikely(wc->status != IB_WC_SUCCESS)) { - set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); - svc_xprt_enqueue(&xprt->sc_xprt); - if (wc->status != IB_WC_WR_FLUSH_ERR) - pr_err("svcrdma: Send: %s (%u/0x%x)\n", - ib_wc_status_msg(wc->status), - wc->status, wc->vendor_err); - } - - svc_xprt_put(&xprt->sc_xprt); -} - static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, struct net *net) { @@ -338,14 +198,14 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, INIT_LIST_HEAD(&cma_xprt->sc_accept_q); INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); - INIT_LIST_HEAD(&cma_xprt->sc_ctxts); + INIT_LIST_HEAD(&cma_xprt->sc_send_ctxts); INIT_LIST_HEAD(&cma_xprt->sc_recv_ctxts); INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts); init_waitqueue_head(&cma_xprt->sc_send_wait); spin_lock_init(&cma_xprt->sc_lock); spin_lock_init(&cma_xprt->sc_rq_dto_lock); - spin_lock_init(&cma_xprt->sc_ctxt_lock); + spin_lock_init(&cma_xprt->sc_send_lock); spin_lock_init(&cma_xprt->sc_recv_lock); spin_lock_init(&cma_xprt->sc_rw_ctxt_lock); @@ -640,9 +500,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) } atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth); - if (!svc_rdma_prealloc_ctxts(newxprt)) - goto errout; - newxprt->sc_pd = ib_alloc_pd(dev, 0); if (IS_ERR(newxprt->sc_pd)) { dprintk("svcrdma: error creating PD for connect request\n"); @@ -794,11 +651,6 @@ static void __svc_rdma_free(struct work_struct *work) svc_rdma_flush_recv_queues(rdma); - /* Warn if we leaked a resource or under-referenced */ - if (rdma->sc_ctxt_used != 0) - pr_err("svcrdma: ctxt still in use? (%d)\n", - rdma->sc_ctxt_used); - /* Final put of backchannel client transport */ if (xprt->xpt_bc_xprt) { xprt_put(xprt->xpt_bc_xprt); @@ -806,7 +658,7 @@ static void __svc_rdma_free(struct work_struct *work) } svc_rdma_destroy_rw_ctxts(rdma); - svc_rdma_destroy_ctxts(rdma); + svc_rdma_send_ctxts_destroy(rdma); svc_rdma_recv_ctxts_destroy(rdma); /* Destroy the QP if present (not a listener) */ @@ -860,52 +712,3 @@ static void svc_rdma_secure_port(struct svc_rqst *rqstp) static void svc_rdma_kill_temp_xprt(struct svc_xprt *xprt) { } - -int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) -{ - struct ib_send_wr *bad_wr, *n_wr; - int wr_count; - int i; - int ret; - - if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags)) - return -ENOTCONN; - - wr_count = 1; - for (n_wr = wr->next; n_wr; n_wr = n_wr->next) - wr_count++; - - /* If the SQ is full, wait until an SQ entry is available */ - while (1) { - if ((atomic_sub_return(wr_count, &xprt->sc_sq_avail) < 0)) { - atomic_inc(&rdma_stat_sq_starve); - trace_svcrdma_sq_full(xprt); - atomic_add(wr_count, &xprt->sc_sq_avail); - wait_event(xprt->sc_send_wait, - atomic_read(&xprt->sc_sq_avail) > wr_count); - if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags)) - return -ENOTCONN; - trace_svcrdma_sq_retry(xprt); - continue; - } - /* Take a transport ref for each WR posted */ - for (i = 0; i < wr_count; i++) - svc_xprt_get(&xprt->sc_xprt); - - /* Bump used SQ WR count and post */ - ret = ib_post_send(xprt->sc_qp, wr, &bad_wr); - trace_svcrdma_post_send(wr, ret); - if (ret) { - set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); - for (i = 0; i < wr_count; i ++) - svc_xprt_put(&xprt->sc_xprt); - dprintk("svcrdma: failed to post SQ WR rc=%d\n", ret); - dprintk(" sc_sq_avail=%d, sc_sq_depth=%d\n", - atomic_read(&xprt->sc_sq_avail), - xprt->sc_sq_depth); - wake_up(&xprt->sc_send_wait); - } - break; - } - return ret; -} -- cgit v1.2.3 From 25fd86eca11c26bad2aede6dd4709ff58f89c7cb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 7 May 2018 15:28:09 -0400 Subject: svcrdma: Don't overrun the SGE array in svc_rdma_send_ctxt Receive buffers are always the same size, but each Send WR has a variable number of SGEs, based on the contents of the xdr_buf being sent. While assembling a Send WR, keep track of the number of SGEs so that we don't exceed the device's maximum, or walk off the end of the Send SGE array. For now the Send path just fails if it exceeds the maximum. The current logic in svc_rdma_accept bases the maximum number of Send SGEs on the largest NFS request that can be sent or received. In the transport layer, the limit is actually based on the capabilities of the underlying device, not on properties of the Upper Layer Protocol. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 9 +++----- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 36 +++++++++++++++++++------------- net/sunrpc/xprtrdma/svc_rdma_transport.c | 13 ++++++++---- 3 files changed, 33 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index d3e2bb331264..bfb8824e31e1 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -96,7 +96,7 @@ struct svcxprt_rdma { struct rdma_cm_id *sc_cm_id; /* RDMA connection id */ struct list_head sc_accept_q; /* Conn. waiting accept */ int sc_ord; /* RDMA read limit */ - int sc_max_sge; + int sc_max_send_sges; bool sc_snd_w_inv; /* OK to use Send With Invalidate */ atomic_t sc_sq_avail; /* SQEs ready to be consumed */ @@ -158,17 +158,14 @@ struct svc_rdma_recv_ctxt { struct page *rc_pages[RPCSVC_MAXPAGES]; }; -enum { - RPCRDMA_MAX_SGES = 1 + (RPCRDMA_MAX_INLINE_THRESH / PAGE_SIZE), -}; - struct svc_rdma_send_ctxt { struct list_head sc_list; struct ib_send_wr sc_send_wr; struct ib_cqe sc_cqe; int sc_page_count; + int sc_cur_sge_no; struct page *sc_pages[RPCSVC_MAXPAGES]; - struct ib_sge sc_sges[RPCRDMA_MAX_SGES]; + struct ib_sge sc_sges[]; }; /* svc_rdma_backchannel.c */ diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index b286d6a6e429..53d8db6bfaf2 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -127,9 +127,12 @@ static struct svc_rdma_send_ctxt * svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) { struct svc_rdma_send_ctxt *ctxt; + size_t size; int i; - ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL); + size = sizeof(*ctxt); + size += rdma->sc_max_send_sges * sizeof(struct ib_sge); + ctxt = kmalloc(size, GFP_KERNEL); if (!ctxt) return NULL; @@ -138,7 +141,7 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) ctxt->sc_send_wr.wr_cqe = &ctxt->sc_cqe; ctxt->sc_send_wr.sg_list = ctxt->sc_sges; ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED; - for (i = 0; i < ARRAY_SIZE(ctxt->sc_sges); i++) + for (i = 0; i < rdma->sc_max_send_sges; i++) ctxt->sc_sges[i].lkey = rdma->sc_pd->local_dma_lkey; return ctxt; } @@ -482,7 +485,6 @@ static u32 svc_rdma_get_inv_rkey(__be32 *rdma_argp, static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt, - unsigned int sge_no, struct page *page, unsigned long offset, unsigned int len) @@ -494,8 +496,8 @@ static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma, if (ib_dma_mapping_error(dev, dma_addr)) goto out_maperr; - ctxt->sc_sges[sge_no].addr = dma_addr; - ctxt->sc_sges[sge_no].length = len; + ctxt->sc_sges[ctxt->sc_cur_sge_no].addr = dma_addr; + ctxt->sc_sges[ctxt->sc_cur_sge_no].length = len; ctxt->sc_send_wr.num_sge++; return 0; @@ -509,11 +511,10 @@ out_maperr: */ static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt, - unsigned int sge_no, unsigned char *base, unsigned int len) { - return svc_rdma_dma_map_page(rdma, ctxt, sge_no, virt_to_page(base), + return svc_rdma_dma_map_page(rdma, ctxt, virt_to_page(base), offset_in_page(base), len); } @@ -535,7 +536,8 @@ int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma, { ctxt->sc_pages[0] = virt_to_page(rdma_resp); ctxt->sc_page_count++; - return svc_rdma_dma_map_page(rdma, ctxt, 0, ctxt->sc_pages[0], 0, len); + ctxt->sc_cur_sge_no = 0; + return svc_rdma_dma_map_page(rdma, ctxt, ctxt->sc_pages[0], 0, len); } /* Load the xdr_buf into the ctxt's sge array, and DMA map each @@ -547,16 +549,16 @@ static int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt, struct xdr_buf *xdr, __be32 *wr_lst) { - unsigned int len, sge_no, remaining; + unsigned int len, remaining; unsigned long page_off; struct page **ppages; unsigned char *base; u32 xdr_pad; int ret; - sge_no = 1; - - ret = svc_rdma_dma_map_buf(rdma, ctxt, sge_no++, + if (++ctxt->sc_cur_sge_no >= rdma->sc_max_send_sges) + return -EIO; + ret = svc_rdma_dma_map_buf(rdma, ctxt, xdr->head[0].iov_base, xdr->head[0].iov_len); if (ret < 0) @@ -586,8 +588,10 @@ static int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, while (remaining) { len = min_t(u32, PAGE_SIZE - page_off, remaining); - ret = svc_rdma_dma_map_page(rdma, ctxt, sge_no++, - *ppages++, page_off, len); + if (++ctxt->sc_cur_sge_no >= rdma->sc_max_send_sges) + return -EIO; + ret = svc_rdma_dma_map_page(rdma, ctxt, *ppages++, + page_off, len); if (ret < 0) return ret; @@ -599,7 +603,9 @@ static int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, len = xdr->tail[0].iov_len; tail: if (len) { - ret = svc_rdma_dma_map_buf(rdma, ctxt, sge_no++, base, len); + if (++ctxt->sc_cur_sge_no >= rdma->sc_max_send_sges) + return -EIO; + ret = svc_rdma_dma_map_buf(rdma, ctxt, base, len); if (ret < 0) return ret; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 3de81735a6cc..e9535a66bab0 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -476,8 +476,13 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) /* Qualify the transport resource defaults with the * capabilities of this particular device */ - newxprt->sc_max_sge = min((size_t)dev->attrs.max_sge, - (size_t)RPCSVC_MAXPAGES); + newxprt->sc_max_send_sges = dev->attrs.max_sge; + /* transport hdr, head iovec, one page list entry, tail iovec */ + if (newxprt->sc_max_send_sges < 4) { + pr_err("svcrdma: too few Send SGEs available (%d)\n", + newxprt->sc_max_send_sges); + goto errout; + } newxprt->sc_max_req_size = svcrdma_max_req_size; newxprt->sc_max_requests = svcrdma_max_requests; newxprt->sc_max_bc_requests = svcrdma_max_bc_requests; @@ -525,7 +530,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) qp_attr.cap.max_rdma_ctxs = ctxts; qp_attr.cap.max_send_wr = newxprt->sc_sq_depth - ctxts; qp_attr.cap.max_recv_wr = rq_depth; - qp_attr.cap.max_send_sge = newxprt->sc_max_sge; + qp_attr.cap.max_send_sge = newxprt->sc_max_send_sges; qp_attr.cap.max_recv_sge = 1; qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; qp_attr.qp_type = IB_QPT_RC; @@ -586,7 +591,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dprintk(" local address : %pIS:%u\n", sap, rpc_get_port(sap)); sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; dprintk(" remote address : %pIS:%u\n", sap, rpc_get_port(sap)); - dprintk(" max_sge : %d\n", newxprt->sc_max_sge); + dprintk(" max_sge : %d\n", newxprt->sc_max_send_sges); dprintk(" sq_depth : %d\n", newxprt->sc_sq_depth); dprintk(" rdma_rw_ctxs : %d\n", ctxts); dprintk(" max_requests : %d\n", newxprt->sc_max_requests); -- cgit v1.2.3 From 986b78894b268f605e9ea055b99959bdce0e5945 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 7 May 2018 15:28:15 -0400 Subject: svcrdma: Remove post_send_wr Clean up: Now that the send_wr is part of the svc_rdma_send_ctxt, svc_rdma_post_send_wr is nearly empty. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 3 -- net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 3 +- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 3 +- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 47 ++++++++---------------------- 4 files changed, 16 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index bfb8824e31e1..a8bfc214614b 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -202,9 +202,6 @@ extern int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr); extern int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt, __be32 *rdma_resp, unsigned int len); -extern int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt, - u32 inv_rkey); extern int svc_rdma_sendto(struct svc_rqst *); /* svc_rdma_transport.c */ diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index 95e33511cc6f..40f5e4afbcc8 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -139,7 +139,8 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, * the rq_buffer before all retransmits are complete. */ get_page(virt_to_page(rqst->rq_buffer)); - ret = svc_rdma_post_send_wr(rdma, ctxt, 0); + ctxt->sc_send_wr.opcode = IB_WR_SEND; + ret = svc_rdma_send(rdma, &ctxt->sc_send_wr); if (ret) goto out_unmap; diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 2d1e0db4c869..68648e6c5be2 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -642,7 +642,8 @@ static void svc_rdma_send_error(struct svcxprt_rdma *xprt, return; } - ret = svc_rdma_post_send_wr(xprt, ctxt, 0); + ctxt->sc_send_wr.opcode = IB_WR_SEND; + ret = svc_rdma_send(xprt, &ctxt->sc_send_wr); if (ret) svc_rdma_send_ctxt_put(xprt, ctxt); } diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 53d8db6bfaf2..0ebdc0c76483 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -630,35 +630,6 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, rqstp->rq_next_page = rqstp->rq_respages + 1; } -/** - * svc_rdma_post_send_wr - Set up and post one Send Work Request - * @rdma: controlling transport - * @ctxt: op_ctxt for transmitting the Send WR - * @inv_rkey: R_key argument to Send With Invalidate, or zero - * - * Returns: - * %0 if the Send* was posted successfully, - * %-ENOTCONN if the connection was lost or dropped, - * %-EINVAL if there was a problem with the Send we built, - * %-ENOMEM if ib_post_send failed. - */ -int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt, - u32 inv_rkey) -{ - dprintk("svcrdma: posting Send WR with %u sge(s)\n", - ctxt->sc_send_wr.num_sge); - - if (inv_rkey) { - ctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV; - ctxt->sc_send_wr.ex.invalidate_rkey = inv_rkey; - } else { - ctxt->sc_send_wr.opcode = IB_WR_SEND; - } - - return svc_rdma_send(rdma, &ctxt->sc_send_wr); -} - /* Prepare the portion of the RPC Reply that will be transmitted * via RDMA Send. The RPC-over-RDMA transport header is prepared * in sc_sges[0], and the RPC xdr_buf is prepared in following sges. @@ -683,7 +654,6 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, __be32 *wr_lst, __be32 *rp_ch) { struct svc_rdma_send_ctxt *ctxt; - u32 inv_rkey; int ret; ctxt = svc_rdma_send_ctxt_get(rdma); @@ -704,10 +674,16 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, svc_rdma_save_io_pages(rqstp, ctxt); - inv_rkey = 0; - if (rdma->sc_snd_w_inv) - inv_rkey = svc_rdma_get_inv_rkey(rdma_argp, wr_lst, rp_ch); - ret = svc_rdma_post_send_wr(rdma, ctxt, inv_rkey); + ctxt->sc_send_wr.opcode = IB_WR_SEND; + if (rdma->sc_snd_w_inv) { + ctxt->sc_send_wr.ex.invalidate_rkey = + svc_rdma_get_inv_rkey(rdma_argp, wr_lst, rp_ch); + if (ctxt->sc_send_wr.ex.invalidate_rkey) + ctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV; + } + dprintk("svcrdma: posting Send WR with %u sge(s)\n", + ctxt->sc_send_wr.num_sge); + ret = svc_rdma_send(rdma, &ctxt->sc_send_wr); if (ret) goto err; @@ -750,7 +726,8 @@ static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, svc_rdma_save_io_pages(rqstp, ctxt); - ret = svc_rdma_post_send_wr(rdma, ctxt, 0); + ctxt->sc_send_wr.opcode = IB_WR_SEND; + ret = svc_rdma_send(rdma, &ctxt->sc_send_wr); if (ret) goto err; -- cgit v1.2.3 From 99722fe4d5a634707ced8d8f42b883b87a86b3c5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 7 May 2018 15:28:25 -0400 Subject: svcrdma: Persistently allocate and DMA-map Send buffers While sending each RPC Reply, svc_rdma_sendto allocates and DMA- maps a separate buffer where the RPC/RDMA transport header is constructed. The buffer is unmapped and released in the Send completion handler. This is significant per-RPC overhead, especially for small RPCs. Instead, allocate and DMA-map a buffer, and cache it in each svc_rdma_send_ctxt. This buffer and its mapping can be re-used for each RPC, saving the cost of memory allocation and DMA mapping. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 8 +- net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 51 ++++------ net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 25 ++--- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 149 +++++++++++++++-------------- 4 files changed, 105 insertions(+), 128 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index a8bfc214614b..96b14a72d359 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -162,6 +162,7 @@ struct svc_rdma_send_ctxt { struct list_head sc_list; struct ib_send_wr sc_send_wr; struct ib_cqe sc_cqe; + void *sc_xprt_buf; int sc_page_count; int sc_cur_sge_no; struct page *sc_pages[RPCSVC_MAXPAGES]; @@ -199,9 +200,12 @@ extern struct svc_rdma_send_ctxt * extern void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt); extern int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr); -extern int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma, +extern void svc_rdma_sync_reply_hdr(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt, + unsigned int len); +extern int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt, - __be32 *rdma_resp, unsigned int len); + struct xdr_buf *xdr, __be32 *wr_lst); extern int svc_rdma_sendto(struct svc_rqst *); /* svc_rdma_transport.c */ diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index 40f5e4afbcc8..343e7add672c 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -115,43 +115,21 @@ out_notfound: * the adapter has a small maximum SQ depth. */ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, - struct rpc_rqst *rqst) + struct rpc_rqst *rqst, + struct svc_rdma_send_ctxt *ctxt) { - struct svc_rdma_send_ctxt *ctxt; int ret; - ctxt = svc_rdma_send_ctxt_get(rdma); - if (!ctxt) { - ret = -ENOMEM; - goto out_err; - } - - /* rpcrdma_bc_send_request builds the transport header and - * the backchannel RPC message in the same buffer. Thus only - * one SGE is needed to send both. - */ - ret = svc_rdma_map_reply_hdr(rdma, ctxt, rqst->rq_buffer, - rqst->rq_snd_buf.len); + ret = svc_rdma_map_reply_msg(rdma, ctxt, &rqst->rq_snd_buf, NULL); if (ret < 0) - goto out_err; + return -EIO; /* Bump page refcnt so Send completion doesn't release * the rq_buffer before all retransmits are complete. */ get_page(virt_to_page(rqst->rq_buffer)); ctxt->sc_send_wr.opcode = IB_WR_SEND; - ret = svc_rdma_send(rdma, &ctxt->sc_send_wr); - if (ret) - goto out_unmap; - -out_err: - dprintk("svcrdma: %s returns %d\n", __func__, ret); - return ret; - -out_unmap: - svc_rdma_send_ctxt_put(rdma, ctxt); - ret = -EIO; - goto out_err; + return svc_rdma_send(rdma, &ctxt->sc_send_wr); } /* Server-side transport endpoint wants a whole page for its send @@ -198,13 +176,15 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst) { struct rpc_xprt *xprt = rqst->rq_xprt; struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct svc_rdma_send_ctxt *ctxt; __be32 *p; int rc; - /* Space in the send buffer for an RPC/RDMA header is reserved - * via xprt->tsh_size. - */ - p = rqst->rq_buffer; + ctxt = svc_rdma_send_ctxt_get(rdma); + if (!ctxt) + goto drop_connection; + + p = ctxt->sc_xprt_buf; *p++ = rqst->rq_xid; *p++ = rpcrdma_version; *p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests); @@ -212,14 +192,17 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst) *p++ = xdr_zero; *p++ = xdr_zero; *p = xdr_zero; + svc_rdma_sync_reply_hdr(rdma, ctxt, RPCRDMA_HDRLEN_MIN); #ifdef SVCRDMA_BACKCHANNEL_DEBUG pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer); #endif - rc = svc_rdma_bc_sendto(rdma, rqst); - if (rc) + rc = svc_rdma_bc_sendto(rdma, rqst, ctxt); + if (rc) { + svc_rdma_send_ctxt_put(rdma, ctxt); goto drop_connection; + } return rc; drop_connection: @@ -327,7 +310,7 @@ xprt_setup_rdma_bc(struct xprt_create *args) xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO; xprt->prot = XPRT_TRANSPORT_BC_RDMA; - xprt->tsh_size = RPCRDMA_HDRLEN_MIN / sizeof(__be32); + xprt->tsh_size = 0; xprt->ops = &xprt_rdma_bc_procs; memcpy(&xprt->addr, args->dstaddr, args->addrlen); diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 68648e6c5be2..09ce09b3ac6e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -602,17 +602,15 @@ static void svc_rdma_send_error(struct svcxprt_rdma *xprt, __be32 *rdma_argp, int status) { struct svc_rdma_send_ctxt *ctxt; - __be32 *p, *err_msgp; unsigned int length; - struct page *page; + __be32 *p; int ret; - page = alloc_page(GFP_KERNEL); - if (!page) + ctxt = svc_rdma_send_ctxt_get(xprt); + if (!ctxt) return; - err_msgp = page_address(page); - p = err_msgp; + p = ctxt->sc_xprt_buf; *p++ = *rdma_argp; *p++ = *(rdma_argp + 1); *p++ = xprt->sc_fc_credits; @@ -628,19 +626,8 @@ static void svc_rdma_send_error(struct svcxprt_rdma *xprt, *p++ = err_chunk; trace_svcrdma_err_chunk(*rdma_argp); } - length = (unsigned long)p - (unsigned long)err_msgp; - - /* Map transport header; no RPC message payload */ - ctxt = svc_rdma_send_ctxt_get(xprt); - if (!ctxt) - return; - - ret = svc_rdma_map_reply_hdr(xprt, ctxt, err_msgp, length); - if (ret) { - dprintk("svcrdma: Error %d mapping send for protocol error\n", - ret); - return; - } + length = (unsigned long)p - (unsigned long)ctxt->sc_xprt_buf; + svc_rdma_sync_reply_hdr(xprt, ctxt, length); ctxt->sc_send_wr.opcode = IB_WR_SEND; ret = svc_rdma_send(xprt, &ctxt->sc_send_wr); diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index edfeca45ac1c..4a3efaea277c 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -127,6 +127,8 @@ static struct svc_rdma_send_ctxt * svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) { struct svc_rdma_send_ctxt *ctxt; + dma_addr_t addr; + void *buffer; size_t size; int i; @@ -134,16 +136,33 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) size += rdma->sc_max_send_sges * sizeof(struct ib_sge); ctxt = kmalloc(size, GFP_KERNEL); if (!ctxt) - return NULL; + goto fail0; + buffer = kmalloc(rdma->sc_max_req_size, GFP_KERNEL); + if (!buffer) + goto fail1; + addr = ib_dma_map_single(rdma->sc_pd->device, buffer, + rdma->sc_max_req_size, DMA_TO_DEVICE); + if (ib_dma_mapping_error(rdma->sc_pd->device, addr)) + goto fail2; - ctxt->sc_cqe.done = svc_rdma_wc_send; ctxt->sc_send_wr.next = NULL; ctxt->sc_send_wr.wr_cqe = &ctxt->sc_cqe; ctxt->sc_send_wr.sg_list = ctxt->sc_sges; ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED; + ctxt->sc_cqe.done = svc_rdma_wc_send; + ctxt->sc_xprt_buf = buffer; + ctxt->sc_sges[0].addr = addr; + for (i = 0; i < rdma->sc_max_send_sges; i++) ctxt->sc_sges[i].lkey = rdma->sc_pd->local_dma_lkey; return ctxt; + +fail2: + kfree(buffer); +fail1: + kfree(ctxt); +fail0: + return NULL; } /** @@ -157,6 +176,11 @@ void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma) while ((ctxt = svc_rdma_next_send_ctxt(&rdma->sc_send_ctxts))) { list_del(&ctxt->sc_list); + ib_dma_unmap_single(rdma->sc_pd->device, + ctxt->sc_sges[0].addr, + rdma->sc_max_req_size, + DMA_TO_DEVICE); + kfree(ctxt->sc_xprt_buf); kfree(ctxt); } } @@ -181,6 +205,7 @@ struct svc_rdma_send_ctxt *svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma) out: ctxt->sc_send_wr.num_sge = 0; + ctxt->sc_cur_sge_no = 0; ctxt->sc_page_count = 0; return ctxt; @@ -205,7 +230,10 @@ void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, struct ib_device *device = rdma->sc_cm_id->device; unsigned int i; - for (i = 0; i < ctxt->sc_send_wr.num_sge; i++) + /* The first SGE contains the transport header, which + * remains mapped until @ctxt is destroyed. + */ + for (i = 1; i < ctxt->sc_send_wr.num_sge; i++) ib_dma_unmap_page(device, ctxt->sc_sges[i].addr, ctxt->sc_sges[i].length, @@ -519,35 +547,37 @@ static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma, } /** - * svc_rdma_map_reply_hdr - DMA map the transport header buffer + * svc_rdma_sync_reply_hdr - DMA sync the transport header buffer * @rdma: controlling transport - * @ctxt: op_ctxt for the Send WR - * @rdma_resp: buffer containing transport header + * @ctxt: send_ctxt for the Send WR * @len: length of transport header * - * Returns: - * %0 if the header is DMA mapped, - * %-EIO if DMA mapping failed. */ -int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt, - __be32 *rdma_resp, - unsigned int len) +void svc_rdma_sync_reply_hdr(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt, + unsigned int len) { - ctxt->sc_pages[0] = virt_to_page(rdma_resp); - ctxt->sc_page_count++; - ctxt->sc_cur_sge_no = 0; - return svc_rdma_dma_map_page(rdma, ctxt, ctxt->sc_pages[0], 0, len); + ctxt->sc_sges[0].length = len; + ctxt->sc_send_wr.num_sge++; + ib_dma_sync_single_for_device(rdma->sc_pd->device, + ctxt->sc_sges[0].addr, len, + DMA_TO_DEVICE); } -/* Load the xdr_buf into the ctxt's sge array, and DMA map each +/* svc_rdma_map_reply_msg - Map the buffer holding RPC message + * @rdma: controlling transport + * @ctxt: send_ctxt for the Send WR + * @xdr: prepared xdr_buf containing RPC message + * @wr_lst: pointer to Call header's Write list, or NULL + * + * Load the xdr_buf into the ctxt's sge array, and DMA map each * element as it is added. * * Returns zero on success, or a negative errno on failure. */ -static int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt, - struct xdr_buf *xdr, __be32 *wr_lst) +int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt, + struct xdr_buf *xdr, __be32 *wr_lst) { unsigned int len, remaining; unsigned long page_off; @@ -624,7 +654,7 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, ctxt->sc_page_count += pages; for (i = 0; i < pages; i++) { - ctxt->sc_pages[i + 1] = rqstp->rq_respages[i]; + ctxt->sc_pages[i] = rqstp->rq_respages[i]; rqstp->rq_respages[i] = NULL; } rqstp->rq_next_page = rqstp->rq_respages + 1; @@ -649,27 +679,18 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, * - The Reply's transport header will never be larger than a page. */ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, - __be32 *rdma_argp, __be32 *rdma_resp, + struct svc_rdma_send_ctxt *ctxt, + __be32 *rdma_argp, struct svc_rqst *rqstp, __be32 *wr_lst, __be32 *rp_ch) { - struct svc_rdma_send_ctxt *ctxt; int ret; - ctxt = svc_rdma_send_ctxt_get(rdma); - if (!ctxt) - return -ENOMEM; - - ret = svc_rdma_map_reply_hdr(rdma, ctxt, rdma_resp, - svc_rdma_reply_hdr_len(rdma_resp)); - if (ret < 0) - goto err; - if (!rp_ch) { ret = svc_rdma_map_reply_msg(rdma, ctxt, &rqstp->rq_res, wr_lst); if (ret < 0) - goto err; + return ret; } svc_rdma_save_io_pages(rqstp, ctxt); @@ -683,15 +704,7 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, } dprintk("svcrdma: posting Send WR with %u sge(s)\n", ctxt->sc_send_wr.num_sge); - ret = svc_rdma_send(rdma, &ctxt->sc_send_wr); - if (ret) - goto err; - - return 0; - -err: - svc_rdma_send_ctxt_put(rdma, ctxt); - return ret; + return svc_rdma_send(rdma, &ctxt->sc_send_wr); } /* Given the client-provided Write and Reply chunks, the server was not @@ -702,40 +715,29 @@ err: * Remote Invalidation is skipped for simplicity. */ static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, - __be32 *rdma_resp, struct svc_rqst *rqstp) + struct svc_rdma_send_ctxt *ctxt, + struct svc_rqst *rqstp) { - struct svc_rdma_send_ctxt *ctxt; __be32 *p; int ret; - ctxt = svc_rdma_send_ctxt_get(rdma); - if (!ctxt) - return -ENOMEM; - - /* Replace the original transport header with an - * RDMA_ERROR response. XID etc are preserved. - */ - trace_svcrdma_err_chunk(*rdma_resp); - p = rdma_resp + 3; + p = ctxt->sc_xprt_buf; + trace_svcrdma_err_chunk(*p); + p += 3; *p++ = rdma_error; *p = err_chunk; - - ret = svc_rdma_map_reply_hdr(rdma, ctxt, rdma_resp, 20); - if (ret < 0) - goto err; + svc_rdma_sync_reply_hdr(rdma, ctxt, RPCRDMA_HDRLEN_ERR); svc_rdma_save_io_pages(rqstp, ctxt); ctxt->sc_send_wr.opcode = IB_WR_SEND; ret = svc_rdma_send(rdma, &ctxt->sc_send_wr); - if (ret) - goto err; + if (ret) { + svc_rdma_send_ctxt_put(rdma, ctxt); + return ret; + } return 0; - -err: - svc_rdma_send_ctxt_put(rdma, ctxt); - return ret; } void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp) @@ -762,7 +764,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt; __be32 *p, *rdma_argp, *rdma_resp, *wr_lst, *rp_ch; struct xdr_buf *xdr = &rqstp->rq_res; - struct page *res_page; + struct svc_rdma_send_ctxt *sctxt; int ret; rdma_argp = rctxt->rc_recv_buf; @@ -775,10 +777,10 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) * critical section. */ ret = -ENOMEM; - res_page = alloc_page(GFP_KERNEL); - if (!res_page) + sctxt = svc_rdma_send_ctxt_get(rdma); + if (!sctxt) goto err0; - rdma_resp = page_address(res_page); + rdma_resp = sctxt->sc_xprt_buf; p = rdma_resp; *p++ = *rdma_argp; @@ -805,10 +807,11 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) svc_rdma_xdr_encode_reply_chunk(rdma_resp, rp_ch, ret); } - ret = svc_rdma_send_reply_msg(rdma, rdma_argp, rdma_resp, rqstp, + svc_rdma_sync_reply_hdr(rdma, sctxt, svc_rdma_reply_hdr_len(rdma_resp)); + ret = svc_rdma_send_reply_msg(rdma, sctxt, rdma_argp, rqstp, wr_lst, rp_ch); if (ret < 0) - goto err0; + goto err1; ret = 0; out: @@ -820,14 +823,14 @@ out: if (ret != -E2BIG && ret != -EINVAL) goto err1; - ret = svc_rdma_send_error_msg(rdma, rdma_resp, rqstp); + ret = svc_rdma_send_error_msg(rdma, sctxt, rqstp); if (ret < 0) - goto err0; + goto err1; ret = 0; goto out; err1: - put_page(res_page); + svc_rdma_send_ctxt_put(rdma, sctxt); err0: trace_svcrdma_send_failed(rqstp, ret); set_bit(XPT_CLOSE, &xprt->xpt_flags); -- cgit v1.2.3 From 51cc257a1186f69dbb6faec1bd48cc7e1fc31079 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 7 May 2018 15:28:31 -0400 Subject: svcrdma: Remove unused svc_rdma_op_ctxt Clean up: Eliminate a structure that is no longer used. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 96b14a72d359..fd78f78df5c6 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -71,26 +71,6 @@ extern atomic_t rdma_stat_rq_prod; extern atomic_t rdma_stat_sq_poll; extern atomic_t rdma_stat_sq_prod; -/* - * Contexts are built when an RDMA request is created and are a - * record of the resources that can be recovered when the request - * completes. - */ -struct svc_rdma_op_ctxt { - struct list_head list; - struct xdr_buf arg; - struct ib_cqe cqe; - u32 byte_len; - struct svcxprt_rdma *xprt; - enum dma_data_direction direction; - int count; - unsigned int mapped_sges; - int hdr_count; - struct ib_send_wr send_wr; - struct ib_sge sge[1 + RPCRDMA_MAX_INLINE_THRESH / PAGE_SIZE]; - struct page *pages[RPCSVC_MAXPAGES]; -}; - struct svcxprt_rdma { struct svc_xprt sc_xprt; /* SVC transport structure */ struct rdma_cm_id *sc_cm_id; /* RDMA connection id */ @@ -111,7 +91,6 @@ struct svcxprt_rdma { spinlock_t sc_send_lock; struct list_head sc_send_ctxts; - int sc_ctxt_used; spinlock_t sc_rw_ctxt_lock; struct list_head sc_rw_ctxts; -- cgit v1.2.3