summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
authorChuck Lever <chuck.lever@oracle.com>2026-05-06 18:26:51 +0300
committerChuck Lever <cel@kernel.org>2026-06-09 23:32:59 +0300
commit58202c29de9360a2b255458892e08a252d4406be (patch)
tree8084840073202911766dc7c71c7a6b392d5cfeb3 /include/linux
parent9545262f7e58d67de413d5a47ea2a3f2e59ba9f6 (diff)
downloadlinux-58202c29de9360a2b255458892e08a252d4406be.tar.xz
svcrdma: Defer send context release to xpo_release_ctxt
Send completion currently queues a work item to an unbound workqueue for each completed send context. Under load, the Send Completion handlers contend for the shared workqueue pool lock. Replace the workqueue with a per-transport lock-free list (llist). The Send completion handler appends the send_ctxt to sc_send_release_list and does no further teardown. The nfsd thread drains the list in xpo_release_ctxt between RPCs, performing DMA unmapping, chunk I/O resource release, and page release in a batch. This eliminates both the workqueue pool lock and the DMA unmap cost from the Send completion path. DMA unmapping can be expensive when an IOMMU is present in strict mode, as each unmap triggers a synchronous hardware IOTLB invalidation. Moving it to the nfsd thread, where that latency is harmless, avoids penalizing completion handler throughput. The nfsd threads absorb the release cost at a point where the client is no longer waiting on a reply, and natural batching amortizes the overhead when completions arrive faster than RPCs complete. A self-enqueue backstops drain on a quiescing transport. When svc_rdma_send_ctxt_put() observes that its llist_add() transitions sc_send_release_list from empty to non-empty, it sets XPT_DATA and calls svc_xprt_enqueue() so that svc_xprt_ready() schedules an nfsd thread. The thread enters svc_rdma_recvfrom(), finds no pending receive, clears XPT_DATA, and returns 0; svc_xprt_release() then runs xpo_release_ctxt and drains the list. Under steady load the foreground drain keeps the list non-empty between adds and no enqueue fires; only the trailing edge of a burst pays for a wakeup. Without this path, a Send completion arriving after the last xpo_release_ctxt on an idle connection would leave the send_ctxt's DMA mappings and reply pages pinned until the next RPC, send-context exhaustion, or transport close. Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/sunrpc/svc_rdma.h5
1 files changed, 3 insertions, 2 deletions
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 14eb9d52742e..4ba39f07371d 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -66,7 +66,6 @@ extern unsigned int svcrdma_ord;
extern unsigned int svcrdma_max_requests;
extern unsigned int svcrdma_max_bc_requests;
extern unsigned int svcrdma_max_req_size;
-extern struct workqueue_struct *svcrdma_wq;
extern struct percpu_counter svcrdma_stat_read;
extern struct percpu_counter svcrdma_stat_recv;
@@ -117,6 +116,8 @@ struct svcxprt_rdma {
struct llist_head sc_recv_ctxts;
+ struct llist_head sc_send_release_list;
+
atomic_t sc_completion_ids;
};
/* sc_flags */
@@ -235,7 +236,6 @@ struct svc_rdma_write_info {
struct svc_rdma_send_ctxt {
struct llist_node sc_node;
struct rpc_rdma_cid sc_cid;
- struct work_struct sc_work;
struct svcxprt_rdma *sc_rdma;
struct ib_send_wr sc_send_wr;
@@ -299,6 +299,7 @@ extern int svc_rdma_process_read_list(struct svcxprt_rdma *rdma,
/* svc_rdma_sendto.c */
extern void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma);
+extern void svc_rdma_send_ctxts_drain(struct svcxprt_rdma *rdma);
extern struct svc_rdma_send_ctxt *
svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma);
extern void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,