diff options
| author | Chuck Lever <chuck.lever@oracle.com> | 2026-05-06 18:26:51 +0300 |
|---|---|---|
| committer | Chuck Lever <cel@kernel.org> | 2026-06-09 23:32:59 +0300 |
| commit | 58202c29de9360a2b255458892e08a252d4406be (patch) | |
| tree | 8084840073202911766dc7c71c7a6b392d5cfeb3 /include/linux | |
| parent | 9545262f7e58d67de413d5a47ea2a3f2e59ba9f6 (diff) | |
| download | linux-58202c29de9360a2b255458892e08a252d4406be.tar.xz | |
svcrdma: Defer send context release to xpo_release_ctxt
Send completion currently queues a work item to an unbound
workqueue for each completed send context. Under load, the
Send Completion handlers contend for the shared workqueue
pool lock.
Replace the workqueue with a per-transport lock-free list
(llist). The Send completion handler appends the send_ctxt
to sc_send_release_list and does no further teardown. The
nfsd thread drains the list in xpo_release_ctxt between
RPCs, performing DMA unmapping, chunk I/O resource release,
and page release in a batch.
This eliminates both the workqueue pool lock and the DMA
unmap cost from the Send completion path. DMA unmapping can
be expensive when an IOMMU is present in strict mode, as
each unmap triggers a synchronous hardware IOTLB
invalidation. Moving it to the nfsd thread, where that
latency is harmless, avoids penalizing completion handler
throughput.
The nfsd threads absorb the release cost at a point where
the client is no longer waiting on a reply, and natural
batching amortizes the overhead when completions arrive
faster than RPCs complete.
A self-enqueue backstops drain on a quiescing transport.
When svc_rdma_send_ctxt_put() observes that its llist_add()
transitions sc_send_release_list from empty to non-empty,
it sets XPT_DATA and calls svc_xprt_enqueue() so that
svc_xprt_ready() schedules an nfsd thread. The thread
enters svc_rdma_recvfrom(), finds no pending receive,
clears XPT_DATA, and returns 0; svc_xprt_release() then
runs xpo_release_ctxt and drains the list. Under steady
load the foreground drain keeps the list non-empty between
adds and no enqueue fires; only the trailing edge of a
burst pays for a wakeup. Without this path, a Send
completion arriving after the last xpo_release_ctxt on an
idle connection would leave the send_ctxt's DMA mappings
and reply pages pinned until the next RPC, send-context
exhaustion, or transport close.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Diffstat (limited to 'include/linux')
| -rw-r--r-- | include/linux/sunrpc/svc_rdma.h | 5 |
1 files changed, 3 insertions, 2 deletions
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 14eb9d52742e..4ba39f07371d 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -66,7 +66,6 @@ extern unsigned int svcrdma_ord; extern unsigned int svcrdma_max_requests; extern unsigned int svcrdma_max_bc_requests; extern unsigned int svcrdma_max_req_size; -extern struct workqueue_struct *svcrdma_wq; extern struct percpu_counter svcrdma_stat_read; extern struct percpu_counter svcrdma_stat_recv; @@ -117,6 +116,8 @@ struct svcxprt_rdma { struct llist_head sc_recv_ctxts; + struct llist_head sc_send_release_list; + atomic_t sc_completion_ids; }; /* sc_flags */ @@ -235,7 +236,6 @@ struct svc_rdma_write_info { struct svc_rdma_send_ctxt { struct llist_node sc_node; struct rpc_rdma_cid sc_cid; - struct work_struct sc_work; struct svcxprt_rdma *sc_rdma; struct ib_send_wr sc_send_wr; @@ -299,6 +299,7 @@ extern int svc_rdma_process_read_list(struct svcxprt_rdma *rdma, /* svc_rdma_sendto.c */ extern void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma); +extern void svc_rdma_send_ctxts_drain(struct svcxprt_rdma *rdma); extern struct svc_rdma_send_ctxt * svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma); extern void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, |
