From 9c332d7f63401c3ff1765c9998531b3784f3f9a4 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 24 Mar 2026 13:32:12 -0400 Subject: nfs: update inode ctime after removexattr operation xfstest generic/728 fails with delegated timestamps. The client does a removexattr and then a stat to test the ctime, which doesn't change. The stat() doesn't trigger a GETATTR because of the delegated timestamps, so it relies on the cached ctime, which is wrong. The setxattr compound has a trailing GETATTR, which ensures that its ctime gets updated. Follow the same strategy with removexattr. Fixes: 3e1f02123fba ("NFSv4.2: add client side XDR handling for extended attributes") Reported-by: Olga Kornievskaia Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- include/linux/nfs_xdr.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index ff1f12aa73d2..fcbd21b5685f 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1611,12 +1611,15 @@ struct nfs42_listxattrsres { struct nfs42_removexattrargs { struct nfs4_sequence_args seq_args; struct nfs_fh *fh; + const u32 *bitmask; const char *xattr_name; }; struct nfs42_removexattrres { struct nfs4_sequence_res seq_res; struct nfs4_change_info cinfo; + struct nfs_fattr *fattr; + const struct nfs_server *server; }; #endif /* CONFIG_NFS_V4_2 */ -- cgit v1.2.3 From 765bde47fe7f197dabeb12da76831f40d0b20377 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 6 Mar 2026 16:56:24 -0500 Subject: xprtrdma: Close lost-wakeup race in xprt_rdma_alloc_slot xprt_rdma_alloc_slot() and xprt_rdma_free_slot() lack serialization between the buffer pool and the backlog queue. A buffer freed after rpcrdma_buffer_get() finds the pool empty but before rpc_sleep_on() places the task on the backlog is returned to the pool with no waiter to wake, leaving the task stuck on the backlog indefinitely. After joining the backlog, re-check the pool and route any recovered buffer through xprt_wake_up_backlog(), whose queue lock serializes with concurrent wakeups and avoids double-assignment of slots. Because xprt_rdma_free_slot() does not hold reserve_lock, the XPRT_CONGESTED double-check in xprt_throttle_congested() is ineffective: a task can join the backlog through that path after free_slot has already found it empty and cleared the bit. Avoid this by using xprt_add_backlog_noncongested(), which queues the task without setting XPRT_CONGESTED, so every allocation reaches xprt_rdma_alloc_slot() and its post-sleep re-check. Fixes: edb41e61a54e ("xprtrdma: Make rpc_rqst part of rpcrdma_req") Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 2 ++ net/sunrpc/xprt.c | 16 ++++++++++++++++ net/sunrpc/xprtrdma/transport.c | 15 ++++++++++++++- 3 files changed, 32 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index f46d1fb8f71a..a82045804d34 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -404,6 +404,8 @@ struct rpc_xprt * xprt_alloc(struct net *net, size_t size, unsigned int max_req); void xprt_free(struct rpc_xprt *); void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task); +void xprt_add_backlog_noncongested(struct rpc_xprt *xprt, + struct rpc_task *task); bool xprt_wake_up_backlog(struct rpc_xprt *xprt, struct rpc_rqst *req); void xprt_cleanup_ids(void); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 4fbb57a29704..48a3618cbb29 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1663,6 +1663,22 @@ void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task) } EXPORT_SYMBOL_GPL(xprt_add_backlog); +/** + * xprt_add_backlog_noncongested - queue task on backlog + * @xprt: transport whose backlog queue receives the task + * @task: task to queue + * + * Like xprt_add_backlog, but does not set XPRT_CONGESTED. + * For transports whose free_slot path does not synchronize + * with xprt_throttle_congested via reserve_lock. + */ +void xprt_add_backlog_noncongested(struct rpc_xprt *xprt, + struct rpc_task *task) +{ + rpc_sleep_on(&xprt->backlog, task, xprt_complete_request_init); +} +EXPORT_SYMBOL_GPL(xprt_add_backlog_noncongested); + static bool __xprt_set_rq(struct rpc_task *task, void *data) { struct rpc_rqst *req = data; diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index ca079439f9cc..61706df5e485 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -511,7 +511,20 @@ xprt_rdma_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) out_sleep: task->tk_status = -EAGAIN; - xprt_add_backlog(xprt, task); + xprt_add_backlog_noncongested(xprt, task); + /* A buffer freed between buffer_get and rpc_sleep_on + * goes back to the pool with no waiter to wake. + * Re-check after joining the backlog to close that gap. + */ + req = rpcrdma_buffer_get(&r_xprt->rx_buf); + if (req) { + struct rpc_rqst *rqst = &req->rl_slot; + + if (!xprt_wake_up_backlog(xprt, rqst)) { + memset(rqst, 0, sizeof(*rqst)); + rpcrdma_buffer_put(&r_xprt->rx_buf, req); + } + } } /** -- cgit v1.2.3 From 7a079ab57c4eeff241d9abfc1ec6477cb90a6206 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 6 Mar 2026 16:56:26 -0500 Subject: xprtrdma: Replace rpcrdma_mr_seg with xdr_buf cursor The FRWR registration path converts data through three representations: xdr_buf -> rpcrdma_mr_seg[] -> scatterlist[] -> ib_map_mr_sg(). The rpcrdma_mr_seg intermediate is a relic of when multiple registration strategies existed (FMR, physical, FRWR). Only FRWR remains, so this indirection and the 6240-byte rl_segments[260] array embedded in each rpcrdma_req serve no purpose. Introduce struct rpcrdma_xdr_cursor to track position within an xdr_buf during iterative MR registration. Rewrite frwr_map to populate scatterlist entries directly from the xdr_buf regions (head kvec, page list, tail kvec). The boundary logic for non-SG_GAPS devices is simpler because the xdr_buf structure guarantees that page-region entries after the first start at offset 0, and that head/tail kvecs are separate regions that naturally break at MR boundaries. Fix a pre-existing bug in rpcrdma_encode_write_list where the write-pad statistics accumulator added mr->mr_length from the last data MR rather than the write-pad MR. The refactored code uses ep->re_write_pad_mr->mr_length. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/trace/events/rpcrdma.h | 28 +++---- net/sunrpc/xprtrdma/frwr_ops.c | 119 +++++++++++++++++++++++------ net/sunrpc/xprtrdma/rpc_rdma.c | 163 ++++++++++++++-------------------------- net/sunrpc/xprtrdma/xprt_rdma.h | 42 +++++++---- 4 files changed, 194 insertions(+), 158 deletions(-) (limited to 'include') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index e6a72646c507..b79913048e1a 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -392,10 +392,10 @@ DECLARE_EVENT_CLASS(xprtrdma_rdch_event, const struct rpc_task *task, unsigned int pos, struct rpcrdma_mr *mr, - int nsegs + bool is_last ), - TP_ARGS(task, pos, mr, nsegs), + TP_ARGS(task, pos, mr, is_last), TP_STRUCT__entry( __field(unsigned int, task_id) @@ -405,7 +405,7 @@ DECLARE_EVENT_CLASS(xprtrdma_rdch_event, __field(u32, handle) __field(u32, length) __field(u64, offset) - __field(int, nsegs) + __field(bool, is_last) ), TP_fast_assign( @@ -416,7 +416,7 @@ DECLARE_EVENT_CLASS(xprtrdma_rdch_event, __entry->handle = mr->mr_handle; __entry->length = mr->mr_length; __entry->offset = mr->mr_offset; - __entry->nsegs = nsegs; + __entry->is_last = is_last; ), TP_printk(SUNRPC_TRACE_TASK_SPECIFIER @@ -424,7 +424,7 @@ DECLARE_EVENT_CLASS(xprtrdma_rdch_event, __entry->task_id, __entry->client_id, __entry->pos, __entry->length, (unsigned long long)__entry->offset, __entry->handle, - __entry->nents < __entry->nsegs ? "more" : "last" + __entry->is_last ? "last" : "more" ) ); @@ -434,18 +434,18 @@ DECLARE_EVENT_CLASS(xprtrdma_rdch_event, const struct rpc_task *task, \ unsigned int pos, \ struct rpcrdma_mr *mr, \ - int nsegs \ + bool is_last \ ), \ - TP_ARGS(task, pos, mr, nsegs)) + TP_ARGS(task, pos, mr, is_last)) DECLARE_EVENT_CLASS(xprtrdma_wrch_event, TP_PROTO( const struct rpc_task *task, struct rpcrdma_mr *mr, - int nsegs + bool is_last ), - TP_ARGS(task, mr, nsegs), + TP_ARGS(task, mr, is_last), TP_STRUCT__entry( __field(unsigned int, task_id) @@ -454,7 +454,7 @@ DECLARE_EVENT_CLASS(xprtrdma_wrch_event, __field(u32, handle) __field(u32, length) __field(u64, offset) - __field(int, nsegs) + __field(bool, is_last) ), TP_fast_assign( @@ -464,7 +464,7 @@ DECLARE_EVENT_CLASS(xprtrdma_wrch_event, __entry->handle = mr->mr_handle; __entry->length = mr->mr_length; __entry->offset = mr->mr_offset; - __entry->nsegs = nsegs; + __entry->is_last = is_last; ), TP_printk(SUNRPC_TRACE_TASK_SPECIFIER @@ -472,7 +472,7 @@ DECLARE_EVENT_CLASS(xprtrdma_wrch_event, __entry->task_id, __entry->client_id, __entry->length, (unsigned long long)__entry->offset, __entry->handle, - __entry->nents < __entry->nsegs ? "more" : "last" + __entry->is_last ? "last" : "more" ) ); @@ -481,9 +481,9 @@ DECLARE_EVENT_CLASS(xprtrdma_wrch_event, TP_PROTO( \ const struct rpc_task *task, \ struct rpcrdma_mr *mr, \ - int nsegs \ + bool is_last \ ), \ - TP_ARGS(task, mr, nsegs)) + TP_ARGS(task, mr, is_last)) TRACE_DEFINE_ENUM(DMA_BIDIRECTIONAL); TRACE_DEFINE_ENUM(DMA_TO_DEVICE); diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 4331b0b65f4c..229057d35fb8 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -268,10 +268,9 @@ int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device) } /** - * frwr_map - Register a memory region + * frwr_map - Register a memory region from an xdr_buf cursor * @r_xprt: controlling transport - * @seg: memory region co-ordinates - * @nsegs: number of segments remaining + * @cur: cursor tracking position within the xdr_buf * @writing: true when RDMA Write will be used * @xid: XID of RPC using the registered memory * @mr: MR to fill in @@ -279,34 +278,104 @@ int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device) * Prepare a REG_MR Work Request to register a memory region * for remote access via RDMA READ or RDMA WRITE. * - * Returns the next segment or a negative errno pointer. - * On success, @mr is filled in. + * Returns 0 on success (cursor advanced past consumed data, + * @mr populated) or a negative errno on failure. */ -struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, - struct rpcrdma_mr_seg *seg, - int nsegs, bool writing, __be32 xid, - struct rpcrdma_mr *mr) +int frwr_map(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_xdr_cursor *cur, + bool writing, __be32 xid, + struct rpcrdma_mr *mr) { struct rpcrdma_ep *ep = r_xprt->rx_ep; + const struct xdr_buf *xdrbuf = cur->xc_buf; + bool sg_gaps = ep->re_mrtype == IB_MR_TYPE_SG_GAPS; + unsigned int max_depth = ep->re_max_fr_depth; struct ib_reg_wr *reg_wr; int i, n, dma_nents; struct ib_mr *ibmr; u8 key; - if (nsegs > ep->re_max_fr_depth) - nsegs = ep->re_max_fr_depth; - for (i = 0; i < nsegs;) { - sg_set_page(&mr->mr_sg[i], seg->mr_page, - seg->mr_len, seg->mr_offset); - - ++seg; - ++i; - if (ep->re_mrtype == IB_MR_TYPE_SG_GAPS) - continue; - if ((i < nsegs && seg->mr_offset) || - offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) - break; + i = 0; + + /* Head kvec */ + if (!(cur->xc_flags & XC_HEAD_DONE)) { + const struct kvec *head = &xdrbuf->head[0]; + + sg_set_page(&mr->mr_sg[i], + virt_to_page(head->iov_base), + head->iov_len, + offset_in_page(head->iov_base)); + cur->xc_flags |= XC_HEAD_DONE; + i++; + /* Without sg-gap support, each non-contiguous region + * must be registered as a separate MR. Returning + * here after the head kvec causes the caller to + * invoke frwr_map() again for the page list and + * tail. + */ + if (!sg_gaps) + goto finish; } + + /* Page list */ + if (!(cur->xc_flags & XC_PAGES_DONE) && xdrbuf->page_len) { + unsigned int page_base, remaining; + struct page **ppages; + + remaining = xdrbuf->page_len - cur->xc_page_offset; + page_base = offset_in_page(xdrbuf->page_base + + cur->xc_page_offset); + ppages = xdrbuf->pages + + ((xdrbuf->page_base + cur->xc_page_offset) + >> PAGE_SHIFT); + + while (remaining > 0 && i < max_depth) { + unsigned int len; + + len = min_t(unsigned int, + PAGE_SIZE - page_base, remaining); + sg_set_page(&mr->mr_sg[i], *ppages, + len, page_base); + cur->xc_page_offset += len; + i++; + ppages++; + remaining -= len; + + if (!sg_gaps && remaining > 0 && + offset_in_page(page_base + len)) + goto finish; + page_base = 0; + } + if (remaining == 0) + cur->xc_flags |= XC_PAGES_DONE; + } else if (!(cur->xc_flags & XC_PAGES_DONE)) { + cur->xc_flags |= XC_PAGES_DONE; + } + + /* Tail kvec */ + if (!(cur->xc_flags & XC_TAIL_DONE) && xdrbuf->tail[0].iov_len && + i < max_depth) { + const struct kvec *tail = &xdrbuf->tail[0]; + + if (!sg_gaps && i > 0) { + struct scatterlist *prev = &mr->mr_sg[i - 1]; + + if (offset_in_page(prev->offset + prev->length) || + offset_in_page(tail->iov_base)) + goto finish; + } + sg_set_page(&mr->mr_sg[i], + virt_to_page(tail->iov_base), + tail->iov_len, + offset_in_page(tail->iov_base)); + cur->xc_flags |= XC_TAIL_DONE; + i++; + } else if (!(cur->xc_flags & XC_TAIL_DONE) && + !xdrbuf->tail[0].iov_len) { + cur->xc_flags |= XC_TAIL_DONE; + } + +finish: mr->mr_dir = rpcrdma_data_dir(writing); mr->mr_nents = i; @@ -338,15 +407,15 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, mr->mr_offset = ibmr->iova; trace_xprtrdma_mr_map(mr); - return seg; + return 0; out_dmamap_err: trace_xprtrdma_frwr_sgerr(mr, i); - return ERR_PTR(-EIO); + return -EIO; out_mapmr_err: trace_xprtrdma_frwr_maperr(mr, n); - return ERR_PTR(-EIO); + return -EIO; } /** diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 3aac1456e23e..a77e7e48aab2 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -200,67 +200,30 @@ rpcrdma_alloc_sparse_pages(struct xdr_buf *buf) return 0; } -/* Convert @vec to a single SGL element. - * - * Returns pointer to next available SGE, and bumps the total number - * of SGEs consumed. - */ -static struct rpcrdma_mr_seg * -rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, - unsigned int *n) +static void +rpcrdma_xdr_cursor_init(struct rpcrdma_xdr_cursor *cur, + const struct xdr_buf *xdrbuf, + unsigned int pos, enum rpcrdma_chunktype type) { - seg->mr_page = virt_to_page(vec->iov_base); - seg->mr_offset = offset_in_page(vec->iov_base); - seg->mr_len = vec->iov_len; - ++seg; - ++(*n); - return seg; + cur->xc_buf = xdrbuf; + cur->xc_page_offset = 0; + cur->xc_flags = 0; + + if (pos != 0) + cur->xc_flags |= XC_HEAD_DONE; + if (!xdrbuf->page_len) + cur->xc_flags |= XC_PAGES_DONE; + if (type == rpcrdma_readch || type == rpcrdma_writech || + !xdrbuf->tail[0].iov_len) + cur->xc_flags |= XC_TAIL_DONE; } -/* Convert @xdrbuf into SGEs no larger than a page each. As they - * are registered, these SGEs are then coalesced into RDMA segments - * when the selected memreg mode supports it. - * - * Returns positive number of SGEs consumed, or a negative errno. - */ - -static int -rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, - unsigned int pos, enum rpcrdma_chunktype type, - struct rpcrdma_mr_seg *seg) +static bool +rpcrdma_xdr_cursor_done(const struct rpcrdma_xdr_cursor *cur) { - unsigned long page_base; - unsigned int len, n; - struct page **ppages; - - n = 0; - if (pos == 0) - seg = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, &n); - - len = xdrbuf->page_len; - ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); - page_base = offset_in_page(xdrbuf->page_base); - while (len) { - seg->mr_page = *ppages; - seg->mr_offset = page_base; - seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len); - len -= seg->mr_len; - ++ppages; - ++seg; - ++n; - page_base = 0; - } - - if (type == rpcrdma_readch || type == rpcrdma_writech) - goto out; - - if (xdrbuf->tail[0].iov_len) - rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n); - -out: - if (unlikely(n > RPCRDMA_MAX_SEGS)) - return -EIO; - return n; + return (cur->xc_flags & (XC_HEAD_DONE | XC_PAGES_DONE | + XC_TAIL_DONE)) == + (XC_HEAD_DONE | XC_PAGES_DONE | XC_TAIL_DONE); } static int @@ -292,11 +255,10 @@ encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr, return 0; } -static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, - struct rpcrdma_req *req, - struct rpcrdma_mr_seg *seg, - int nsegs, bool writing, - struct rpcrdma_mr **mr) +static int rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct rpcrdma_xdr_cursor *cur, + bool writing, struct rpcrdma_mr **mr) { *mr = rpcrdma_mr_pop(&req->rl_free_mrs); if (!*mr) { @@ -307,13 +269,13 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, } rpcrdma_mr_push(*mr, &req->rl_registered); - return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr); + return frwr_map(r_xprt, cur, writing, req->rl_slot.rq_xid, *mr); out_getmr_err: trace_xprtrdma_nomrs_err(r_xprt, req); xprt_wait_for_buffer_space(&r_xprt->rx_xprt); rpcrdma_mrs_refresh(r_xprt); - return ERR_PTR(-EAGAIN); + return -EAGAIN; } /* Register and XDR encode the Read list. Supports encoding a list of read @@ -336,10 +298,10 @@ static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, enum rpcrdma_chunktype rtype) { struct xdr_stream *xdr = &req->rl_stream; - struct rpcrdma_mr_seg *seg; + struct rpcrdma_xdr_cursor cur; struct rpcrdma_mr *mr; unsigned int pos; - int nsegs; + int ret; if (rtype == rpcrdma_noch_pullup || rtype == rpcrdma_noch_mapped) goto done; @@ -347,24 +309,20 @@ static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, pos = rqst->rq_snd_buf.head[0].iov_len; if (rtype == rpcrdma_areadch) pos = 0; - seg = req->rl_segments; - nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos, - rtype, seg); - if (nsegs < 0) - return nsegs; + rpcrdma_xdr_cursor_init(&cur, &rqst->rq_snd_buf, pos, rtype); do { - seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, false, &mr); - if (IS_ERR(seg)) - return PTR_ERR(seg); + ret = rpcrdma_mr_prepare(r_xprt, req, &cur, false, &mr); + if (ret) + return ret; if (encode_read_segment(xdr, mr, pos) < 0) return -EMSGSIZE; - trace_xprtrdma_chunk_read(rqst->rq_task, pos, mr, nsegs); + trace_xprtrdma_chunk_read(rqst->rq_task, pos, mr, + rpcrdma_xdr_cursor_done(&cur)); r_xprt->rx_stats.read_chunk_count++; - nsegs -= mr->mr_nents; - } while (nsegs); + } while (!rpcrdma_xdr_cursor_done(&cur)); done: if (xdr_stream_encode_item_absent(xdr) < 0) @@ -394,20 +352,16 @@ static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, { struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_ep *ep = r_xprt->rx_ep; - struct rpcrdma_mr_seg *seg; + struct rpcrdma_xdr_cursor cur; struct rpcrdma_mr *mr; - int nsegs, nchunks; + int nchunks, ret; __be32 *segcount; if (wtype != rpcrdma_writech) goto done; - seg = req->rl_segments; - nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, - rqst->rq_rcv_buf.head[0].iov_len, - wtype, seg); - if (nsegs < 0) - return nsegs; + rpcrdma_xdr_cursor_init(&cur, &rqst->rq_rcv_buf, + rqst->rq_rcv_buf.head[0].iov_len, wtype); if (xdr_stream_encode_item_present(xdr) < 0) return -EMSGSIZE; @@ -418,30 +372,30 @@ static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, nchunks = 0; do { - seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr); - if (IS_ERR(seg)) - return PTR_ERR(seg); + ret = rpcrdma_mr_prepare(r_xprt, req, &cur, true, &mr); + if (ret) + return ret; if (encode_rdma_segment(xdr, mr) < 0) return -EMSGSIZE; - trace_xprtrdma_chunk_write(rqst->rq_task, mr, nsegs); + trace_xprtrdma_chunk_write(rqst->rq_task, mr, + rpcrdma_xdr_cursor_done(&cur)); r_xprt->rx_stats.write_chunk_count++; r_xprt->rx_stats.total_rdma_request += mr->mr_length; nchunks++; - nsegs -= mr->mr_nents; - } while (nsegs); + } while (!rpcrdma_xdr_cursor_done(&cur)); if (xdr_pad_size(rqst->rq_rcv_buf.page_len)) { if (encode_rdma_segment(xdr, ep->re_write_pad_mr) < 0) return -EMSGSIZE; trace_xprtrdma_chunk_wp(rqst->rq_task, ep->re_write_pad_mr, - nsegs); + true); r_xprt->rx_stats.write_chunk_count++; - r_xprt->rx_stats.total_rdma_request += mr->mr_length; + r_xprt->rx_stats.total_rdma_request += + ep->re_write_pad_mr->mr_length; nchunks++; - nsegs -= mr->mr_nents; } /* Update count of segments in this Write chunk */ @@ -471,9 +425,9 @@ static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, enum rpcrdma_chunktype wtype) { struct xdr_stream *xdr = &req->rl_stream; - struct rpcrdma_mr_seg *seg; + struct rpcrdma_xdr_cursor cur; struct rpcrdma_mr *mr; - int nsegs, nchunks; + int nchunks, ret; __be32 *segcount; if (wtype != rpcrdma_replych) { @@ -482,10 +436,7 @@ static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, return 0; } - seg = req->rl_segments; - nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg); - if (nsegs < 0) - return nsegs; + rpcrdma_xdr_cursor_init(&cur, &rqst->rq_rcv_buf, 0, wtype); if (xdr_stream_encode_item_present(xdr) < 0) return -EMSGSIZE; @@ -496,19 +447,19 @@ static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, nchunks = 0; do { - seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr); - if (IS_ERR(seg)) - return PTR_ERR(seg); + ret = rpcrdma_mr_prepare(r_xprt, req, &cur, true, &mr); + if (ret) + return ret; if (encode_rdma_segment(xdr, mr) < 0) return -EMSGSIZE; - trace_xprtrdma_chunk_reply(rqst->rq_task, mr, nsegs); + trace_xprtrdma_chunk_reply(rqst->rq_task, mr, + rpcrdma_xdr_cursor_done(&cur)); r_xprt->rx_stats.reply_chunk_count++; r_xprt->rx_stats.total_rdma_request += mr->mr_length; nchunks++; - nsegs -= mr->mr_nents; - } while (nsegs); + } while (!rpcrdma_xdr_cursor_done(&cur)); /* Update count of segments in the Reply chunk */ *segcount = cpu_to_be32(nchunks); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 8147d2b41494..37bba72065e8 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -283,19 +283,36 @@ struct rpcrdma_mr { * registered or invalidated. Must handle a Reply chunk: */ enum { - RPCRDMA_MAX_IOV_SEGS = 3, + RPCRDMA_MAX_IOV_SEGS = 3, /* head, page-boundary, tail */ RPCRDMA_MAX_DATA_SEGS = ((1 * 1024 * 1024) / PAGE_SIZE) + 1, RPCRDMA_MAX_SEGS = RPCRDMA_MAX_DATA_SEGS + RPCRDMA_MAX_IOV_SEGS, }; -/* Arguments for DMA mapping and registration */ -struct rpcrdma_mr_seg { - u32 mr_len; /* length of segment */ - struct page *mr_page; /* underlying struct page */ - u64 mr_offset; /* IN: page offset, OUT: iova */ +/** + * struct rpcrdma_xdr_cursor - tracks position within an xdr_buf + * for iterative MR registration + * @xc_buf: the xdr_buf being iterated + * @xc_page_offset: byte offset into the page region consumed so far + * @xc_flags: combination of XC_* bits + * + * Each XC_*_DONE flag indicates that this region has no + * remaining MR registration work. That condition holds both when the region + * has already been registered by a prior frwr_map() call and + * when the region is excluded from this chunk type (pre-set + * at init time by rpcrdma_xdr_cursor_init()). frwr_map() + * treats the two cases identically: skip the region. + */ +struct rpcrdma_xdr_cursor { + const struct xdr_buf *xc_buf; + unsigned int xc_page_offset; + unsigned int xc_flags; }; +#define XC_HEAD_DONE BIT(0) +#define XC_PAGES_DONE BIT(1) +#define XC_TAIL_DONE BIT(2) + /* The Send SGE array is provisioned to send a maximum size * inline request: * - RPC-over-RDMA header @@ -330,7 +347,6 @@ struct rpcrdma_req { struct list_head rl_free_mrs; struct list_head rl_registered; - struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; }; static inline struct rpcrdma_req * @@ -450,8 +466,8 @@ rpcrdma_portstr(const struct rpcrdma_xprt *r_xprt) } /* Setting this to 0 ensures interoperability with early servers. - * Setting this to 1 enhances certain unaligned read/write performance. - * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */ + * Setting this to 1 enhances unaligned read/write performance. + * Default is 0, see sysctl entry and rpc_rdma.c */ extern int xprt_rdma_pad_optimize; /* This setting controls the hunt for a supported memory @@ -535,10 +551,10 @@ void frwr_reset(struct rpcrdma_req *req); int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device); int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr); void frwr_mr_release(struct rpcrdma_mr *mr); -struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, - struct rpcrdma_mr_seg *seg, - int nsegs, bool writing, __be32 xid, - struct rpcrdma_mr *mr); +int frwr_map(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_xdr_cursor *cur, + bool writing, __be32 xid, + struct rpcrdma_mr *mr); int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs); void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); -- cgit v1.2.3 From 5d3869a41f3608101c00ff9c9c7c2364c555fa65 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Mon, 13 Apr 2026 18:24:23 -0400 Subject: NFS: fix writeback in presence of errors After running xfstest generic/751, in certain conditions, can have a writeback IO stuck while experiencing one of the two patterns. Pattern#1: writeback IO experiences ENOSPC on an offset smaller than the filesize. Example, write offset=0 len=4096 how=unstable OK write offset=8192 len=4096 how=unstable OK write offset=12288 len=4096 how=unstable ENOSPC write offset=4096 len=4096 how=unstable ENOSPC client sends a commit and receives a verifier which is different from the last successful write. It marks pages dirty and writeback retries. But it again send writes unstable and gets into the same pattern, running into the ENOSPC error and sending a commit because writes were sent at unstable. Pattern#2: an unstable write followed by a short write and ENOSPC. write offset=0 len=4096 how=unstable OK write offset=4096 len=4096 how=unstable returns OK but count=100 write offset=4197 len=3996 how=stable returns ENOSPC client send a commit and receives a verifier different from the last unstable write. The same behaviour is retried in a loop. Instead, this patch proposes to identify those conditions and mark requests to be done synchronously instead. Previous solution tried to mark it in the nfs_page, however that's not persistent thus instead mark it in the nfs_open_context. Furthermore, the same problem occurs during localio code path so recognize that IO needs to be done sync in that case as well. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- fs/nfs/localio.c | 15 ++++++++++++++- fs/nfs/pagelist.c | 3 +++ fs/nfs/write.c | 9 +++++++++ include/linux/nfs_fs.h | 1 + 4 files changed, 27 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 4c7d16a99ed6..e55c5977fcc3 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -865,6 +865,8 @@ static void nfs_local_call_write(struct work_struct *work) file_start_write(filp); n_iters = atomic_read(&iocb->n_iters); for (int i = 0; i < n_iters ; i++) { + size_t icount; + if (iocb->iter_is_dio_aligned[i]) { iocb->kiocb.ki_flags |= IOCB_DIRECT; /* Only use AIO completion if DIO-aligned segment is last */ @@ -881,8 +883,16 @@ static void nfs_local_call_write(struct work_struct *work) if (status == -EIOCBQUEUED) continue; /* Break on completion, errors, or short writes */ + icount = iov_iter_count(&iocb->iters[i]); if (nfs_local_pgio_done(iocb, status) || status < 0 || - (size_t)status < iov_iter_count(&iocb->iters[i])) { + (size_t)status < icount) { + if ((size_t)status < icount) { + struct nfs_lock_context *ctx = + iocb->hdr->req->wb_lock_context; + + set_bit(NFS_CONTEXT_WRITE_SYNC, + &ctx->open_context->flags); + } nfs_local_write_iocb_done(iocb); break; } @@ -901,6 +911,9 @@ static void nfs_local_do_write(struct nfs_local_kiocb *iocb, __func__, hdr->args.count, hdr->args.offset, (hdr->args.stable == NFS_UNSTABLE) ? "unstable" : "stable"); + if (test_bit(NFS_CONTEXT_WRITE_SYNC, + &hdr->req->wb_lock_context->open_context->flags)) + hdr->args.stable = NFS_FILE_SYNC; switch (hdr->args.stable) { default: break; diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index a9373de891c9..4a87b2fdb2e6 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -1186,6 +1186,9 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, nfs_page_group_lock(req); + if (test_bit(NFS_CONTEXT_WRITE_SYNC, + &req->wb_lock_context->open_context->flags)) + desc->pg_ioflags |= FLUSH_STABLE; subreq = req; subreq_size = subreq->wb_bytes; for(;;) { diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f1f62787dd74..f224b73fa30e 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -927,9 +927,13 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) goto remove_req; } if (nfs_write_need_commit(hdr)) { + struct nfs_open_context *ctx = + hdr->req->wb_lock_context->open_context; + /* Reset wb_nio, since the write was successful. */ req->wb_nio = 0; memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf)); + clear_bit(NFS_CONTEXT_WRITE_SYNC, &ctx->flags); nfs_mark_request_commit(req, hdr->lseg, &cinfo, hdr->ds_commit_idx); goto next; @@ -1553,7 +1557,10 @@ static void nfs_writeback_result(struct rpc_task *task, if (resp->count < argp->count && !list_empty(&hdr->pages)) { static unsigned long complain; + struct nfs_open_context *ctx = + hdr->req->wb_lock_context->open_context; + set_bit(NFS_CONTEXT_WRITE_SYNC, &ctx->flags); /* This a short write! */ nfs_inc_stats(hdr->inode, NFSIOS_SHORTWRITE); @@ -1837,6 +1844,8 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) /* We have a mismatch. Write the page again */ dprintk(" mismatch\n"); nfs_mark_request_dirty(req); + set_bit(NFS_CONTEXT_WRITE_SYNC, + &req->wb_lock_context->open_context->flags); atomic_long_inc(&NFS_I(data->inode)->redirtied_pages); next: nfs_unlock_and_release_request(req); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 8dd79a3f3d66..4623262da3c0 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -109,6 +109,7 @@ struct nfs_open_context { #define NFS_CONTEXT_BAD (2) #define NFS_CONTEXT_UNLOCK (3) #define NFS_CONTEXT_FILE_OPEN (4) +#define NFS_CONTEXT_WRITE_SYNC (5) struct nfs4_threshold *mdsthreshold; struct list_head list; -- cgit v1.2.3