From 55b6dd54c3bcb6edf7ad630a4510759f4b0cf1cd Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 13 Jan 2026 13:37:39 -0500 Subject: nfsd/sunrpc: add svc_rqst->rq_private pointer and remove rq_lease_breaker rq_lease_breaker has always been a NFSv4 specific layering violation in svc_rqst. The reason it's there though is that we need a place that is thread-local, and accessible from the svc_rqst pointer. Add a new rq_private pointer to struct svc_rqst. This is intended for use by the threads that are handling the service. sunrpc code doesn't touch it. In nfsd, define a new struct nfsd_thread_local_info. nfsd declares one of these on the stack and puts a pointer to it in rq_private. Add a new ntli_lease_breaker field to the new struct and convert all of the places that access rq_lease_breaker to use the new field instead. Signed-off-by: Jeff Layton Reviewed-by: Benjamin Coddington Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 4dc14c7a711b..ab8237ba9596 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -175,6 +175,9 @@ static inline unsigned long svc_serv_maxpages(const struct svc_serv *serv) /* * The context of a single thread, including the request currently being * processed. + * + * RPC programs are free to use rq_private to stash thread-local information. + * The sunrpc layer will not access it. */ struct svc_rqst { struct list_head rq_all; /* all threads list */ @@ -251,7 +254,7 @@ struct svc_rqst { unsigned long bc_to_initval; unsigned int bc_to_retries; unsigned int rq_status_counter; /* RPC processing counter */ - void **rq_lease_breaker; /* The v4 client breaking a lease */ + void *rq_private; /* For use by the service thread */ }; /* bits for rq_flags */ -- cgit v1.2.3 From 322ecd01bf8ad7e0da21e174679aff1759e68b2c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 13 Jan 2026 13:37:40 -0500 Subject: nfsd/sunrpc: move rq_cachetype into struct nfsd_thread_local_info The svc_rqst->rq_cachetype field is only accessed by nfsd. Move it into the nfsd_thread_local_info instead. Signed-off-by: Jeff Layton Reviewed-by: Benjamin Coddington Signed-off-by: Chuck Lever --- fs/nfsd/nfs4xdr.c | 3 ++- fs/nfsd/nfscache.c | 3 ++- fs/nfsd/nfsd.h | 1 + fs/nfsd/nfssvc.c | 5 +++-- include/linux/sunrpc/svc.h | 1 - 5 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 9d234913100b..690f7a3122ec 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2598,6 +2598,7 @@ nfsd4_opnum_in_range(struct nfsd4_compoundargs *argp, struct nfsd4_op *op) static bool nfsd4_decode_compound(struct nfsd4_compoundargs *argp) { + struct nfsd_thread_local_info *ntli = argp->rqstp->rq_private; struct nfsd4_op *op; bool cachethis = false; int auth_slack= argp->rqstp->rq_auth_slack; @@ -2690,7 +2691,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) if (argp->minorversion) cachethis = false; svc_reserve_auth(argp->rqstp, max_reply + readbytes); - argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE; + ntli->ntli_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE; argp->splice_ok = nfsd_read_splice_ok(argp->rqstp); if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack) diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index ab13ee9c7fd8..154468ceccdc 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -467,10 +467,11 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, unsigned int len, struct nfsd_cacherep **cacherep) { struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + struct nfsd_thread_local_info *ntli = rqstp->rq_private; struct nfsd_cacherep *rp, *found; __wsum csum; struct nfsd_drc_bucket *b; - int type = rqstp->rq_cachetype; + int type = ntli->ntli_cachetype; LIST_HEAD(dispose); int rtn = RC_DOIT; diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 938906c6d10c..a2e35a4fa105 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -84,6 +84,7 @@ extern const struct seq_operations nfs_exports_op; struct nfsd_thread_local_info { struct nfs4_client **ntli_lease_breaker; + int ntli_cachetype; }; /* diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index fd979e5392a1..4f1ab3222a4d 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -972,6 +972,7 @@ nfsd(void *vrqstp) */ int nfsd_dispatch(struct svc_rqst *rqstp) { + struct nfsd_thread_local_info *ntli = rqstp->rq_private; const struct svc_procedure *proc = rqstp->rq_procinfo; __be32 *statp = rqstp->rq_accept_statp; struct nfsd_cacherep *rp; @@ -982,7 +983,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp) * Give the xdr decoder a chance to change this if it wants * (necessary in the NFSv4.0 compound case) */ - rqstp->rq_cachetype = proc->pc_cachetype; + ntli->ntli_cachetype = proc->pc_cachetype; /* * ->pc_decode advances the argument stream past the NFS @@ -1027,7 +1028,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp) */ smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter + 1); - nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, nfs_reply); + nfsd_cache_update(rqstp, rp, ntli->ntli_cachetype, nfs_reply); out_cached_reply: return 1; diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index ab8237ba9596..62152e4f3bcc 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -218,7 +218,6 @@ struct svc_rqst { u32 rq_vers; /* program version */ u32 rq_proc; /* procedure number */ u32 rq_prot; /* IP protocol */ - int rq_cachetype; /* catering to nfsd */ unsigned long rq_flags; /* flags field */ ktime_t rq_qtime; /* enqueue time */ -- cgit v1.2.3 From 153b9e025308417d167332c93e1bcc11174178de Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:23 -0500 Subject: lockd: Relocate and rename nlm_drop_reply The nlm_drop_reply status code is internal to the kernel's lockd implementation and must never appear on the wire. Its previous location in xdr.h grouped it with legitimate NLM protocol status codes, obscuring this critical distinction. Relocate the definition to lockd.h with a comment block for internal status codes, and rename to nlm__int__drop_reply to make its internal-only nature explicit. This prepares for adding additional internal status codes in subsequent patches. Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 22 ++++++++++++++-------- fs/lockd/svclock.c | 4 ++-- fs/lockd/svcproc.c | 24 +++++++++++++++--------- fs/nfsd/lockd.c | 2 +- include/linux/lockd/lockd.h | 6 ++++++ include/linux/lockd/xdr.h | 2 -- 6 files changed, 38 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 4b6f18d97734..9c756d07223a 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -104,12 +104,13 @@ __nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp) /* Obtain client and file */ if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; /* Now check for conflicting locks */ resp->status = nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock); - if (resp->status == nlm_drop_reply) + if (resp->status == nlm__int__drop_reply) rc = rpc_drop_reply; else dprintk("lockd: TEST4 status %d\n", ntohl(resp->status)); @@ -140,13 +141,14 @@ __nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_res *resp) /* Obtain client and file */ if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; /* Now try to lock the file */ resp->status = nlmsvc_lock(rqstp, file, host, &argp->lock, argp->block, &argp->cookie, argp->reclaim); - if (resp->status == nlm_drop_reply) + if (resp->status == nlm__int__drop_reply) rc = rpc_drop_reply; else dprintk("lockd: LOCK status %d\n", ntohl(resp->status)); @@ -182,7 +184,8 @@ __nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_res *resp) /* Obtain client and file */ if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; /* Try to cancel request. */ resp->status = nlmsvc_cancel_blocked(SVC_NET(rqstp), file, &argp->lock); @@ -222,7 +225,8 @@ __nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_res *resp) /* Obtain client and file */ if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; /* Now try to remove the lock */ resp->status = nlmsvc_unlock(SVC_NET(rqstp), file, &argp->lock); @@ -369,7 +373,8 @@ nlm4svc_proc_share(struct svc_rqst *rqstp) /* Obtain client and file */ if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; /* Now try to create the share */ resp->status = nlmsvc_share_file(host, file, argp); @@ -404,7 +409,8 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp) /* Obtain client and file */ if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; /* Now try to lock the file */ resp->status = nlmsvc_unshare_file(host, file, argp); diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 255a847ca0b6..d86b02153c7c 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -463,7 +463,7 @@ nlmsvc_defer_lock_rqst(struct svc_rqst *rqstp, struct nlm_block *block) block->b_deferred_req = rqstp->rq_chandle.defer(block->b_cache_req); if (block->b_deferred_req != NULL) - status = nlm_drop_reply; + status = nlm__int__drop_reply; } dprintk("lockd: nlmsvc_defer_lock_rqst block %p flags %d status %d\n", block, block->b_flags, ntohl(status)); @@ -531,7 +531,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, ret = nlm_lck_denied; goto out; } - ret = nlm_drop_reply; + ret = nlm__int__drop_reply; goto out; } diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 95c6bf7ab757..2a2e48a9bd12 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -25,7 +25,7 @@ static inline __be32 cast_status(__be32 status) case nlm_lck_denied_nolocks: case nlm_lck_blocked: case nlm_lck_denied_grace_period: - case nlm_drop_reply: + case nlm__int__drop_reply: break; case nlm4_deadlock: status = nlm_lck_denied; @@ -122,12 +122,13 @@ __nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp) /* Obtain client and file */ if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; /* Now check for conflicting locks */ resp->status = cast_status(nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock)); - if (resp->status == nlm_drop_reply) + if (resp->status == nlm__int__drop_reply) rc = rpc_drop_reply; else dprintk("lockd: TEST status %d vers %d\n", @@ -159,13 +160,14 @@ __nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_res *resp) /* Obtain client and file */ if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; /* Now try to lock the file */ resp->status = cast_status(nlmsvc_lock(rqstp, file, host, &argp->lock, argp->block, &argp->cookie, argp->reclaim)); - if (resp->status == nlm_drop_reply) + if (resp->status == nlm__int__drop_reply) rc = rpc_drop_reply; else dprintk("lockd: LOCK status %d\n", ntohl(resp->status)); @@ -202,7 +204,8 @@ __nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_res *resp) /* Obtain client and file */ if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; /* Try to cancel request. */ resp->status = cast_status(nlmsvc_cancel_blocked(net, file, &argp->lock)); @@ -243,7 +246,8 @@ __nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_res *resp) /* Obtain client and file */ if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; /* Now try to remove the lock */ resp->status = cast_status(nlmsvc_unlock(net, file, &argp->lock)); @@ -400,7 +404,8 @@ nlmsvc_proc_share(struct svc_rqst *rqstp) /* Obtain client and file */ if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; /* Now try to create the share */ resp->status = cast_status(nlmsvc_share_file(host, file, argp)); @@ -435,7 +440,8 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp) /* Obtain client and file */ if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; /* Now try to unshare the file */ resp->status = cast_status(nlmsvc_unshare_file(host, file, argp)); diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index c774ce9aa296..8c230ccd6645 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -71,7 +71,7 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp, * to callback when the delegation is returned but might * not have a proper lock request to block on. */ - return nlm_drop_reply; + return nlm__int__drop_reply; case nfserr_stale: return nlm_stale_fh; default: diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 330e38776bb2..fdefec39553f 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -38,6 +38,12 @@ */ #define LOCKD_DFLT_TIMEO 10 +/* + * Internal-use status codes, not to be placed on the wire. + * Version handlers translate these to appropriate wire values. + */ +#define nlm__int__drop_reply cpu_to_be32(30000) + /* * Lockd host handle (used both by the client and server personality). */ diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h index 17d53165d9f2..292e4e38d17d 100644 --- a/include/linux/lockd/xdr.h +++ b/include/linux/lockd/xdr.h @@ -33,8 +33,6 @@ struct svc_rqst; #define nlm_lck_blocked cpu_to_be32(NLM_LCK_BLOCKED) #define nlm_lck_denied_grace_period cpu_to_be32(NLM_LCK_DENIED_GRACE_PERIOD) -#define nlm_drop_reply cpu_to_be32(30000) - /* Lock info passed via NLM */ struct nlm_lock { char * caller; -- cgit v1.2.3 From 9e0d0c61940796893e0c2200cdc7be0684218238 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:24 -0500 Subject: lockd: Introduce nlm__int__deadlock The use of CONFIG_LOCKD_V4 in combination with a later cast_status() in the NLMv3 code is difficult to reason about. Instead, replace the use of nlm_deadlock with an implementation-defined status value that version-specific code translates appropriately. The new approach establishes a translation boundary: generic lockd code returns nlm__int__deadlock when posix_lock_file() yields -EDEADLK. Version-specific handlers (svc4proc.c for NLMv4, svcproc.c for NLMv3) translate this internal status to the appropriate wire protocol value. NLMv4 maps to nlm4_deadlock; NLMv3 maps to nlm_lck_denied (since NLMv3 lacks a deadlock-specific status code). Later this modification will also remove the need to include NLMv4 headers in NLMv3 and generic code. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 10 ++++++++-- fs/lockd/svclock.c | 8 +------- fs/lockd/svcproc.c | 4 +++- include/linux/lockd/lockd.h | 1 + 4 files changed, 13 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 9c756d07223a..55b6dcc56db1 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -148,10 +148,16 @@ __nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_res *resp) resp->status = nlmsvc_lock(rqstp, file, host, &argp->lock, argp->block, &argp->cookie, argp->reclaim); - if (resp->status == nlm__int__drop_reply) + switch (resp->status) { + case nlm__int__drop_reply: rc = rpc_drop_reply; - else + break; + case nlm__int__deadlock: + resp->status = nlm4_deadlock; + fallthrough; + default: dprintk("lockd: LOCK status %d\n", ntohl(resp->status)); + } nlmsvc_release_lockowner(&argp->lock); nlmsvc_release_host(host); diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index d86b02153c7c..5edf00751a1e 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -33,12 +33,6 @@ #define NLMDBG_FACILITY NLMDBG_SVCLOCK -#ifdef CONFIG_LOCKD_V4 -#define nlm_deadlock nlm4_deadlock -#else -#define nlm_deadlock nlm_lck_denied -#endif - static void nlmsvc_release_block(struct nlm_block *block); static void nlmsvc_insert_block(struct nlm_block *block, unsigned long); static void nlmsvc_remove_block(struct nlm_block *block); @@ -589,7 +583,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; case -EDEADLK: nlmsvc_remove_block(block); - ret = nlm_deadlock; + ret = nlm__int__deadlock; goto out; default: /* includes ENOLCK */ nlmsvc_remove_block(block); diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 2a2e48a9bd12..27ed71935e45 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -27,7 +27,7 @@ static inline __be32 cast_status(__be32 status) case nlm_lck_denied_grace_period: case nlm__int__drop_reply: break; - case nlm4_deadlock: + case nlm__int__deadlock: status = nlm_lck_denied; break; default: @@ -39,6 +39,8 @@ static inline __be32 cast_status(__be32 status) #else static inline __be32 cast_status(__be32 status) { + if (status == nlm__int__deadlock) + status = nlm_lck_denied; return status; } #endif diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index fdefec39553f..793691912137 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -43,6 +43,7 @@ * Version handlers translate these to appropriate wire values. */ #define nlm__int__drop_reply cpu_to_be32(30000) +#define nlm__int__deadlock cpu_to_be32(30001) /* * Lockd host handle (used both by the client and server personality). -- cgit v1.2.3 From 7db001e03d7a668ca6c3789fee42a24236ca90f6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:25 -0500 Subject: lockd: Have nlm_fopen() return errno values The nlm_fopen() function is part of the API between nfsd and lockd. Currently its return value is an on-the-wire NLM status code. But that forces NFSD to include NLM wire protocol definitions despite having no other dependency on the NLM wire protocol. In addition, a CONFIG_LOCKD_V4 Kconfig symbol appears in the middle of NFSD source code. Refactor: Let's not use on-the-wire values as part of a high-level API between two Linux kernel modules. That's what we have errno for, right? And, instead of simply moving the CONFIG_LOCKD_V4 check, we can get rid of it entirely and let the decision of what actual NLM status code goes on the wire to be left up to NLM version-specific code. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 18 +++++++++++++--- fs/lockd/svcproc.c | 14 ++++++++++++- fs/lockd/svcsubs.c | 27 ++++++++++++++++++------ fs/nfsd/lockd.c | 50 +++++++++++++++++++++++++-------------------- include/linux/lockd/bind.h | 8 +++----- include/linux/lockd/lockd.h | 2 ++ 6 files changed, 82 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 55b6dcc56db1..4ceb27cc72e4 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -73,9 +73,21 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, no_locks: nlmsvc_release_host(host); - if (error) - return error; - return nlm_lck_denied_nolocks; + switch (error) { + case nlm_granted: + return nlm_lck_denied_nolocks; + case nlm__int__stale_fh: + return nlm4_stale_fh; + case nlm__int__failed: + return nlm4_failed; + default: + if (be32_to_cpu(error) >= 30000) { + pr_warn_once("lockd: unhandled internal status %u\n", + be32_to_cpu(error)); + return nlm4_failed; + } + return error; + } } /* diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 27ed71935e45..272c8f36ed2a 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -39,8 +39,20 @@ static inline __be32 cast_status(__be32 status) #else static inline __be32 cast_status(__be32 status) { - if (status == nlm__int__deadlock) + switch (status) { + case nlm__int__deadlock: status = nlm_lck_denied; + break; + case nlm__int__stale_fh: + case nlm__int__failed: + status = nlm_lck_denied_nolocks; + break; + default: + if (be32_to_cpu(status) >= 30000) + pr_warn_once("lockd: unhandled internal status %u\n", + be32_to_cpu(status)); + break; + } return status; } #endif diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index dd0214dcb695..967739d2aa90 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -87,14 +87,29 @@ static __be32 nlm_do_fopen(struct svc_rqst *rqstp, struct nlm_file *file, int mode) { struct file **fp = &file->f_file[mode]; - __be32 nfserr; + __be32 nlmerr = nlm_granted; + int error; if (*fp) - return 0; - nfserr = nlmsvc_ops->fopen(rqstp, &file->f_handle, fp, mode); - if (nfserr) - dprintk("lockd: open failed (error %d)\n", nfserr); - return nfserr; + return nlmerr; + + error = nlmsvc_ops->fopen(rqstp, &file->f_handle, fp, mode); + if (error) { + dprintk("lockd: open failed (errno %d)\n", error); + switch (error) { + case -EWOULDBLOCK: + nlmerr = nlm__int__drop_reply; + break; + case -ESTALE: + nlmerr = nlm__int__stale_fh; + break; + default: + nlmerr = nlm__int__failed; + break; + } + } + + return nlmerr; } /* diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index 8c230ccd6645..6fe1325815e0 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -14,19 +14,20 @@ #define NFSDDBG_FACILITY NFSDDBG_LOCKD -#ifdef CONFIG_LOCKD_V4 -#define nlm_stale_fh nlm4_stale_fh -#define nlm_failed nlm4_failed -#else -#define nlm_stale_fh nlm_lck_denied_nolocks -#define nlm_failed nlm_lck_denied_nolocks -#endif -/* - * Note: we hold the dentry use count while the file is open. +/** + * nlm_fopen - Open an NFSD file + * @rqstp: NLM RPC procedure execution context + * @f: NFS file handle to be opened + * @filp: OUT: an opened struct file + * @flags: the POSIX open flags to use + * + * nlm_fopen() holds the dentry reference until nlm_fclose() releases it. + * + * Returns zero on success or a negative errno value if the file + * cannot be opened. */ -static __be32 -nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp, - int mode) +static int nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, + struct file **filp, int flags) { __be32 nfserr; int access; @@ -47,18 +48,17 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp, * if NFSEXP_NOAUTHNLM is set. Some older clients use AUTH_NULL * for NLM requests. */ - access = (mode == O_WRONLY) ? NFSD_MAY_WRITE : NFSD_MAY_READ; + access = (flags == O_WRONLY) ? NFSD_MAY_WRITE : NFSD_MAY_READ; access |= NFSD_MAY_NLM | NFSD_MAY_OWNER_OVERRIDE | NFSD_MAY_BYPASS_GSS; nfserr = nfsd_open(rqstp, &fh, S_IFREG, access, filp); fh_put(&fh); - /* We return nlm error codes as nlm doesn't know - * about nfsd, but nfsd does know about nlm.. - */ + switch (nfserr) { case nfs_ok: - return 0; + break; case nfserr_jukebox: - /* this error can indicate a presence of a conflicting + /* + * This error can indicate a presence of a conflicting * delegation to an NLM lock request. Options are: * (1) For now, drop this request and make the client * retry. When delegation is returned, client's lock retry @@ -66,19 +66,25 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp, * (2) NLM4_DENIED as per "spec" signals to the client * that the lock is unavailable now but client can retry. * Linux client implementation does not. It treats - * NLM4_DENIED same as NLM4_FAILED and errors the request. + * NLM4_DENIED same as NLM4_FAILED and fails the request. * (3) For the future, treat this as blocked lock and try * to callback when the delegation is returned but might * not have a proper lock request to block on. */ - return nlm__int__drop_reply; + return -EWOULDBLOCK; case nfserr_stale: - return nlm_stale_fh; + return -ESTALE; default: - return nlm_failed; + return -ENOLCK; } + + return 0; } +/** + * nlm_fclose - Close an NFSD file + * @filp: a struct file that was opened by nlm_fopen() + */ static void nlm_fclose(struct file *filp) { diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index c53c81242e72..2f5dd9e943ee 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -26,11 +26,9 @@ struct rpc_clnt; * This is the set of functions for lockd->nfsd communication */ struct nlmsvc_binding { - __be32 (*fopen)(struct svc_rqst *, - struct nfs_fh *, - struct file **, - int mode); - void (*fclose)(struct file *); + int (*fopen)(struct svc_rqst *rqstp, struct nfs_fh *f, + struct file **filp, int flags); + void (*fclose)(struct file *filp); }; extern const struct nlmsvc_binding *nlmsvc_ops; diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 793691912137..195e6ce28f6e 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -44,6 +44,8 @@ */ #define nlm__int__drop_reply cpu_to_be32(30000) #define nlm__int__deadlock cpu_to_be32(30001) +#define nlm__int__stale_fh cpu_to_be32(30002) +#define nlm__int__failed cpu_to_be32(30003) /* * Lockd host handle (used both by the client and server personality). -- cgit v1.2.3 From efb5b15e3b78f5644dd2d4ddec8880e0c9aa5b5f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:26 -0500 Subject: lockd: Relocate nlmsvc_unlock API declarations The nlmsvc_unlock_all_by_sb() and nlmsvc_unlock_all_by_ip() functions are part of lockd's external API, consumed by other kernel subsystems. Their declarations currently reside in linux/lockd/lockd.h alongside internal implementation details, which blurs the boundary between lockd's public interface and its private internals. Moving these declarations to linux/lockd/bind.h groups them with other external API functions and makes the separation explicit. This clarifies which functions are intended for external use and reduces the risk of internal implementation details leaking into the public API surface. Build-tested with allyesconfig; no functional changes. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfsctl.c | 2 +- include/linux/lockd/bind.h | 7 +++++++ include/linux/lockd/lockd.h | 6 ------ 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 71aabdaa1d15..0bf01ae411c5 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -11,7 +11,7 @@ #include #include -#include +#include #include #include #include diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index 2f5dd9e943ee..82eca0a13ccc 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -21,6 +21,7 @@ struct svc_rqst; struct rpc_task; struct rpc_clnt; +struct super_block; /* * This is the set of functions for lockd->nfsd communication @@ -80,4 +81,10 @@ extern int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl, vo extern int lockd_up(struct net *net, const struct cred *cred); extern void lockd_down(struct net *net); +/* + * Cluster failover support + */ +int nlmsvc_unlock_all_by_sb(struct super_block *sb); +int nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr); + #endif /* LINUX_LOCKD_BIND_H */ diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 195e6ce28f6e..0d883f48ec21 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -311,12 +311,6 @@ void nlmsvc_mark_resources(struct net *); void nlmsvc_free_host_resources(struct nlm_host *); void nlmsvc_invalidate_all(void); -/* - * Cluster failover support - */ -int nlmsvc_unlock_all_by_sb(struct super_block *sb); -int nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr); - static inline struct file *nlmsvc_file_file(const struct nlm_file *file) { return file->f_file[O_RDONLY] ? -- cgit v1.2.3 From 840621fd2ff23ada8b9262d90477e75232566e6b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:27 -0500 Subject: NFS: Use nlmclnt_shutdown_rpc_clnt() to safely shut down NLM A race condition exists in shutdown_store() when writing to the sysfs "shutdown" file concurrently with nlm_shutdown_hosts_net(). Without synchronization, the following sequence can occur: 1. shutdown_store() reads server->nlm_host (non-NULL) 2. nlm_shutdown_hosts_net() acquires nlm_host_mutex, calls rpc_shutdown_client(), sets h_rpcclnt to NULL, and potentially frees the host via nlm_gc_hosts() 3. shutdown_store() dereferences the now-stale or freed host Introduce nlmclnt_shutdown_rpc_clnt(), which acquires nlm_host_mutex before accessing h_rpcclnt. This synchronizes with nlm_shutdown_hosts_net() and ensures the rpc_clnt pointer remains valid during the shutdown operation. This change also improves API layering: NFS client code no longer needs to include the internal lockd header to access nlm_host fields. The new helper resides in bind.h alongside other public lockd interfaces. Reported-by: Jeff Layton Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/host.c | 29 +++++++++++++++++++++++++++++ fs/nfs/sysfs.c | 4 ++-- include/linux/lockd/bind.h | 1 + 3 files changed, 32 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 1a9582a10a86..015900d2d4c2 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -306,6 +306,35 @@ void nlmclnt_release_host(struct nlm_host *host) } } +/* Callback for rpc_cancel_tasks() - matches all tasks for cancellation */ +static bool nlmclnt_match_all(const struct rpc_task *task, const void *data) +{ + return true; +} + +/** + * nlmclnt_shutdown_rpc_clnt - safely shut down NLM client RPC operations + * @host: nlm_host to shut down + * + * Cancels outstanding RPC tasks and marks the client as shut down. + * Synchronizes with nlmclnt_release_host() via nlm_host_mutex to prevent + * races between shutdown and host destruction. Safe to call if h_rpcclnt + * is NULL or already shut down. + */ +void nlmclnt_shutdown_rpc_clnt(struct nlm_host *host) +{ + struct rpc_clnt *clnt; + + mutex_lock(&nlm_host_mutex); + clnt = host->h_rpcclnt; + if (clnt) { + clnt->cl_shutdown = 1; + rpc_cancel_tasks(clnt, -EIO, nlmclnt_match_all, NULL); + } + mutex_unlock(&nlm_host_mutex); +} +EXPORT_SYMBOL_GPL(nlmclnt_shutdown_rpc_clnt); + /** * nlmsvc_lookup_host - Find an NLM host handle matching a remote client * @rqstp: incoming NLM request diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c index 7d8921f524a6..051da37770d8 100644 --- a/fs/nfs/sysfs.c +++ b/fs/nfs/sysfs.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include "internal.h" #include "nfs4_fs.h" @@ -285,7 +285,7 @@ shutdown_store(struct kobject *kobj, struct kobj_attribute *attr, shutdown_client(server->client_acl); if (server->nlm_host) - shutdown_client(server->nlm_host->h_rpcclnt); + nlmclnt_shutdown_rpc_clnt(server->nlm_host); out: shutdown_nfs_client(server->nfs_client); return count; diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index 82eca0a13ccc..39c124dcb19c 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -57,6 +57,7 @@ struct nlmclnt_initdata { extern struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init); extern void nlmclnt_done(struct nlm_host *host); extern struct rpc_clnt *nlmclnt_rpc_clnt(struct nlm_host *host); +extern void nlmclnt_shutdown_rpc_clnt(struct nlm_host *host); /* * NLM client operations provide a means to modify RPC processing of NLM -- cgit v1.2.3 From f4d5f8caadd858f11b21e8a9e5c85290fc21a568 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:28 -0500 Subject: lockd: Move xdr4.h from include/linux/lockd/ to fs/lockd/ The xdr4.h header declares NLMv4-specific XDR encoder/decoder functions and error codes that are used exclusively within the lockd subsystem. Moving it from include/linux/lockd/ to fs/lockd/ clarifies the intended scope of these declarations and prevents external code from depending on lockd-internal interfaces. This change reduces the public API surface of the lockd module and makes it easier to refactor NLMv4 internals without risk of breaking out-of-tree consumers. The header's contents are implementation details of the NLMv4 wire protocol handling, not a contract with other kernel subsystems. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/clnt4xdr.c | 2 ++ fs/lockd/svc4proc.c | 2 ++ fs/lockd/xdr4.c | 1 + fs/lockd/xdr4.h | 34 ++++++++++++++++++++++++++++++++++ include/linux/lockd/bind.h | 3 --- include/linux/lockd/lockd.h | 7 ++++--- include/linux/lockd/xdr4.h | 43 ------------------------------------------- 7 files changed, 43 insertions(+), 49 deletions(-) create mode 100644 fs/lockd/xdr4.h delete mode 100644 include/linux/lockd/xdr4.h (limited to 'include') diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c index 527458db4525..23896073c7e5 100644 --- a/fs/lockd/clnt4xdr.c +++ b/fs/lockd/clnt4xdr.c @@ -17,6 +17,8 @@ #include +#include "xdr4.h" + #define NLMDBG_FACILITY NLMDBG_XDR #if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 4ceb27cc72e4..51d072a83a49 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -14,6 +14,8 @@ #include #include +#include "xdr4.h" + #define NLMDBG_FACILITY NLMDBG_CLIENT /* diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c index e343c820301f..5b1e15977697 100644 --- a/fs/lockd/xdr4.c +++ b/fs/lockd/xdr4.c @@ -19,6 +19,7 @@ #include #include "svcxdr.h" +#include "xdr4.h" static inline s64 loff_t_to_s64(loff_t offset) diff --git a/fs/lockd/xdr4.h b/fs/lockd/xdr4.h new file mode 100644 index 000000000000..7be318c0512b --- /dev/null +++ b/fs/lockd/xdr4.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * XDR types for the NLM protocol + * + * Copyright (C) 1996 Olaf Kirch + */ + +#ifndef _LOCKD_XDR4_H +#define _LOCKD_XDR4_H + +/* error codes new to NLMv4 */ +#define nlm4_deadlock cpu_to_be32(NLM_DEADLCK) +#define nlm4_rofs cpu_to_be32(NLM_ROFS) +#define nlm4_stale_fh cpu_to_be32(NLM_STALE_FH) +#define nlm4_fbig cpu_to_be32(NLM_FBIG) +#define nlm4_failed cpu_to_be32(NLM_FAILED) + +void nlm4svc_set_file_lock_range(struct file_lock *fl, u64 off, u64 len); +bool nlm4svc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4svc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4svc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4svc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4svc_decode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4svc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4svc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4svc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr); + +bool nlm4svc_encode_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4svc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4svc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4svc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr); + +#endif /* _LOCKD_XDR4_H */ diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index 39c124dcb19c..077da0696f12 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -13,9 +13,6 @@ #include /* need xdr-encoded error codes too, so... */ #include -#ifdef CONFIG_LOCKD_V4 -#include -#endif /* Dummy declarations */ struct svc_rqst; diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 0d883f48ec21..46f244141645 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -22,9 +22,6 @@ #include #include #include -#ifdef CONFIG_LOCKD_V4 -#include -#endif #include #include @@ -235,6 +232,10 @@ int nlmclnt_reclaim(struct nlm_host *, struct file_lock *, struct nlm_rqst *); void nlmclnt_next_cookie(struct nlm_cookie *); +#ifdef CONFIG_LOCKD_V4 +extern const struct rpc_version nlm_version4; +#endif + /* * Host cache */ diff --git a/include/linux/lockd/xdr4.h b/include/linux/lockd/xdr4.h deleted file mode 100644 index 72831e35dca3..000000000000 --- a/include/linux/lockd/xdr4.h +++ /dev/null @@ -1,43 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * linux/include/linux/lockd/xdr4.h - * - * XDR types for the NLM protocol - * - * Copyright (C) 1996 Olaf Kirch - */ - -#ifndef LOCKD_XDR4_H -#define LOCKD_XDR4_H - -#include -#include -#include -#include - -/* error codes new to NLMv4 */ -#define nlm4_deadlock cpu_to_be32(NLM_DEADLCK) -#define nlm4_rofs cpu_to_be32(NLM_ROFS) -#define nlm4_stale_fh cpu_to_be32(NLM_STALE_FH) -#define nlm4_fbig cpu_to_be32(NLM_FBIG) -#define nlm4_failed cpu_to_be32(NLM_FAILED) - -void nlm4svc_set_file_lock_range(struct file_lock *fl, u64 off, u64 len); -bool nlm4svc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr); - -bool nlm4svc_encode_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr); - -extern const struct rpc_version nlm_version4; - -#endif /* LOCKD_XDR4_H */ -- cgit v1.2.3 From 4db2f8a016dc9f9b357bfbf5c507c2582bb36730 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:29 -0500 Subject: lockd: Move share.h from include/linux/lockd/ to fs/lockd/ The share.h header defines struct nlm_share and declares the DOS share management functions used by the NLM server to implement NLM_SHARE and NLM_UNSHARE operations. These interfaces are used exclusively within the lockd subsystem. A git grep search confirms no external code references them. Relocating this header from include/linux/lockd/ to fs/lockd/ narrows the public API surface of the lockd module. Out-of-tree code cannot depend on these internal interfaces after this change. Future refactoring of the share management implementation thus requires no consideration of external consumers. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/share.h | 30 ++++++++++++++++++++++++++++++ fs/lockd/svc4proc.c | 2 +- fs/lockd/svcproc.c | 3 ++- fs/lockd/svcshare.c | 3 ++- fs/lockd/svcsubs.c | 3 ++- include/linux/lockd/lockd.h | 2 ++ include/linux/lockd/share.h | 32 -------------------------------- 7 files changed, 39 insertions(+), 36 deletions(-) create mode 100644 fs/lockd/share.h delete mode 100644 include/linux/lockd/share.h (limited to 'include') diff --git a/fs/lockd/share.h b/fs/lockd/share.h new file mode 100644 index 000000000000..d8f4ebd9c278 --- /dev/null +++ b/fs/lockd/share.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * DOS share management for lockd. + * + * Copyright (C) 1996, Olaf Kirch + */ + +#ifndef _LOCKD_SHARE_H +#define _LOCKD_SHARE_H + +/* + * DOS share for a specific file + */ +struct nlm_share { + struct nlm_share * s_next; /* linked list */ + struct nlm_host * s_host; /* client host */ + struct nlm_file * s_file; /* shared file */ + struct xdr_netobj s_owner; /* owner handle */ + u32 s_access; /* access mode */ + u32 s_mode; /* deny mode */ +}; + +__be32 nlmsvc_share_file(struct nlm_host *, struct nlm_file *, + struct nlm_args *); +__be32 nlmsvc_unshare_file(struct nlm_host *, struct nlm_file *, + struct nlm_args *); +void nlmsvc_traverse_shares(struct nlm_host *, struct nlm_file *, + nlm_host_match_fn_t); + +#endif /* _LOCKD_SHARE_H */ diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 51d072a83a49..da88b638d90d 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -11,9 +11,9 @@ #include #include #include -#include #include +#include "share.h" #include "xdr4.h" #define NLMDBG_FACILITY NLMDBG_CLIENT diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 272c8f36ed2a..8441fabd019f 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -11,9 +11,10 @@ #include #include #include -#include #include +#include "share.h" + #define NLMDBG_FACILITY NLMDBG_CLIENT #ifdef CONFIG_LOCKD_V4 diff --git a/fs/lockd/svcshare.c b/fs/lockd/svcshare.c index 88c81ce1148d..8e06840834c6 100644 --- a/fs/lockd/svcshare.c +++ b/fs/lockd/svcshare.c @@ -15,7 +15,8 @@ #include #include #include -#include + +#include "share.h" static inline int nlm_cmp_owner(struct nlm_share *share, struct xdr_netobj *oh) diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 967739d2aa90..ce596a17112c 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -16,11 +16,12 @@ #include #include #include -#include #include #include #include +#include "share.h" + #define NLMDBG_FACILITY NLMDBG_SVCSUBS diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 46f244141645..eebcecd12fae 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -155,6 +155,8 @@ struct nlm_rqst { void * a_callback_data; /* sent to nlmclnt_operations callbacks */ }; +struct nlm_share; + /* * This struct describes a file held open by lockd on behalf of * an NFS client. diff --git a/include/linux/lockd/share.h b/include/linux/lockd/share.h deleted file mode 100644 index 1f18a9faf645..000000000000 --- a/include/linux/lockd/share.h +++ /dev/null @@ -1,32 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * linux/include/linux/lockd/share.h - * - * DOS share management for lockd. - * - * Copyright (C) 1996, Olaf Kirch - */ - -#ifndef LINUX_LOCKD_SHARE_H -#define LINUX_LOCKD_SHARE_H - -/* - * DOS share for a specific file - */ -struct nlm_share { - struct nlm_share * s_next; /* linked list */ - struct nlm_host * s_host; /* client host */ - struct nlm_file * s_file; /* shared file */ - struct xdr_netobj s_owner; /* owner handle */ - u32 s_access; /* access mode */ - u32 s_mode; /* deny mode */ -}; - -__be32 nlmsvc_share_file(struct nlm_host *, struct nlm_file *, - struct nlm_args *); -__be32 nlmsvc_unshare_file(struct nlm_host *, struct nlm_file *, - struct nlm_args *); -void nlmsvc_traverse_shares(struct nlm_host *, struct nlm_file *, - nlm_host_match_fn_t); - -#endif /* LINUX_LOCKD_SHARE_H */ -- cgit v1.2.3 From 2c562c6e6715619ce34bb37d8a0a5e40fdcc7a44 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:30 -0500 Subject: lockd: Relocate include/linux/lockd/lockd.h Headers placed in include/linux/ form part of the kernel's internal API and signal to subsystem maintainers that other parts of the kernel may depend on them. By moving lockd.h into fs/lockd/, lockd becomes a more self-contained module whose internal interfaces are clearly distinguished from its public contract with the rest of the kernel. This relocation addresses a long-standing XXX comment in the header itself that acknowledged the file's misplacement. Future changes to lockd internals can now proceed with confidence that external consumers are not inadvertently coupled to implementation details. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/clnt4xdr.c | 3 +- fs/lockd/clntlock.c | 2 +- fs/lockd/clntproc.c | 2 +- fs/lockd/clntxdr.c | 3 +- fs/lockd/host.c | 2 +- fs/lockd/lockd.h | 395 +++++++++++++++++++++++++++++++++++++++++++ fs/lockd/mon.c | 2 +- fs/lockd/svc.c | 2 +- fs/lockd/svc4proc.c | 2 +- fs/lockd/svclock.c | 3 +- fs/lockd/svcproc.c | 2 +- fs/lockd/svcshare.c | 2 +- fs/lockd/svcsubs.c | 2 +- fs/lockd/trace.h | 3 +- fs/lockd/xdr.c | 3 +- fs/lockd/xdr4.c | 2 +- include/linux/lockd/lockd.h | 401 -------------------------------------------- 17 files changed, 414 insertions(+), 417 deletions(-) create mode 100644 fs/lockd/lockd.h delete mode 100644 include/linux/lockd/lockd.h (limited to 'include') diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c index 23896073c7e5..61ee5fa6dfa4 100644 --- a/fs/lockd/clnt4xdr.c +++ b/fs/lockd/clnt4xdr.c @@ -13,7 +13,8 @@ #include #include #include -#include + +#include "lockd.h" #include diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 85bc0f3e91df..8fa30c42c92a 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -15,9 +15,9 @@ #include #include #include -#include #include +#include "lockd.h" #include "trace.h" #define NLMDBG_FACILITY NLMDBG_CLIENT diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index fb4d0752c9bb..7f211008a5d2 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -18,8 +18,8 @@ #include #include #include -#include +#include "lockd.h" #include "trace.h" #define NLMDBG_FACILITY NLMDBG_CLIENT diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c index 6ea3448d2d31..65555f5224b1 100644 --- a/fs/lockd/clntxdr.c +++ b/fs/lockd/clntxdr.c @@ -15,7 +15,8 @@ #include #include #include -#include + +#include "lockd.h" #include diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 015900d2d4c2..ea8a8e166f7e 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -16,13 +16,13 @@ #include #include #include -#include #include #include #include +#include "lockd.h" #include "netns.h" #define NLMDBG_FACILITY NLMDBG_HOSTCACHE diff --git a/fs/lockd/lockd.h b/fs/lockd/lockd.h new file mode 100644 index 000000000000..9bcf89765a69 --- /dev/null +++ b/fs/lockd/lockd.h @@ -0,0 +1,395 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 1996 Olaf Kirch + */ + +#ifndef _LOCKD_LOCKD_H +#define _LOCKD_LOCKD_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Version string + */ +#define LOCKD_VERSION "0.5" + +/* + * Default timeout for RPC calls (seconds) + */ +#define LOCKD_DFLT_TIMEO 10 + +/* + * Internal-use status codes, not to be placed on the wire. + * Version handlers translate these to appropriate wire values. + */ +#define nlm__int__drop_reply cpu_to_be32(30000) +#define nlm__int__deadlock cpu_to_be32(30001) +#define nlm__int__stale_fh cpu_to_be32(30002) +#define nlm__int__failed cpu_to_be32(30003) + +/* + * Lockd host handle (used both by the client and server personality). + */ +struct nlm_host { + struct hlist_node h_hash; /* doubly linked list */ + struct sockaddr_storage h_addr; /* peer address */ + size_t h_addrlen; + struct sockaddr_storage h_srcaddr; /* our address (optional) */ + size_t h_srcaddrlen; + struct rpc_clnt *h_rpcclnt; /* RPC client to talk to peer */ + char *h_name; /* remote hostname */ + u32 h_version; /* interface version */ + unsigned short h_proto; /* transport proto */ + unsigned short h_reclaiming : 1, + h_server : 1, /* server side, not client side */ + h_noresvport : 1, + h_inuse : 1; + wait_queue_head_t h_gracewait; /* wait while reclaiming */ + struct rw_semaphore h_rwsem; /* Reboot recovery lock */ + u32 h_state; /* pseudo-state counter */ + u32 h_nsmstate; /* true remote NSM state */ + u32 h_pidcount; /* Pseudopids */ + refcount_t h_count; /* reference count */ + struct mutex h_mutex; /* mutex for pmap binding */ + unsigned long h_nextrebind; /* next portmap call */ + unsigned long h_expires; /* eligible for GC */ + struct list_head h_lockowners; /* Lockowners for the client */ + spinlock_t h_lock; + struct list_head h_granted; /* Locks in GRANTED state */ + struct list_head h_reclaim; /* Locks in RECLAIM state */ + struct nsm_handle *h_nsmhandle; /* NSM status handle */ + char *h_addrbuf; /* address eyecatcher */ + struct net *net; /* host net */ + const struct cred *h_cred; + char nodename[UNX_MAXNODENAME + 1]; + const struct nlmclnt_operations *h_nlmclnt_ops; /* Callback ops for NLM users */ +}; + +/* + * The largest string sm_addrbuf should hold is a full-size IPv6 address + * (no "::" anywhere) with a scope ID. The buffer size is computed to + * hold eight groups of colon-separated four-hex-digit numbers, a + * percent sign, a scope id (at most 32 bits, in decimal), and NUL. + */ +#define NSM_ADDRBUF ((8 * 4 + 7) + (1 + 10) + 1) + +struct nsm_handle { + struct list_head sm_link; + refcount_t sm_count; + char *sm_mon_name; + char *sm_name; + struct sockaddr_storage sm_addr; + size_t sm_addrlen; + unsigned int sm_monitored : 1, + sm_sticky : 1; /* don't unmonitor */ + struct nsm_private sm_priv; + char sm_addrbuf[NSM_ADDRBUF]; +}; + +/* + * Rigorous type checking on sockaddr type conversions + */ +static inline struct sockaddr *nlm_addr(const struct nlm_host *host) +{ + return (struct sockaddr *)&host->h_addr; +} + +static inline struct sockaddr *nlm_srcaddr(const struct nlm_host *host) +{ + return (struct sockaddr *)&host->h_srcaddr; +} + +/* + * Map an fl_owner_t into a unique 32-bit "pid" + */ +struct nlm_lockowner { + struct list_head list; + refcount_t count; + + struct nlm_host *host; + fl_owner_t owner; + uint32_t pid; +}; + +/* + * This is the representation of a blocked client lock. + */ +struct nlm_wait { + struct list_head b_list; /* linked list */ + wait_queue_head_t b_wait; /* where to wait on */ + struct nlm_host *b_host; + struct file_lock *b_lock; /* local file lock */ + __be32 b_status; /* grant callback status */ +}; + +/* + * Memory chunk for NLM client RPC request. + */ +#define NLMCLNT_OHSIZE ((__NEW_UTS_LEN) + 10u) +struct nlm_rqst { + refcount_t a_count; + unsigned int a_flags; /* initial RPC task flags */ + struct nlm_host * a_host; /* host handle */ + struct nlm_args a_args; /* arguments */ + struct nlm_res a_res; /* result */ + struct nlm_block * a_block; + unsigned int a_retries; /* Retry count */ + u8 a_owner[NLMCLNT_OHSIZE]; + void * a_callback_data; /* sent to nlmclnt_operations callbacks */ +}; + +struct nlm_share; + +/* + * This struct describes a file held open by lockd on behalf of + * an NFS client. + */ +struct nlm_file { + struct hlist_node f_list; /* linked list */ + struct nfs_fh f_handle; /* NFS file handle */ + struct file * f_file[2]; /* VFS file pointers, + indexed by O_ flags */ + struct nlm_share * f_shares; /* DOS shares */ + struct list_head f_blocks; /* blocked locks */ + unsigned int f_locks; /* guesstimate # of locks */ + unsigned int f_count; /* reference count */ + struct mutex f_mutex; /* avoid concurrent access */ +}; + +/* + * This is a server block (i.e. a lock requested by some client which + * couldn't be granted because of a conflicting lock). + */ +#define NLM_NEVER (~(unsigned long) 0) +/* timeout on non-blocking call: */ +#define NLM_TIMEOUT (7 * HZ) + +struct nlm_block { + struct kref b_count; /* Reference count */ + struct list_head b_list; /* linked list of all blocks */ + struct list_head b_flist; /* linked list (per file) */ + struct nlm_rqst * b_call; /* RPC args & callback info */ + struct svc_serv * b_daemon; /* NLM service */ + struct nlm_host * b_host; /* host handle for RPC clnt */ + unsigned long b_when; /* next re-xmit */ + unsigned int b_id; /* block id */ + unsigned char b_granted; /* VFS granted lock */ + struct nlm_file * b_file; /* file in question */ + struct cache_req * b_cache_req; /* deferred request handling */ + struct cache_deferred_req * b_deferred_req; + unsigned int b_flags; /* block flags */ +#define B_QUEUED 1 /* lock queued */ +#define B_GOT_CALLBACK 2 /* got lock or conflicting lock */ +#define B_TIMED_OUT 4 /* filesystem too slow to respond */ +}; + +/* + * Global variables + */ +extern const struct rpc_program nlm_program; +extern const struct svc_procedure nlmsvc_procedures[24]; +#ifdef CONFIG_LOCKD_V4 +extern const struct svc_procedure nlmsvc_procedures4[24]; +#endif +extern int nlmsvc_grace_period; +extern unsigned long nlm_timeout; +extern bool nsm_use_hostnames; +extern u32 nsm_local_state; + +extern struct timer_list nlmsvc_retry; + +/* + * Lockd client functions + */ +struct nlm_rqst * nlm_alloc_call(struct nlm_host *host); +int nlm_async_call(struct nlm_rqst *, u32, const struct rpc_call_ops *); +int nlm_async_reply(struct nlm_rqst *, u32, const struct rpc_call_ops *); +void nlmclnt_release_call(struct nlm_rqst *); +void nlmclnt_prepare_block(struct nlm_wait *block, struct nlm_host *host, + struct file_lock *fl); +void nlmclnt_queue_block(struct nlm_wait *block); +__be32 nlmclnt_dequeue_block(struct nlm_wait *block); +int nlmclnt_wait(struct nlm_wait *block, struct nlm_rqst *req, long timeout); +__be32 nlmclnt_grant(const struct sockaddr *addr, + const struct nlm_lock *lock); +void nlmclnt_recovery(struct nlm_host *); +int nlmclnt_reclaim(struct nlm_host *, struct file_lock *, + struct nlm_rqst *); +void nlmclnt_next_cookie(struct nlm_cookie *); + +#ifdef CONFIG_LOCKD_V4 +extern const struct rpc_version nlm_version4; +#endif + +/* + * Host cache + */ +struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, + const size_t salen, + const unsigned short protocol, + const u32 version, + const char *hostname, + int noresvport, + struct net *net, + const struct cred *cred); +void nlmclnt_release_host(struct nlm_host *); +struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, + const char *hostname, + const size_t hostname_len); +void nlmsvc_release_host(struct nlm_host *); +struct rpc_clnt * nlm_bind_host(struct nlm_host *); +void nlm_rebind_host(struct nlm_host *); +struct nlm_host * nlm_get_host(struct nlm_host *); +void nlm_shutdown_hosts(void); +void nlm_shutdown_hosts_net(struct net *net); +void nlm_host_rebooted(const struct net *net, + const struct nlm_reboot *); + +/* + * Host monitoring + */ +int nsm_monitor(const struct nlm_host *host); +void nsm_unmonitor(const struct nlm_host *host); + +struct nsm_handle *nsm_get_handle(const struct net *net, + const struct sockaddr *sap, + const size_t salen, + const char *hostname, + const size_t hostname_len); +struct nsm_handle *nsm_reboot_lookup(const struct net *net, + const struct nlm_reboot *info); +void nsm_release(struct nsm_handle *nsm); + +/* + * This is used in garbage collection and resource reclaim + * A return value != 0 means destroy the lock/block/share + */ +typedef int (*nlm_host_match_fn_t)(void *cur, struct nlm_host *ref); + +/* + * Server-side lock handling + */ +int lock_to_openmode(struct file_lock *); +__be32 nlmsvc_lock(struct svc_rqst *, struct nlm_file *, + struct nlm_host *, struct nlm_lock *, int, + struct nlm_cookie *, int); +__be32 nlmsvc_unlock(struct net *net, struct nlm_file *, struct nlm_lock *); +__be32 nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, + struct nlm_host *host, struct nlm_lock *lock, + struct nlm_lock *conflock); +__be32 nlmsvc_cancel_blocked(struct net *net, struct nlm_file *, struct nlm_lock *); +void nlmsvc_retry_blocked(struct svc_rqst *rqstp); +void nlmsvc_traverse_blocks(struct nlm_host *, struct nlm_file *, + nlm_host_match_fn_t match); +void nlmsvc_grant_reply(struct nlm_cookie *, __be32); +void nlmsvc_release_call(struct nlm_rqst *); +void nlmsvc_locks_init_private(struct file_lock *, struct nlm_host *, pid_t); + +/* + * File handling for the server personality + */ +__be32 nlm_lookup_file(struct svc_rqst *, struct nlm_file **, + struct nlm_lock *); +void nlm_release_file(struct nlm_file *); +void nlmsvc_put_lockowner(struct nlm_lockowner *); +void nlmsvc_release_lockowner(struct nlm_lock *); +void nlmsvc_mark_resources(struct net *); +void nlmsvc_free_host_resources(struct nlm_host *); +void nlmsvc_invalidate_all(void); + +static inline struct file *nlmsvc_file_file(const struct nlm_file *file) +{ + return file->f_file[O_RDONLY] ? + file->f_file[O_RDONLY] : file->f_file[O_WRONLY]; +} + +static inline struct inode *nlmsvc_file_inode(struct nlm_file *file) +{ + return file_inode(nlmsvc_file_file(file)); +} + +static inline bool +nlmsvc_file_cannot_lock(const struct nlm_file *file) +{ + return exportfs_cannot_lock(nlmsvc_file_file(file)->f_path.dentry->d_sb->s_export_op); +} + +static inline int __nlm_privileged_request4(const struct sockaddr *sap) +{ + const struct sockaddr_in *sin = (struct sockaddr_in *)sap; + + if (ntohs(sin->sin_port) > 1023) + return 0; + + return ipv4_is_loopback(sin->sin_addr.s_addr); +} + +#if IS_ENABLED(CONFIG_IPV6) +static inline int __nlm_privileged_request6(const struct sockaddr *sap) +{ + const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + + if (ntohs(sin6->sin6_port) > 1023) + return 0; + + if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_MAPPED) + return ipv4_is_loopback(sin6->sin6_addr.s6_addr32[3]); + + return ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LOOPBACK; +} +#else /* IS_ENABLED(CONFIG_IPV6) */ +static inline int __nlm_privileged_request6(const struct sockaddr *sap) +{ + return 0; +} +#endif /* IS_ENABLED(CONFIG_IPV6) */ + +/* + * Ensure incoming requests are from local privileged callers. + * + * Return TRUE if sender is local and is connecting via a privileged port; + * otherwise return FALSE. + */ +static inline int nlm_privileged_requester(const struct svc_rqst *rqstp) +{ + const struct sockaddr *sap = svc_addr(rqstp); + + switch (sap->sa_family) { + case AF_INET: + return __nlm_privileged_request4(sap); + case AF_INET6: + return __nlm_privileged_request6(sap); + default: + return 0; + } +} + +/* + * Compare two NLM locks. + * When the second lock is of type F_UNLCK, this acts like a wildcard. + */ +static inline int nlm_compare_locks(const struct file_lock *fl1, + const struct file_lock *fl2) +{ + return file_inode(fl1->c.flc_file) == file_inode(fl2->c.flc_file) + && fl1->c.flc_pid == fl2->c.flc_pid + && fl1->c.flc_owner == fl2->c.flc_owner + && fl1->fl_start == fl2->fl_start + && fl1->fl_end == fl2->fl_end + &&(fl1->c.flc_type == fl2->c.flc_type || fl2->c.flc_type == F_UNLCK); +} + +extern const struct lock_manager_operations nlmsvc_lock_operations; + +#endif /* _LOCKD_LOCKD_H */ diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index b8fc732e1c67..3d3ee88ca4dc 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -16,10 +16,10 @@ #include #include #include -#include #include +#include "lockd.h" #include "netns.h" #define NLMDBG_FACILITY NLMDBG_MONITOR diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index dcd80c4e74c9..9dd7f8e11544 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -36,9 +36,9 @@ #include #include #include -#include #include +#include "lockd.h" #include "netns.h" #include "procfs.h" #include "netlink.h" diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index da88b638d90d..86dfeb6ce68d 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -10,9 +10,9 @@ #include #include -#include #include +#include "lockd.h" #include "share.h" #include "xdr4.h" diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 5edf00751a1e..1c800fffe69c 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -29,7 +29,8 @@ #include #include #include -#include + +#include "lockd.h" #define NLMDBG_FACILITY NLMDBG_SVCLOCK diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 8441fabd019f..e9a6bcc3bf2e 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -10,9 +10,9 @@ #include #include -#include #include +#include "lockd.h" #include "share.h" #define NLMDBG_FACILITY NLMDBG_CLIENT diff --git a/fs/lockd/svcshare.c b/fs/lockd/svcshare.c index 8e06840834c6..8675ac80ab16 100644 --- a/fs/lockd/svcshare.c +++ b/fs/lockd/svcshare.c @@ -14,8 +14,8 @@ #include #include -#include +#include "lockd.h" #include "share.h" static inline int diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index ce596a17112c..71eaec5ed8d7 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -15,11 +15,11 @@ #include #include #include -#include #include #include #include +#include "lockd.h" #include "share.h" #define NLMDBG_FACILITY NLMDBG_SVCSUBS diff --git a/fs/lockd/trace.h b/fs/lockd/trace.h index 7461b13b6e74..7214d7e96a42 100644 --- a/fs/lockd/trace.h +++ b/fs/lockd/trace.h @@ -8,7 +8,8 @@ #include #include #include -#include + +#include "lockd.h" #ifdef CONFIG_LOCKD_V4 #define NLM_STATUS_LIST \ diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index adfcce2bf11b..5aac49d1875a 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c @@ -15,13 +15,12 @@ #include #include #include -#include #include +#include "lockd.h" #include "svcxdr.h" - static inline loff_t s32_to_loff_t(__s32 offset) { diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c index 5b1e15977697..f57d4881d5f1 100644 --- a/fs/lockd/xdr4.c +++ b/fs/lockd/xdr4.c @@ -16,8 +16,8 @@ #include #include #include -#include +#include "lockd.h" #include "svcxdr.h" #include "xdr4.h" diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h deleted file mode 100644 index eebcecd12fae..000000000000 --- a/include/linux/lockd/lockd.h +++ /dev/null @@ -1,401 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * linux/include/linux/lockd/lockd.h - * - * General-purpose lockd include file. - * - * Copyright (C) 1996 Olaf Kirch - */ - -#ifndef LINUX_LOCKD_LOCKD_H -#define LINUX_LOCKD_LOCKD_H - -/* XXX: a lot of this should really be under fs/lockd. */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Version string - */ -#define LOCKD_VERSION "0.5" - -/* - * Default timeout for RPC calls (seconds) - */ -#define LOCKD_DFLT_TIMEO 10 - -/* - * Internal-use status codes, not to be placed on the wire. - * Version handlers translate these to appropriate wire values. - */ -#define nlm__int__drop_reply cpu_to_be32(30000) -#define nlm__int__deadlock cpu_to_be32(30001) -#define nlm__int__stale_fh cpu_to_be32(30002) -#define nlm__int__failed cpu_to_be32(30003) - -/* - * Lockd host handle (used both by the client and server personality). - */ -struct nlm_host { - struct hlist_node h_hash; /* doubly linked list */ - struct sockaddr_storage h_addr; /* peer address */ - size_t h_addrlen; - struct sockaddr_storage h_srcaddr; /* our address (optional) */ - size_t h_srcaddrlen; - struct rpc_clnt *h_rpcclnt; /* RPC client to talk to peer */ - char *h_name; /* remote hostname */ - u32 h_version; /* interface version */ - unsigned short h_proto; /* transport proto */ - unsigned short h_reclaiming : 1, - h_server : 1, /* server side, not client side */ - h_noresvport : 1, - h_inuse : 1; - wait_queue_head_t h_gracewait; /* wait while reclaiming */ - struct rw_semaphore h_rwsem; /* Reboot recovery lock */ - u32 h_state; /* pseudo-state counter */ - u32 h_nsmstate; /* true remote NSM state */ - u32 h_pidcount; /* Pseudopids */ - refcount_t h_count; /* reference count */ - struct mutex h_mutex; /* mutex for pmap binding */ - unsigned long h_nextrebind; /* next portmap call */ - unsigned long h_expires; /* eligible for GC */ - struct list_head h_lockowners; /* Lockowners for the client */ - spinlock_t h_lock; - struct list_head h_granted; /* Locks in GRANTED state */ - struct list_head h_reclaim; /* Locks in RECLAIM state */ - struct nsm_handle *h_nsmhandle; /* NSM status handle */ - char *h_addrbuf; /* address eyecatcher */ - struct net *net; /* host net */ - const struct cred *h_cred; - char nodename[UNX_MAXNODENAME + 1]; - const struct nlmclnt_operations *h_nlmclnt_ops; /* Callback ops for NLM users */ -}; - -/* - * The largest string sm_addrbuf should hold is a full-size IPv6 address - * (no "::" anywhere) with a scope ID. The buffer size is computed to - * hold eight groups of colon-separated four-hex-digit numbers, a - * percent sign, a scope id (at most 32 bits, in decimal), and NUL. - */ -#define NSM_ADDRBUF ((8 * 4 + 7) + (1 + 10) + 1) - -struct nsm_handle { - struct list_head sm_link; - refcount_t sm_count; - char *sm_mon_name; - char *sm_name; - struct sockaddr_storage sm_addr; - size_t sm_addrlen; - unsigned int sm_monitored : 1, - sm_sticky : 1; /* don't unmonitor */ - struct nsm_private sm_priv; - char sm_addrbuf[NSM_ADDRBUF]; -}; - -/* - * Rigorous type checking on sockaddr type conversions - */ -static inline struct sockaddr *nlm_addr(const struct nlm_host *host) -{ - return (struct sockaddr *)&host->h_addr; -} - -static inline struct sockaddr *nlm_srcaddr(const struct nlm_host *host) -{ - return (struct sockaddr *)&host->h_srcaddr; -} - -/* - * Map an fl_owner_t into a unique 32-bit "pid" - */ -struct nlm_lockowner { - struct list_head list; - refcount_t count; - - struct nlm_host *host; - fl_owner_t owner; - uint32_t pid; -}; - -/* - * This is the representation of a blocked client lock. - */ -struct nlm_wait { - struct list_head b_list; /* linked list */ - wait_queue_head_t b_wait; /* where to wait on */ - struct nlm_host *b_host; - struct file_lock *b_lock; /* local file lock */ - __be32 b_status; /* grant callback status */ -}; - -/* - * Memory chunk for NLM client RPC request. - */ -#define NLMCLNT_OHSIZE ((__NEW_UTS_LEN) + 10u) -struct nlm_rqst { - refcount_t a_count; - unsigned int a_flags; /* initial RPC task flags */ - struct nlm_host * a_host; /* host handle */ - struct nlm_args a_args; /* arguments */ - struct nlm_res a_res; /* result */ - struct nlm_block * a_block; - unsigned int a_retries; /* Retry count */ - u8 a_owner[NLMCLNT_OHSIZE]; - void * a_callback_data; /* sent to nlmclnt_operations callbacks */ -}; - -struct nlm_share; - -/* - * This struct describes a file held open by lockd on behalf of - * an NFS client. - */ -struct nlm_file { - struct hlist_node f_list; /* linked list */ - struct nfs_fh f_handle; /* NFS file handle */ - struct file * f_file[2]; /* VFS file pointers, - indexed by O_ flags */ - struct nlm_share * f_shares; /* DOS shares */ - struct list_head f_blocks; /* blocked locks */ - unsigned int f_locks; /* guesstimate # of locks */ - unsigned int f_count; /* reference count */ - struct mutex f_mutex; /* avoid concurrent access */ -}; - -/* - * This is a server block (i.e. a lock requested by some client which - * couldn't be granted because of a conflicting lock). - */ -#define NLM_NEVER (~(unsigned long) 0) -/* timeout on non-blocking call: */ -#define NLM_TIMEOUT (7 * HZ) - -struct nlm_block { - struct kref b_count; /* Reference count */ - struct list_head b_list; /* linked list of all blocks */ - struct list_head b_flist; /* linked list (per file) */ - struct nlm_rqst * b_call; /* RPC args & callback info */ - struct svc_serv * b_daemon; /* NLM service */ - struct nlm_host * b_host; /* host handle for RPC clnt */ - unsigned long b_when; /* next re-xmit */ - unsigned int b_id; /* block id */ - unsigned char b_granted; /* VFS granted lock */ - struct nlm_file * b_file; /* file in question */ - struct cache_req * b_cache_req; /* deferred request handling */ - struct cache_deferred_req * b_deferred_req; - unsigned int b_flags; /* block flags */ -#define B_QUEUED 1 /* lock queued */ -#define B_GOT_CALLBACK 2 /* got lock or conflicting lock */ -#define B_TIMED_OUT 4 /* filesystem too slow to respond */ -}; - -/* - * Global variables - */ -extern const struct rpc_program nlm_program; -extern const struct svc_procedure nlmsvc_procedures[24]; -#ifdef CONFIG_LOCKD_V4 -extern const struct svc_procedure nlmsvc_procedures4[24]; -#endif -extern int nlmsvc_grace_period; -extern unsigned long nlm_timeout; -extern bool nsm_use_hostnames; -extern u32 nsm_local_state; - -extern struct timer_list nlmsvc_retry; - -/* - * Lockd client functions - */ -struct nlm_rqst * nlm_alloc_call(struct nlm_host *host); -int nlm_async_call(struct nlm_rqst *, u32, const struct rpc_call_ops *); -int nlm_async_reply(struct nlm_rqst *, u32, const struct rpc_call_ops *); -void nlmclnt_release_call(struct nlm_rqst *); -void nlmclnt_prepare_block(struct nlm_wait *block, struct nlm_host *host, - struct file_lock *fl); -void nlmclnt_queue_block(struct nlm_wait *block); -__be32 nlmclnt_dequeue_block(struct nlm_wait *block); -int nlmclnt_wait(struct nlm_wait *block, struct nlm_rqst *req, long timeout); -__be32 nlmclnt_grant(const struct sockaddr *addr, - const struct nlm_lock *lock); -void nlmclnt_recovery(struct nlm_host *); -int nlmclnt_reclaim(struct nlm_host *, struct file_lock *, - struct nlm_rqst *); -void nlmclnt_next_cookie(struct nlm_cookie *); - -#ifdef CONFIG_LOCKD_V4 -extern const struct rpc_version nlm_version4; -#endif - -/* - * Host cache - */ -struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, - const size_t salen, - const unsigned short protocol, - const u32 version, - const char *hostname, - int noresvport, - struct net *net, - const struct cred *cred); -void nlmclnt_release_host(struct nlm_host *); -struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, - const char *hostname, - const size_t hostname_len); -void nlmsvc_release_host(struct nlm_host *); -struct rpc_clnt * nlm_bind_host(struct nlm_host *); -void nlm_rebind_host(struct nlm_host *); -struct nlm_host * nlm_get_host(struct nlm_host *); -void nlm_shutdown_hosts(void); -void nlm_shutdown_hosts_net(struct net *net); -void nlm_host_rebooted(const struct net *net, - const struct nlm_reboot *); - -/* - * Host monitoring - */ -int nsm_monitor(const struct nlm_host *host); -void nsm_unmonitor(const struct nlm_host *host); - -struct nsm_handle *nsm_get_handle(const struct net *net, - const struct sockaddr *sap, - const size_t salen, - const char *hostname, - const size_t hostname_len); -struct nsm_handle *nsm_reboot_lookup(const struct net *net, - const struct nlm_reboot *info); -void nsm_release(struct nsm_handle *nsm); - -/* - * This is used in garbage collection and resource reclaim - * A return value != 0 means destroy the lock/block/share - */ -typedef int (*nlm_host_match_fn_t)(void *cur, struct nlm_host *ref); - -/* - * Server-side lock handling - */ -int lock_to_openmode(struct file_lock *); -__be32 nlmsvc_lock(struct svc_rqst *, struct nlm_file *, - struct nlm_host *, struct nlm_lock *, int, - struct nlm_cookie *, int); -__be32 nlmsvc_unlock(struct net *net, struct nlm_file *, struct nlm_lock *); -__be32 nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, - struct nlm_host *host, struct nlm_lock *lock, - struct nlm_lock *conflock); -__be32 nlmsvc_cancel_blocked(struct net *net, struct nlm_file *, struct nlm_lock *); -void nlmsvc_retry_blocked(struct svc_rqst *rqstp); -void nlmsvc_traverse_blocks(struct nlm_host *, struct nlm_file *, - nlm_host_match_fn_t match); -void nlmsvc_grant_reply(struct nlm_cookie *, __be32); -void nlmsvc_release_call(struct nlm_rqst *); -void nlmsvc_locks_init_private(struct file_lock *, struct nlm_host *, pid_t); - -/* - * File handling for the server personality - */ -__be32 nlm_lookup_file(struct svc_rqst *, struct nlm_file **, - struct nlm_lock *); -void nlm_release_file(struct nlm_file *); -void nlmsvc_put_lockowner(struct nlm_lockowner *); -void nlmsvc_release_lockowner(struct nlm_lock *); -void nlmsvc_mark_resources(struct net *); -void nlmsvc_free_host_resources(struct nlm_host *); -void nlmsvc_invalidate_all(void); - -static inline struct file *nlmsvc_file_file(const struct nlm_file *file) -{ - return file->f_file[O_RDONLY] ? - file->f_file[O_RDONLY] : file->f_file[O_WRONLY]; -} - -static inline struct inode *nlmsvc_file_inode(struct nlm_file *file) -{ - return file_inode(nlmsvc_file_file(file)); -} - -static inline bool -nlmsvc_file_cannot_lock(const struct nlm_file *file) -{ - return exportfs_cannot_lock(nlmsvc_file_file(file)->f_path.dentry->d_sb->s_export_op); -} - -static inline int __nlm_privileged_request4(const struct sockaddr *sap) -{ - const struct sockaddr_in *sin = (struct sockaddr_in *)sap; - - if (ntohs(sin->sin_port) > 1023) - return 0; - - return ipv4_is_loopback(sin->sin_addr.s_addr); -} - -#if IS_ENABLED(CONFIG_IPV6) -static inline int __nlm_privileged_request6(const struct sockaddr *sap) -{ - const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; - - if (ntohs(sin6->sin6_port) > 1023) - return 0; - - if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_MAPPED) - return ipv4_is_loopback(sin6->sin6_addr.s6_addr32[3]); - - return ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LOOPBACK; -} -#else /* IS_ENABLED(CONFIG_IPV6) */ -static inline int __nlm_privileged_request6(const struct sockaddr *sap) -{ - return 0; -} -#endif /* IS_ENABLED(CONFIG_IPV6) */ - -/* - * Ensure incoming requests are from local privileged callers. - * - * Return TRUE if sender is local and is connecting via a privileged port; - * otherwise return FALSE. - */ -static inline int nlm_privileged_requester(const struct svc_rqst *rqstp) -{ - const struct sockaddr *sap = svc_addr(rqstp); - - switch (sap->sa_family) { - case AF_INET: - return __nlm_privileged_request4(sap); - case AF_INET6: - return __nlm_privileged_request6(sap); - default: - return 0; - } -} - -/* - * Compare two NLM locks. - * When the second lock is of type F_UNLCK, this acts like a wildcard. - */ -static inline int nlm_compare_locks(const struct file_lock *fl1, - const struct file_lock *fl2) -{ - return file_inode(fl1->c.flc_file) == file_inode(fl2->c.flc_file) - && fl1->c.flc_pid == fl2->c.flc_pid - && fl1->c.flc_owner == fl2->c.flc_owner - && fl1->fl_start == fl2->fl_start - && fl1->fl_end == fl2->fl_end - &&(fl1->c.flc_type == fl2->c.flc_type || fl2->c.flc_type == F_UNLCK); -} - -extern const struct lock_manager_operations nlmsvc_lock_operations; - -#endif /* LINUX_LOCKD_LOCKD_H */ -- cgit v1.2.3 From 236f3171ac690f632e13d391f47c68c3a8519bd2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:31 -0500 Subject: lockd: Remove lockd/debug.h The lockd include structure has unnecessary indirection. The header include/linux/lockd/debug.h is consumed only by fs/lockd/lockd.h, creating an extra compilation dependency and making the code harder to navigate. Fold the debug.h definitions directly into lockd.h and remove the now-redundant header. This reduces the include tree depth and makes the debug-related definitions easier to find when working on lockd internals. Build-tested with lockd built as module and built-in. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/lockd.h | 24 +++++++++++++++++++++++- include/linux/lockd/debug.h | 40 ---------------------------------------- 2 files changed, 23 insertions(+), 41 deletions(-) delete mode 100644 include/linux/lockd/debug.h (limited to 'include') diff --git a/fs/lockd/lockd.h b/fs/lockd/lockd.h index 9bcf89765a69..460ccb701749 100644 --- a/fs/lockd/lockd.h +++ b/fs/lockd/lockd.h @@ -16,9 +16,31 @@ #include #include #include -#include +#include #include +/* + * Enable lockd debugging. + * Requires CONFIG_SUNRPC_DEBUG. + */ +#undef ifdebug +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) +# define ifdebug(flag) if (unlikely(nlm_debug & NLMDBG_##flag)) +#else +# define ifdebug(flag) if (0) +#endif + +#define NLMDBG_SVC 0x0001 +#define NLMDBG_CLIENT 0x0002 +#define NLMDBG_CLNTLOCK 0x0004 +#define NLMDBG_SVCLOCK 0x0008 +#define NLMDBG_MONITOR 0x0010 +#define NLMDBG_CLNTSUBS 0x0020 +#define NLMDBG_SVCSUBS 0x0040 +#define NLMDBG_HOSTCACHE 0x0080 +#define NLMDBG_XDR 0x0100 +#define NLMDBG_ALL 0x7fff + /* * Version string */ diff --git a/include/linux/lockd/debug.h b/include/linux/lockd/debug.h deleted file mode 100644 index eede2ab5246f..000000000000 --- a/include/linux/lockd/debug.h +++ /dev/null @@ -1,40 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * linux/include/linux/lockd/debug.h - * - * Debugging stuff. - * - * Copyright (C) 1996 Olaf Kirch - */ - -#ifndef LINUX_LOCKD_DEBUG_H -#define LINUX_LOCKD_DEBUG_H - -#include - -/* - * Enable lockd debugging. - * Requires RPC_DEBUG. - */ -#undef ifdebug -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define ifdebug(flag) if (unlikely(nlm_debug & NLMDBG_##flag)) -#else -# define ifdebug(flag) if (0) -#endif - -/* - * Debug flags - */ -#define NLMDBG_SVC 0x0001 -#define NLMDBG_CLIENT 0x0002 -#define NLMDBG_CLNTLOCK 0x0004 -#define NLMDBG_SVCLOCK 0x0008 -#define NLMDBG_MONITOR 0x0010 -#define NLMDBG_CLNTSUBS 0x0020 -#define NLMDBG_SVCSUBS 0x0040 -#define NLMDBG_HOSTCACHE 0x0080 -#define NLMDBG_XDR 0x0100 -#define NLMDBG_ALL 0x7fff - -#endif /* LINUX_LOCKD_DEBUG_H */ -- cgit v1.2.3 From 615384a24b1e6b0f091ebc1dfbf7ec8b4c27fa81 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:32 -0500 Subject: lockd: Move xdr.h from include/linux/lockd/ to fs/lockd/ The lockd subsystem unnecessarily exposes internal NLM XDR type definitions through the global include path. These definitions are not used by any code outside fs/lockd/, making them inappropriate for include/linux/lockd/. Moving xdr.h to fs/lockd/ narrows the API surface and clarifies that these types are internal implementation details. The comment in linux/lockd/bind.h stating xdr.h was needed for "xdr-encoded error codes" is stale: no lockd API consumers use those codes. Forward declarations for struct nfs_fh and struct file_lock are added to bind.h because their definitions were previously pulled in transitively through xdr.h. Additionally, nfs3proc.c and proc.c need explicit includes of filelock.h for FL_CLOSE and for accessing struct file_lock members, respectively. Built and tested with lockd client/server operations. No functional change. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/lockd.h | 2 +- fs/lockd/xdr.h | 111 ++++++++++++++++++++++++++++++++++++++++++++ fs/nfs/nfs3proc.c | 1 + fs/nfs/proc.c | 1 + include/linux/lockd/bind.h | 5 +- include/linux/lockd/xdr.h | 113 --------------------------------------------- 6 files changed, 116 insertions(+), 117 deletions(-) create mode 100644 fs/lockd/xdr.h delete mode 100644 include/linux/lockd/xdr.h (limited to 'include') diff --git a/fs/lockd/lockd.h b/fs/lockd/lockd.h index 460ccb701749..6f83b9a7257f 100644 --- a/fs/lockd/lockd.h +++ b/fs/lockd/lockd.h @@ -15,7 +15,7 @@ #include #include #include -#include +#include "xdr.h" #include #include diff --git a/fs/lockd/xdr.h b/fs/lockd/xdr.h new file mode 100644 index 000000000000..af821ecf2a4e --- /dev/null +++ b/fs/lockd/xdr.h @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * XDR types for the NLM protocol + * + * Copyright (C) 1996 Olaf Kirch + */ + +#ifndef _LOCKD_XDR_H +#define _LOCKD_XDR_H + +#include +#include +#include +#include + +#define SM_MAXSTRLEN 1024 +#define SM_PRIV_SIZE 16 + +struct nsm_private { + unsigned char data[SM_PRIV_SIZE]; +}; + +struct svc_rqst; + +#define NLM_MAXCOOKIELEN 32 +#define NLM_MAXSTRLEN 1024 + +#define nlm_granted cpu_to_be32(NLM_LCK_GRANTED) +#define nlm_lck_denied cpu_to_be32(NLM_LCK_DENIED) +#define nlm_lck_denied_nolocks cpu_to_be32(NLM_LCK_DENIED_NOLOCKS) +#define nlm_lck_blocked cpu_to_be32(NLM_LCK_BLOCKED) +#define nlm_lck_denied_grace_period cpu_to_be32(NLM_LCK_DENIED_GRACE_PERIOD) + +/* Lock info passed via NLM */ +struct nlm_lock { + char * caller; + unsigned int len; /* length of "caller" */ + struct nfs_fh fh; + struct xdr_netobj oh; + u32 svid; + u64 lock_start; + u64 lock_len; + struct file_lock fl; +}; + +/* + * NLM cookies. Technically they can be 1K, but Linux only uses 8 bytes. + * FreeBSD uses 16, Apple Mac OS X 10.3 uses 20. Therefore we set it to + * 32 bytes. + */ + +struct nlm_cookie +{ + unsigned char data[NLM_MAXCOOKIELEN]; + unsigned int len; +}; + +/* + * Generic lockd arguments for all but sm_notify + */ +struct nlm_args { + struct nlm_cookie cookie; + struct nlm_lock lock; + u32 block; + u32 reclaim; + u32 state; + u32 monitor; + u32 fsm_access; + u32 fsm_mode; +}; + +/* + * Generic lockd result + */ +struct nlm_res { + struct nlm_cookie cookie; + __be32 status; + struct nlm_lock lock; +}; + +/* + * statd callback when client has rebooted + */ +struct nlm_reboot { + char *mon; + unsigned int len; + u32 state; + struct nsm_private priv; +}; + +/* + * Contents of statd callback when monitored host rebooted + */ +#define NLMSVC_XDRSIZE sizeof(struct nlm_args) + +bool nlmsvc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlmsvc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlmsvc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlmsvc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlmsvc_decode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlmsvc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlmsvc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlmsvc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr); + +bool nlmsvc_encode_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlmsvc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlmsvc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlmsvc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr); + +#endif /* _LOCKD_XDR_H */ diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index be2aebf62056..95d7cd564b74 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 8c3d2efa2636..70795684b8e8 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include "internal.h" diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index 077da0696f12..ba9258c96bfd 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -11,10 +11,9 @@ #define LINUX_LOCKD_BIND_H #include -/* need xdr-encoded error codes too, so... */ -#include -/* Dummy declarations */ +struct file_lock; +struct nfs_fh; struct svc_rqst; struct rpc_task; struct rpc_clnt; diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h deleted file mode 100644 index 292e4e38d17d..000000000000 --- a/include/linux/lockd/xdr.h +++ /dev/null @@ -1,113 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * linux/include/linux/lockd/xdr.h - * - * XDR types for the NLM protocol - * - * Copyright (C) 1996 Olaf Kirch - */ - -#ifndef LOCKD_XDR_H -#define LOCKD_XDR_H - -#include -#include -#include -#include - -#define SM_MAXSTRLEN 1024 -#define SM_PRIV_SIZE 16 - -struct nsm_private { - unsigned char data[SM_PRIV_SIZE]; -}; - -struct svc_rqst; - -#define NLM_MAXCOOKIELEN 32 -#define NLM_MAXSTRLEN 1024 - -#define nlm_granted cpu_to_be32(NLM_LCK_GRANTED) -#define nlm_lck_denied cpu_to_be32(NLM_LCK_DENIED) -#define nlm_lck_denied_nolocks cpu_to_be32(NLM_LCK_DENIED_NOLOCKS) -#define nlm_lck_blocked cpu_to_be32(NLM_LCK_BLOCKED) -#define nlm_lck_denied_grace_period cpu_to_be32(NLM_LCK_DENIED_GRACE_PERIOD) - -/* Lock info passed via NLM */ -struct nlm_lock { - char * caller; - unsigned int len; /* length of "caller" */ - struct nfs_fh fh; - struct xdr_netobj oh; - u32 svid; - u64 lock_start; - u64 lock_len; - struct file_lock fl; -}; - -/* - * NLM cookies. Technically they can be 1K, but Linux only uses 8 bytes. - * FreeBSD uses 16, Apple Mac OS X 10.3 uses 20. Therefore we set it to - * 32 bytes. - */ - -struct nlm_cookie -{ - unsigned char data[NLM_MAXCOOKIELEN]; - unsigned int len; -}; - -/* - * Generic lockd arguments for all but sm_notify - */ -struct nlm_args { - struct nlm_cookie cookie; - struct nlm_lock lock; - u32 block; - u32 reclaim; - u32 state; - u32 monitor; - u32 fsm_access; - u32 fsm_mode; -}; - -/* - * Generic lockd result - */ -struct nlm_res { - struct nlm_cookie cookie; - __be32 status; - struct nlm_lock lock; -}; - -/* - * statd callback when client has rebooted - */ -struct nlm_reboot { - char *mon; - unsigned int len; - u32 state; - struct nsm_private priv; -}; - -/* - * Contents of statd callback when monitored host rebooted - */ -#define NLMSVC_XDRSIZE sizeof(struct nlm_args) - -bool nlmsvc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlmsvc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlmsvc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlmsvc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlmsvc_decode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlmsvc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlmsvc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlmsvc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr); - -bool nlmsvc_encode_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlmsvc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlmsvc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlmsvc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr); - -#endif /* LOCKD_XDR_H */ -- cgit v1.2.3 From 5829352e568d24dd04ae112128a4f44748d073bc Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Jan 2026 10:19:33 -0500 Subject: lockd: Make linux/lockd/nlm.h an internal header The NLM protocol constants and status codes in nlm.h are needed only by lockd's internal implementation. NFS client code and NFSD interact with lockd through the stable API in bind.h and have no direct use for protocol-level definitions. Exposing these definitions globally via bind.h creates unnecessary coupling between lockd internals and its consumers. Moving nlm.h from include/linux/lockd/ to fs/lockd/ clarifies the API boundary: bind.h provides the lockd service interface, while nlm.h remains available only to code within fs/lockd/ that implements the protocol. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/lockd/lockd.h | 1 + fs/lockd/nlm.h | 56 ++++++++++++++++++++++++++++++++++++++++++++ fs/lockd/svclock.c | 1 - include/linux/lockd/bind.h | 2 -- include/linux/lockd/nlm.h | 58 ---------------------------------------------- 5 files changed, 57 insertions(+), 61 deletions(-) create mode 100644 fs/lockd/nlm.h delete mode 100644 include/linux/lockd/nlm.h (limited to 'include') diff --git a/fs/lockd/lockd.h b/fs/lockd/lockd.h index 6f83b9a7257f..e73c6b348154 100644 --- a/fs/lockd/lockd.h +++ b/fs/lockd/lockd.h @@ -14,6 +14,7 @@ #include #include #include +#include "nlm.h" #include #include "xdr.h" #include diff --git a/fs/lockd/nlm.h b/fs/lockd/nlm.h new file mode 100644 index 000000000000..47be65d0111f --- /dev/null +++ b/fs/lockd/nlm.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Declarations for the Network Lock Manager protocol. + * + * Copyright (C) 1996, Olaf Kirch + */ + +#ifndef _LOCKD_NLM_H +#define _LOCKD_NLM_H + + +/* Maximum file offset in file_lock.fl_end */ +# define NLM_OFFSET_MAX ((s32) 0x7fffffff) +# define NLM4_OFFSET_MAX ((s64) ((~(u64)0) >> 1)) + +/* Return states for NLM */ +enum { + NLM_LCK_GRANTED = 0, + NLM_LCK_DENIED = 1, + NLM_LCK_DENIED_NOLOCKS = 2, + NLM_LCK_BLOCKED = 3, + NLM_LCK_DENIED_GRACE_PERIOD = 4, +#ifdef CONFIG_LOCKD_V4 + NLM_DEADLCK = 5, + NLM_ROFS = 6, + NLM_STALE_FH = 7, + NLM_FBIG = 8, + NLM_FAILED = 9, +#endif +}; + +#define NLM_PROGRAM 100021 + +#define NLMPROC_NULL 0 +#define NLMPROC_TEST 1 +#define NLMPROC_LOCK 2 +#define NLMPROC_CANCEL 3 +#define NLMPROC_UNLOCK 4 +#define NLMPROC_GRANTED 5 +#define NLMPROC_TEST_MSG 6 +#define NLMPROC_LOCK_MSG 7 +#define NLMPROC_CANCEL_MSG 8 +#define NLMPROC_UNLOCK_MSG 9 +#define NLMPROC_GRANTED_MSG 10 +#define NLMPROC_TEST_RES 11 +#define NLMPROC_LOCK_RES 12 +#define NLMPROC_CANCEL_RES 13 +#define NLMPROC_UNLOCK_RES 14 +#define NLMPROC_GRANTED_RES 15 +#define NLMPROC_NSM_NOTIFY 16 /* statd callback */ +#define NLMPROC_SHARE 20 +#define NLMPROC_UNSHARE 21 +#define NLMPROC_NM_LOCK 22 +#define NLMPROC_FREE_ALL 23 + +#endif /* _LOCKD_NLM_H */ diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 1c800fffe69c..e687103e42d1 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -28,7 +28,6 @@ #include #include #include -#include #include "lockd.h" diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index ba9258c96bfd..b614e0deea72 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -10,8 +10,6 @@ #ifndef LINUX_LOCKD_BIND_H #define LINUX_LOCKD_BIND_H -#include - struct file_lock; struct nfs_fh; struct svc_rqst; diff --git a/include/linux/lockd/nlm.h b/include/linux/lockd/nlm.h deleted file mode 100644 index 6e343ef760dc..000000000000 --- a/include/linux/lockd/nlm.h +++ /dev/null @@ -1,58 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * linux/include/linux/lockd/nlm.h - * - * Declarations for the Network Lock Manager protocol. - * - * Copyright (C) 1996, Olaf Kirch - */ - -#ifndef LINUX_LOCKD_NLM_H -#define LINUX_LOCKD_NLM_H - - -/* Maximum file offset in file_lock.fl_end */ -# define NLM_OFFSET_MAX ((s32) 0x7fffffff) -# define NLM4_OFFSET_MAX ((s64) ((~(u64)0) >> 1)) - -/* Return states for NLM */ -enum { - NLM_LCK_GRANTED = 0, - NLM_LCK_DENIED = 1, - NLM_LCK_DENIED_NOLOCKS = 2, - NLM_LCK_BLOCKED = 3, - NLM_LCK_DENIED_GRACE_PERIOD = 4, -#ifdef CONFIG_LOCKD_V4 - NLM_DEADLCK = 5, - NLM_ROFS = 6, - NLM_STALE_FH = 7, - NLM_FBIG = 8, - NLM_FAILED = 9, -#endif -}; - -#define NLM_PROGRAM 100021 - -#define NLMPROC_NULL 0 -#define NLMPROC_TEST 1 -#define NLMPROC_LOCK 2 -#define NLMPROC_CANCEL 3 -#define NLMPROC_UNLOCK 4 -#define NLMPROC_GRANTED 5 -#define NLMPROC_TEST_MSG 6 -#define NLMPROC_LOCK_MSG 7 -#define NLMPROC_CANCEL_MSG 8 -#define NLMPROC_UNLOCK_MSG 9 -#define NLMPROC_GRANTED_MSG 10 -#define NLMPROC_TEST_RES 11 -#define NLMPROC_LOCK_RES 12 -#define NLMPROC_CANCEL_RES 13 -#define NLMPROC_UNLOCK_RES 14 -#define NLMPROC_GRANTED_RES 15 -#define NLMPROC_NSM_NOTIFY 16 /* statd callback */ -#define NLMPROC_SHARE 20 -#define NLMPROC_UNSHARE 21 -#define NLMPROC_NM_LOCK 22 -#define NLMPROC_FREE_ALL 23 - -#endif /* LINUX_LOCKD_NLM_H */ -- cgit v1.2.3 From adcc59114ccd402259c089b0fea24da5e4974563 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 4 Feb 2026 21:21:50 +0100 Subject: sunrpc: Kill RPC_IFDEBUG() RPC_IFDEBUG() is used in only two places. In one the user of the definition is guarded by ifdeffery, in the second one it's implied due to dprintk() usage. Kill the macro and move the ifdeffery to the regular condition with the variable defined inside, while in the second case add the same conditional and move the respective code there. Reviewed-by: Jeff Layton Signed-off-by: Andy Shevchenko Signed-off-by: Chuck Lever --- fs/nfsd/nfsfh.c | 9 ++++++--- include/linux/sunrpc/debug.h | 2 -- net/sunrpc/xprtrdma/svc_rdma_transport.c | 27 ++++++++++++++------------- 3 files changed, 20 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index ed85dd43da18..68b629fbaaeb 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -105,9 +105,12 @@ static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp, { /* Check if the request originated from a secure port. */ if (rqstp && !nfsd_originating_port_ok(rqstp, cred, exp)) { - RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); - dprintk("nfsd: request from insecure port %s!\n", - svc_print_addr(rqstp, buf, sizeof(buf))); + if (IS_ENABLED(CONFIG_SUNRPC_DEBUG)) { + char buf[RPC_MAX_ADDRBUFLEN]; + + dprintk("nfsd: request from insecure port %s!\n", + svc_print_addr(rqstp, buf, sizeof(buf))); + } return nfserr_perm; } diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h index eb4bd62df319..93d1a11ffbfb 100644 --- a/include/linux/sunrpc/debug.h +++ b/include/linux/sunrpc/debug.h @@ -49,12 +49,10 @@ do { \ } \ } while (0) -# define RPC_IFDEBUG(x) x #else # define ifdebug(fac) if (0) # define dfprintk(fac, fmt, ...) do {} while (0) # define dfprintk_rcu(fac, fmt, ...) do {} while (0) -# define RPC_IFDEBUG(x) #endif /* diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 9b623849723e..f2d72181a6fe 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -414,7 +414,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) struct ib_qp_init_attr qp_attr; struct ib_device *dev; int ret = 0; - RPC_IFDEBUG(struct sockaddr *sap); listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); clear_bit(XPT_CONN, &xprt->xpt_flags); @@ -560,18 +559,20 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) goto errout; } -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - dprintk("svcrdma: new connection accepted on device %s:\n", dev->name); - sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr; - dprintk(" local address : %pIS:%u\n", sap, rpc_get_port(sap)); - sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; - dprintk(" remote address : %pIS:%u\n", sap, rpc_get_port(sap)); - dprintk(" max_sge : %d\n", newxprt->sc_max_send_sges); - dprintk(" sq_depth : %d\n", newxprt->sc_sq_depth); - dprintk(" rdma_rw_ctxs : %d\n", ctxts); - dprintk(" max_requests : %d\n", newxprt->sc_max_requests); - dprintk(" ord : %d\n", conn_param.initiator_depth); -#endif + if (IS_ENABLED(CONFIG_SUNRPC_DEBUG)) { + struct sockaddr *sap; + + dprintk("svcrdma: new connection accepted on device %s:\n", dev->name); + sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr; + dprintk(" local address : %pIS:%u\n", sap, rpc_get_port(sap)); + sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; + dprintk(" remote address : %pIS:%u\n", sap, rpc_get_port(sap)); + dprintk(" max_sge : %d\n", newxprt->sc_max_send_sges); + dprintk(" sq_depth : %d\n", newxprt->sc_sq_depth); + dprintk(" rdma_rw_ctxs : %d\n", ctxts); + dprintk(" max_requests : %d\n", newxprt->sc_max_requests); + dprintk(" ord : %d\n", conn_param.initiator_depth); + } return &newxprt->sc_xprt; -- cgit v1.2.3 From 6f57293abb8d087de830dd3f02e66d94b3e59973 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 4 Feb 2026 21:21:51 +0100 Subject: sunrpc: Fix compilation error (`make W=1`) when dprintk() is no-op Clang compiler is not happy about set but unused variables: .../flexfilelayout/flexfilelayoutdev.c:56:9: error: variable 'ret' set but not used [-Werror,-Wunused-but-set-variable] .../flexfilelayout/flexfilelayout.c:1505:6: error: variable 'err' set but not used [-Werror,-Wunused-but-set-variable] .../nfs4proc.c:9244:12: error: variable 'ptr' set but not used [-Werror,-Wunused-but-set-variable] Fix these by forwarding parameters of dprintk() to no_printk(). The positive side-effect is a format-string checker enabled even for the cases when dprintk() is no-op. Fixes: d67ae825a59d ("pnfs/flexfiles: Add the FlexFile Layout Driver") Fixes: fc931582c260 ("nfs41: create_session operation") Acked-by: Geert Uytterhoeven Reviewed-by: Jeff Layton Signed-off-by: Andy Shevchenko Signed-off-by: Chuck Lever --- fs/lockd/svclock.c | 5 +++++ include/linux/sunrpc/debug.h | 8 ++++++-- include/linux/sunrpc/sched.h | 3 --- 3 files changed, 11 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index e687103e42d1..ee23f5802af1 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -74,6 +74,11 @@ static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie) return buf; } +#else +static inline const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie) +{ + return "???"; +} #endif /* diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h index 93d1a11ffbfb..ab61bed2f7af 100644 --- a/include/linux/sunrpc/debug.h +++ b/include/linux/sunrpc/debug.h @@ -38,6 +38,8 @@ extern unsigned int nlm_debug; do { \ ifdebug(fac) \ __sunrpc_printk(fmt, ##__VA_ARGS__); \ + else \ + no_printk(fmt, ##__VA_ARGS__); \ } while (0) # define dfprintk_rcu(fac, fmt, ...) \ @@ -46,13 +48,15 @@ do { \ rcu_read_lock(); \ __sunrpc_printk(fmt, ##__VA_ARGS__); \ rcu_read_unlock(); \ + } else { \ + no_printk(fmt, ##__VA_ARGS__); \ } \ } while (0) #else # define ifdebug(fac) if (0) -# define dfprintk(fac, fmt, ...) do {} while (0) -# define dfprintk_rcu(fac, fmt, ...) do {} while (0) +# define dfprintk(fac, fmt, ...) no_printk(fmt, ##__VA_ARGS__) +# define dfprintk_rcu(fac, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif /* diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index ccba79ebf893..0dbdf3722537 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -95,10 +95,7 @@ struct rpc_task { int tk_rpc_status; /* Result of last RPC operation */ unsigned short tk_flags; /* misc flags */ unsigned short tk_timeouts; /* maj timeouts */ - -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) || IS_ENABLED(CONFIG_TRACEPOINTS) unsigned short tk_pid; /* debugging aid */ -#endif unsigned char tk_priority : 2,/* Task priority */ tk_garb_retry : 2, tk_cred_retry : 2; -- cgit v1.2.3 From f52792f484ba2316853736856dde19b7e7458861 Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Fri, 13 Feb 2026 10:36:30 -0800 Subject: NFSD: Enforce timeout on layout recall and integrate lease manager fencing When a layout conflict triggers a recall, enforcing a timeout is necessary to prevent excessive nfsd threads from being blocked in __break_lease ensuring the server continues servicing incoming requests efficiently. This patch introduces a new function to lease_manager_operations: lm_breaker_timedout: Invoked when a lease recall times out and is about to be disposed of. This function enables the lease manager to inform the caller whether the file_lease should remain on the flc_list or be disposed of. For the NFSD lease manager, this function now handles layout recall timeouts. If the layout type supports fencing and the client has not been fenced, a fence operation is triggered to prevent the client from accessing the block device. While the fencing operation is in progress, the conflicting file_lease remains on the flc_list until fencing is complete. This guarantees that no other clients can access the file, and the client with exclusive access is properly blocked before disposal. Signed-off-by: Dai Ngo Signed-off-by: Chuck Lever --- .../admin-guide/nfs/pnfs-block-server.rst | 30 ++++ Documentation/admin-guide/nfs/pnfs-scsi-server.rst | 31 +++++ Documentation/filesystems/locking.rst | 2 + fs/locks.c | 26 +++- fs/nfsd/blocklayout.c | 42 +++++- fs/nfsd/nfs4layouts.c | 152 ++++++++++++++++++++- fs/nfsd/nfs4state.c | 1 + fs/nfsd/pnfs.h | 5 +- fs/nfsd/state.h | 6 + include/linux/filelock.h | 1 + 10 files changed, 279 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/Documentation/admin-guide/nfs/pnfs-block-server.rst b/Documentation/admin-guide/nfs/pnfs-block-server.rst index 20fe9f5117fe..b4f5997009af 100644 --- a/Documentation/admin-guide/nfs/pnfs-block-server.rst +++ b/Documentation/admin-guide/nfs/pnfs-block-server.rst @@ -40,3 +40,33 @@ how to translate the device into a serial number from SCSI EVPD 0x80:: echo "fencing client ${CLIENT} serial ${EVPD}" >> /var/log/pnfsd-fence.log EOF + +If the nfsd server needs to fence a non-responding client and the +fencing operation fails, the server logs a warning message in the +system log with the following format: + + FENCE failed client[IP_address] clid[#n] device[dev_name] + + Where: + + IP_address: refers to the IP address of the affected client. + #n: indicates the unique client identifier. + dev_name: specifies the name of the block device related + to the fencing attempt. + +The server will repeatedly retry the operation indefinitely. During +this time, access to the affected file is restricted for all other +clients. This is to prevent potential data corruption if multiple +clients access the same file simultaneously. + +To restore access to the affected file for other clients, the admin +needs to take the following actions: + + . shutdown or power off the client being fenced. + . manually expire the client to release all its state on the server: + + echo 'expire' > /proc/fs/nfsd/clients/clid/ctl'. + + Where: + + clid: is the unique client identifier displayed in the system log. diff --git a/Documentation/admin-guide/nfs/pnfs-scsi-server.rst b/Documentation/admin-guide/nfs/pnfs-scsi-server.rst index b2eec2288329..db34afbf67a9 100644 --- a/Documentation/admin-guide/nfs/pnfs-scsi-server.rst +++ b/Documentation/admin-guide/nfs/pnfs-scsi-server.rst @@ -22,3 +22,34 @@ option and the underlying SCSI device support persistent reservations. On the client make sure the kernel has the CONFIG_PNFS_BLOCK option enabled, and the file system is mounted using the NFSv4.1 protocol version (mount -o vers=4.1). + +If the nfsd server needs to fence a non-responding client and the +fencing operation fails, the server logs a warning message in the +system log with the following format: + + FENCE failed client[IP_address] clid[#n] device[dev_name] + + Where: + + IP_address: refers to the IP address of the affected client. + #n: indicates the unique client identifier. + dev_name: specifies the name of the block device related + to the fencing attempt. + +The server will repeatedly retry the operation indefinitely. During +this time, access to the affected file is restricted for all other +clients. This is to prevent potential data corruption if multiple +clients access the same file simultaneously. + +To restore access to the affected file for other clients, the admin +needs to take the following actions: + + . shutdown or power off the client being fenced. + . manually expire the client to release all its state on the server: + + echo 'expire' > /proc/fs/nfsd/clients/clid/ctl'. + + Where: + + clid: is the unique client identifier displayed in the system log. + diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 8025df6e6499..8421ea21bd35 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -398,6 +398,7 @@ prototypes:: bool (*lm_breaker_owns_lease)(struct file_lock *); bool (*lm_lock_expirable)(struct file_lock *); void (*lm_expire_lock)(void); + bool (*lm_breaker_timedout)(struct file_lease *); locking rules: @@ -412,6 +413,7 @@ lm_breaker_owns_lease: yes no no lm_lock_expirable yes no no lm_expire_lock no no yes lm_open_conflict yes no no +lm_breaker_timedout yes no no ====================== ============= ================= ========= buffer_head diff --git a/fs/locks.c b/fs/locks.c index d13ec930b7bb..8e44b1f6c15a 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1534,6 +1534,7 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose) { struct file_lock_context *ctx = inode->i_flctx; struct file_lease *fl, *tmp; + bool remove; lockdep_assert_held(&ctx->flc_lock); @@ -1541,8 +1542,19 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose) trace_time_out_leases(inode, fl); if (past_time(fl->fl_downgrade_time)) lease_modify(fl, F_RDLCK, dispose); - if (past_time(fl->fl_break_time)) - lease_modify(fl, F_UNLCK, dispose); + + remove = true; + if (past_time(fl->fl_break_time)) { + /* + * Consult the lease manager when a lease break times + * out to determine whether the lease should be disposed + * of. + */ + if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout) + remove = fl->fl_lmops->lm_breaker_timedout(fl); + if (remove) + lease_modify(fl, F_UNLCK, dispose); + } } } @@ -1670,9 +1682,13 @@ int __break_lease(struct inode *inode, unsigned int flags) restart: fl = list_first_entry(&ctx->flc_lease, struct file_lease, c.flc_list); break_time = fl->fl_break_time; - if (break_time != 0) - break_time -= jiffies; - if (break_time == 0) + if (break_time != 0) { + if (time_after(jiffies, break_time)) { + fl->fl_break_time = jiffies + lease_break_time * HZ; + break_time = lease_break_time * HZ; + } else + break_time -= jiffies; + } else break_time++; locks_insert_block(&fl->c, &new_fl->c, leases_conflict); trace_break_lease_block(inode, new_fl); diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 8b987fca1e60..9d829c84f374 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -297,6 +297,7 @@ static inline int nfsd4_scsi_fence_insert(struct nfs4_client *clp, ret = 0; } xa_unlock(xa); + clp->cl_fence_retry_warn = false; return ret; } @@ -443,15 +444,33 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp, return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps); } -static void +/* + * Perform the fence operation to prevent the client from accessing the + * block device. If a fence operation is already in progress, wait for + * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the + * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set, + * update the layout stateid by setting the ls_fenced flag to indicate + * that the client has been fenced. + * + * The cl_fence_mutex ensures that the fence operation has been fully + * completed, rather than just in progress, when returning from this + * function. + * + * Return true if client was fenced otherwise return false. + */ +static bool nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file) { struct nfs4_client *clp = ls->ls_stid.sc_client; struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev; int status; + bool ret; - if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) - return; + mutex_lock(&clp->cl_fence_mutex); + if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) { + mutex_unlock(&clp->cl_fence_mutex); + return true; + } status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY, nfsd4_scsi_pr_key(clp), @@ -470,13 +489,22 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file) * PR_STS_RESERVATION_CONFLICT, which would cause an infinite * retry loop. */ - if (status < 0 || - status == PR_STS_PATH_FAILED || - status == PR_STS_PATH_FAST_FAILED || - status == PR_STS_RETRY_PATH_FAILURE) + switch (status) { + case 0: + case PR_STS_IOERR: + case PR_STS_RESERVATION_CONFLICT: + ret = true; + break; + default: + /* retry-able and other errors */ + ret = false; nfsd4_scsi_fence_clear(clp, bdev->bd_dev); + break; + } + mutex_unlock(&clp->cl_fence_mutex); trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status); + return ret; } const struct nfsd4_layout_ops scsi_layout_ops = { diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index ad7af8cfcf1f..69e41105efdd 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -27,6 +27,8 @@ static struct kmem_cache *nfs4_layout_stateid_cache; static const struct nfsd4_callback_ops nfsd4_cb_layout_ops; static const struct lease_manager_operations nfsd4_layouts_lm_ops; +static void nfsd4_layout_fence_worker(struct work_struct *work); + const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = { #ifdef CONFIG_NFSD_FLEXFILELAYOUT [LAYOUT_FLEX_FILES] = &ff_layout_ops, @@ -177,6 +179,13 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid) trace_nfsd_layoutstate_free(&ls->ls_stid.sc_stateid); + spin_lock(&ls->ls_lock); + if (delayed_work_pending(&ls->ls_fence_work)) { + spin_unlock(&ls->ls_lock); + cancel_delayed_work_sync(&ls->ls_fence_work); + } else + spin_unlock(&ls->ls_lock); + spin_lock(&clp->cl_lock); list_del_init(&ls->ls_perclnt); spin_unlock(&clp->cl_lock); @@ -271,6 +280,10 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, list_add(&ls->ls_perfile, &fp->fi_lo_states); spin_unlock(&fp->fi_lock); + ls->ls_fenced = false; + ls->ls_fence_delay = 0; + INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker); + trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid); return ls; } @@ -747,11 +760,9 @@ static bool nfsd4_layout_lm_break(struct file_lease *fl) { /* - * We don't want the locks code to timeout the lease for us; - * we'll remove it ourself if a layout isn't returned - * in time: + * Enforce break lease timeout to prevent NFSD + * thread from hanging in __break_lease. */ - fl->fl_break_time = 0; nfsd4_recall_file_layout(fl->c.flc_owner); return false; } @@ -782,10 +793,143 @@ nfsd4_layout_lm_open_conflict(struct file *filp, int arg) return 0; } +static void +nfsd4_layout_fence_worker(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct nfs4_layout_stateid *ls = container_of(dwork, + struct nfs4_layout_stateid, ls_fence_work); + struct nfsd_file *nf; + struct block_device *bdev; + struct nfs4_client *clp; + struct nfsd_net *nn; + + /* + * The workqueue clears WORK_STRUCT_PENDING before invoking + * this callback. Re-arm immediately so that + * delayed_work_pending() returns true while the fence + * operation is in progress, preventing + * lm_breaker_timedout() from taking a duplicate reference. + */ + mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0); + + spin_lock(&ls->ls_lock); + if (list_empty(&ls->ls_layouts)) { + spin_unlock(&ls->ls_lock); +dispose: + cancel_delayed_work(&ls->ls_fence_work); + /* unlock the lease so that tasks waiting on it can proceed */ + nfsd4_close_layout(ls); + + ls->ls_fenced = true; + nfs4_put_stid(&ls->ls_stid); + return; + } + spin_unlock(&ls->ls_lock); + + rcu_read_lock(); + nf = nfsd_file_get(ls->ls_file); + rcu_read_unlock(); + if (!nf) + goto dispose; + + clp = ls->ls_stid.sc_client; + nn = net_generic(clp->net, nfsd_net_id); + bdev = nf->nf_file->f_path.mnt->mnt_sb->s_bdev; + if (nfsd4_layout_ops[ls->ls_layout_type]->fence_client(ls, nf)) { + /* fenced ok */ + nfsd_file_put(nf); + pr_warn("%s: FENCED client[%pISpc] clid[%d] to device[%s]\n", + __func__, (struct sockaddr *)&clp->cl_addr, + clp->cl_clientid.cl_id - nn->clientid_base, + bdev->bd_disk->disk_name); + goto dispose; + } + /* fence failed */ + nfsd_file_put(nf); + + if (!clp->cl_fence_retry_warn) { + pr_warn("%s: FENCE failed client[%pISpc] clid[%d] device[%s]\n", + __func__, (struct sockaddr *)&clp->cl_addr, + clp->cl_clientid.cl_id - nn->clientid_base, + bdev->bd_disk->disk_name); + clp->cl_fence_retry_warn = true; + } + /* + * The fence worker retries the fencing operation indefinitely to + * prevent data corruption. The admin needs to take the following + * actions to restore access to the file for other clients: + * + * . shutdown or power off the client being fenced. + * . manually expire the client to release all its state on the server; + * echo 'expire' > /proc/fs/nfsd/clients/clid/ctl'. + * + * Where: + * + * clid: is the unique client identifier displayed in + * the warning message above. + */ + if (!ls->ls_fence_delay) + ls->ls_fence_delay = HZ; + else + ls->ls_fence_delay = min(ls->ls_fence_delay << 1, + MAX_FENCE_DELAY); + mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, ls->ls_fence_delay); +} + +/** + * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out. + * @fl: file to check + * + * If the layout type supports a fence operation, schedule a worker to + * fence the client from accessing the block device. + * + * This function runs under the protection of the spin_lock flc_lock. + * At this time, the file_lease associated with the layout stateid is + * on the flc_list. A reference count is incremented on the layout + * stateid to prevent it from being freed while the fence worker is + * executing. Once the fence worker finishes its operation, it releases + * this reference. + * + * The fence worker continues to run until either the client has been + * fenced or the layout becomes invalid. The layout can become invalid + * as a result of a LAYOUTRETURN or when the CB_LAYOUT recall callback + * has completed. + * + * Return true if the file_lease should be disposed of by the caller; + * otherwise, return false. + */ +static bool +nfsd4_layout_lm_breaker_timedout(struct file_lease *fl) +{ + struct nfs4_layout_stateid *ls = fl->c.flc_owner; + + if ((!nfsd4_layout_ops[ls->ls_layout_type]->fence_client) || + ls->ls_fenced) + return true; + if (delayed_work_pending(&ls->ls_fence_work)) + return false; + /* + * Make sure layout has not been returned yet before + * taking a reference count on the layout stateid. + */ + spin_lock(&ls->ls_lock); + if (list_empty(&ls->ls_layouts) || + !refcount_inc_not_zero(&ls->ls_stid.sc_count)) { + spin_unlock(&ls->ls_lock); + return true; + } + spin_unlock(&ls->ls_lock); + + mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0); + return false; +} + static const struct lease_manager_operations nfsd4_layouts_lm_ops = { .lm_break = nfsd4_layout_lm_break, .lm_change = nfsd4_layout_lm_change, .lm_open_conflict = nfsd4_layout_lm_open_conflict, + .lm_breaker_timedout = nfsd4_layout_lm_breaker_timedout, }; int diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 1b4c101ff04b..1d31f2bb2162 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2386,6 +2386,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name, #endif #ifdef CONFIG_NFSD_SCSILAYOUT xa_init(&clp->cl_dev_fences); + mutex_init(&clp->cl_fence_mutex); #endif INIT_LIST_HEAD(&clp->async_copies); spin_lock_init(&clp->async_lock); diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h index db9af780438b..f7bee4dc5d3d 100644 --- a/fs/nfsd/pnfs.h +++ b/fs/nfsd/pnfs.h @@ -11,6 +11,9 @@ struct xdr_stream; +/* Cap exponential backoff between fence retries at 3 minutes */ +#define MAX_FENCE_DELAY ((unsigned int)(3 * 60 * HZ)) + struct nfsd4_deviceid_map { struct list_head hash; u64 idx; @@ -38,7 +41,7 @@ struct nfsd4_layout_ops { struct svc_rqst *rqstp, struct nfsd4_layoutcommit *lcp); - void (*fence_client)(struct nfs4_layout_stateid *ls, + bool (*fence_client)(struct nfs4_layout_stateid *ls, struct nfsd_file *file); }; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 99aeaab9cf2b..ec1c5467012e 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -456,6 +456,7 @@ struct nfs4_client { struct list_head cl_lru; /* tail queue */ #ifdef CONFIG_NFSD_PNFS struct list_head cl_lo_states; /* outstanding layout states */ + bool cl_fence_retry_warn; #endif struct xdr_netobj cl_name; /* id generated by client */ nfs4_verifier cl_verifier; /* generated by client */ @@ -529,6 +530,7 @@ struct nfs4_client { time64_t cl_ra_time; #ifdef CONFIG_NFSD_SCSILAYOUT struct xarray cl_dev_fences; + struct mutex cl_fence_mutex; #endif }; @@ -745,6 +747,10 @@ struct nfs4_layout_stateid { stateid_t ls_recall_sid; bool ls_recalled; struct mutex ls_mutex; + + struct delayed_work ls_fence_work; + unsigned int ls_fence_delay; + bool ls_fenced; }; static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s) diff --git a/include/linux/filelock.h b/include/linux/filelock.h index d2c9740e26a8..5f0a2fb31450 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -50,6 +50,7 @@ struct lease_manager_operations { void (*lm_setup)(struct file_lease *, void **); bool (*lm_breaker_owns_lease)(struct file_lease *); int (*lm_open_conflict)(struct file *, int); + bool (*lm_breaker_timedout)(struct file_lease *fl); }; struct lock_manager { -- cgit v1.2.3 From 5bc37b759ec0cdde2c652a2637d704f2d6306617 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Feb 2026 17:06:53 -0500 Subject: Documentation: Add the RPC language description of NLM version 4 In order to generate source code to encode and decode NLMv4 protocol elements, include a copy of the RPC language description of NLMv4 for xdrgen to process. The language description is an amalgam of RFC 1813 and the Open Group's XNFS specification: https://pubs.opengroup.org/onlinepubs/9629799/chap10.htm The C code committed here was generated from the new nlm4.x file using tools/net/sunrpc/xdrgen/xdrgen. The goals of replacing hand-written XDR functions with ones that are tool-generated are to improve memory safety and make XDR encoding and decoding less brittle to maintain. The xdrgen utility derives both the type definitions and the encode/decode functions directly from protocol specifications, using names and symbols familiar to anyone who knows those specs. Unlike hand-written code that can inadvertently diverge from the specification, xdrgen guarantees that the generated code matches the specification exactly. We would eventually like xdrgen to generate Rust code as well, making the conversion of the kernel's NFS stacks to use Rust just a little easier for us. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- Documentation/sunrpc/xdr/nlm4.x | 211 +++++++++++ fs/lockd/Makefile | 30 +- fs/lockd/nlm4xdr_gen.c | 724 +++++++++++++++++++++++++++++++++++++ fs/lockd/nlm4xdr_gen.h | 32 ++ include/linux/sunrpc/xdrgen/nlm4.h | 233 ++++++++++++ 5 files changed, 1229 insertions(+), 1 deletion(-) create mode 100644 Documentation/sunrpc/xdr/nlm4.x create mode 100644 fs/lockd/nlm4xdr_gen.c create mode 100644 fs/lockd/nlm4xdr_gen.h create mode 100644 include/linux/sunrpc/xdrgen/nlm4.h (limited to 'include') diff --git a/Documentation/sunrpc/xdr/nlm4.x b/Documentation/sunrpc/xdr/nlm4.x new file mode 100644 index 000000000000..0c44a80ef674 --- /dev/null +++ b/Documentation/sunrpc/xdr/nlm4.x @@ -0,0 +1,211 @@ +/* + * This file was extracted by hand from + * https://www.rfc-editor.org/rfc/rfc1813.html . + * + * Note that RFC 1813 is Informational. Its official date of + * publication (June 1995) is before the IETF required its RFCs to + * carry an explicit copyright or other IP ownership notices. + * + * Note also that RFC 1813 does not specify the whole NLM4 protocol. + * In particular, the argument and result types are not present in + * that document, and had to be reverse-engineered. + */ + +/* + * The NLMv4 protocol + */ + +pragma header nlm4; + +/* + * The following definitions are missing in RFC 1813, + * but can be found in the OpenNetworking Network Lock + * Manager protocol: + * + * https://pubs.opengroup.org/onlinepubs/9629799/chap10.htm + */ + +const LM_MAXSTRLEN = 1024; + +const LM_MAXNAMELEN = 1025; + +const MAXNETOBJ_SZ = 1024; + +typedef opaque netobj; + +enum fsh4_mode { + fsm_DN = 0, /* deny none */ + fsm_DR = 1, /* deny read */ + fsm_DW = 2, /* deny write */ + fsm_DRW = 3 /* deny read/write */ +}; + +enum fsh4_access { + fsa_NONE = 0, /* for completeness */ + fsa_R = 1, /* read-only */ + fsa_W = 2, /* write-only */ + fsa_RW = 3 /* read/write */ +}; + +/* + * The following definitions come from the OpenNetworking + * Network Status Monitor protocol: + * + * https://pubs.opengroup.org/onlinepubs/9629799/chap11.htm + */ + +const SM_MAXSTRLEN = 1024; + +/* + * The NLM protocol as extracted from: + * https://tools.ietf.org/html/rfc1813 Appendix II + */ + +typedef unsigned hyper uint64; + +typedef hyper int64; + +typedef unsigned long uint32; + +typedef long int32; + +enum nlm4_stats { + NLM4_GRANTED = 0, + NLM4_DENIED = 1, + NLM4_DENIED_NOLOCKS = 2, + NLM4_BLOCKED = 3, + NLM4_DENIED_GRACE_PERIOD = 4, + NLM4_DEADLCK = 5, + NLM4_ROFS = 6, + NLM4_STALE_FH = 7, + NLM4_FBIG = 8, + NLM4_FAILED = 9 +}; + +pragma big_endian nlm4_stats; + +struct nlm4_holder { + bool exclusive; + int32 svid; + netobj oh; + uint64 l_offset; + uint64 l_len; +}; + +union nlm4_testrply switch (nlm4_stats stat) { + case NLM4_DENIED: + nlm4_holder holder; + default: + void; +}; + +struct nlm4_stat { + nlm4_stats stat; +}; + +struct nlm4_res { + netobj cookie; + nlm4_stat stat; +}; + +struct nlm4_testres { + netobj cookie; + nlm4_testrply stat; +}; + +struct nlm4_lock { + string caller_name; + netobj fh; + netobj oh; + int32 svid; + uint64 l_offset; + uint64 l_len; +}; + +struct nlm4_lockargs { + netobj cookie; + bool block; + bool exclusive; + nlm4_lock alock; + bool reclaim; + int32 state; +}; + +struct nlm4_cancargs { + netobj cookie; + bool block; + bool exclusive; + nlm4_lock alock; +}; + +struct nlm4_testargs { + netobj cookie; + bool exclusive; + nlm4_lock alock; +}; + +struct nlm4_unlockargs { + netobj cookie; + nlm4_lock alock; +}; + +struct nlm4_share { + string caller_name; + netobj fh; + netobj oh; + fsh4_mode mode; + fsh4_access access; +}; + +struct nlm4_shareargs { + netobj cookie; + nlm4_share share; + bool reclaim; +}; + +struct nlm4_shareres { + netobj cookie; + nlm4_stats stat; + int32 sequence; +}; + +struct nlm4_notify { + string name; + int32 state; +}; + +/* + * Argument for the Linux-private SM_NOTIFY procedure + */ +const SM_PRIV_SIZE = 16; + +struct nlm4_notifyargs { + nlm4_notify notify; + opaque private[SM_PRIV_SIZE]; +}; + +program NLM4_PROG { + version NLM4_VERS { + void NLMPROC4_NULL(void) = 0; + nlm4_testres NLMPROC4_TEST(nlm4_testargs) = 1; + nlm4_res NLMPROC4_LOCK(nlm4_lockargs) = 2; + nlm4_res NLMPROC4_CANCEL(nlm4_cancargs) = 3; + nlm4_res NLMPROC4_UNLOCK(nlm4_unlockargs) = 4; + nlm4_res NLMPROC4_GRANTED(nlm4_testargs) = 5; + void NLMPROC4_TEST_MSG(nlm4_testargs) = 6; + void NLMPROC4_LOCK_MSG(nlm4_lockargs) = 7; + void NLMPROC4_CANCEL_MSG(nlm4_cancargs) = 8; + void NLMPROC4_UNLOCK_MSG(nlm4_unlockargs) = 9; + void NLMPROC4_GRANTED_MSG(nlm4_testargs) = 10; + void NLMPROC4_TEST_RES(nlm4_testres) = 11; + void NLMPROC4_LOCK_RES(nlm4_res) = 12; + void NLMPROC4_CANCEL_RES(nlm4_res) = 13; + void NLMPROC4_UNLOCK_RES(nlm4_res) = 14; + void NLMPROC4_GRANTED_RES(nlm4_res) = 15; + void NLMPROC4_SM_NOTIFY(nlm4_notifyargs) = 16; + nlm4_shareres NLMPROC4_SHARE(nlm4_shareargs) = 20; + nlm4_shareres NLMPROC4_UNSHARE(nlm4_shareargs) = 21; + nlm4_res NLMPROC4_NM_LOCK(nlm4_lockargs) = 22; + void NLMPROC4_FREE_ALL(nlm4_notify) = 23; + } = 4; +} = 100021; diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile index 51bbe22d21e3..8e9d18a4348c 100644 --- a/fs/lockd/Makefile +++ b/fs/lockd/Makefile @@ -9,5 +9,33 @@ obj-$(CONFIG_LOCKD) += lockd.o lockd-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \ svcshare.o svcproc.o svcsubs.o mon.o trace.o xdr.o netlink.o -lockd-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o +lockd-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o nlm4xdr_gen.o lockd-$(CONFIG_PROC_FS) += procfs.o + +# +# XDR code generation (requires Python and additional packages) +# +# The generated *xdr_gen.{h,c} files are checked into git. Normal kernel +# builds do not require the xdrgen tool or its Python dependencies. +# +# Developers modifying .x files in Documentation/sunrpc/xdr/ should run +# "make xdrgen" to regenerate the affected files. +# +.PHONY: xdrgen + +XDRGEN = ../../tools/net/sunrpc/xdrgen/xdrgen + +XDRGEN_DEFINITIONS = ../../include/linux/sunrpc/xdrgen/nlm4.h +XDRGEN_DECLARATIONS = nlm4xdr_gen.h +XDRGEN_SOURCE = nlm4xdr_gen.c + +xdrgen: $(XDRGEN_DEFINITIONS) $(XDRGEN_DECLARATIONS) $(XDRGEN_SOURCE) + +../../include/linux/sunrpc/xdrgen/nlm4.h: ../../Documentation/sunrpc/xdr/nlm4.x + $(XDRGEN) definitions $< > $@ + +nlm4xdr_gen.h: ../../Documentation/sunrpc/xdr/nlm4.x + $(XDRGEN) declarations $< > $@ + +nlm4xdr_gen.c: ../../Documentation/sunrpc/xdr/nlm4.x + $(XDRGEN) source --peer server $< > $@ diff --git a/fs/lockd/nlm4xdr_gen.c b/fs/lockd/nlm4xdr_gen.c new file mode 100644 index 000000000000..1c8c221db456 --- /dev/null +++ b/fs/lockd/nlm4xdr_gen.c @@ -0,0 +1,724 @@ +// SPDX-License-Identifier: GPL-2.0 +// Generated by xdrgen. Manual edits will be lost. +// XDR specification file: ../../Documentation/sunrpc/xdr/nlm4.x +// XDR specification modification time: Thu Dec 25 13:10:19 2025 + +#include + +#include "nlm4xdr_gen.h" + +static bool __maybe_unused +xdrgen_decode_netobj(struct xdr_stream *xdr, netobj *ptr) +{ + return xdrgen_decode_opaque(xdr, ptr, MAXNETOBJ_SZ); +} + +static bool __maybe_unused +xdrgen_decode_fsh4_mode(struct xdr_stream *xdr, fsh4_mode *ptr) +{ + u32 val; + + if (xdr_stream_decode_u32(xdr, &val) < 0) + return false; + *ptr = val; + return true; +} + +static bool __maybe_unused +xdrgen_decode_fsh4_access(struct xdr_stream *xdr, fsh4_access *ptr) +{ + u32 val; + + if (xdr_stream_decode_u32(xdr, &val) < 0) + return false; + *ptr = val; + return true; +} + +static bool __maybe_unused +xdrgen_decode_uint64(struct xdr_stream *xdr, uint64 *ptr) +{ + return xdrgen_decode_unsigned_hyper(xdr, ptr); +} + +static bool __maybe_unused +xdrgen_decode_int64(struct xdr_stream *xdr, int64 *ptr) +{ + return xdrgen_decode_hyper(xdr, ptr); +} + +static bool __maybe_unused +xdrgen_decode_uint32(struct xdr_stream *xdr, uint32 *ptr) +{ + return xdrgen_decode_unsigned_long(xdr, ptr); +} + +static bool __maybe_unused +xdrgen_decode_int32(struct xdr_stream *xdr, int32 *ptr) +{ + return xdrgen_decode_long(xdr, ptr); +} + +static bool __maybe_unused +xdrgen_decode_nlm4_stats(struct xdr_stream *xdr, nlm4_stats *ptr) +{ + return xdr_stream_decode_be32(xdr, ptr) == 0; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_holder(struct xdr_stream *xdr, struct nlm4_holder *ptr) +{ + if (!xdrgen_decode_bool(xdr, &ptr->exclusive)) + return false; + if (!xdrgen_decode_int32(xdr, &ptr->svid)) + return false; + if (!xdrgen_decode_netobj(xdr, &ptr->oh)) + return false; + if (!xdrgen_decode_uint64(xdr, &ptr->l_offset)) + return false; + if (!xdrgen_decode_uint64(xdr, &ptr->l_len)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_testrply(struct xdr_stream *xdr, struct nlm4_testrply *ptr) +{ + if (!xdrgen_decode_nlm4_stats(xdr, &ptr->stat)) + return false; + switch (ptr->stat) { + case __constant_cpu_to_be32(NLM4_DENIED): + if (!xdrgen_decode_nlm4_holder(xdr, &ptr->u.holder)) + return false; + break; + default: + break; + } + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_stat(struct xdr_stream *xdr, struct nlm4_stat *ptr) +{ + if (!xdrgen_decode_nlm4_stats(xdr, &ptr->stat)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_res(struct xdr_stream *xdr, struct nlm4_res *ptr) +{ + if (!xdrgen_decode_netobj(xdr, &ptr->cookie)) + return false; + if (!xdrgen_decode_nlm4_stat(xdr, &ptr->stat)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_testres(struct xdr_stream *xdr, struct nlm4_testres *ptr) +{ + if (!xdrgen_decode_netobj(xdr, &ptr->cookie)) + return false; + if (!xdrgen_decode_nlm4_testrply(xdr, &ptr->stat)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_lock(struct xdr_stream *xdr, struct nlm4_lock *ptr) +{ + if (!xdrgen_decode_string(xdr, (string *)ptr, LM_MAXSTRLEN)) + return false; + if (!xdrgen_decode_netobj(xdr, &ptr->fh)) + return false; + if (!xdrgen_decode_netobj(xdr, &ptr->oh)) + return false; + if (!xdrgen_decode_int32(xdr, &ptr->svid)) + return false; + if (!xdrgen_decode_uint64(xdr, &ptr->l_offset)) + return false; + if (!xdrgen_decode_uint64(xdr, &ptr->l_len)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_lockargs(struct xdr_stream *xdr, struct nlm4_lockargs *ptr) +{ + if (!xdrgen_decode_netobj(xdr, &ptr->cookie)) + return false; + if (!xdrgen_decode_bool(xdr, &ptr->block)) + return false; + if (!xdrgen_decode_bool(xdr, &ptr->exclusive)) + return false; + if (!xdrgen_decode_nlm4_lock(xdr, &ptr->alock)) + return false; + if (!xdrgen_decode_bool(xdr, &ptr->reclaim)) + return false; + if (!xdrgen_decode_int32(xdr, &ptr->state)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_cancargs(struct xdr_stream *xdr, struct nlm4_cancargs *ptr) +{ + if (!xdrgen_decode_netobj(xdr, &ptr->cookie)) + return false; + if (!xdrgen_decode_bool(xdr, &ptr->block)) + return false; + if (!xdrgen_decode_bool(xdr, &ptr->exclusive)) + return false; + if (!xdrgen_decode_nlm4_lock(xdr, &ptr->alock)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_testargs(struct xdr_stream *xdr, struct nlm4_testargs *ptr) +{ + if (!xdrgen_decode_netobj(xdr, &ptr->cookie)) + return false; + if (!xdrgen_decode_bool(xdr, &ptr->exclusive)) + return false; + if (!xdrgen_decode_nlm4_lock(xdr, &ptr->alock)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_unlockargs(struct xdr_stream *xdr, struct nlm4_unlockargs *ptr) +{ + if (!xdrgen_decode_netobj(xdr, &ptr->cookie)) + return false; + if (!xdrgen_decode_nlm4_lock(xdr, &ptr->alock)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_share(struct xdr_stream *xdr, struct nlm4_share *ptr) +{ + if (!xdrgen_decode_string(xdr, (string *)ptr, LM_MAXSTRLEN)) + return false; + if (!xdrgen_decode_netobj(xdr, &ptr->fh)) + return false; + if (!xdrgen_decode_netobj(xdr, &ptr->oh)) + return false; + if (!xdrgen_decode_fsh4_mode(xdr, &ptr->mode)) + return false; + if (!xdrgen_decode_fsh4_access(xdr, &ptr->access)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_shareargs(struct xdr_stream *xdr, struct nlm4_shareargs *ptr) +{ + if (!xdrgen_decode_netobj(xdr, &ptr->cookie)) + return false; + if (!xdrgen_decode_nlm4_share(xdr, &ptr->share)) + return false; + if (!xdrgen_decode_bool(xdr, &ptr->reclaim)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_shareres(struct xdr_stream *xdr, struct nlm4_shareres *ptr) +{ + if (!xdrgen_decode_netobj(xdr, &ptr->cookie)) + return false; + if (!xdrgen_decode_nlm4_stats(xdr, &ptr->stat)) + return false; + if (!xdrgen_decode_int32(xdr, &ptr->sequence)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_notify(struct xdr_stream *xdr, struct nlm4_notify *ptr) +{ + if (!xdrgen_decode_string(xdr, (string *)ptr, LM_MAXNAMELEN)) + return false; + if (!xdrgen_decode_int32(xdr, &ptr->state)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_notifyargs(struct xdr_stream *xdr, struct nlm4_notifyargs *ptr) +{ + if (!xdrgen_decode_nlm4_notify(xdr, &ptr->notify)) + return false; + if (xdr_stream_decode_opaque_fixed(xdr, ptr->private, SM_PRIV_SIZE) < 0) + return false; + return true; +} + +/** + * nlm4_svc_decode_void - Decode a void argument + * @rqstp: RPC transaction context + * @xdr: source XDR data stream + * + * Return values: + * %true: procedure arguments decoded successfully + * %false: decode failed + */ +bool nlm4_svc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + return xdrgen_decode_void(xdr); +} + +/** + * nlm4_svc_decode_nlm4_testargs - Decode a nlm4_testargs argument + * @rqstp: RPC transaction context + * @xdr: source XDR data stream + * + * Return values: + * %true: procedure arguments decoded successfully + * %false: decode failed + */ +bool nlm4_svc_decode_nlm4_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_testargs *argp = rqstp->rq_argp; + + return xdrgen_decode_nlm4_testargs(xdr, argp); +} + +/** + * nlm4_svc_decode_nlm4_lockargs - Decode a nlm4_lockargs argument + * @rqstp: RPC transaction context + * @xdr: source XDR data stream + * + * Return values: + * %true: procedure arguments decoded successfully + * %false: decode failed + */ +bool nlm4_svc_decode_nlm4_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_lockargs *argp = rqstp->rq_argp; + + return xdrgen_decode_nlm4_lockargs(xdr, argp); +} + +/** + * nlm4_svc_decode_nlm4_cancargs - Decode a nlm4_cancargs argument + * @rqstp: RPC transaction context + * @xdr: source XDR data stream + * + * Return values: + * %true: procedure arguments decoded successfully + * %false: decode failed + */ +bool nlm4_svc_decode_nlm4_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_cancargs *argp = rqstp->rq_argp; + + return xdrgen_decode_nlm4_cancargs(xdr, argp); +} + +/** + * nlm4_svc_decode_nlm4_unlockargs - Decode a nlm4_unlockargs argument + * @rqstp: RPC transaction context + * @xdr: source XDR data stream + * + * Return values: + * %true: procedure arguments decoded successfully + * %false: decode failed + */ +bool nlm4_svc_decode_nlm4_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_unlockargs *argp = rqstp->rq_argp; + + return xdrgen_decode_nlm4_unlockargs(xdr, argp); +} + +/** + * nlm4_svc_decode_nlm4_testres - Decode a nlm4_testres argument + * @rqstp: RPC transaction context + * @xdr: source XDR data stream + * + * Return values: + * %true: procedure arguments decoded successfully + * %false: decode failed + */ +bool nlm4_svc_decode_nlm4_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_testres *argp = rqstp->rq_argp; + + return xdrgen_decode_nlm4_testres(xdr, argp); +} + +/** + * nlm4_svc_decode_nlm4_res - Decode a nlm4_res argument + * @rqstp: RPC transaction context + * @xdr: source XDR data stream + * + * Return values: + * %true: procedure arguments decoded successfully + * %false: decode failed + */ +bool nlm4_svc_decode_nlm4_res(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_res *argp = rqstp->rq_argp; + + return xdrgen_decode_nlm4_res(xdr, argp); +} + +/** + * nlm4_svc_decode_nlm4_notifyargs - Decode a nlm4_notifyargs argument + * @rqstp: RPC transaction context + * @xdr: source XDR data stream + * + * Return values: + * %true: procedure arguments decoded successfully + * %false: decode failed + */ +bool nlm4_svc_decode_nlm4_notifyargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_notifyargs *argp = rqstp->rq_argp; + + return xdrgen_decode_nlm4_notifyargs(xdr, argp); +} + +/** + * nlm4_svc_decode_nlm4_shareargs - Decode a nlm4_shareargs argument + * @rqstp: RPC transaction context + * @xdr: source XDR data stream + * + * Return values: + * %true: procedure arguments decoded successfully + * %false: decode failed + */ +bool nlm4_svc_decode_nlm4_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_shareargs *argp = rqstp->rq_argp; + + return xdrgen_decode_nlm4_shareargs(xdr, argp); +} + +/** + * nlm4_svc_decode_nlm4_notify - Decode a nlm4_notify argument + * @rqstp: RPC transaction context + * @xdr: source XDR data stream + * + * Return values: + * %true: procedure arguments decoded successfully + * %false: decode failed + */ +bool nlm4_svc_decode_nlm4_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_notify *argp = rqstp->rq_argp; + + return xdrgen_decode_nlm4_notify(xdr, argp); +} + +static bool __maybe_unused +xdrgen_encode_netobj(struct xdr_stream *xdr, const netobj value) +{ + return xdr_stream_encode_opaque(xdr, value.data, value.len) >= 0; +} + +static bool __maybe_unused +xdrgen_encode_fsh4_mode(struct xdr_stream *xdr, fsh4_mode value) +{ + return xdr_stream_encode_u32(xdr, value) == XDR_UNIT; +} + +static bool __maybe_unused +xdrgen_encode_fsh4_access(struct xdr_stream *xdr, fsh4_access value) +{ + return xdr_stream_encode_u32(xdr, value) == XDR_UNIT; +} + +static bool __maybe_unused +xdrgen_encode_uint64(struct xdr_stream *xdr, const uint64 value) +{ + return xdrgen_encode_unsigned_hyper(xdr, value); +} + +static bool __maybe_unused +xdrgen_encode_int64(struct xdr_stream *xdr, const int64 value) +{ + return xdrgen_encode_hyper(xdr, value); +} + +static bool __maybe_unused +xdrgen_encode_uint32(struct xdr_stream *xdr, const uint32 value) +{ + return xdrgen_encode_unsigned_long(xdr, value); +} + +static bool __maybe_unused +xdrgen_encode_int32(struct xdr_stream *xdr, const int32 value) +{ + return xdrgen_encode_long(xdr, value); +} + +static bool __maybe_unused +xdrgen_encode_nlm4_stats(struct xdr_stream *xdr, nlm4_stats value) +{ + return xdr_stream_encode_be32(xdr, value) == XDR_UNIT; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_holder(struct xdr_stream *xdr, const struct nlm4_holder *value) +{ + if (!xdrgen_encode_bool(xdr, value->exclusive)) + return false; + if (!xdrgen_encode_int32(xdr, value->svid)) + return false; + if (!xdrgen_encode_netobj(xdr, value->oh)) + return false; + if (!xdrgen_encode_uint64(xdr, value->l_offset)) + return false; + if (!xdrgen_encode_uint64(xdr, value->l_len)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_testrply(struct xdr_stream *xdr, const struct nlm4_testrply *ptr) +{ + if (!xdrgen_encode_nlm4_stats(xdr, ptr->stat)) + return false; + switch (ptr->stat) { + case __constant_cpu_to_be32(NLM4_DENIED): + if (!xdrgen_encode_nlm4_holder(xdr, &ptr->u.holder)) + return false; + break; + default: + break; + } + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_stat(struct xdr_stream *xdr, const struct nlm4_stat *value) +{ + if (!xdrgen_encode_nlm4_stats(xdr, value->stat)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_res(struct xdr_stream *xdr, const struct nlm4_res *value) +{ + if (!xdrgen_encode_netobj(xdr, value->cookie)) + return false; + if (!xdrgen_encode_nlm4_stat(xdr, &value->stat)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_testres(struct xdr_stream *xdr, const struct nlm4_testres *value) +{ + if (!xdrgen_encode_netobj(xdr, value->cookie)) + return false; + if (!xdrgen_encode_nlm4_testrply(xdr, &value->stat)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_lock(struct xdr_stream *xdr, const struct nlm4_lock *value) +{ + if (value->caller_name.len > LM_MAXSTRLEN) + return false; + if (xdr_stream_encode_opaque(xdr, value->caller_name.data, value->caller_name.len) < 0) + return false; + if (!xdrgen_encode_netobj(xdr, value->fh)) + return false; + if (!xdrgen_encode_netobj(xdr, value->oh)) + return false; + if (!xdrgen_encode_int32(xdr, value->svid)) + return false; + if (!xdrgen_encode_uint64(xdr, value->l_offset)) + return false; + if (!xdrgen_encode_uint64(xdr, value->l_len)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_lockargs(struct xdr_stream *xdr, const struct nlm4_lockargs *value) +{ + if (!xdrgen_encode_netobj(xdr, value->cookie)) + return false; + if (!xdrgen_encode_bool(xdr, value->block)) + return false; + if (!xdrgen_encode_bool(xdr, value->exclusive)) + return false; + if (!xdrgen_encode_nlm4_lock(xdr, &value->alock)) + return false; + if (!xdrgen_encode_bool(xdr, value->reclaim)) + return false; + if (!xdrgen_encode_int32(xdr, value->state)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_cancargs(struct xdr_stream *xdr, const struct nlm4_cancargs *value) +{ + if (!xdrgen_encode_netobj(xdr, value->cookie)) + return false; + if (!xdrgen_encode_bool(xdr, value->block)) + return false; + if (!xdrgen_encode_bool(xdr, value->exclusive)) + return false; + if (!xdrgen_encode_nlm4_lock(xdr, &value->alock)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_testargs(struct xdr_stream *xdr, const struct nlm4_testargs *value) +{ + if (!xdrgen_encode_netobj(xdr, value->cookie)) + return false; + if (!xdrgen_encode_bool(xdr, value->exclusive)) + return false; + if (!xdrgen_encode_nlm4_lock(xdr, &value->alock)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_unlockargs(struct xdr_stream *xdr, const struct nlm4_unlockargs *value) +{ + if (!xdrgen_encode_netobj(xdr, value->cookie)) + return false; + if (!xdrgen_encode_nlm4_lock(xdr, &value->alock)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_share(struct xdr_stream *xdr, const struct nlm4_share *value) +{ + if (value->caller_name.len > LM_MAXSTRLEN) + return false; + if (xdr_stream_encode_opaque(xdr, value->caller_name.data, value->caller_name.len) < 0) + return false; + if (!xdrgen_encode_netobj(xdr, value->fh)) + return false; + if (!xdrgen_encode_netobj(xdr, value->oh)) + return false; + if (!xdrgen_encode_fsh4_mode(xdr, value->mode)) + return false; + if (!xdrgen_encode_fsh4_access(xdr, value->access)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_shareargs(struct xdr_stream *xdr, const struct nlm4_shareargs *value) +{ + if (!xdrgen_encode_netobj(xdr, value->cookie)) + return false; + if (!xdrgen_encode_nlm4_share(xdr, &value->share)) + return false; + if (!xdrgen_encode_bool(xdr, value->reclaim)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_shareres(struct xdr_stream *xdr, const struct nlm4_shareres *value) +{ + if (!xdrgen_encode_netobj(xdr, value->cookie)) + return false; + if (!xdrgen_encode_nlm4_stats(xdr, value->stat)) + return false; + if (!xdrgen_encode_int32(xdr, value->sequence)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_notify(struct xdr_stream *xdr, const struct nlm4_notify *value) +{ + if (value->name.len > LM_MAXNAMELEN) + return false; + if (xdr_stream_encode_opaque(xdr, value->name.data, value->name.len) < 0) + return false; + if (!xdrgen_encode_int32(xdr, value->state)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_notifyargs(struct xdr_stream *xdr, const struct nlm4_notifyargs *value) +{ + if (!xdrgen_encode_nlm4_notify(xdr, &value->notify)) + return false; + if (xdr_stream_encode_opaque_fixed(xdr, value->private, SM_PRIV_SIZE) < 0) + return false; + return true; +} + +/** + * nlm4_svc_encode_void - Encode a void result + * @rqstp: RPC transaction context + * @xdr: target XDR data stream + * + * Return values: + * %true: procedure results encoded successfully + * %false: encode failed + */ +bool nlm4_svc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + return xdrgen_encode_void(xdr); +} + +/** + * nlm4_svc_encode_nlm4_testres - Encode a nlm4_testres result + * @rqstp: RPC transaction context + * @xdr: target XDR data stream + * + * Return values: + * %true: procedure results encoded successfully + * %false: encode failed + */ +bool nlm4_svc_encode_nlm4_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_testres *resp = rqstp->rq_resp; + + return xdrgen_encode_nlm4_testres(xdr, resp); +} + +/** + * nlm4_svc_encode_nlm4_res - Encode a nlm4_res result + * @rqstp: RPC transaction context + * @xdr: target XDR data stream + * + * Return values: + * %true: procedure results encoded successfully + * %false: encode failed + */ +bool nlm4_svc_encode_nlm4_res(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_res *resp = rqstp->rq_resp; + + return xdrgen_encode_nlm4_res(xdr, resp); +} + +/** + * nlm4_svc_encode_nlm4_shareres - Encode a nlm4_shareres result + * @rqstp: RPC transaction context + * @xdr: target XDR data stream + * + * Return values: + * %true: procedure results encoded successfully + * %false: encode failed + */ +bool nlm4_svc_encode_nlm4_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_shareres *resp = rqstp->rq_resp; + + return xdrgen_encode_nlm4_shareres(xdr, resp); +} diff --git a/fs/lockd/nlm4xdr_gen.h b/fs/lockd/nlm4xdr_gen.h new file mode 100644 index 000000000000..b6008b296a3e --- /dev/null +++ b/fs/lockd/nlm4xdr_gen.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Generated by xdrgen. Manual edits will be lost. */ +/* XDR specification file: ../../Documentation/sunrpc/xdr/nlm4.x */ +/* XDR specification modification time: Thu Dec 25 13:10:19 2025 */ + +#ifndef _LINUX_XDRGEN_NLM4_DECL_H +#define _LINUX_XDRGEN_NLM4_DECL_H + +#include + +#include +#include +#include +#include + +bool nlm4_svc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_decode_nlm4_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_decode_nlm4_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_decode_nlm4_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_decode_nlm4_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_decode_nlm4_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_decode_nlm4_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_decode_nlm4_notifyargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_decode_nlm4_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_decode_nlm4_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr); + +bool nlm4_svc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_encode_nlm4_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_encode_nlm4_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_encode_nlm4_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr); + +#endif /* _LINUX_XDRGEN_NLM4_DECL_H */ diff --git a/include/linux/sunrpc/xdrgen/nlm4.h b/include/linux/sunrpc/xdrgen/nlm4.h new file mode 100644 index 000000000000..e95e8f105624 --- /dev/null +++ b/include/linux/sunrpc/xdrgen/nlm4.h @@ -0,0 +1,233 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Generated by xdrgen. Manual edits will be lost. */ +/* XDR specification file: ../../Documentation/sunrpc/xdr/nlm4.x */ +/* XDR specification modification time: Thu Dec 25 13:10:19 2025 */ + +#ifndef _LINUX_XDRGEN_NLM4_DEF_H +#define _LINUX_XDRGEN_NLM4_DEF_H + +#include +#include + +enum { LM_MAXSTRLEN = 1024 }; + +enum { LM_MAXNAMELEN = 1025 }; + +enum { MAXNETOBJ_SZ = 1024 }; + +typedef opaque netobj; + +enum fsh4_mode { + fsm_DN = 0, + fsm_DR = 1, + fsm_DW = 2, + fsm_DRW = 3, +}; + +typedef enum fsh4_mode fsh4_mode; + +enum fsh4_access { + fsa_NONE = 0, + fsa_R = 1, + fsa_W = 2, + fsa_RW = 3, +}; + +typedef enum fsh4_access fsh4_access; + +enum { SM_MAXSTRLEN = 1024 }; + +typedef u64 uint64; + +typedef s64 int64; + +typedef u32 uint32; + +typedef s32 int32; + +enum nlm4_stats { + NLM4_GRANTED = 0, + NLM4_DENIED = 1, + NLM4_DENIED_NOLOCKS = 2, + NLM4_BLOCKED = 3, + NLM4_DENIED_GRACE_PERIOD = 4, + NLM4_DEADLCK = 5, + NLM4_ROFS = 6, + NLM4_STALE_FH = 7, + NLM4_FBIG = 8, + NLM4_FAILED = 9, +}; + +typedef __be32 nlm4_stats; + +struct nlm4_holder { + bool exclusive; + int32 svid; + netobj oh; + uint64 l_offset; + uint64 l_len; +}; + +struct nlm4_testrply { + nlm4_stats stat; + union { + struct nlm4_holder holder; + } u; +}; + +struct nlm4_stat { + nlm4_stats stat; +}; + +struct nlm4_res { + netobj cookie; + struct nlm4_stat stat; +}; + +struct nlm4_testres { + netobj cookie; + struct nlm4_testrply stat; +}; + +struct nlm4_lock { + string caller_name; + netobj fh; + netobj oh; + int32 svid; + uint64 l_offset; + uint64 l_len; +}; + +struct nlm4_lockargs { + netobj cookie; + bool block; + bool exclusive; + struct nlm4_lock alock; + bool reclaim; + int32 state; +}; + +struct nlm4_cancargs { + netobj cookie; + bool block; + bool exclusive; + struct nlm4_lock alock; +}; + +struct nlm4_testargs { + netobj cookie; + bool exclusive; + struct nlm4_lock alock; +}; + +struct nlm4_unlockargs { + netobj cookie; + struct nlm4_lock alock; +}; + +struct nlm4_share { + string caller_name; + netobj fh; + netobj oh; + fsh4_mode mode; + fsh4_access access; +}; + +struct nlm4_shareargs { + netobj cookie; + struct nlm4_share share; + bool reclaim; +}; + +struct nlm4_shareres { + netobj cookie; + nlm4_stats stat; + int32 sequence; +}; + +struct nlm4_notify { + string name; + int32 state; +}; + +enum { SM_PRIV_SIZE = 16 }; + +struct nlm4_notifyargs { + struct nlm4_notify notify; + u8 private[SM_PRIV_SIZE]; +}; + +enum { + NLMPROC4_NULL = 0, + NLMPROC4_TEST = 1, + NLMPROC4_LOCK = 2, + NLMPROC4_CANCEL = 3, + NLMPROC4_UNLOCK = 4, + NLMPROC4_GRANTED = 5, + NLMPROC4_TEST_MSG = 6, + NLMPROC4_LOCK_MSG = 7, + NLMPROC4_CANCEL_MSG = 8, + NLMPROC4_UNLOCK_MSG = 9, + NLMPROC4_GRANTED_MSG = 10, + NLMPROC4_TEST_RES = 11, + NLMPROC4_LOCK_RES = 12, + NLMPROC4_CANCEL_RES = 13, + NLMPROC4_UNLOCK_RES = 14, + NLMPROC4_GRANTED_RES = 15, + NLMPROC4_SM_NOTIFY = 16, + NLMPROC4_SHARE = 20, + NLMPROC4_UNSHARE = 21, + NLMPROC4_NM_LOCK = 22, + NLMPROC4_FREE_ALL = 23, +}; + +#ifndef NLM4_PROG +#define NLM4_PROG (100021) +#endif + +#define NLM4_netobj_sz (XDR_unsigned_int + XDR_QUADLEN(MAXNETOBJ_SZ)) +#define NLM4_fsh4_mode_sz (XDR_int) +#define NLM4_fsh4_access_sz (XDR_int) +#define NLM4_uint64_sz \ + (XDR_unsigned_hyper) +#define NLM4_int64_sz \ + (XDR_hyper) +#define NLM4_uint32_sz \ + (XDR_unsigned_long) +#define NLM4_int32_sz \ + (XDR_long) +#define NLM4_nlm4_stats_sz (XDR_int) +#define NLM4_nlm4_holder_sz \ + (XDR_bool + NLM4_int32_sz + NLM4_netobj_sz + NLM4_uint64_sz + NLM4_uint64_sz) +#define NLM4_nlm4_testrply_sz \ + (NLM4_nlm4_stats_sz + NLM4_nlm4_holder_sz) +#define NLM4_nlm4_stat_sz \ + (NLM4_nlm4_stats_sz) +#define NLM4_nlm4_res_sz \ + (NLM4_netobj_sz + NLM4_nlm4_stat_sz) +#define NLM4_nlm4_testres_sz \ + (NLM4_netobj_sz + NLM4_nlm4_testrply_sz) +#define NLM4_nlm4_lock_sz \ + (XDR_unsigned_int + XDR_QUADLEN(LM_MAXSTRLEN) + NLM4_netobj_sz + NLM4_netobj_sz + NLM4_int32_sz + NLM4_uint64_sz + NLM4_uint64_sz) +#define NLM4_nlm4_lockargs_sz \ + (NLM4_netobj_sz + XDR_bool + XDR_bool + NLM4_nlm4_lock_sz + XDR_bool + NLM4_int32_sz) +#define NLM4_nlm4_cancargs_sz \ + (NLM4_netobj_sz + XDR_bool + XDR_bool + NLM4_nlm4_lock_sz) +#define NLM4_nlm4_testargs_sz \ + (NLM4_netobj_sz + XDR_bool + NLM4_nlm4_lock_sz) +#define NLM4_nlm4_unlockargs_sz \ + (NLM4_netobj_sz + NLM4_nlm4_lock_sz) +#define NLM4_nlm4_share_sz \ + (XDR_unsigned_int + XDR_QUADLEN(LM_MAXSTRLEN) + NLM4_netobj_sz + NLM4_netobj_sz + NLM4_fsh4_mode_sz + NLM4_fsh4_access_sz) +#define NLM4_nlm4_shareargs_sz \ + (NLM4_netobj_sz + NLM4_nlm4_share_sz + XDR_bool) +#define NLM4_nlm4_shareres_sz \ + (NLM4_netobj_sz + NLM4_nlm4_stats_sz + NLM4_int32_sz) +#define NLM4_nlm4_notify_sz \ + (XDR_unsigned_int + XDR_QUADLEN(LM_MAXNAMELEN) + NLM4_int32_sz) +#define NLM4_nlm4_notifyargs_sz \ + (NLM4_nlm4_notify_sz + XDR_QUADLEN(SM_PRIV_SIZE)) +#define NLM4_MAX_ARGS_SZ \ + (NLM4_nlm4_lockargs_sz) + +#endif /* _LINUX_XDRGEN_NLM4_DEF_H */ -- cgit v1.2.3 From 6b4f16a532e794e0df90baf15173e2166f863864 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 21 Feb 2026 13:39:59 -0500 Subject: sunrpc: Add XPT flags missing from SVC_XPRT_FLAG_LIST Commit eccbbc7c00a5 ("nfsd: don't use sv_nrthreads in connection limiting calculations.") and commit 898374fdd7f0 ("nfsd: unregister with rpcbind when deleting a transport") added new XPT flags but neglected to update the show_svc_xprt_flags() macro. Signed-off-by: Chuck Lever --- include/trace/events/sunrpc.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 750ecce56930..ff855197880d 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1933,7 +1933,9 @@ TRACE_EVENT(svc_stats_latency, svc_xprt_flag(CONG_CTRL) \ svc_xprt_flag(HANDSHAKE) \ svc_xprt_flag(TLS_SESSION) \ - svc_xprt_flag_end(PEER_AUTH) + svc_xprt_flag(PEER_AUTH) \ + svc_xprt_flag(PEER_VALID) \ + svc_xprt_flag_end(RPCB_UNREG) #undef svc_xprt_flag #undef svc_xprt_flag_end -- cgit v1.2.3 From 17c1d66579ff27a7a8f2f407d1425272ff6fdd8c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 23 Feb 2026 12:09:59 -0500 Subject: sunrpc: convert queue_lock from global spinlock to per-cache-detail lock The global queue_lock serializes all upcall queue operations across every cache_detail instance. Convert it to a per-cache-detail spinlock so that different caches (e.g. auth.unix.ip vs nfsd.fh) no longer contend with each other on queue operations. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/cache.h | 1 + net/sunrpc/cache.c | 47 ++++++++++++++++++++++---------------------- 2 files changed, 24 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index e783132e481f..3d32dd1f7b05 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -113,6 +113,7 @@ struct cache_detail { /* fields for communication over channel */ struct list_head queue; + spinlock_t queue_lock; atomic_t writers; /* how many time is /channel open */ time64_t last_close; /* if no writers, when did last close */ diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 86b3fd5a429d..1cfaae488c6c 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -400,6 +400,7 @@ void sunrpc_init_cache_detail(struct cache_detail *cd) { spin_lock_init(&cd->hash_lock); INIT_LIST_HEAD(&cd->queue); + spin_lock_init(&cd->queue_lock); spin_lock(&cache_list_lock); cd->nextcheck = 0; cd->entries = 0; @@ -803,8 +804,6 @@ void cache_clean_deferred(void *owner) * */ -static DEFINE_SPINLOCK(queue_lock); - struct cache_queue { struct list_head list; int reader; /* if 0, then request */ @@ -847,7 +846,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, inode_lock(inode); /* protect against multiple concurrent * readers on this file */ again: - spin_lock(&queue_lock); + spin_lock(&cd->queue_lock); /* need to find next request */ while (rp->q.list.next != &cd->queue && list_entry(rp->q.list.next, struct cache_queue, list) @@ -856,7 +855,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, list_move(&rp->q.list, next); } if (rp->q.list.next == &cd->queue) { - spin_unlock(&queue_lock); + spin_unlock(&cd->queue_lock); inode_unlock(inode); WARN_ON_ONCE(rp->offset); return 0; @@ -865,7 +864,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, WARN_ON_ONCE(rq->q.reader); if (rp->offset == 0) rq->readers++; - spin_unlock(&queue_lock); + spin_unlock(&cd->queue_lock); if (rq->len == 0) { err = cache_request(cd, rq); @@ -876,9 +875,9 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, if (rp->offset == 0 && !test_bit(CACHE_PENDING, &rq->item->flags)) { err = -EAGAIN; - spin_lock(&queue_lock); + spin_lock(&cd->queue_lock); list_move(&rp->q.list, &rq->q.list); - spin_unlock(&queue_lock); + spin_unlock(&cd->queue_lock); } else { if (rp->offset + count > rq->len) count = rq->len - rp->offset; @@ -888,26 +887,26 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, rp->offset += count; if (rp->offset >= rq->len) { rp->offset = 0; - spin_lock(&queue_lock); + spin_lock(&cd->queue_lock); list_move(&rp->q.list, &rq->q.list); - spin_unlock(&queue_lock); + spin_unlock(&cd->queue_lock); } err = 0; } out: if (rp->offset == 0) { /* need to release rq */ - spin_lock(&queue_lock); + spin_lock(&cd->queue_lock); rq->readers--; if (rq->readers == 0 && !test_bit(CACHE_PENDING, &rq->item->flags)) { list_del(&rq->q.list); - spin_unlock(&queue_lock); + spin_unlock(&cd->queue_lock); cache_put(rq->item, cd); kfree(rq->buf); kfree(rq); } else - spin_unlock(&queue_lock); + spin_unlock(&cd->queue_lock); } if (err == -EAGAIN) goto again; @@ -988,7 +987,7 @@ static __poll_t cache_poll(struct file *filp, poll_table *wait, if (!rp) return mask; - spin_lock(&queue_lock); + spin_lock(&cd->queue_lock); for (cq= &rp->q; &cq->list != &cd->queue; cq = list_entry(cq->list.next, struct cache_queue, list)) @@ -996,7 +995,7 @@ static __poll_t cache_poll(struct file *filp, poll_table *wait, mask |= EPOLLIN | EPOLLRDNORM; break; } - spin_unlock(&queue_lock); + spin_unlock(&cd->queue_lock); return mask; } @@ -1011,7 +1010,7 @@ static int cache_ioctl(struct inode *ino, struct file *filp, if (cmd != FIONREAD || !rp) return -EINVAL; - spin_lock(&queue_lock); + spin_lock(&cd->queue_lock); /* only find the length remaining in current request, * or the length of the next request @@ -1024,7 +1023,7 @@ static int cache_ioctl(struct inode *ino, struct file *filp, len = cr->len - rp->offset; break; } - spin_unlock(&queue_lock); + spin_unlock(&cd->queue_lock); return put_user(len, (int __user *)arg); } @@ -1046,9 +1045,9 @@ static int cache_open(struct inode *inode, struct file *filp, rp->offset = 0; rp->q.reader = 1; - spin_lock(&queue_lock); + spin_lock(&cd->queue_lock); list_add(&rp->q.list, &cd->queue); - spin_unlock(&queue_lock); + spin_unlock(&cd->queue_lock); } if (filp->f_mode & FMODE_WRITE) atomic_inc(&cd->writers); @@ -1064,7 +1063,7 @@ static int cache_release(struct inode *inode, struct file *filp, if (rp) { struct cache_request *rq = NULL; - spin_lock(&queue_lock); + spin_lock(&cd->queue_lock); if (rp->offset) { struct cache_queue *cq; for (cq = &rp->q; &cq->list != &cd->queue; @@ -1086,7 +1085,7 @@ static int cache_release(struct inode *inode, struct file *filp, rp->offset = 0; } list_del(&rp->q.list); - spin_unlock(&queue_lock); + spin_unlock(&cd->queue_lock); if (rq) { cache_put(rq->item, cd); @@ -1113,7 +1112,7 @@ static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch) struct cache_request *cr; LIST_HEAD(dequeued); - spin_lock(&queue_lock); + spin_lock(&detail->queue_lock); list_for_each_entry_safe(cq, tmp, &detail->queue, list) if (!cq->reader) { cr = container_of(cq, struct cache_request, q); @@ -1126,7 +1125,7 @@ static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch) continue; list_move(&cr->q.list, &dequeued); } - spin_unlock(&queue_lock); + spin_unlock(&detail->queue_lock); while (!list_empty(&dequeued)) { cr = list_entry(dequeued.next, struct cache_request, q.list); list_del(&cr->q.list); @@ -1251,7 +1250,7 @@ static int cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h) crq->buf = buf; crq->len = 0; crq->readers = 0; - spin_lock(&queue_lock); + spin_lock(&detail->queue_lock); if (test_bit(CACHE_PENDING, &h->flags)) { crq->item = cache_get(h); list_add_tail(&crq->q.list, &detail->queue); @@ -1259,7 +1258,7 @@ static int cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h) } else /* Lost a race, no longer PENDING, so don't enqueue */ ret = -EAGAIN; - spin_unlock(&queue_lock); + spin_unlock(&detail->queue_lock); wake_up(&queue_wait); if (ret == -EAGAIN) { kfree(buf); -- cgit v1.2.3 From 552d0e17ea042fc4f959c4543cbbd0e54de7a8e9 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 23 Feb 2026 12:10:00 -0500 Subject: sunrpc: convert queue_wait from global to per-cache-detail waitqueue The queue_wait waitqueue is currently a file-scoped global, so a wake_up for one cache_detail wakes pollers on all caches. Convert it to a per-cache-detail field so that only pollers on the relevant cache are woken. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/cache.h | 2 ++ net/sunrpc/cache.c | 7 +++---- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index 3d32dd1f7b05..031379efba24 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -16,6 +16,7 @@ #include #include #include +#include /* * Each cache requires: @@ -114,6 +115,7 @@ struct cache_detail { /* fields for communication over channel */ struct list_head queue; spinlock_t queue_lock; + wait_queue_head_t queue_wait; atomic_t writers; /* how many time is /channel open */ time64_t last_close; /* if no writers, when did last close */ diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 1cfaae488c6c..fd02dca1f07a 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -401,6 +401,7 @@ void sunrpc_init_cache_detail(struct cache_detail *cd) spin_lock_init(&cd->hash_lock); INIT_LIST_HEAD(&cd->queue); spin_lock_init(&cd->queue_lock); + init_waitqueue_head(&cd->queue_wait); spin_lock(&cache_list_lock); cd->nextcheck = 0; cd->entries = 0; @@ -970,8 +971,6 @@ out: return ret; } -static DECLARE_WAIT_QUEUE_HEAD(queue_wait); - static __poll_t cache_poll(struct file *filp, poll_table *wait, struct cache_detail *cd) { @@ -979,7 +978,7 @@ static __poll_t cache_poll(struct file *filp, poll_table *wait, struct cache_reader *rp = filp->private_data; struct cache_queue *cq; - poll_wait(filp, &queue_wait, wait); + poll_wait(filp, &cd->queue_wait, wait); /* alway allow write */ mask = EPOLLOUT | EPOLLWRNORM; @@ -1259,7 +1258,7 @@ static int cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h) /* Lost a race, no longer PENDING, so don't enqueue */ ret = -EAGAIN; spin_unlock(&detail->queue_lock); - wake_up(&queue_wait); + wake_up(&detail->queue_wait); if (ret == -EAGAIN) { kfree(buf); kfree(crq); -- cgit v1.2.3 From facc4e3c80420e3466003ce09b576e005b56a015 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 23 Feb 2026 12:10:01 -0500 Subject: sunrpc: split cache_detail queue into request and reader lists Replace the single interleaved queue (which mixed cache_request and cache_reader entries distinguished by a ->reader flag) with two dedicated lists: cd->requests for upcall requests and cd->readers for open file handles. Readers now track their position via a monotonically increasing sequence number (next_seqno) rather than by their position in the shared list. Each cache_request is assigned a seqno when enqueued, and a new cache_next_request() helper finds the next request at or after a given seqno. This eliminates the cache_queue wrapper struct entirely, simplifies the reader-skipping loops in cache_read/cache_poll/cache_ioctl/ cache_release, and makes the data flow easier to reason about. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/cache.h | 4 +- net/sunrpc/cache.c | 143 ++++++++++++++++++------------------------- 2 files changed, 62 insertions(+), 85 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index 031379efba24..b1e595c2615b 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -113,9 +113,11 @@ struct cache_detail { int entries; /* fields for communication over channel */ - struct list_head queue; + struct list_head requests; + struct list_head readers; spinlock_t queue_lock; wait_queue_head_t queue_wait; + u64 next_seqno; atomic_t writers; /* how many time is /channel open */ time64_t last_close; /* if no writers, when did last close */ diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index fd02dca1f07a..7081c1214e6c 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -399,9 +399,11 @@ static struct delayed_work cache_cleaner; void sunrpc_init_cache_detail(struct cache_detail *cd) { spin_lock_init(&cd->hash_lock); - INIT_LIST_HEAD(&cd->queue); + INIT_LIST_HEAD(&cd->requests); + INIT_LIST_HEAD(&cd->readers); spin_lock_init(&cd->queue_lock); init_waitqueue_head(&cd->queue_wait); + cd->next_seqno = 0; spin_lock(&cache_list_lock); cd->nextcheck = 0; cd->entries = 0; @@ -796,29 +798,20 @@ void cache_clean_deferred(void *owner) * On read, you get a full request, or block. * On write, an update request is processed. * Poll works if anything to read, and always allows write. - * - * Implemented by linked list of requests. Each open file has - * a ->private that also exists in this list. New requests are added - * to the end and may wakeup and preceding readers. - * New readers are added to the head. If, on read, an item is found with - * CACHE_UPCALLING clear, we free it from the list. - * */ -struct cache_queue { - struct list_head list; - int reader; /* if 0, then request */ -}; struct cache_request { - struct cache_queue q; + struct list_head list; struct cache_head *item; - char * buf; + char *buf; int len; int readers; + u64 seqno; }; struct cache_reader { - struct cache_queue q; + struct list_head list; int offset; /* if non-0, we have a refcnt on next request */ + u64 next_seqno; }; static int cache_request(struct cache_detail *detail, @@ -833,6 +826,17 @@ static int cache_request(struct cache_detail *detail, return PAGE_SIZE - len; } +static struct cache_request * +cache_next_request(struct cache_detail *cd, u64 seqno) +{ + struct cache_request *rq; + + list_for_each_entry(rq, &cd->requests, list) + if (rq->seqno >= seqno) + return rq; + return NULL; +} + static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos, struct cache_detail *cd) { @@ -849,20 +853,13 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, again: spin_lock(&cd->queue_lock); /* need to find next request */ - while (rp->q.list.next != &cd->queue && - list_entry(rp->q.list.next, struct cache_queue, list) - ->reader) { - struct list_head *next = rp->q.list.next; - list_move(&rp->q.list, next); - } - if (rp->q.list.next == &cd->queue) { + rq = cache_next_request(cd, rp->next_seqno); + if (!rq) { spin_unlock(&cd->queue_lock); inode_unlock(inode); WARN_ON_ONCE(rp->offset); return 0; } - rq = container_of(rp->q.list.next, struct cache_request, q.list); - WARN_ON_ONCE(rq->q.reader); if (rp->offset == 0) rq->readers++; spin_unlock(&cd->queue_lock); @@ -876,9 +873,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, if (rp->offset == 0 && !test_bit(CACHE_PENDING, &rq->item->flags)) { err = -EAGAIN; - spin_lock(&cd->queue_lock); - list_move(&rp->q.list, &rq->q.list); - spin_unlock(&cd->queue_lock); + rp->next_seqno = rq->seqno + 1; } else { if (rp->offset + count > rq->len) count = rq->len - rp->offset; @@ -888,9 +883,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, rp->offset += count; if (rp->offset >= rq->len) { rp->offset = 0; - spin_lock(&cd->queue_lock); - list_move(&rp->q.list, &rq->q.list); - spin_unlock(&cd->queue_lock); + rp->next_seqno = rq->seqno + 1; } err = 0; } @@ -901,7 +894,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, rq->readers--; if (rq->readers == 0 && !test_bit(CACHE_PENDING, &rq->item->flags)) { - list_del(&rq->q.list); + list_del(&rq->list); spin_unlock(&cd->queue_lock); cache_put(rq->item, cd); kfree(rq->buf); @@ -976,7 +969,6 @@ static __poll_t cache_poll(struct file *filp, poll_table *wait, { __poll_t mask; struct cache_reader *rp = filp->private_data; - struct cache_queue *cq; poll_wait(filp, &cd->queue_wait, wait); @@ -988,12 +980,8 @@ static __poll_t cache_poll(struct file *filp, poll_table *wait, spin_lock(&cd->queue_lock); - for (cq= &rp->q; &cq->list != &cd->queue; - cq = list_entry(cq->list.next, struct cache_queue, list)) - if (!cq->reader) { - mask |= EPOLLIN | EPOLLRDNORM; - break; - } + if (cache_next_request(cd, rp->next_seqno)) + mask |= EPOLLIN | EPOLLRDNORM; spin_unlock(&cd->queue_lock); return mask; } @@ -1004,7 +992,7 @@ static int cache_ioctl(struct inode *ino, struct file *filp, { int len = 0; struct cache_reader *rp = filp->private_data; - struct cache_queue *cq; + struct cache_request *rq; if (cmd != FIONREAD || !rp) return -EINVAL; @@ -1014,14 +1002,9 @@ static int cache_ioctl(struct inode *ino, struct file *filp, /* only find the length remaining in current request, * or the length of the next request */ - for (cq= &rp->q; &cq->list != &cd->queue; - cq = list_entry(cq->list.next, struct cache_queue, list)) - if (!cq->reader) { - struct cache_request *cr = - container_of(cq, struct cache_request, q); - len = cr->len - rp->offset; - break; - } + rq = cache_next_request(cd, rp->next_seqno); + if (rq) + len = rq->len - rp->offset; spin_unlock(&cd->queue_lock); return put_user(len, (int __user *)arg); @@ -1042,10 +1025,10 @@ static int cache_open(struct inode *inode, struct file *filp, return -ENOMEM; } rp->offset = 0; - rp->q.reader = 1; + rp->next_seqno = 0; spin_lock(&cd->queue_lock); - list_add(&rp->q.list, &cd->queue); + list_add(&rp->list, &cd->readers); spin_unlock(&cd->queue_lock); } if (filp->f_mode & FMODE_WRITE) @@ -1064,26 +1047,21 @@ static int cache_release(struct inode *inode, struct file *filp, spin_lock(&cd->queue_lock); if (rp->offset) { - struct cache_queue *cq; - for (cq = &rp->q; &cq->list != &cd->queue; - cq = list_entry(cq->list.next, - struct cache_queue, list)) - if (!cq->reader) { - struct cache_request *cr = - container_of(cq, - struct cache_request, q); - cr->readers--; - if (cr->readers == 0 && - !test_bit(CACHE_PENDING, - &cr->item->flags)) { - list_del(&cr->q.list); - rq = cr; - } - break; + struct cache_request *cr; + + cr = cache_next_request(cd, rp->next_seqno); + if (cr) { + cr->readers--; + if (cr->readers == 0 && + !test_bit(CACHE_PENDING, + &cr->item->flags)) { + list_del(&cr->list); + rq = cr; } + } rp->offset = 0; } - list_del(&rp->q.list); + list_del(&rp->list); spin_unlock(&cd->queue_lock); if (rq) { @@ -1107,27 +1085,24 @@ static int cache_release(struct inode *inode, struct file *filp, static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch) { - struct cache_queue *cq, *tmp; - struct cache_request *cr; + struct cache_request *cr, *tmp; LIST_HEAD(dequeued); spin_lock(&detail->queue_lock); - list_for_each_entry_safe(cq, tmp, &detail->queue, list) - if (!cq->reader) { - cr = container_of(cq, struct cache_request, q); - if (cr->item != ch) - continue; - if (test_bit(CACHE_PENDING, &ch->flags)) - /* Lost a race and it is pending again */ - break; - if (cr->readers != 0) - continue; - list_move(&cr->q.list, &dequeued); - } + list_for_each_entry_safe(cr, tmp, &detail->requests, list) { + if (cr->item != ch) + continue; + if (test_bit(CACHE_PENDING, &ch->flags)) + /* Lost a race and it is pending again */ + break; + if (cr->readers != 0) + continue; + list_move(&cr->list, &dequeued); + } spin_unlock(&detail->queue_lock); while (!list_empty(&dequeued)) { - cr = list_entry(dequeued.next, struct cache_request, q.list); - list_del(&cr->q.list); + cr = list_entry(dequeued.next, struct cache_request, list); + list_del(&cr->list); cache_put(cr->item, detail); kfree(cr->buf); kfree(cr); @@ -1245,14 +1220,14 @@ static int cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h) return -EAGAIN; } - crq->q.reader = 0; crq->buf = buf; crq->len = 0; crq->readers = 0; spin_lock(&detail->queue_lock); if (test_bit(CACHE_PENDING, &h->flags)) { crq->item = cache_get(h); - list_add_tail(&crq->q.list, &detail->queue); + crq->seqno = detail->next_seqno++; + list_add_tail(&crq->list, &detail->requests); trace_cache_entry_upcall(detail, h); } else /* Lost a race, no longer PENDING, so don't enqueue */ -- cgit v1.2.3 From 62346217fd722510c3551858ad7d0fcfab8cce7e Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Wed, 25 Feb 2026 07:51:36 -0500 Subject: NFSD: Add a key for signing filehandles A future patch will enable NFSD to sign filehandles by appending a Message Authentication Code(MAC). To do this, NFSD requires a secret 128-bit key that can persist across reboots. A persisted key allows the server to accept filehandles after a restart. Enable NFSD to be configured with this key via the netlink interface. Link: https://lore.kernel.org/linux-nfs/cover.1772022373.git.bcodding@hammerspace.com Signed-off-by: Benjamin Coddington Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- Documentation/netlink/specs/nfsd.yaml | 6 ++++++ fs/nfsd/netlink.c | 5 +++-- fs/nfsd/netns.h | 1 + fs/nfsd/nfsctl.c | 38 ++++++++++++++++++++++++++++++++++- fs/nfsd/trace.h | 22 ++++++++++++++++++++ include/uapi/linux/nfsd_netlink.h | 1 + 6 files changed, 70 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/Documentation/netlink/specs/nfsd.yaml b/Documentation/netlink/specs/nfsd.yaml index f87b5a05e5e9..8ab43c8253b2 100644 --- a/Documentation/netlink/specs/nfsd.yaml +++ b/Documentation/netlink/specs/nfsd.yaml @@ -81,6 +81,11 @@ attribute-sets: - name: min-threads type: u32 + - + name: fh-key + type: binary + checks: + exact-len: 16 - name: version attributes: @@ -163,6 +168,7 @@ operations: - leasetime - scope - min-threads + - fh-key - name: threads-get doc: get the maximum number of running threads diff --git a/fs/nfsd/netlink.c b/fs/nfsd/netlink.c index 887525964451..81c943345d13 100644 --- a/fs/nfsd/netlink.c +++ b/fs/nfsd/netlink.c @@ -24,12 +24,13 @@ const struct nla_policy nfsd_version_nl_policy[NFSD_A_VERSION_ENABLED + 1] = { }; /* NFSD_CMD_THREADS_SET - do */ -static const struct nla_policy nfsd_threads_set_nl_policy[NFSD_A_SERVER_MIN_THREADS + 1] = { +static const struct nla_policy nfsd_threads_set_nl_policy[NFSD_A_SERVER_FH_KEY + 1] = { [NFSD_A_SERVER_THREADS] = { .type = NLA_U32, }, [NFSD_A_SERVER_GRACETIME] = { .type = NLA_U32, }, [NFSD_A_SERVER_LEASETIME] = { .type = NLA_U32, }, [NFSD_A_SERVER_SCOPE] = { .type = NLA_NUL_STRING, }, [NFSD_A_SERVER_MIN_THREADS] = { .type = NLA_U32, }, + [NFSD_A_SERVER_FH_KEY] = NLA_POLICY_EXACT_LEN(16), }; /* NFSD_CMD_VERSION_SET - do */ @@ -58,7 +59,7 @@ static const struct genl_split_ops nfsd_nl_ops[] = { .cmd = NFSD_CMD_THREADS_SET, .doit = nfsd_nl_threads_set_doit, .policy = nfsd_threads_set_nl_policy, - .maxattr = NFSD_A_SERVER_MIN_THREADS, + .maxattr = NFSD_A_SERVER_FH_KEY, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 3a89d4708e8a..6ad3fe5d7e12 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -227,6 +227,7 @@ struct nfsd_net { spinlock_t local_clients_lock; struct list_head local_clients; #endif + siphash_key_t *fh_key; }; /* Simple check to find out if a given net was properly initialized */ diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 0bf01ae411c5..20ec00f323b4 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1581,6 +1581,32 @@ out_unlock: return ret; } +/** + * nfsd_nl_fh_key_set - helper to copy fh_key from userspace + * @attr: nlattr NFSD_A_SERVER_FH_KEY + * @nn: nfsd_net + * + * Callers should hold nfsd_mutex, returns 0 on success or negative errno. + * Callers must ensure the server is shut down (sv_nrthreads == 0), + * userspace documentation asserts the key may only be set when the server + * is not running. + */ +static int nfsd_nl_fh_key_set(const struct nlattr *attr, struct nfsd_net *nn) +{ + siphash_key_t *fh_key = nn->fh_key; + + if (!fh_key) { + fh_key = kmalloc(sizeof(siphash_key_t), GFP_KERNEL); + if (!fh_key) + return -ENOMEM; + nn->fh_key = fh_key; + } + + fh_key->key[0] = get_unaligned_le64(nla_data(attr)); + fh_key->key[1] = get_unaligned_le64(nla_data(attr) + 8); + return 0; +} + /** * nfsd_nl_threads_set_doit - set the number of running threads * @skb: reply buffer @@ -1622,7 +1648,8 @@ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NFSD_A_SERVER_GRACETIME] || info->attrs[NFSD_A_SERVER_LEASETIME] || - info->attrs[NFSD_A_SERVER_SCOPE]) { + info->attrs[NFSD_A_SERVER_SCOPE] || + info->attrs[NFSD_A_SERVER_FH_KEY]) { ret = -EBUSY; if (nn->nfsd_serv && nn->nfsd_serv->sv_nrthreads) goto out_unlock; @@ -1651,6 +1678,14 @@ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info) attr = info->attrs[NFSD_A_SERVER_SCOPE]; if (attr) scope = nla_data(attr); + + attr = info->attrs[NFSD_A_SERVER_FH_KEY]; + if (attr) { + ret = nfsd_nl_fh_key_set(attr, nn); + trace_nfsd_ctl_fh_key_set((const char *)nn->fh_key, ret); + if (ret) + goto out_unlock; + } } attr = info->attrs[NFSD_A_SERVER_MIN_THREADS]; @@ -2237,6 +2272,7 @@ static __net_exit void nfsd_net_exit(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); + kfree_sensitive(nn->fh_key); nfsd_proc_stat_shutdown(net); percpu_counter_destroy_many(nn->counter, NFSD_STATS_COUNTERS_NUM); nfsd_idmap_shutdown(net); diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index d1d0b0dd0545..185a998996a0 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -2240,6 +2240,28 @@ TRACE_EVENT(nfsd_end_grace, ) ); +TRACE_EVENT(nfsd_ctl_fh_key_set, + TP_PROTO( + const char *key, + int result + ), + TP_ARGS(key, result), + TP_STRUCT__entry( + __field(u32, key_hash) + __field(int, result) + ), + TP_fast_assign( + if (key) + __entry->key_hash = ~crc32_le(0xFFFFFFFF, key, 16); + else + __entry->key_hash = 0; + __entry->result = result; + ), + TP_printk("key=0x%08x result=%d", + __entry->key_hash, __entry->result + ) +); + DECLARE_EVENT_CLASS(nfsd_copy_class, TP_PROTO( const struct nfsd4_copy *copy diff --git a/include/uapi/linux/nfsd_netlink.h b/include/uapi/linux/nfsd_netlink.h index e9efbc9e63d8..97c7447f4d14 100644 --- a/include/uapi/linux/nfsd_netlink.h +++ b/include/uapi/linux/nfsd_netlink.h @@ -36,6 +36,7 @@ enum { NFSD_A_SERVER_LEASETIME, NFSD_A_SERVER_SCOPE, NFSD_A_SERVER_MIN_THREADS, + NFSD_A_SERVER_FH_KEY, __NFSD_A_SERVER_MAX, NFSD_A_SERVER_MAX = (__NFSD_A_SERVER_MAX - 1) -- cgit v1.2.3 From a002ad8a9bc89c084bc40933065c88336700837e Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Wed, 25 Feb 2026 07:51:37 -0500 Subject: NFSD/export: Add sign_fh export option In order to signal that filehandles on this export should be signed, add a "sign_fh" export option. Filehandle signing can help the server defend against certain filehandle guessing attacks. Setting the "sign_fh" export option sets NFSEXP_SIGN_FH. In a future patch NFSD uses this signal to append a MAC onto filehandles for that export. While we're in here, tidy a few stray expflags to more closely align to the export flag order. Link: https://lore.kernel.org/linux-nfs/cover.1772022373.git.bcodding@hammerspace.com Signed-off-by: Benjamin Coddington Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/export.c | 5 +++-- include/uapi/linux/nfsd/export.h | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 8e8a76a44ff0..7f4a51b832ef 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1362,13 +1362,14 @@ static struct flags { { NFSEXP_ASYNC, {"async", "sync"}}, { NFSEXP_GATHERED_WRITES, {"wdelay", "no_wdelay"}}, { NFSEXP_NOREADDIRPLUS, {"nordirplus", ""}}, + { NFSEXP_SECURITY_LABEL, {"security_label", ""}}, + { NFSEXP_SIGN_FH, {"sign_fh", ""}}, { NFSEXP_NOHIDE, {"nohide", ""}}, - { NFSEXP_CROSSMOUNT, {"crossmnt", ""}}, { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}}, { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}}, + { NFSEXP_CROSSMOUNT, {"crossmnt", ""}}, { NFSEXP_V4ROOT, {"v4root", ""}}, { NFSEXP_PNFS, {"pnfs", ""}}, - { NFSEXP_SECURITY_LABEL, {"security_label", ""}}, { 0, {"", ""}} }; diff --git a/include/uapi/linux/nfsd/export.h b/include/uapi/linux/nfsd/export.h index a73ca3703abb..de647cf166c3 100644 --- a/include/uapi/linux/nfsd/export.h +++ b/include/uapi/linux/nfsd/export.h @@ -34,7 +34,7 @@ #define NFSEXP_GATHERED_WRITES 0x0020 #define NFSEXP_NOREADDIRPLUS 0x0040 #define NFSEXP_SECURITY_LABEL 0x0080 -/* 0x100 currently unused */ +#define NFSEXP_SIGN_FH 0x0100 #define NFSEXP_NOHIDE 0x0200 #define NFSEXP_NOSUBTREECHECK 0x0400 #define NFSEXP_NOAUTHNLM 0x0800 /* Don't authenticate NLM requests - just trust */ @@ -55,7 +55,7 @@ #define NFSEXP_PNFS 0x20000 /* All flags that we claim to support. (Note we don't support NOACL.) */ -#define NFSEXP_ALLFLAGS 0x3FEFF +#define NFSEXP_ALLFLAGS 0x3FFFF /* The flags that may vary depending on security flavor: */ #define NFSEXP_SECINFO_FLAGS (NFSEXP_READONLY | NFSEXP_ROOTSQUASH \ -- cgit v1.2.3 From ee66b9e3e1c69efc986f3932555f07121c3460a7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 26 Feb 2026 09:47:35 -0500 Subject: SUNRPC: Allocate a separate Reply page array struct svc_rqst uses a single dynamically-allocated page array (rq_pages) for both the incoming RPC Call message and the outgoing RPC Reply message. rq_respages is a sliding pointer into rq_pages that each transport receive path must compute based on how many pages the Call consumed. This boundary tracking is a source of confusion and bugs, and prevents an RPC transaction from having both a large Call and a large Reply simultaneously. Allocate rq_respages as its own page array, eliminating the boundary arithmetic. This decouples Call and Reply buffer lifetimes, following the precedent set by rq_bvec (a separate dynamically- allocated array for I/O vectors). Each svc_rqst now pins twice as many pages as before. For a server running 16 threads with a 1MB maximum payload, the additional cost is roughly 16MB of pinned memory. The new dynamic svc thread count facility keeps this overhead minimal on an idle server. A subsequent patch in this series limits per-request repopulation to only the pages released during the previous RPC, avoiding a full-array scan on each call to svc_alloc_arg(). Note: We've considered several alternatives to maintaining a full second array. Each alternative reintroduces either boundary logic complexity or I/O-path allocation pressure. rq_next_page is initialized in svc_alloc_arg() and svc_process() during Reply construction, and in svc_rdma_recvfrom() as a precaution on error paths. Transport receive paths no longer compute it from the Call size. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 47 ++++++++++++++++----------------- net/sunrpc/svc.c | 29 ++++++++++++++++---- net/sunrpc/svc_xprt.c | 36 ++++++++++++++++++------- net/sunrpc/svcsock.c | 6 ----- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 15 +++-------- 5 files changed, 77 insertions(+), 56 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 62152e4f3bcc..3b1a98ab5cba 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -134,25 +134,24 @@ enum { extern u32 svc_max_payload(const struct svc_rqst *rqstp); /* - * RPC Requests and replies are stored in one or more pages. - * We maintain an array of pages for each server thread. - * Requests are copied into these pages as they arrive. Remaining - * pages are available to write the reply into. + * RPC Call and Reply messages each have their own page array. + * rq_pages holds the incoming Call message; rq_respages holds + * the outgoing Reply message. Both arrays are sized to + * svc_serv_maxpages() entries and are allocated dynamically. * - * Pages are sent using ->sendmsg with MSG_SPLICE_PAGES so each server thread - * needs to allocate more to replace those used in sending. To help keep track - * of these pages we have a receive list where all pages initialy live, and a - * send list where pages are moved to when there are to be part of a reply. + * Pages are sent using ->sendmsg with MSG_SPLICE_PAGES so each + * server thread needs to allocate more to replace those used in + * sending. * - * We use xdr_buf for holding responses as it fits well with NFS - * read responses (that have a header, and some data pages, and possibly - * a tail) and means we can share some client side routines. + * xdr_buf holds responses; the structure fits NFS read responses + * (header, data pages, optional tail) and enables sharing of + * client-side routines. * - * The xdr_buf.head kvec always points to the first page in the rq_*pages - * list. The xdr_buf.pages pointer points to the second page on that - * list. xdr_buf.tail points to the end of the first page. - * This assumes that the non-page part of an rpc reply will fit - * in a page - NFSd ensures this. lockd also has no trouble. + * The xdr_buf.head kvec always points to the first page in the + * rq_*pages list. The xdr_buf.pages pointer points to the second + * page on that list. xdr_buf.tail points to the end of the first + * page. This assumes that the non-page part of an rpc reply will + * fit in a page - NFSd ensures this. lockd also has no trouble. */ /** @@ -162,10 +161,10 @@ extern u32 svc_max_payload(const struct svc_rqst *rqstp); * Returns a count of pages or vectors that can hold the maximum * size RPC message for @serv. * - * Each request/reply pair can have at most one "payload", plus two - * pages, one for the request, and one for the reply. - * nfsd_splice_actor() might need an extra page when a READ payload - * is not page-aligned. + * Each page array can hold at most one payload plus two + * overhead pages (one for the RPC header, one for tail data). + * nfsd_splice_actor() might need an extra page when a READ + * payload is not page-aligned. */ static inline unsigned long svc_serv_maxpages(const struct svc_serv *serv) { @@ -204,11 +203,11 @@ struct svc_rqst { struct xdr_stream rq_res_stream; struct folio *rq_scratch_folio; struct xdr_buf rq_res; - unsigned long rq_maxpages; /* num of entries in rq_pages */ - struct page * *rq_pages; - struct page * *rq_respages; /* points into rq_pages */ + unsigned long rq_maxpages; /* entries per page array */ + struct page * *rq_pages; /* Call buffer pages */ + struct page * *rq_respages; /* Reply buffer pages */ struct page * *rq_next_page; /* next reply page to use */ - struct page * *rq_page_end; /* one past the last page */ + struct page * *rq_page_end; /* one past the last reply page */ struct folio_batch rq_fbatch; struct bio_vec *rq_bvec; diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index f7ec02457328..9abef638b1e0 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -638,13 +638,23 @@ svc_init_buffer(struct svc_rqst *rqstp, const struct svc_serv *serv, int node) { rqstp->rq_maxpages = svc_serv_maxpages(serv); - /* rq_pages' last entry is NULL for historical reasons. */ + /* +1 for a NULL sentinel readable by nfsd_splice_actor() */ rqstp->rq_pages = kcalloc_node(rqstp->rq_maxpages + 1, sizeof(struct page *), GFP_KERNEL, node); if (!rqstp->rq_pages) return false; + /* +1 for a NULL sentinel at rq_page_end (see svc_rqst_replace_page) */ + rqstp->rq_respages = kcalloc_node(rqstp->rq_maxpages + 1, + sizeof(struct page *), + GFP_KERNEL, node); + if (!rqstp->rq_respages) { + kfree(rqstp->rq_pages); + rqstp->rq_pages = NULL; + return false; + } + return true; } @@ -656,10 +666,19 @@ svc_release_buffer(struct svc_rqst *rqstp) { unsigned long i; - for (i = 0; i < rqstp->rq_maxpages; i++) - if (rqstp->rq_pages[i]) - put_page(rqstp->rq_pages[i]); - kfree(rqstp->rq_pages); + if (rqstp->rq_pages) { + for (i = 0; i < rqstp->rq_maxpages; i++) + if (rqstp->rq_pages[i]) + put_page(rqstp->rq_pages[i]); + kfree(rqstp->rq_pages); + } + + if (rqstp->rq_respages) { + for (i = 0; i < rqstp->rq_maxpages; i++) + if (rqstp->rq_respages[i]) + put_page(rqstp->rq_respages[i]); + kfree(rqstp->rq_respages); + } } static void diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 56a663b8939f..e027765f4307 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -650,14 +650,13 @@ static void svc_check_conn_limits(struct svc_serv *serv) } } -static bool svc_alloc_arg(struct svc_rqst *rqstp) +static bool svc_fill_pages(struct svc_rqst *rqstp, struct page **pages, + unsigned long npages) { - struct xdr_buf *arg = &rqstp->rq_arg; - unsigned long pages, filled, ret; + unsigned long filled, ret; - pages = rqstp->rq_maxpages; - for (filled = 0; filled < pages; filled = ret) { - ret = alloc_pages_bulk(GFP_KERNEL, pages, rqstp->rq_pages); + for (filled = 0; filled < npages; filled = ret) { + ret = alloc_pages_bulk(GFP_KERNEL, npages, pages); if (ret > filled) /* Made progress, don't sleep yet */ continue; @@ -667,11 +666,29 @@ static bool svc_alloc_arg(struct svc_rqst *rqstp) set_current_state(TASK_RUNNING); return false; } - trace_svc_alloc_arg_err(pages, ret); + trace_svc_alloc_arg_err(npages, ret); memalloc_retry_wait(GFP_KERNEL); } - rqstp->rq_page_end = &rqstp->rq_pages[pages]; - rqstp->rq_pages[pages] = NULL; /* this might be seen in nfsd_splice_actor() */ + return true; +} + +static bool svc_alloc_arg(struct svc_rqst *rqstp) +{ + struct xdr_buf *arg = &rqstp->rq_arg; + unsigned long pages; + + pages = rqstp->rq_maxpages; + + if (!svc_fill_pages(rqstp, rqstp->rq_pages, pages)) + return false; + if (!svc_fill_pages(rqstp, rqstp->rq_respages, pages)) + return false; + rqstp->rq_next_page = rqstp->rq_respages; + rqstp->rq_page_end = &rqstp->rq_respages[pages]; + /* svc_rqst_replace_page() dereferences *rq_next_page even + * at rq_page_end; NULL prevents releasing a garbage page. + */ + rqstp->rq_page_end[0] = NULL; /* Make arg->head point to first page and arg->pages point to rest */ arg->head[0].iov_base = page_address(rqstp->rq_pages[0]); @@ -1277,7 +1294,6 @@ static noinline int svc_deferred_recv(struct svc_rqst *rqstp) rqstp->rq_addrlen = dr->addrlen; /* Save off transport header len in case we get deferred again */ rqstp->rq_daddr = dr->daddr; - rqstp->rq_respages = rqstp->rq_pages; rqstp->rq_xprt_ctxt = dr->xprt_ctxt; dr->xprt_ctxt = NULL; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index f28c6076f7e8..c86f28f720f7 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -351,8 +351,6 @@ static ssize_t svc_tcp_read_msg(struct svc_rqst *rqstp, size_t buflen, for (i = 0, t = 0; t < buflen; i++, t += PAGE_SIZE) bvec_set_page(&bvec[i], rqstp->rq_pages[i], PAGE_SIZE, 0); - rqstp->rq_respages = &rqstp->rq_pages[i]; - rqstp->rq_next_page = rqstp->rq_respages + 1; iov_iter_bvec(&msg.msg_iter, ITER_DEST, bvec, i, buflen); if (seek) { @@ -677,13 +675,9 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) if (len <= rqstp->rq_arg.head[0].iov_len) { rqstp->rq_arg.head[0].iov_len = len; rqstp->rq_arg.page_len = 0; - rqstp->rq_respages = rqstp->rq_pages+1; } else { rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len; - rqstp->rq_respages = rqstp->rq_pages + 1 + - DIV_ROUND_UP(rqstp->rq_arg.page_len, PAGE_SIZE); } - rqstp->rq_next_page = rqstp->rq_respages+1; if (serv->sv_stats) serv->sv_stats->netudpcnt++; diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index e7e4a39ca6c6..3081a37a5896 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -861,18 +861,12 @@ static noinline void svc_rdma_read_complete(struct svc_rqst *rqstp, unsigned int i; /* Transfer the Read chunk pages into @rqstp.rq_pages, replacing - * the rq_pages that were already allocated for this rqstp. + * the receive buffer pages already allocated for this rqstp. */ - release_pages(rqstp->rq_respages, ctxt->rc_page_count); + release_pages(rqstp->rq_pages, ctxt->rc_page_count); for (i = 0; i < ctxt->rc_page_count; i++) rqstp->rq_pages[i] = ctxt->rc_pages[i]; - /* Update @rqstp's result send buffer to start after the - * last page in the RDMA Read payload. - */ - rqstp->rq_respages = &rqstp->rq_pages[ctxt->rc_page_count]; - rqstp->rq_next_page = rqstp->rq_respages + 1; - /* Prevent svc_rdma_recv_ctxt_put() from releasing the * pages in ctxt::rc_pages a second time. */ @@ -931,10 +925,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) struct svc_rdma_recv_ctxt *ctxt; int ret; - /* Prevent svc_xprt_release() from releasing pages in rq_pages - * when returning 0 or an error. + /* Precaution: a zero page count on error return causes + * svc_rqst_release_pages() to release nothing. */ - rqstp->rq_respages = rqstp->rq_pages; rqstp->rq_next_page = rqstp->rq_respages; rqstp->rq_xprt_ctxt = NULL; -- cgit v1.2.3 From 7ed7504287a627834f2a35ef04e5dfd26d1c8986 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 26 Feb 2026 09:47:38 -0500 Subject: SUNRPC: Track consumed rq_pages entries The rq_pages array holds pages allocated for incoming RPC requests. Two transport receive paths NULL entries in rq_pages to prevent svc_rqst_release_pages() from freeing pages that the transport has taken ownership of: - svc_tcp_save_pages() moves partial request data pages to svsk->sk_pages during multi-fragment TCP reassembly. - svc_rdma_clear_rqst_pages() moves request data pages to head->rc_pages because they are targets of active RDMA Read WRs. A new rq_pages_nfree field in struct svc_rqst records how many entries were NULLed. svc_alloc_arg() uses it to refill only those entries rather than scanning the full rq_pages array. In steady state, the transport NULLs a handful of entries per RPC, so the allocator visits only those entries instead of the full ~259 slots (for 1MB messages). Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 10 ++++++++++ net/sunrpc/svc.c | 1 + net/sunrpc/svc_xprt.c | 11 ++++++++--- net/sunrpc/svcsock.c | 1 + net/sunrpc/xprtrdma/svc_rdma_rw.c | 1 + 5 files changed, 21 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 3b1a98ab5cba..c3399cf64524 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -143,6 +143,15 @@ extern u32 svc_max_payload(const struct svc_rqst *rqstp); * server thread needs to allocate more to replace those used in * sending. * + * rq_pages request page contract: + * + * Transport receive paths that move request data pages out of + * rq_pages -- TCP multi-fragment reassembly (svc_tcp_save_pages) + * and RDMA Read I/O (svc_rdma_clear_rqst_pages) -- NULL those + * entries to prevent svc_rqst_release_pages() from freeing pages + * still in transport use, and set rq_pages_nfree to the count. + * svc_alloc_arg() refills only that many rq_pages entries. + * * xdr_buf holds responses; the structure fits NFS read responses * (header, data pages, optional tail) and enables sharing of * client-side routines. @@ -204,6 +213,7 @@ struct svc_rqst { struct folio *rq_scratch_folio; struct xdr_buf rq_res; unsigned long rq_maxpages; /* entries per page array */ + unsigned long rq_pages_nfree; /* rq_pages entries NULLed by transport */ struct page * *rq_pages; /* Call buffer pages */ struct page * *rq_respages; /* Reply buffer pages */ struct page * *rq_next_page; /* next reply page to use */ diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 0ce16e9abdf6..6e57e35fa6d6 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -655,6 +655,7 @@ svc_init_buffer(struct svc_rqst *rqstp, const struct svc_serv *serv, int node) return false; } + rqstp->rq_pages_nfree = rqstp->rq_maxpages; return true; } diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index e027765f4307..795b5729525f 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -675,12 +675,17 @@ static bool svc_fill_pages(struct svc_rqst *rqstp, struct page **pages, static bool svc_alloc_arg(struct svc_rqst *rqstp) { struct xdr_buf *arg = &rqstp->rq_arg; - unsigned long pages; + unsigned long pages, nfree; pages = rqstp->rq_maxpages; - if (!svc_fill_pages(rqstp, rqstp->rq_pages, pages)) - return false; + nfree = rqstp->rq_pages_nfree; + if (nfree) { + if (!svc_fill_pages(rqstp, rqstp->rq_pages, nfree)) + return false; + rqstp->rq_pages_nfree = 0; + } + if (!svc_fill_pages(rqstp, rqstp->rq_respages, pages)) return false; rqstp->rq_next_page = rqstp->rq_respages; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index c86f28f720f7..2ce43f9995f1 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1009,6 +1009,7 @@ static void svc_tcp_save_pages(struct svc_sock *svsk, struct svc_rqst *rqstp) svsk->sk_pages[i] = rqstp->rq_pages[i]; rqstp->rq_pages[i] = NULL; } + rqstp->rq_pages_nfree = npages; } static void svc_tcp_clear_pages(struct svc_sock *svsk) diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 4ec2f9ae06aa..cf4a1762b629 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -1107,6 +1107,7 @@ static void svc_rdma_clear_rqst_pages(struct svc_rqst *rqstp, head->rc_pages[i] = rqstp->rq_pages[i]; rqstp->rq_pages[i] = NULL; } + rqstp->rq_pages_nfree = head->rc_page_count; } /** -- cgit v1.2.3 From d7f3efd9ff474867b04e1ea784690f02450a245b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 26 Feb 2026 09:47:39 -0500 Subject: SUNRPC: Optimize rq_respages allocation in svc_alloc_arg svc_alloc_arg() invokes alloc_pages_bulk() with the full rq_maxpages count (~259 for 1MB messages) for the rq_respages array, causing a full-array scan despite most slots holding valid pages. svc_rqst_release_pages() NULLs only the range [rq_respages, rq_next_page) after each RPC, so only that range contains NULL entries. Limit the rq_respages fill in svc_alloc_arg() to that range instead of scanning the full array. svc_init_buffer() initializes rq_next_page to span the entire rq_respages array, so the first svc_alloc_arg() call fills all slots. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 4 ++++ net/sunrpc/svc.c | 1 + net/sunrpc/svc_xprt.c | 8 +++++++- 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index c3399cf64524..669c944eaf7f 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -152,6 +152,10 @@ extern u32 svc_max_payload(const struct svc_rqst *rqstp); * still in transport use, and set rq_pages_nfree to the count. * svc_alloc_arg() refills only that many rq_pages entries. * + * For rq_respages, svc_rqst_release_pages() NULLs entries in + * [rq_respages, rq_next_page) after each RPC. svc_alloc_arg() + * refills only that range. + * * xdr_buf holds responses; the structure fits NFS read responses * (header, data pages, optional tail) and enables sharing of * client-side routines. diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 6e57e35fa6d6..5e0b5ec2fd52 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -656,6 +656,7 @@ svc_init_buffer(struct svc_rqst *rqstp, const struct svc_serv *serv, int node) } rqstp->rq_pages_nfree = rqstp->rq_maxpages; + rqstp->rq_next_page = rqstp->rq_respages + rqstp->rq_maxpages; return true; } diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 795b5729525f..b16e710926c1 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -686,8 +686,14 @@ static bool svc_alloc_arg(struct svc_rqst *rqstp) rqstp->rq_pages_nfree = 0; } - if (!svc_fill_pages(rqstp, rqstp->rq_respages, pages)) + if (WARN_ON_ONCE(rqstp->rq_next_page < rqstp->rq_respages)) return false; + nfree = rqstp->rq_next_page - rqstp->rq_respages; + if (nfree) { + if (!svc_fill_pages(rqstp, rqstp->rq_respages, nfree)) + return false; + } + rqstp->rq_next_page = rqstp->rq_respages; rqstp->rq_page_end = &rqstp->rq_respages[pages]; /* svc_rqst_replace_page() dereferences *rq_next_page even -- cgit v1.2.3 From ccc89b9d1ed233349cfe8d87b842e7351b74d8de Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 27 Feb 2026 09:03:28 -0500 Subject: svcrdma: Add fair queuing for Send Queue access When the Send Queue fills, multiple threads may wait for SQ slots. The previous implementation had no ordering guarantee, allowing starvation when one thread repeatedly acquires slots while others wait indefinitely. Introduce a ticket-based fair queuing system. Each waiter takes a ticket number and is served in FIFO order. This ensures forward progress for all waiters when SQ capacity is constrained. The implementation has two phases: 1. Fast path: attempt to reserve SQ slots without waiting 2. Slow path: take a ticket, wait for turn, then wait for slots The ticket system adds two atomic counters to the transport: - sc_sq_ticket_head: next ticket to issue - sc_sq_ticket_tail: ticket currently being served A dedicated wait queue (sc_sq_ticket_wait) handles ticket ordering, separate from sc_send_wait which handles SQ capacity. This separation ensures that send completions (the high-frequency wake source) wake only the current ticket holder rather than all queued waiters. Ticket handoff wakes only the ticket wait queue, and each ticket holder that exits via connection close propagates the wake to the next waiter in line. When a waiter successfully reserves slots, it advances the tail counter and wakes the next waiter. This creates an orderly handoff that prevents starvation while maintaining good throughput on the fast path when contention is low. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 10 ++ net/sunrpc/xprtrdma/svc_rdma_rw.c | 37 ++----- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 162 +++++++++++++++++++++++-------- net/sunrpc/xprtrdma/svc_rdma_transport.c | 6 +- 4 files changed, 146 insertions(+), 69 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 57f4fd94166a..658b8498177e 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -84,6 +84,9 @@ struct svcxprt_rdma { atomic_t sc_sq_avail; /* SQEs ready to be consumed */ unsigned int sc_sq_depth; /* Depth of SQ */ + atomic_t sc_sq_ticket_head; /* Next ticket to issue */ + atomic_t sc_sq_ticket_tail; /* Ticket currently serving */ + wait_queue_head_t sc_sq_ticket_wait; /* Ticket ordering waitlist */ __be32 sc_fc_credits; /* Forward credits */ u32 sc_max_requests; /* Max requests */ u32 sc_max_bc_requests;/* Backward credits */ @@ -306,6 +309,13 @@ extern void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, struct svc_rdma_recv_ctxt *rctxt, int status); extern void svc_rdma_wake_send_waiters(struct svcxprt_rdma *rdma, int avail); +extern int svc_rdma_sq_wait(struct svcxprt_rdma *rdma, + const struct rpc_rdma_cid *cid, int sqecount); +extern int svc_rdma_post_send_err(struct svcxprt_rdma *rdma, + const struct rpc_rdma_cid *cid, + const struct ib_send_wr *bad_wr, + const struct ib_send_wr *first_wr, + int sqecount, int ret); extern int svc_rdma_sendto(struct svc_rqst *); extern int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset, unsigned int length); diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index cf4a1762b629..97bce806974b 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -405,34 +405,17 @@ static int svc_rdma_post_chunk_ctxt(struct svcxprt_rdma *rdma, cqe = NULL; } - do { - if (atomic_sub_return(cc->cc_sqecount, - &rdma->sc_sq_avail) > 0) { - cc->cc_posttime = ktime_get(); - ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); - if (ret) - break; - return 0; - } - - percpu_counter_inc(&svcrdma_stat_sq_starve); - trace_svcrdma_sq_full(rdma, &cc->cc_cid); - atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); - wait_event(rdma->sc_send_wait, - atomic_read(&rdma->sc_sq_avail) > cc->cc_sqecount); - trace_svcrdma_sq_retry(rdma, &cc->cc_cid); - } while (1); - - trace_svcrdma_sq_post_err(rdma, &cc->cc_cid, ret); - svc_xprt_deferred_close(&rdma->sc_xprt); - - /* If even one was posted, there will be a completion. */ - if (bad_wr != first_wr) - return 0; + ret = svc_rdma_sq_wait(rdma, &cc->cc_cid, cc->cc_sqecount); + if (ret < 0) + return ret; - atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); - wake_up(&rdma->sc_send_wait); - return -ENOTCONN; + cc->cc_posttime = ktime_get(); + ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); + if (ret) + return svc_rdma_post_send_err(rdma, &cc->cc_cid, bad_wr, + first_wr, cc->cc_sqecount, + ret); + return 0; } /* Build a bvec that covers one kvec in an xdr_buf. diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 17c8429da9d5..02559947272a 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -294,6 +294,117 @@ void svc_rdma_wake_send_waiters(struct svcxprt_rdma *rdma, int avail) wake_up(&rdma->sc_send_wait); } +/** + * svc_rdma_sq_wait - Wait for SQ slots using fair queuing + * @rdma: controlling transport + * @cid: completion ID for tracing + * @sqecount: number of SQ entries needed + * + * A ticket-based system ensures fair ordering when multiple threads + * wait for Send Queue capacity. Each waiter takes a ticket and is + * served in order, preventing starvation. + * + * Protocol invariant: every ticket holder must increment + * sc_sq_ticket_tail exactly once, whether the reservation + * succeeds or the connection closes. Failing to advance the + * tail stalls all subsequent waiters. + * + * The ticket counters are signed 32-bit atomics. After + * wrapping through INT_MAX, the equality check + * (tail == ticket) remains correct because both counters + * advance monotonically and the comparison uses exact + * equality rather than relational operators. + * + * Return values: + * %0: SQ slots were reserved successfully + * %-ENOTCONN: The connection was lost + */ +int svc_rdma_sq_wait(struct svcxprt_rdma *rdma, + const struct rpc_rdma_cid *cid, int sqecount) +{ + int ticket; + + /* Fast path: try to reserve SQ slots without waiting. + * + * A failed reservation temporarily understates sc_sq_avail + * until the compensating atomic_add restores it. A Send + * completion arriving in that window sees a lower count + * than reality, but the value self-corrects once the add + * completes. No ordering guarantee is needed here because + * the slow path serializes all contended waiters. + */ + if (likely(atomic_sub_return(sqecount, &rdma->sc_sq_avail) >= 0)) + return 0; + atomic_add(sqecount, &rdma->sc_sq_avail); + + /* Slow path: take a ticket and wait in line */ + ticket = atomic_fetch_inc(&rdma->sc_sq_ticket_head); + + percpu_counter_inc(&svcrdma_stat_sq_starve); + trace_svcrdma_sq_full(rdma, cid); + + /* Wait until all earlier tickets have been served */ + wait_event(rdma->sc_sq_ticket_wait, + test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags) || + atomic_read(&rdma->sc_sq_ticket_tail) == ticket); + if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) + goto out_close; + + /* It's our turn. Wait for enough SQ slots to be available. */ + while (atomic_sub_return(sqecount, &rdma->sc_sq_avail) < 0) { + atomic_add(sqecount, &rdma->sc_sq_avail); + + wait_event(rdma->sc_send_wait, + test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags) || + atomic_read(&rdma->sc_sq_avail) >= sqecount); + if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) + goto out_close; + } + + /* Slots reserved successfully. Let the next waiter proceed. */ + atomic_inc(&rdma->sc_sq_ticket_tail); + wake_up(&rdma->sc_sq_ticket_wait); + trace_svcrdma_sq_retry(rdma, cid); + return 0; + +out_close: + atomic_inc(&rdma->sc_sq_ticket_tail); + wake_up(&rdma->sc_sq_ticket_wait); + return -ENOTCONN; +} + +/** + * svc_rdma_post_send_err - Handle ib_post_send failure + * @rdma: controlling transport + * @cid: completion ID for tracing + * @bad_wr: first WR that was not posted + * @first_wr: first WR in the chain + * @sqecount: number of SQ entries that were reserved + * @ret: error code from ib_post_send + * + * Return values: + * %0: At least one WR was posted; a completion handles cleanup + * %-ENOTCONN: No WRs were posted; SQ slots are released + */ +int svc_rdma_post_send_err(struct svcxprt_rdma *rdma, + const struct rpc_rdma_cid *cid, + const struct ib_send_wr *bad_wr, + const struct ib_send_wr *first_wr, + int sqecount, int ret) +{ + trace_svcrdma_sq_post_err(rdma, cid, ret); + svc_xprt_deferred_close(&rdma->sc_xprt); + + /* If even one WR was posted, a Send completion will + * return the reserved SQ slots. + */ + if (bad_wr != first_wr) + return 0; + + svc_rdma_wake_send_waiters(rdma, sqecount); + return -ENOTCONN; +} + /** * svc_rdma_wc_send - Invoked by RDMA provider for each polled Send WC * @cq: Completion Queue context @@ -336,11 +447,6 @@ flushed: * that these values remain available after the ib_post_send() call. * In some error flow cases, svc_rdma_wc_send() releases @ctxt. * - * Note there is potential for starvation when the Send Queue is - * full because there is no order to when waiting threads are - * awoken. The transport is typically provisioned with a deep - * enough Send Queue that SQ exhaustion should be a rare event. - * * Return values: * %0: @ctxt's WR chain was posted successfully * %-ENOTCONN: The connection was lost @@ -362,42 +468,16 @@ int svc_rdma_post_send(struct svcxprt_rdma *rdma, send_wr->sg_list[0].length, DMA_TO_DEVICE); - /* If the SQ is full, wait until an SQ entry is available */ - while (!test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) { - if (atomic_sub_return(sqecount, &rdma->sc_sq_avail) < 0) { - svc_rdma_wake_send_waiters(rdma, sqecount); - - /* When the transport is torn down, assume - * ib_drain_sq() will trigger enough Send - * completions to wake us. The XPT_CLOSE test - * above should then cause the while loop to - * exit. - */ - percpu_counter_inc(&svcrdma_stat_sq_starve); - trace_svcrdma_sq_full(rdma, &cid); - wait_event(rdma->sc_send_wait, - atomic_read(&rdma->sc_sq_avail) > 0); - trace_svcrdma_sq_retry(rdma, &cid); - continue; - } - - trace_svcrdma_post_send(ctxt); - ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); - if (ret) { - trace_svcrdma_sq_post_err(rdma, &cid, ret); - svc_xprt_deferred_close(&rdma->sc_xprt); - - /* If even one WR was posted, there will be a - * Send completion that bumps sc_sq_avail. - */ - if (bad_wr == first_wr) { - svc_rdma_wake_send_waiters(rdma, sqecount); - break; - } - } - return 0; - } - return -ENOTCONN; + ret = svc_rdma_sq_wait(rdma, &cid, sqecount); + if (ret < 0) + return ret; + + trace_svcrdma_post_send(ctxt); + ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); + if (ret) + return svc_rdma_post_send_err(rdma, &cid, bad_wr, + first_wr, sqecount, ret); + return 0; } /** diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index f2d72181a6fe..f18bc60d9f4f 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -179,6 +179,7 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, init_llist_head(&cma_xprt->sc_recv_ctxts); init_llist_head(&cma_xprt->sc_rw_ctxts); init_waitqueue_head(&cma_xprt->sc_send_wait); + init_waitqueue_head(&cma_xprt->sc_sq_ticket_wait); spin_lock_init(&cma_xprt->sc_lock); spin_lock_init(&cma_xprt->sc_rq_dto_lock); @@ -477,6 +478,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr) newxprt->sc_sq_depth = dev->attrs.max_qp_wr; atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth); + atomic_set(&newxprt->sc_sq_ticket_head, 0); + atomic_set(&newxprt->sc_sq_ticket_tail, 0); newxprt->sc_pd = ib_alloc_pd(dev, 0); if (IS_ERR(newxprt->sc_pd)) { @@ -649,7 +652,8 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt) * If there are already waiters on the SQ, * return false. */ - if (waitqueue_active(&rdma->sc_send_wait)) + if (waitqueue_active(&rdma->sc_send_wait) || + waitqueue_active(&rdma->sc_sq_ticket_wait)) return 0; /* Otherwise return true. */ -- cgit v1.2.3 From d16f060f3ee297424c0aba047b1d49208adb9318 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 27 Feb 2026 09:03:31 -0500 Subject: svcrdma: Add Write chunk WRs to the RPC's Send WR chain Previously, Write chunk RDMA Writes were posted via a separate ib_post_send() call with their own completion handler. Each Write chunk incurred a doorbell and generated a completion event. Link Write chunk WRs onto the RPC Reply's Send WR chain so that a single ib_post_send() call posts both the RDMA Writes and the Send WR. A single completion event signals that all operations have finished. This reduces both doorbell rate and completion rate, as well as eliminating the latency of a round-trip between the Write chunk completion and the subsequent Send WR posting. The lifecycle of Write chunk resources changes: previously, the svc_rdma_write_done() completion handler released Write chunk resources when RDMA Writes completed. With WR chaining, resources remain live until the Send completion. A new sc_write_info_list tracks Write chunk metadata attached to each Send context, and svc_rdma_write_chunk_release() frees these resources when the Send context is released. The svc_rdma_write_done() handler now handles only error cases. On success it returns immediately since the Send completion handles resource release. On failure (WR flush), it closes the connection to signal to the client that the RPC Reply is incomplete. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 13 +++-- net/sunrpc/xprtrdma/svc_rdma_rw.c | 94 +++++++++++++++++++++++++++-------- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 10 +++- 3 files changed, 91 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 658b8498177e..df6e08aaad57 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -216,6 +216,7 @@ struct svc_rdma_recv_ctxt { */ struct svc_rdma_write_info { struct svcxprt_rdma *wi_rdma; + struct list_head wi_list; const struct svc_rdma_chunk *wi_chunk; @@ -244,7 +245,10 @@ struct svc_rdma_send_ctxt { struct ib_cqe sc_cqe; struct xdr_buf sc_hdrbuf; struct xdr_stream sc_stream; + + struct list_head sc_write_info_list; struct svc_rdma_write_info sc_reply_info; + void *sc_xprt_buf; int sc_page_count; int sc_cur_sge_no; @@ -277,11 +281,14 @@ extern void svc_rdma_cc_init(struct svcxprt_rdma *rdma, extern void svc_rdma_cc_release(struct svcxprt_rdma *rdma, struct svc_rdma_chunk_ctxt *cc, enum dma_data_direction dir); +extern void svc_rdma_write_chunk_release(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt); extern void svc_rdma_reply_chunk_release(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt); -extern int svc_rdma_send_write_list(struct svcxprt_rdma *rdma, - const struct svc_rdma_recv_ctxt *rctxt, - const struct xdr_buf *xdr); +extern int svc_rdma_prepare_write_list(struct svcxprt_rdma *rdma, + const struct svc_rdma_recv_ctxt *rctxt, + struct svc_rdma_send_ctxt *sctxt, + const struct xdr_buf *xdr); extern int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma, const struct svc_rdma_pcl *write_pcl, const struct svc_rdma_pcl *reply_pcl, diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 97bce806974b..ebc90c12c835 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -251,6 +251,28 @@ static void svc_rdma_write_info_free(struct svc_rdma_write_info *info) queue_work(svcrdma_wq, &info->wi_work); } +/** + * svc_rdma_write_chunk_release - Release Write chunk I/O resources + * @rdma: controlling transport + * @ctxt: Send context that is being released + * + * Write chunk resources remain live until Send completion because + * Write WRs are chained to the Send WR. This function releases all + * write_info structures accumulated on @ctxt->sc_write_info_list. + */ +void svc_rdma_write_chunk_release(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt) +{ + struct svc_rdma_write_info *info; + + while (!list_empty(&ctxt->sc_write_info_list)) { + info = list_first_entry(&ctxt->sc_write_info_list, + struct svc_rdma_write_info, wi_list); + list_del(&info->wi_list); + svc_rdma_write_info_free(info); + } +} + /** * svc_rdma_reply_chunk_release - Release Reply chunk I/O resources * @rdma: controlling transport @@ -307,13 +329,11 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) struct ib_cqe *cqe = wc->wr_cqe; struct svc_rdma_chunk_ctxt *cc = container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); - struct svc_rdma_write_info *info = - container_of(cc, struct svc_rdma_write_info, wi_cc); switch (wc->status) { case IB_WC_SUCCESS: trace_svcrdma_wc_write(&cc->cc_cid); - break; + return; case IB_WC_WR_FLUSH_ERR: trace_svcrdma_wc_write_flush(wc, &cc->cc_cid); break; @@ -321,12 +341,11 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) trace_svcrdma_wc_write_err(wc, &cc->cc_cid); } - svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount); - - if (unlikely(wc->status != IB_WC_SUCCESS)) - svc_xprt_deferred_close(&rdma->sc_xprt); - - svc_rdma_write_info_free(info); + /* The RDMA Write has flushed, so the client won't get + * some of the outgoing RPC message. Signal the loss + * to the client by closing the connection. + */ + svc_xprt_deferred_close(&rdma->sc_xprt); } /** @@ -600,13 +619,27 @@ static int svc_rdma_xb_write(const struct xdr_buf *xdr, void *data) return xdr->len; } -static int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, - const struct svc_rdma_chunk *chunk, - const struct xdr_buf *xdr) +/* + * svc_rdma_prepare_write_chunk - Link Write WRs for @chunk onto @sctxt's chain + * + * Write WRs are prepended to the Send WR chain so that a single + * ib_post_send() posts both RDMA Writes and the final Send. Only + * the first WR in each chunk gets a CQE for error detection; + * subsequent WRs complete without individual completion events. + * The Send WR's signaled completion indicates all chained + * operations have finished. + */ +static int svc_rdma_prepare_write_chunk(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *sctxt, + const struct svc_rdma_chunk *chunk, + const struct xdr_buf *xdr) { struct svc_rdma_write_info *info; struct svc_rdma_chunk_ctxt *cc; + struct ib_send_wr *first_wr; struct xdr_buf payload; + struct list_head *pos; + struct ib_cqe *cqe; int ret; if (xdr_buf_subsegment(xdr, &payload, chunk->ch_position, @@ -622,10 +655,25 @@ static int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, if (ret != payload.len) goto out_err; - trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount); - ret = svc_rdma_post_chunk_ctxt(rdma, cc); - if (ret < 0) + ret = -EINVAL; + if (unlikely(sctxt->sc_sqecount + cc->cc_sqecount > rdma->sc_sq_depth)) goto out_err; + + first_wr = sctxt->sc_wr_chain; + cqe = &cc->cc_cqe; + list_for_each(pos, &cc->cc_rwctxts) { + struct svc_rdma_rw_ctxt *rwc; + + rwc = list_entry(pos, struct svc_rdma_rw_ctxt, rw_list); + first_wr = rdma_rw_ctx_wrs(&rwc->rw_ctx, rdma->sc_qp, + rdma->sc_port_num, cqe, first_wr); + cqe = NULL; + } + sctxt->sc_wr_chain = first_wr; + sctxt->sc_sqecount += cc->cc_sqecount; + list_add(&info->wi_list, &sctxt->sc_write_info_list); + + trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount); return 0; out_err: @@ -634,17 +682,19 @@ out_err: } /** - * svc_rdma_send_write_list - Send all chunks on the Write list + * svc_rdma_prepare_write_list - Construct WR chain for sending Write list * @rdma: controlling RDMA transport * @rctxt: Write list provisioned by the client + * @sctxt: Send WR resources * @xdr: xdr_buf containing an RPC Reply message * - * Returns zero on success, or a negative errno if one or more - * Write chunks could not be sent. + * Returns zero on success, or a negative errno if WR chain + * construction fails for one or more Write chunks. */ -int svc_rdma_send_write_list(struct svcxprt_rdma *rdma, - const struct svc_rdma_recv_ctxt *rctxt, - const struct xdr_buf *xdr) +int svc_rdma_prepare_write_list(struct svcxprt_rdma *rdma, + const struct svc_rdma_recv_ctxt *rctxt, + struct svc_rdma_send_ctxt *sctxt, + const struct xdr_buf *xdr) { struct svc_rdma_chunk *chunk; int ret; @@ -652,7 +702,7 @@ int svc_rdma_send_write_list(struct svcxprt_rdma *rdma, pcl_for_each_chunk(chunk, &rctxt->rc_write_pcl) { if (!chunk->ch_payload_length) break; - ret = svc_rdma_send_write_chunk(rdma, chunk, xdr); + ret = svc_rdma_prepare_write_chunk(rdma, sctxt, chunk, xdr); if (ret < 0) return ret; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index bef68efa7034..8b3f0c8c14b2 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -150,6 +150,7 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) ctxt->sc_send_wr.sg_list = ctxt->sc_sges; ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED; ctxt->sc_cqe.done = svc_rdma_wc_send; + INIT_LIST_HEAD(&ctxt->sc_write_info_list); ctxt->sc_xprt_buf = buffer; xdr_buf_init(&ctxt->sc_hdrbuf, ctxt->sc_xprt_buf, rdma->sc_max_req_size); @@ -237,6 +238,7 @@ static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma, struct ib_device *device = rdma->sc_cm_id->device; unsigned int i; + svc_rdma_write_chunk_release(rdma, ctxt); svc_rdma_reply_chunk_release(rdma, ctxt); if (ctxt->sc_page_count) @@ -1054,6 +1056,12 @@ void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, sctxt->sc_send_wr.num_sge = 1; sctxt->sc_send_wr.opcode = IB_WR_SEND; sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len; + + /* Ensure only the error message is posted, not any previously + * prepared Write chunk WRs. + */ + sctxt->sc_wr_chain = &sctxt->sc_send_wr; + sctxt->sc_sqecount = 1; if (svc_rdma_post_send(rdma, sctxt)) goto put_ctxt; return; @@ -1101,7 +1109,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) if (!p) goto put_ctxt; - ret = svc_rdma_send_write_list(rdma, rctxt, &rqstp->rq_res); + ret = svc_rdma_prepare_write_list(rdma, rctxt, sctxt, &rqstp->rq_res); if (ret < 0) goto put_ctxt; -- cgit v1.2.3 From 3603bf99062c6d563df4fba3848f829d5401d959 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 28 Feb 2026 14:09:22 -0800 Subject: SUNRPC: xdr.h: fix all kernel-doc warnings Correct a function parameter name (s/page/folio/) and add function return value sections for multiple functions to eliminate kernel-doc warnings: Warning: include/linux/sunrpc/xdr.h:298 function parameter 'folio' not described in 'xdr_set_scratch_folio' Warning: include/linux/sunrpc/xdr.h:337 No description found for return value of 'xdr_stream_remaining' Warning: include/linux/sunrpc/xdr.h:357 No description found for return value of 'xdr_align_size' Warning: include/linux/sunrpc/xdr.h:374 No description found for return value of 'xdr_pad_size' Warning: include/linux/sunrpc/xdr.h:387 No description found for return value of 'xdr_stream_encode_item_present' Signed-off-by: Randy Dunlap Signed-off-by: Chuck Lever --- include/linux/sunrpc/xdr.h | 48 +++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 152597750f55..b639a6fafcbc 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -290,7 +290,7 @@ xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen) /** * xdr_set_scratch_folio - Attach a scratch buffer for decoding data * @xdr: pointer to xdr_stream struct - * @page: an anonymous folio + * @folio: an anonymous folio * * See xdr_set_scratch_buffer(). */ @@ -330,7 +330,7 @@ static inline void xdr_commit_encode(struct xdr_stream *xdr) * xdr_stream_remaining - Return the number of bytes remaining in the stream * @xdr: pointer to struct xdr_stream * - * Return value: + * Returns: * Number of bytes remaining in @xdr before xdr->end */ static inline size_t @@ -350,7 +350,7 @@ ssize_t xdr_stream_encode_opaque_auth(struct xdr_stream *xdr, u32 flavor, * xdr_align_size - Calculate padded size of an object * @n: Size of an object being XDR encoded (in bytes) * - * Return value: + * Returns: * Size (in bytes) of the object including xdr padding */ static inline size_t @@ -368,7 +368,7 @@ xdr_align_size(size_t n) * This implementation avoids the need for conditional * branches or modulo division. * - * Return value: + * Returns: * Size (in bytes) of the needed XDR pad */ static inline size_t xdr_pad_size(size_t n) @@ -380,7 +380,7 @@ static inline size_t xdr_pad_size(size_t n) * xdr_stream_encode_item_present - Encode a "present" list item * @xdr: pointer to xdr_stream * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -399,7 +399,7 @@ static inline ssize_t xdr_stream_encode_item_present(struct xdr_stream *xdr) * xdr_stream_encode_item_absent - Encode a "not present" list item * @xdr: pointer to xdr_stream * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -419,7 +419,7 @@ static inline int xdr_stream_encode_item_absent(struct xdr_stream *xdr) * @p: address in a buffer into which to encode * @n: boolean value to encode * - * Return value: + * Returns: * Address of item following the encoded boolean */ static inline __be32 *xdr_encode_bool(__be32 *p, u32 n) @@ -433,7 +433,7 @@ static inline __be32 *xdr_encode_bool(__be32 *p, u32 n) * @xdr: pointer to xdr_stream * @n: boolean value to encode * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -453,7 +453,7 @@ static inline int xdr_stream_encode_bool(struct xdr_stream *xdr, __u32 n) * @xdr: pointer to xdr_stream * @n: integer to encode * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -474,7 +474,7 @@ xdr_stream_encode_u32(struct xdr_stream *xdr, __u32 n) * @xdr: pointer to xdr_stream * @n: integer to encode * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -495,7 +495,7 @@ xdr_stream_encode_be32(struct xdr_stream *xdr, __be32 n) * @xdr: pointer to xdr_stream * @n: 64-bit integer to encode * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -517,7 +517,7 @@ xdr_stream_encode_u64(struct xdr_stream *xdr, __u64 n) * @ptr: pointer to void pointer * @len: size of object * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -542,7 +542,7 @@ xdr_stream_encode_opaque_inline(struct xdr_stream *xdr, void **ptr, size_t len) * @ptr: pointer to opaque data object * @len: size of object pointed to by @ptr * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -563,7 +563,7 @@ xdr_stream_encode_opaque_fixed(struct xdr_stream *xdr, const void *ptr, size_t l * @ptr: pointer to opaque data object * @len: size of object pointed to by @ptr * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -585,7 +585,7 @@ xdr_stream_encode_opaque(struct xdr_stream *xdr, const void *ptr, size_t len) * @array: array of integers * @array_size: number of elements in @array * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -608,7 +608,7 @@ xdr_stream_encode_uint32_array(struct xdr_stream *xdr, * xdr_item_is_absent - symbolically handle XDR discriminators * @p: pointer to undecoded discriminator * - * Return values: + * Returns: * %true if the following XDR item is absent * %false if the following XDR item is present */ @@ -621,7 +621,7 @@ static inline bool xdr_item_is_absent(const __be32 *p) * xdr_item_is_present - symbolically handle XDR discriminators * @p: pointer to undecoded discriminator * - * Return values: + * Returns: * %true if the following XDR item is present * %false if the following XDR item is absent */ @@ -635,7 +635,7 @@ static inline bool xdr_item_is_present(const __be32 *p) * @xdr: pointer to xdr_stream * @ptr: pointer to a u32 in which to store the result * - * Return values: + * Returns: * %0 on success * %-EBADMSG on XDR buffer overflow */ @@ -656,7 +656,7 @@ xdr_stream_decode_bool(struct xdr_stream *xdr, __u32 *ptr) * @xdr: pointer to xdr_stream * @ptr: location to store integer * - * Return values: + * Returns: * %0 on success * %-EBADMSG on XDR buffer overflow */ @@ -677,7 +677,7 @@ xdr_stream_decode_u32(struct xdr_stream *xdr, __u32 *ptr) * @xdr: pointer to xdr_stream * @ptr: location to store integer * - * Return values: + * Returns: * %0 on success * %-EBADMSG on XDR buffer overflow */ @@ -698,7 +698,7 @@ xdr_stream_decode_be32(struct xdr_stream *xdr, __be32 *ptr) * @xdr: pointer to xdr_stream * @ptr: location to store 64-bit integer * - * Return values: + * Returns: * %0 on success * %-EBADMSG on XDR buffer overflow */ @@ -720,7 +720,7 @@ xdr_stream_decode_u64(struct xdr_stream *xdr, __u64 *ptr) * @ptr: location to store data * @len: size of buffer pointed to by @ptr * - * Return values: + * Returns: * %0 on success * %-EBADMSG on XDR buffer overflow */ @@ -746,7 +746,7 @@ xdr_stream_decode_opaque_fixed(struct xdr_stream *xdr, void *ptr, size_t len) * on @xdr. It is therefore expected that the object it points to should * be processed immediately. * - * Return values: + * Returns: * On success, returns size of object stored in *@ptr * %-EBADMSG on XDR buffer overflow * %-EMSGSIZE if the size of the object would exceed @maxlen @@ -777,7 +777,7 @@ xdr_stream_decode_opaque_inline(struct xdr_stream *xdr, void **ptr, size_t maxle * @array: location to store the integer array or NULL * @array_size: number of elements to store * - * Return values: + * Returns: * On success, returns number of elements stored in @array * %-EBADMSG on XDR buffer overflow * %-EMSGSIZE if the size of the array exceeds @array_size -- cgit v1.2.3 From 4e2866b2baaddfff6069a2f18fc134c1d5a08f2b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 11 Mar 2026 12:18:54 -0400 Subject: SUNRPC: Add svc_rqst_page_release() helper svc_rqst_replace_page() releases displaced pages through a per-rqst folio batch, but exposes the add-or-flush sequence directly. svc_tcp_restore_pages() releases displaced pages individually with put_page(). Introduce svc_rqst_page_release() to encapsulate the batched release mechanism. Convert svc_rqst_replace_page() and svc_tcp_restore_pages() to use it. The latter now benefits from the same batched release that svc_rqst_replace_page() already uses. Reviewed-by: Christoph Hellwig Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 15 +++++++++++++++ net/sunrpc/svc.c | 7 ++----- net/sunrpc/svcsock.c | 2 +- 3 files changed, 18 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 669c944eaf7f..1ebd9c7efa70 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -498,6 +498,21 @@ int svc_generic_rpcbind_set(struct net *net, #define RPC_MAX_ADDRBUFLEN (63U) +/** + * svc_rqst_page_release - release a page associated with an RPC transaction + * @rqstp: RPC transaction context + * @page: page to release + * + * Released pages are batched and freed together, reducing + * allocator pressure under heavy RPC workloads. + */ +static inline void svc_rqst_page_release(struct svc_rqst *rqstp, + struct page *page) +{ + if (!folio_batch_add(&rqstp->rq_fbatch, page_folio(page))) + __folio_batch_release(&rqstp->rq_fbatch); +} + /* * When we want to reduce the size of the reserved space in the response * buffer, we need to take into account the size of any checksum data that diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 5e0b5ec2fd52..576fa42e7abf 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -976,11 +976,8 @@ bool svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page) return false; } - if (*rqstp->rq_next_page) { - if (!folio_batch_add(&rqstp->rq_fbatch, - page_folio(*rqstp->rq_next_page))) - __folio_batch_release(&rqstp->rq_fbatch); - } + if (*rqstp->rq_next_page) + svc_rqst_page_release(rqstp, *rqstp->rq_next_page); get_page(page); *(rqstp->rq_next_page++) = page; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 2ce43f9995f1..7be3de1a1aed 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -988,7 +988,7 @@ static size_t svc_tcp_restore_pages(struct svc_sock *svsk, npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; for (i = 0; i < npages; i++) { if (rqstp->rq_pages[i] != NULL) - put_page(rqstp->rq_pages[i]); + svc_rqst_page_release(rqstp, rqstp->rq_pages[i]); BUG_ON(svsk->sk_pages[i] == NULL); rqstp->rq_pages[i] = svsk->sk_pages[i]; svsk->sk_pages[i] = NULL; -- cgit v1.2.3