diff options
78 files changed, 4421 insertions, 1724 deletions
diff --git a/Documentation/admin-guide/nfs/pnfs-block-server.rst b/Documentation/admin-guide/nfs/pnfs-block-server.rst index 20fe9f5117fe..7667dd2e17f1 100644 --- a/Documentation/admin-guide/nfs/pnfs-block-server.rst +++ b/Documentation/admin-guide/nfs/pnfs-block-server.rst @@ -40,3 +40,33 @@ how to translate the device into a serial number from SCSI EVPD 0x80:: echo "fencing client ${CLIENT} serial ${EVPD}" >> /var/log/pnfsd-fence.log EOF + +If the nfsd server needs to fence a non-responding client and the +fencing operation fails, the server logs a warning message in the +system log with the following format: + + FENCE failed client[IP_address] clid[#n] device[dev_name] + + where: + + - IP_address: refers to the IP address of the affected client. + - #n: indicates the unique client identifier. + - dev_name: specifies the name of the block device related + to the fencing attempt. + +The server will repeatedly retry the operation indefinitely. During +this time, access to the affected file is restricted for all other +clients. This is to prevent potential data corruption if multiple +clients access the same file simultaneously. + +To restore access to the affected file for other clients, the admin +needs to take the following actions: + + - shutdown or power off the client being fenced. + - manually expire the client to release all its state on the server:: + + echo 'expire' > /proc/fs/nfsd/clients/clid/ctl + + where: + + - clid: is the unique client identifier displayed in the system log. diff --git a/Documentation/admin-guide/nfs/pnfs-scsi-server.rst b/Documentation/admin-guide/nfs/pnfs-scsi-server.rst index b2eec2288329..b202508d281d 100644 --- a/Documentation/admin-guide/nfs/pnfs-scsi-server.rst +++ b/Documentation/admin-guide/nfs/pnfs-scsi-server.rst @@ -22,3 +22,34 @@ option and the underlying SCSI device support persistent reservations. On the client make sure the kernel has the CONFIG_PNFS_BLOCK option enabled, and the file system is mounted using the NFSv4.1 protocol version (mount -o vers=4.1). + +If the nfsd server needs to fence a non-responding client and the +fencing operation fails, the server logs a warning message in the +system log with the following format: + + FENCE failed client[IP_address] clid[#n] device[dev_name] + + where: + + - IP_address: refers to the IP address of the affected client. + - #n: indicates the unique client identifier. + - dev_name: specifies the name of the block device related + to the fencing attempt. + +The server will repeatedly retry the operation indefinitely. During +this time, access to the affected file is restricted for all other +clients. This is to prevent potential data corruption if multiple +clients access the same file simultaneously. + +To restore access to the affected file for other clients, the admin +needs to take the following actions: + + - shutdown or power off the client being fenced. + - manually expire the client to release all its state on the server:: + + echo 'expire' > /proc/fs/nfsd/clients/clid/ctl + + where: + + - clid: is the unique client identifier displayed in the system log. + diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 8025df6e6499..8421ea21bd35 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -398,6 +398,7 @@ prototypes:: bool (*lm_breaker_owns_lease)(struct file_lock *); bool (*lm_lock_expirable)(struct file_lock *); void (*lm_expire_lock)(void); + bool (*lm_breaker_timedout)(struct file_lease *); locking rules: @@ -412,6 +413,7 @@ lm_breaker_owns_lease: yes no no lm_lock_expirable yes no no lm_expire_lock no no yes lm_open_conflict yes no no +lm_breaker_timedout yes no no ====================== ============= ================= ========= buffer_head diff --git a/Documentation/filesystems/nfs/exporting.rst b/Documentation/filesystems/nfs/exporting.rst index a01d9b9b5bc3..4aa59b0bf253 100644 --- a/Documentation/filesystems/nfs/exporting.rst +++ b/Documentation/filesystems/nfs/exporting.rst @@ -206,3 +206,88 @@ following flags are defined: all of an inode's dirty data on last close. Exports that behave this way should set EXPORT_OP_FLUSH_ON_CLOSE so that NFSD knows to skip waiting for writeback when closing such files. + +Signed Filehandles +------------------ + +To protect against filehandle guessing attacks, the Linux NFS server can be +configured to sign filehandles with a Message Authentication Code (MAC). + +Standard NFS filehandles are often predictable. If an attacker can guess +a valid filehandle for a file they do not have permission to access via +directory traversal, they may be able to bypass path-based permissions +(though they still remain subject to inode-level permissions). + +Signed filehandles prevent this by appending a MAC to the filehandle +before it is sent to the client. Upon receiving a filehandle back from a +client, the server re-calculates the MAC using its internal key and +verifies it against the one provided. If the signatures do not match, +the server treats the filehandle as invalid (returning NFS[34]ERR_STALE). + +Note that signing filehandles provides integrity and authenticity but +not confidentiality. The contents of the filehandle remain visible to +the client; they simply cannot be forged or modified. + +Configuration +~~~~~~~~~~~~~ + +To enable signed filehandles, the administrator must provide a signing +key to the kernel and enable the "sign_fh" export option. + +1. Providing a Key + The signing key is managed via the nfsd netlink interface. This key + is per-network-namespace and must be set before any exports using + "sign_fh" become active. + +2. Export Options + The feature is controlled on a per-export basis in /etc/exports: + + sign_fh + Enables signing for all filehandles generated under this export. + + no_sign_fh + (Default) Disables signing. + +Key Management and Rotation +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The security of this mechanism relies entirely on the secrecy of the +signing key. + +Initial Setup: + The key should be generated using a high-quality random source and + loaded early in the boot process or during the nfs-server startup + sequence. + +Changing Keys: + If a key is changed while clients have active mounts, existing + filehandles held by those clients will become invalid, resulting in + "Stale file handle" errors on the client side. + +Safe Rotation: + Currently, there is no mechanism for "graceful" key rotation + (maintaining multiple valid keys). Changing the key is an atomic + operation that immediately invalidates all previous signatures. + +Transitioning Exports +~~~~~~~~~~~~~~~~~~~~~ + +When adding or removing the "sign_fh" flag from an active export, the +following behaviors should be expected: + ++-------------------+---------------------------------------------------+ +| Change | Result for Existing Clients | ++===================+===================================================+ +| Adding sign_fh | Clients holding unsigned filehandles will find | +| | them rejected, as the server now expects a | +| | signature. | ++-------------------+---------------------------------------------------+ +| Removing sign_fh | Clients holding signed filehandles will find them | +| | rejected, as the server now expects the | +| | filehandle to end at its traditional boundary | +| | without a MAC. | ++-------------------+---------------------------------------------------+ + +Because filehandles are often cached persistently by clients, adding or +removing this option should generally be done during a scheduled maintenance +window involving a NFS client unmount/remount. diff --git a/Documentation/netlink/specs/nfsd.yaml b/Documentation/netlink/specs/nfsd.yaml index f87b5a05e5e9..8ab43c8253b2 100644 --- a/Documentation/netlink/specs/nfsd.yaml +++ b/Documentation/netlink/specs/nfsd.yaml @@ -81,6 +81,11 @@ attribute-sets: - name: min-threads type: u32 + - + name: fh-key + type: binary + checks: + exact-len: 16 - name: version attributes: @@ -163,6 +168,7 @@ operations: - leasetime - scope - min-threads + - fh-key - name: threads-get doc: get the maximum number of running threads diff --git a/Documentation/sunrpc/xdr/nlm4.x b/Documentation/sunrpc/xdr/nlm4.x new file mode 100644 index 000000000000..0c44a80ef674 --- /dev/null +++ b/Documentation/sunrpc/xdr/nlm4.x @@ -0,0 +1,211 @@ +/* + * This file was extracted by hand from + * https://www.rfc-editor.org/rfc/rfc1813.html . + * + * Note that RFC 1813 is Informational. Its official date of + * publication (June 1995) is before the IETF required its RFCs to + * carry an explicit copyright or other IP ownership notices. + * + * Note also that RFC 1813 does not specify the whole NLM4 protocol. + * In particular, the argument and result types are not present in + * that document, and had to be reverse-engineered. + */ + +/* + * The NLMv4 protocol + */ + +pragma header nlm4; + +/* + * The following definitions are missing in RFC 1813, + * but can be found in the OpenNetworking Network Lock + * Manager protocol: + * + * https://pubs.opengroup.org/onlinepubs/9629799/chap10.htm + */ + +const LM_MAXSTRLEN = 1024; + +const LM_MAXNAMELEN = 1025; + +const MAXNETOBJ_SZ = 1024; + +typedef opaque netobj<MAXNETOBJ_SZ>; + +enum fsh4_mode { + fsm_DN = 0, /* deny none */ + fsm_DR = 1, /* deny read */ + fsm_DW = 2, /* deny write */ + fsm_DRW = 3 /* deny read/write */ +}; + +enum fsh4_access { + fsa_NONE = 0, /* for completeness */ + fsa_R = 1, /* read-only */ + fsa_W = 2, /* write-only */ + fsa_RW = 3 /* read/write */ +}; + +/* + * The following definitions come from the OpenNetworking + * Network Status Monitor protocol: + * + * https://pubs.opengroup.org/onlinepubs/9629799/chap11.htm + */ + +const SM_MAXSTRLEN = 1024; + +/* + * The NLM protocol as extracted from: + * https://tools.ietf.org/html/rfc1813 Appendix II + */ + +typedef unsigned hyper uint64; + +typedef hyper int64; + +typedef unsigned long uint32; + +typedef long int32; + +enum nlm4_stats { + NLM4_GRANTED = 0, + NLM4_DENIED = 1, + NLM4_DENIED_NOLOCKS = 2, + NLM4_BLOCKED = 3, + NLM4_DENIED_GRACE_PERIOD = 4, + NLM4_DEADLCK = 5, + NLM4_ROFS = 6, + NLM4_STALE_FH = 7, + NLM4_FBIG = 8, + NLM4_FAILED = 9 +}; + +pragma big_endian nlm4_stats; + +struct nlm4_holder { + bool exclusive; + int32 svid; + netobj oh; + uint64 l_offset; + uint64 l_len; +}; + +union nlm4_testrply switch (nlm4_stats stat) { + case NLM4_DENIED: + nlm4_holder holder; + default: + void; +}; + +struct nlm4_stat { + nlm4_stats stat; +}; + +struct nlm4_res { + netobj cookie; + nlm4_stat stat; +}; + +struct nlm4_testres { + netobj cookie; + nlm4_testrply stat; +}; + +struct nlm4_lock { + string caller_name<LM_MAXSTRLEN>; + netobj fh; + netobj oh; + int32 svid; + uint64 l_offset; + uint64 l_len; +}; + +struct nlm4_lockargs { + netobj cookie; + bool block; + bool exclusive; + nlm4_lock alock; + bool reclaim; + int32 state; +}; + +struct nlm4_cancargs { + netobj cookie; + bool block; + bool exclusive; + nlm4_lock alock; +}; + +struct nlm4_testargs { + netobj cookie; + bool exclusive; + nlm4_lock alock; +}; + +struct nlm4_unlockargs { + netobj cookie; + nlm4_lock alock; +}; + +struct nlm4_share { + string caller_name<LM_MAXSTRLEN>; + netobj fh; + netobj oh; + fsh4_mode mode; + fsh4_access access; +}; + +struct nlm4_shareargs { + netobj cookie; + nlm4_share share; + bool reclaim; +}; + +struct nlm4_shareres { + netobj cookie; + nlm4_stats stat; + int32 sequence; +}; + +struct nlm4_notify { + string name<LM_MAXNAMELEN>; + int32 state; +}; + +/* + * Argument for the Linux-private SM_NOTIFY procedure + */ +const SM_PRIV_SIZE = 16; + +struct nlm4_notifyargs { + nlm4_notify notify; + opaque private[SM_PRIV_SIZE]; +}; + +program NLM4_PROG { + version NLM4_VERS { + void NLMPROC4_NULL(void) = 0; + nlm4_testres NLMPROC4_TEST(nlm4_testargs) = 1; + nlm4_res NLMPROC4_LOCK(nlm4_lockargs) = 2; + nlm4_res NLMPROC4_CANCEL(nlm4_cancargs) = 3; + nlm4_res NLMPROC4_UNLOCK(nlm4_unlockargs) = 4; + nlm4_res NLMPROC4_GRANTED(nlm4_testargs) = 5; + void NLMPROC4_TEST_MSG(nlm4_testargs) = 6; + void NLMPROC4_LOCK_MSG(nlm4_lockargs) = 7; + void NLMPROC4_CANCEL_MSG(nlm4_cancargs) = 8; + void NLMPROC4_UNLOCK_MSG(nlm4_unlockargs) = 9; + void NLMPROC4_GRANTED_MSG(nlm4_testargs) = 10; + void NLMPROC4_TEST_RES(nlm4_testres) = 11; + void NLMPROC4_LOCK_RES(nlm4_res) = 12; + void NLMPROC4_CANCEL_RES(nlm4_res) = 13; + void NLMPROC4_UNLOCK_RES(nlm4_res) = 14; + void NLMPROC4_GRANTED_RES(nlm4_res) = 15; + void NLMPROC4_SM_NOTIFY(nlm4_notifyargs) = 16; + nlm4_shareres NLMPROC4_SHARE(nlm4_shareargs) = 20; + nlm4_shareres NLMPROC4_UNSHARE(nlm4_shareargs) = 21; + nlm4_res NLMPROC4_NM_LOCK(nlm4_lockargs) = 22; + void NLMPROC4_FREE_ALL(nlm4_notify) = 23; + } = 4; +} = 100021; diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile index 51bbe22d21e3..808f0f2a7be1 100644 --- a/fs/lockd/Makefile +++ b/fs/lockd/Makefile @@ -9,5 +9,33 @@ obj-$(CONFIG_LOCKD) += lockd.o lockd-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \ svcshare.o svcproc.o svcsubs.o mon.o trace.o xdr.o netlink.o -lockd-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o +lockd-$(CONFIG_LOCKD_V4) += clnt4xdr.o svc4proc.o nlm4xdr_gen.o lockd-$(CONFIG_PROC_FS) += procfs.o + +# +# XDR code generation (requires Python and additional packages) +# +# The generated *xdr_gen.{h,c} files are checked into git. Normal kernel +# builds do not require the xdrgen tool or its Python dependencies. +# +# Developers modifying .x files in Documentation/sunrpc/xdr/ should run +# "make xdrgen" to regenerate the affected files. +# +.PHONY: xdrgen + +XDRGEN = ../../tools/net/sunrpc/xdrgen/xdrgen + +XDRGEN_DEFINITIONS = ../../include/linux/sunrpc/xdrgen/nlm4.h +XDRGEN_DECLARATIONS = nlm4xdr_gen.h +XDRGEN_SOURCE = nlm4xdr_gen.c + +xdrgen: $(XDRGEN_DEFINITIONS) $(XDRGEN_DECLARATIONS) $(XDRGEN_SOURCE) + +../../include/linux/sunrpc/xdrgen/nlm4.h: ../../Documentation/sunrpc/xdr/nlm4.x + $(XDRGEN) definitions $< > $@ + +nlm4xdr_gen.h: ../../Documentation/sunrpc/xdr/nlm4.x + $(XDRGEN) declarations $< > $@ + +nlm4xdr_gen.c: ../../Documentation/sunrpc/xdr/nlm4.x + $(XDRGEN) source --peer server $< > $@ diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c index 527458db4525..2058733eacf8 100644 --- a/fs/lockd/clnt4xdr.c +++ b/fs/lockd/clnt4xdr.c @@ -13,7 +13,8 @@ #include <linux/sunrpc/xdr.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/stats.h> -#include <linux/lockd/lockd.h> + +#include "lockd.h" #include <uapi/linux/nfs3.h> @@ -284,7 +285,7 @@ static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result) fl->c.flc_type = exclusive != 0 ? F_WRLCK : F_RDLCK; p = xdr_decode_hyper(p, &l_offset); xdr_decode_hyper(p, &l_len); - nlm4svc_set_file_lock_range(fl, l_offset, l_len); + lockd_set_file_lock_range4(fl, l_offset, l_len); error = 0; out: return error; diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 85bc0f3e91df..8fa30c42c92a 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -15,9 +15,9 @@ #include <linux/sunrpc/addr.h> #include <linux/sunrpc/svc.h> #include <linux/sunrpc/svc_xprt.h> -#include <linux/lockd/lockd.h> #include <linux/kthread.h> +#include "lockd.h" #include "trace.h" #define NLMDBG_FACILITY NLMDBG_CLIENT diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index fb4d0752c9bb..7f211008a5d2 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -18,8 +18,8 @@ #include <linux/freezer.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svc.h> -#include <linux/lockd/lockd.h> +#include "lockd.h" #include "trace.h" #define NLMDBG_FACILITY NLMDBG_CLIENT diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c index 6ea3448d2d31..65555f5224b1 100644 --- a/fs/lockd/clntxdr.c +++ b/fs/lockd/clntxdr.c @@ -15,7 +15,8 @@ #include <linux/sunrpc/xdr.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/stats.h> -#include <linux/lockd/lockd.h> + +#include "lockd.h" #include <uapi/linux/nfs2.h> diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 1a9582a10a86..ea8a8e166f7e 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -16,13 +16,13 @@ #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/svc.h> -#include <linux/lockd/lockd.h> #include <linux/mutex.h> #include <linux/sunrpc/svc_xprt.h> #include <net/ipv6.h> +#include "lockd.h" #include "netns.h" #define NLMDBG_FACILITY NLMDBG_HOSTCACHE @@ -306,6 +306,35 @@ void nlmclnt_release_host(struct nlm_host *host) } } +/* Callback for rpc_cancel_tasks() - matches all tasks for cancellation */ +static bool nlmclnt_match_all(const struct rpc_task *task, const void *data) +{ + return true; +} + +/** + * nlmclnt_shutdown_rpc_clnt - safely shut down NLM client RPC operations + * @host: nlm_host to shut down + * + * Cancels outstanding RPC tasks and marks the client as shut down. + * Synchronizes with nlmclnt_release_host() via nlm_host_mutex to prevent + * races between shutdown and host destruction. Safe to call if h_rpcclnt + * is NULL or already shut down. + */ +void nlmclnt_shutdown_rpc_clnt(struct nlm_host *host) +{ + struct rpc_clnt *clnt; + + mutex_lock(&nlm_host_mutex); + clnt = host->h_rpcclnt; + if (clnt) { + clnt->cl_shutdown = 1; + rpc_cancel_tasks(clnt, -EIO, nlmclnt_match_all, NULL); + } + mutex_unlock(&nlm_host_mutex); +} +EXPORT_SYMBOL_GPL(nlmclnt_shutdown_rpc_clnt); + /** * nlmsvc_lookup_host - Find an NLM host handle matching a remote client * @rqstp: incoming NLM request diff --git a/include/linux/lockd/lockd.h b/fs/lockd/lockd.h index 330e38776bb2..a7c85ab6d4b5 100644 --- a/include/linux/lockd/lockd.h +++ b/fs/lockd/lockd.h @@ -1,16 +1,10 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * linux/include/linux/lockd/lockd.h - * - * General-purpose lockd include file. - * * Copyright (C) 1996 Olaf Kirch <okir@monad.swb.de> */ -#ifndef LINUX_LOCKD_LOCKD_H -#define LINUX_LOCKD_LOCKD_H - -/* XXX: a lot of this should really be under fs/lockd. */ +#ifndef _LOCKD_LOCKD_H +#define _LOCKD_LOCKD_H #include <linux/exportfs.h> #include <linux/in.h> @@ -20,15 +14,35 @@ #include <linux/kref.h> #include <linux/refcount.h> #include <linux/utsname.h> +#include "nlm.h" #include <linux/lockd/bind.h> -#include <linux/lockd/xdr.h> -#ifdef CONFIG_LOCKD_V4 -#include <linux/lockd/xdr4.h> -#endif -#include <linux/lockd/debug.h> +#include "xdr.h" +#include <linux/sunrpc/debug.h> #include <linux/sunrpc/svc.h> /* + * Enable lockd debugging. + * Requires CONFIG_SUNRPC_DEBUG. + */ +#undef ifdebug +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) +# define ifdebug(flag) if (unlikely(nlm_debug & NLMDBG_##flag)) +#else +# define ifdebug(flag) if (0) +#endif + +#define NLMDBG_SVC 0x0001 +#define NLMDBG_CLIENT 0x0002 +#define NLMDBG_CLNTLOCK 0x0004 +#define NLMDBG_SVCLOCK 0x0008 +#define NLMDBG_MONITOR 0x0010 +#define NLMDBG_CLNTSUBS 0x0020 +#define NLMDBG_SVCSUBS 0x0040 +#define NLMDBG_HOSTCACHE 0x0080 +#define NLMDBG_XDR 0x0100 +#define NLMDBG_ALL 0x7fff + +/* * Version string */ #define LOCKD_VERSION "0.5" @@ -38,6 +52,22 @@ */ #define LOCKD_DFLT_TIMEO 10 +/* error codes new to NLMv4 */ +#define nlm4_deadlock cpu_to_be32(NLM_DEADLCK) +#define nlm4_rofs cpu_to_be32(NLM_ROFS) +#define nlm4_stale_fh cpu_to_be32(NLM_STALE_FH) +#define nlm4_fbig cpu_to_be32(NLM_FBIG) +#define nlm4_failed cpu_to_be32(NLM_FAILED) + +/* + * Internal-use status codes, not to be placed on the wire. + * Version handlers translate these to appropriate wire values. + */ +#define nlm__int__drop_reply cpu_to_be32(30000) +#define nlm__int__deadlock cpu_to_be32(30001) +#define nlm__int__stale_fh cpu_to_be32(30002) +#define nlm__int__failed cpu_to_be32(30003) + /* * Lockd host handle (used both by the client and server personality). */ @@ -149,6 +179,8 @@ struct nlm_rqst { void * a_callback_data; /* sent to nlmclnt_operations callbacks */ }; +struct nlm_share; + /* * This struct describes a file held open by lockd on behalf of * an NFS client. @@ -196,9 +228,10 @@ struct nlm_block { * Global variables */ extern const struct rpc_program nlm_program; -extern const struct svc_procedure nlmsvc_procedures[24]; +extern const struct svc_version nlmsvc_version1; +extern const struct svc_version nlmsvc_version3; #ifdef CONFIG_LOCKD_V4 -extern const struct svc_procedure nlmsvc_procedures4[24]; +extern const struct svc_version nlmsvc_version4; #endif extern int nlmsvc_grace_period; extern unsigned long nlm_timeout; @@ -226,6 +259,10 @@ int nlmclnt_reclaim(struct nlm_host *, struct file_lock *, struct nlm_rqst *); void nlmclnt_next_cookie(struct nlm_cookie *); +#ifdef CONFIG_LOCKD_V4 +extern const struct rpc_version nlm_version4; +#endif + /* * Host cache */ @@ -289,6 +326,7 @@ void nlmsvc_traverse_blocks(struct nlm_host *, struct nlm_file *, void nlmsvc_grant_reply(struct nlm_cookie *, __be32); void nlmsvc_release_call(struct nlm_rqst *); void nlmsvc_locks_init_private(struct file_lock *, struct nlm_host *, pid_t); +int nlmsvc_dispatch(struct svc_rqst *rqstp); /* * File handling for the server personality @@ -302,12 +340,6 @@ void nlmsvc_mark_resources(struct net *); void nlmsvc_free_host_resources(struct nlm_host *); void nlmsvc_invalidate_all(void); -/* - * Cluster failover support - */ -int nlmsvc_unlock_all_by_sb(struct super_block *sb); -int nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr); - static inline struct file *nlmsvc_file_file(const struct nlm_file *file) { return file->f_file[O_RDONLY] ? @@ -390,6 +422,31 @@ static inline int nlm_compare_locks(const struct file_lock *fl1, &&(fl1->c.flc_type == fl2->c.flc_type || fl2->c.flc_type == F_UNLCK); } +/** + * lockd_set_file_lock_range4 - set the byte range of a file_lock + * @fl: file_lock whose length fields are to be initialized + * @off: starting offset of the lock, in bytes + * @len: length of the byte range, in bytes, or zero + * + * The NLMv4 protocol represents lock byte ranges as (start, length), + * where length zero means "lock to end of file." The kernel's file_lock + * structure uses (start, end) representation. Convert from NLMv4 format + * to file_lock format, clamping the starting offset and treating + * arithmetic overflow as "lock to EOF." + */ +static inline void +lockd_set_file_lock_range4(struct file_lock *fl, u64 off, u64 len) +{ + u64 clamped_off = (off > OFFSET_MAX) ? OFFSET_MAX : off; + s64 end = clamped_off + len - 1; + + fl->fl_start = clamped_off; + if (len == 0 || end < 0) + fl->fl_end = OFFSET_MAX; + else + fl->fl_end = end; +} + extern const struct lock_manager_operations nlmsvc_lock_operations; -#endif /* LINUX_LOCKD_LOCKD_H */ +#endif /* _LOCKD_LOCKD_H */ diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index b8fc732e1c67..3d3ee88ca4dc 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -16,10 +16,10 @@ #include <linux/sunrpc/addr.h> #include <linux/sunrpc/xprtsock.h> #include <linux/sunrpc/svc.h> -#include <linux/lockd/lockd.h> #include <linux/unaligned.h> +#include "lockd.h" #include "netns.h" #define NLMDBG_FACILITY NLMDBG_MONITOR diff --git a/include/linux/lockd/nlm.h b/fs/lockd/nlm.h index 6e343ef760dc..47be65d0111f 100644 --- a/include/linux/lockd/nlm.h +++ b/fs/lockd/nlm.h @@ -1,14 +1,12 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * linux/include/linux/lockd/nlm.h - * * Declarations for the Network Lock Manager protocol. * * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de> */ -#ifndef LINUX_LOCKD_NLM_H -#define LINUX_LOCKD_NLM_H +#ifndef _LOCKD_NLM_H +#define _LOCKD_NLM_H /* Maximum file offset in file_lock.fl_end */ @@ -55,4 +53,4 @@ enum { #define NLMPROC_NM_LOCK 22 #define NLMPROC_FREE_ALL 23 -#endif /* LINUX_LOCKD_NLM_H */ +#endif /* _LOCKD_NLM_H */ diff --git a/fs/lockd/nlm4xdr_gen.c b/fs/lockd/nlm4xdr_gen.c new file mode 100644 index 000000000000..1c8c221db456 --- /dev/null +++ b/fs/lockd/nlm4xdr_gen.c @@ -0,0 +1,724 @@ +// SPDX-License-Identifier: GPL-2.0 +// Generated by xdrgen. Manual edits will be lost. +// XDR specification file: ../../Documentation/sunrpc/xdr/nlm4.x +// XDR specification modification time: Thu Dec 25 13:10:19 2025 + +#include <linux/sunrpc/svc.h> + +#include "nlm4xdr_gen.h" + +static bool __maybe_unused +xdrgen_decode_netobj(struct xdr_stream *xdr, netobj *ptr) +{ + return xdrgen_decode_opaque(xdr, ptr, MAXNETOBJ_SZ); +} + +static bool __maybe_unused +xdrgen_decode_fsh4_mode(struct xdr_stream *xdr, fsh4_mode *ptr) +{ + u32 val; + + if (xdr_stream_decode_u32(xdr, &val) < 0) + return false; + *ptr = val; + return true; +} + +static bool __maybe_unused +xdrgen_decode_fsh4_access(struct xdr_stream *xdr, fsh4_access *ptr) +{ + u32 val; + + if (xdr_stream_decode_u32(xdr, &val) < 0) + return false; + *ptr = val; + return true; +} + +static bool __maybe_unused +xdrgen_decode_uint64(struct xdr_stream *xdr, uint64 *ptr) +{ + return xdrgen_decode_unsigned_hyper(xdr, ptr); +} + +static bool __maybe_unused +xdrgen_decode_int64(struct xdr_stream *xdr, int64 *ptr) +{ + return xdrgen_decode_hyper(xdr, ptr); +} + +static bool __maybe_unused +xdrgen_decode_uint32(struct xdr_stream *xdr, uint32 *ptr) +{ + return xdrgen_decode_unsigned_long(xdr, ptr); +} + +static bool __maybe_unused +xdrgen_decode_int32(struct xdr_stream *xdr, int32 *ptr) +{ + return xdrgen_decode_long(xdr, ptr); +} + +static bool __maybe_unused +xdrgen_decode_nlm4_stats(struct xdr_stream *xdr, nlm4_stats *ptr) +{ + return xdr_stream_decode_be32(xdr, ptr) == 0; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_holder(struct xdr_stream *xdr, struct nlm4_holder *ptr) +{ + if (!xdrgen_decode_bool(xdr, &ptr->exclusive)) + return false; + if (!xdrgen_decode_int32(xdr, &ptr->svid)) + return false; + if (!xdrgen_decode_netobj(xdr, &ptr->oh)) + return false; + if (!xdrgen_decode_uint64(xdr, &ptr->l_offset)) + return false; + if (!xdrgen_decode_uint64(xdr, &ptr->l_len)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_testrply(struct xdr_stream *xdr, struct nlm4_testrply *ptr) +{ + if (!xdrgen_decode_nlm4_stats(xdr, &ptr->stat)) + return false; + switch (ptr->stat) { + case __constant_cpu_to_be32(NLM4_DENIED): + if (!xdrgen_decode_nlm4_holder(xdr, &ptr->u.holder)) + return false; + break; + default: + break; + } + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_stat(struct xdr_stream *xdr, struct nlm4_stat *ptr) +{ + if (!xdrgen_decode_nlm4_stats(xdr, &ptr->stat)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_res(struct xdr_stream *xdr, struct nlm4_res *ptr) +{ + if (!xdrgen_decode_netobj(xdr, &ptr->cookie)) + return false; + if (!xdrgen_decode_nlm4_stat(xdr, &ptr->stat)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_testres(struct xdr_stream *xdr, struct nlm4_testres *ptr) +{ + if (!xdrgen_decode_netobj(xdr, &ptr->cookie)) + return false; + if (!xdrgen_decode_nlm4_testrply(xdr, &ptr->stat)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_lock(struct xdr_stream *xdr, struct nlm4_lock *ptr) +{ + if (!xdrgen_decode_string(xdr, (string *)ptr, LM_MAXSTRLEN)) + return false; + if (!xdrgen_decode_netobj(xdr, &ptr->fh)) + return false; + if (!xdrgen_decode_netobj(xdr, &ptr->oh)) + return false; + if (!xdrgen_decode_int32(xdr, &ptr->svid)) + return false; + if (!xdrgen_decode_uint64(xdr, &ptr->l_offset)) + return false; + if (!xdrgen_decode_uint64(xdr, &ptr->l_len)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_lockargs(struct xdr_stream *xdr, struct nlm4_lockargs *ptr) +{ + if (!xdrgen_decode_netobj(xdr, &ptr->cookie)) + return false; + if (!xdrgen_decode_bool(xdr, &ptr->block)) + return false; + if (!xdrgen_decode_bool(xdr, &ptr->exclusive)) + return false; + if (!xdrgen_decode_nlm4_lock(xdr, &ptr->alock)) + return false; + if (!xdrgen_decode_bool(xdr, &ptr->reclaim)) + return false; + if (!xdrgen_decode_int32(xdr, &ptr->state)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_cancargs(struct xdr_stream *xdr, struct nlm4_cancargs *ptr) +{ + if (!xdrgen_decode_netobj(xdr, &ptr->cookie)) + return false; + if (!xdrgen_decode_bool(xdr, &ptr->block)) + return false; + if (!xdrgen_decode_bool(xdr, &ptr->exclusive)) + return false; + if (!xdrgen_decode_nlm4_lock(xdr, &ptr->alock)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_testargs(struct xdr_stream *xdr, struct nlm4_testargs *ptr) +{ + if (!xdrgen_decode_netobj(xdr, &ptr->cookie)) + return false; + if (!xdrgen_decode_bool(xdr, &ptr->exclusive)) + return false; + if (!xdrgen_decode_nlm4_lock(xdr, &ptr->alock)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_unlockargs(struct xdr_stream *xdr, struct nlm4_unlockargs *ptr) +{ + if (!xdrgen_decode_netobj(xdr, &ptr->cookie)) + return false; + if (!xdrgen_decode_nlm4_lock(xdr, &ptr->alock)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_share(struct xdr_stream *xdr, struct nlm4_share *ptr) +{ + if (!xdrgen_decode_string(xdr, (string *)ptr, LM_MAXSTRLEN)) + return false; + if (!xdrgen_decode_netobj(xdr, &ptr->fh)) + return false; + if (!xdrgen_decode_netobj(xdr, &ptr->oh)) + return false; + if (!xdrgen_decode_fsh4_mode(xdr, &ptr->mode)) + return false; + if (!xdrgen_decode_fsh4_access(xdr, &ptr->access)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_shareargs(struct xdr_stream *xdr, struct nlm4_shareargs *ptr) +{ + if (!xdrgen_decode_netobj(xdr, &ptr->cookie)) + return false; + if (!xdrgen_decode_nlm4_share(xdr, &ptr->share)) + return false; + if (!xdrgen_decode_bool(xdr, &ptr->reclaim)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_shareres(struct xdr_stream *xdr, struct nlm4_shareres *ptr) +{ + if (!xdrgen_decode_netobj(xdr, &ptr->cookie)) + return false; + if (!xdrgen_decode_nlm4_stats(xdr, &ptr->stat)) + return false; + if (!xdrgen_decode_int32(xdr, &ptr->sequence)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_notify(struct xdr_stream *xdr, struct nlm4_notify *ptr) +{ + if (!xdrgen_decode_string(xdr, (string *)ptr, LM_MAXNAMELEN)) + return false; + if (!xdrgen_decode_int32(xdr, &ptr->state)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_decode_nlm4_notifyargs(struct xdr_stream *xdr, struct nlm4_notifyargs *ptr) +{ + if (!xdrgen_decode_nlm4_notify(xdr, &ptr->notify)) + return false; + if (xdr_stream_decode_opaque_fixed(xdr, ptr->private, SM_PRIV_SIZE) < 0) + return false; + return true; +} + +/** + * nlm4_svc_decode_void - Decode a void argument + * @rqstp: RPC transaction context + * @xdr: source XDR data stream + * + * Return values: + * %true: procedure arguments decoded successfully + * %false: decode failed + */ +bool nlm4_svc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + return xdrgen_decode_void(xdr); +} + +/** + * nlm4_svc_decode_nlm4_testargs - Decode a nlm4_testargs argument + * @rqstp: RPC transaction context + * @xdr: source XDR data stream + * + * Return values: + * %true: procedure arguments decoded successfully + * %false: decode failed + */ +bool nlm4_svc_decode_nlm4_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_testargs *argp = rqstp->rq_argp; + + return xdrgen_decode_nlm4_testargs(xdr, argp); +} + +/** + * nlm4_svc_decode_nlm4_lockargs - Decode a nlm4_lockargs argument + * @rqstp: RPC transaction context + * @xdr: source XDR data stream + * + * Return values: + * %true: procedure arguments decoded successfully + * %false: decode failed + */ +bool nlm4_svc_decode_nlm4_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_lockargs *argp = rqstp->rq_argp; + + return xdrgen_decode_nlm4_lockargs(xdr, argp); +} + +/** + * nlm4_svc_decode_nlm4_cancargs - Decode a nlm4_cancargs argument + * @rqstp: RPC transaction context + * @xdr: source XDR data stream + * + * Return values: + * %true: procedure arguments decoded successfully + * %false: decode failed + */ +bool nlm4_svc_decode_nlm4_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_cancargs *argp = rqstp->rq_argp; + + return xdrgen_decode_nlm4_cancargs(xdr, argp); +} + +/** + * nlm4_svc_decode_nlm4_unlockargs - Decode a nlm4_unlockargs argument + * @rqstp: RPC transaction context + * @xdr: source XDR data stream + * + * Return values: + * %true: procedure arguments decoded successfully + * %false: decode failed + */ +bool nlm4_svc_decode_nlm4_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_unlockargs *argp = rqstp->rq_argp; + + return xdrgen_decode_nlm4_unlockargs(xdr, argp); +} + +/** + * nlm4_svc_decode_nlm4_testres - Decode a nlm4_testres argument + * @rqstp: RPC transaction context + * @xdr: source XDR data stream + * + * Return values: + * %true: procedure arguments decoded successfully + * %false: decode failed + */ +bool nlm4_svc_decode_nlm4_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_testres *argp = rqstp->rq_argp; + + return xdrgen_decode_nlm4_testres(xdr, argp); +} + +/** + * nlm4_svc_decode_nlm4_res - Decode a nlm4_res argument + * @rqstp: RPC transaction context + * @xdr: source XDR data stream + * + * Return values: + * %true: procedure arguments decoded successfully + * %false: decode failed + */ +bool nlm4_svc_decode_nlm4_res(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_res *argp = rqstp->rq_argp; + + return xdrgen_decode_nlm4_res(xdr, argp); +} + +/** + * nlm4_svc_decode_nlm4_notifyargs - Decode a nlm4_notifyargs argument + * @rqstp: RPC transaction context + * @xdr: source XDR data stream + * + * Return values: + * %true: procedure arguments decoded successfully + * %false: decode failed + */ +bool nlm4_svc_decode_nlm4_notifyargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_notifyargs *argp = rqstp->rq_argp; + + return xdrgen_decode_nlm4_notifyargs(xdr, argp); +} + +/** + * nlm4_svc_decode_nlm4_shareargs - Decode a nlm4_shareargs argument + * @rqstp: RPC transaction context + * @xdr: source XDR data stream + * + * Return values: + * %true: procedure arguments decoded successfully + * %false: decode failed + */ +bool nlm4_svc_decode_nlm4_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_shareargs *argp = rqstp->rq_argp; + + return xdrgen_decode_nlm4_shareargs(xdr, argp); +} + +/** + * nlm4_svc_decode_nlm4_notify - Decode a nlm4_notify argument + * @rqstp: RPC transaction context + * @xdr: source XDR data stream + * + * Return values: + * %true: procedure arguments decoded successfully + * %false: decode failed + */ +bool nlm4_svc_decode_nlm4_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_notify *argp = rqstp->rq_argp; + + return xdrgen_decode_nlm4_notify(xdr, argp); +} + +static bool __maybe_unused +xdrgen_encode_netobj(struct xdr_stream *xdr, const netobj value) +{ + return xdr_stream_encode_opaque(xdr, value.data, value.len) >= 0; +} + +static bool __maybe_unused +xdrgen_encode_fsh4_mode(struct xdr_stream *xdr, fsh4_mode value) +{ + return xdr_stream_encode_u32(xdr, value) == XDR_UNIT; +} + +static bool __maybe_unused +xdrgen_encode_fsh4_access(struct xdr_stream *xdr, fsh4_access value) +{ + return xdr_stream_encode_u32(xdr, value) == XDR_UNIT; +} + +static bool __maybe_unused +xdrgen_encode_uint64(struct xdr_stream *xdr, const uint64 value) +{ + return xdrgen_encode_unsigned_hyper(xdr, value); +} + +static bool __maybe_unused +xdrgen_encode_int64(struct xdr_stream *xdr, const int64 value) +{ + return xdrgen_encode_hyper(xdr, value); +} + +static bool __maybe_unused +xdrgen_encode_uint32(struct xdr_stream *xdr, const uint32 value) +{ + return xdrgen_encode_unsigned_long(xdr, value); +} + +static bool __maybe_unused +xdrgen_encode_int32(struct xdr_stream *xdr, const int32 value) +{ + return xdrgen_encode_long(xdr, value); +} + +static bool __maybe_unused +xdrgen_encode_nlm4_stats(struct xdr_stream *xdr, nlm4_stats value) +{ + return xdr_stream_encode_be32(xdr, value) == XDR_UNIT; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_holder(struct xdr_stream *xdr, const struct nlm4_holder *value) +{ + if (!xdrgen_encode_bool(xdr, value->exclusive)) + return false; + if (!xdrgen_encode_int32(xdr, value->svid)) + return false; + if (!xdrgen_encode_netobj(xdr, value->oh)) + return false; + if (!xdrgen_encode_uint64(xdr, value->l_offset)) + return false; + if (!xdrgen_encode_uint64(xdr, value->l_len)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_testrply(struct xdr_stream *xdr, const struct nlm4_testrply *ptr) +{ + if (!xdrgen_encode_nlm4_stats(xdr, ptr->stat)) + return false; + switch (ptr->stat) { + case __constant_cpu_to_be32(NLM4_DENIED): + if (!xdrgen_encode_nlm4_holder(xdr, &ptr->u.holder)) + return false; + break; + default: + break; + } + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_stat(struct xdr_stream *xdr, const struct nlm4_stat *value) +{ + if (!xdrgen_encode_nlm4_stats(xdr, value->stat)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_res(struct xdr_stream *xdr, const struct nlm4_res *value) +{ + if (!xdrgen_encode_netobj(xdr, value->cookie)) + return false; + if (!xdrgen_encode_nlm4_stat(xdr, &value->stat)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_testres(struct xdr_stream *xdr, const struct nlm4_testres *value) +{ + if (!xdrgen_encode_netobj(xdr, value->cookie)) + return false; + if (!xdrgen_encode_nlm4_testrply(xdr, &value->stat)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_lock(struct xdr_stream *xdr, const struct nlm4_lock *value) +{ + if (value->caller_name.len > LM_MAXSTRLEN) + return false; + if (xdr_stream_encode_opaque(xdr, value->caller_name.data, value->caller_name.len) < 0) + return false; + if (!xdrgen_encode_netobj(xdr, value->fh)) + return false; + if (!xdrgen_encode_netobj(xdr, value->oh)) + return false; + if (!xdrgen_encode_int32(xdr, value->svid)) + return false; + if (!xdrgen_encode_uint64(xdr, value->l_offset)) + return false; + if (!xdrgen_encode_uint64(xdr, value->l_len)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_lockargs(struct xdr_stream *xdr, const struct nlm4_lockargs *value) +{ + if (!xdrgen_encode_netobj(xdr, value->cookie)) + return false; + if (!xdrgen_encode_bool(xdr, value->block)) + return false; + if (!xdrgen_encode_bool(xdr, value->exclusive)) + return false; + if (!xdrgen_encode_nlm4_lock(xdr, &value->alock)) + return false; + if (!xdrgen_encode_bool(xdr, value->reclaim)) + return false; + if (!xdrgen_encode_int32(xdr, value->state)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_cancargs(struct xdr_stream *xdr, const struct nlm4_cancargs *value) +{ + if (!xdrgen_encode_netobj(xdr, value->cookie)) + return false; + if (!xdrgen_encode_bool(xdr, value->block)) + return false; + if (!xdrgen_encode_bool(xdr, value->exclusive)) + return false; + if (!xdrgen_encode_nlm4_lock(xdr, &value->alock)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_testargs(struct xdr_stream *xdr, const struct nlm4_testargs *value) +{ + if (!xdrgen_encode_netobj(xdr, value->cookie)) + return false; + if (!xdrgen_encode_bool(xdr, value->exclusive)) + return false; + if (!xdrgen_encode_nlm4_lock(xdr, &value->alock)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_unlockargs(struct xdr_stream *xdr, const struct nlm4_unlockargs *value) +{ + if (!xdrgen_encode_netobj(xdr, value->cookie)) + return false; + if (!xdrgen_encode_nlm4_lock(xdr, &value->alock)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_share(struct xdr_stream *xdr, const struct nlm4_share *value) +{ + if (value->caller_name.len > LM_MAXSTRLEN) + return false; + if (xdr_stream_encode_opaque(xdr, value->caller_name.data, value->caller_name.len) < 0) + return false; + if (!xdrgen_encode_netobj(xdr, value->fh)) + return false; + if (!xdrgen_encode_netobj(xdr, value->oh)) + return false; + if (!xdrgen_encode_fsh4_mode(xdr, value->mode)) + return false; + if (!xdrgen_encode_fsh4_access(xdr, value->access)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_shareargs(struct xdr_stream *xdr, const struct nlm4_shareargs *value) +{ + if (!xdrgen_encode_netobj(xdr, value->cookie)) + return false; + if (!xdrgen_encode_nlm4_share(xdr, &value->share)) + return false; + if (!xdrgen_encode_bool(xdr, value->reclaim)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_shareres(struct xdr_stream *xdr, const struct nlm4_shareres *value) +{ + if (!xdrgen_encode_netobj(xdr, value->cookie)) + return false; + if (!xdrgen_encode_nlm4_stats(xdr, value->stat)) + return false; + if (!xdrgen_encode_int32(xdr, value->sequence)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_notify(struct xdr_stream *xdr, const struct nlm4_notify *value) +{ + if (value->name.len > LM_MAXNAMELEN) + return false; + if (xdr_stream_encode_opaque(xdr, value->name.data, value->name.len) < 0) + return false; + if (!xdrgen_encode_int32(xdr, value->state)) + return false; + return true; +} + +static bool __maybe_unused +xdrgen_encode_nlm4_notifyargs(struct xdr_stream *xdr, const struct nlm4_notifyargs *value) +{ + if (!xdrgen_encode_nlm4_notify(xdr, &value->notify)) + return false; + if (xdr_stream_encode_opaque_fixed(xdr, value->private, SM_PRIV_SIZE) < 0) + return false; + return true; +} + +/** + * nlm4_svc_encode_void - Encode a void result + * @rqstp: RPC transaction context + * @xdr: target XDR data stream + * + * Return values: + * %true: procedure results encoded successfully + * %false: encode failed + */ +bool nlm4_svc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + return xdrgen_encode_void(xdr); +} + +/** + * nlm4_svc_encode_nlm4_testres - Encode a nlm4_testres result + * @rqstp: RPC transaction context + * @xdr: target XDR data stream + * + * Return values: + * %true: procedure results encoded successfully + * %false: encode failed + */ +bool nlm4_svc_encode_nlm4_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_testres *resp = rqstp->rq_resp; + + return xdrgen_encode_nlm4_testres(xdr, resp); +} + +/** + * nlm4_svc_encode_nlm4_res - Encode a nlm4_res result + * @rqstp: RPC transaction context + * @xdr: target XDR data stream + * + * Return values: + * %true: procedure results encoded successfully + * %false: encode failed + */ +bool nlm4_svc_encode_nlm4_res(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_res *resp = rqstp->rq_resp; + + return xdrgen_encode_nlm4_res(xdr, resp); +} + +/** + * nlm4_svc_encode_nlm4_shareres - Encode a nlm4_shareres result + * @rqstp: RPC transaction context + * @xdr: target XDR data stream + * + * Return values: + * %true: procedure results encoded successfully + * %false: encode failed + */ +bool nlm4_svc_encode_nlm4_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + struct nlm4_shareres *resp = rqstp->rq_resp; + + return xdrgen_encode_nlm4_shareres(xdr, resp); +} diff --git a/fs/lockd/nlm4xdr_gen.h b/fs/lockd/nlm4xdr_gen.h new file mode 100644 index 000000000000..b6008b296a3e --- /dev/null +++ b/fs/lockd/nlm4xdr_gen.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Generated by xdrgen. Manual edits will be lost. */ +/* XDR specification file: ../../Documentation/sunrpc/xdr/nlm4.x */ +/* XDR specification modification time: Thu Dec 25 13:10:19 2025 */ + +#ifndef _LINUX_XDRGEN_NLM4_DECL_H +#define _LINUX_XDRGEN_NLM4_DECL_H + +#include <linux/types.h> + +#include <linux/sunrpc/xdr.h> +#include <linux/sunrpc/xdrgen/_defs.h> +#include <linux/sunrpc/xdrgen/_builtins.h> +#include <linux/sunrpc/xdrgen/nlm4.h> + +bool nlm4_svc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_decode_nlm4_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_decode_nlm4_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_decode_nlm4_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_decode_nlm4_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_decode_nlm4_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_decode_nlm4_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_decode_nlm4_notifyargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_decode_nlm4_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_decode_nlm4_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr); + +bool nlm4_svc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_encode_nlm4_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_encode_nlm4_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4_svc_encode_nlm4_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr); + +#endif /* _LINUX_XDRGEN_NLM4_DECL_H */ diff --git a/include/linux/lockd/share.h b/fs/lockd/share.h index 1f18a9faf645..20ea8ee49168 100644 --- a/include/linux/lockd/share.h +++ b/fs/lockd/share.h @@ -1,14 +1,15 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * linux/include/linux/lockd/share.h - * * DOS share management for lockd. * * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de> */ -#ifndef LINUX_LOCKD_SHARE_H -#define LINUX_LOCKD_SHARE_H +#ifndef _LOCKD_SHARE_H +#define _LOCKD_SHARE_H + +/* Synthetic svid for lockowner lookup during share operations */ +#define LOCKD_SHARE_SVID (~(u32)0) /* * DOS share for a specific file @@ -22,11 +23,11 @@ struct nlm_share { u32 s_mode; /* deny mode */ }; -__be32 nlmsvc_share_file(struct nlm_host *, struct nlm_file *, - struct nlm_args *); -__be32 nlmsvc_unshare_file(struct nlm_host *, struct nlm_file *, - struct nlm_args *); +__be32 nlmsvc_share_file(struct nlm_host *host, struct nlm_file *file, + struct xdr_netobj *oh, u32 access, u32 mode); +__be32 nlmsvc_unshare_file(struct nlm_host *host, struct nlm_file *file, + struct xdr_netobj *oh); void nlmsvc_traverse_shares(struct nlm_host *, struct nlm_file *, nlm_host_match_fn_t); -#endif /* LINUX_LOCKD_SHARE_H */ +#endif /* _LOCKD_SHARE_H */ diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index dcd80c4e74c9..490551369ef2 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -36,15 +36,14 @@ #include <net/ip.h> #include <net/addrconf.h> #include <net/ipv6.h> -#include <linux/lockd/lockd.h> #include <linux/nfs.h> +#include "lockd.h" #include "netns.h" #include "procfs.h" #include "netlink.h" #define NLMDBG_FACILITY NLMDBG_SVC -#define LOCKD_BUFSIZE (1024 + NLMSVC_XDRSIZE) static struct svc_program nlmsvc_program; @@ -319,6 +318,7 @@ static struct notifier_block lockd_inet6addr_notifier = { static int lockd_get(void) { struct svc_serv *serv; + unsigned int bufsize; int error; if (nlmsvc_serv) { @@ -334,7 +334,15 @@ static int lockd_get(void) printk(KERN_WARNING "lockd_up: no pid, %d users??\n", nlmsvc_users); - serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, lockd); +#ifdef CONFIG_LOCKD_V4 + bufsize = 1024 + max3(nlmsvc_version1.vs_xdrsize, + nlmsvc_version3.vs_xdrsize, + nlmsvc_version4.vs_xdrsize); +#else + bufsize = 1024 + max(nlmsvc_version1.vs_xdrsize, + nlmsvc_version3.vs_xdrsize); +#endif + serv = svc_create(&nlmsvc_program, bufsize, lockd); if (!serv) { printk(KERN_WARNING "lockd_up: create service failed\n"); return -ENOMEM; @@ -640,7 +648,7 @@ module_exit(exit_nlm); * %0: Processing complete; do not send a Reply * %1: Processing complete; send Reply in rqstp->rq_res */ -static int nlmsvc_dispatch(struct svc_rqst *rqstp) +int nlmsvc_dispatch(struct svc_rqst *rqstp) { const struct svc_procedure *procp = rqstp->rq_procinfo; __be32 *statp = rqstp->rq_accept_statp; @@ -671,40 +679,6 @@ out_encode_err: /* * Define NLM program and procedures */ -static DEFINE_PER_CPU_ALIGNED(unsigned long, nlmsvc_version1_count[17]); -static const struct svc_version nlmsvc_version1 = { - .vs_vers = 1, - .vs_nproc = 17, - .vs_proc = nlmsvc_procedures, - .vs_count = nlmsvc_version1_count, - .vs_dispatch = nlmsvc_dispatch, - .vs_xdrsize = NLMSVC_XDRSIZE, -}; - -static DEFINE_PER_CPU_ALIGNED(unsigned long, - nlmsvc_version3_count[ARRAY_SIZE(nlmsvc_procedures)]); -static const struct svc_version nlmsvc_version3 = { - .vs_vers = 3, - .vs_nproc = ARRAY_SIZE(nlmsvc_procedures), - .vs_proc = nlmsvc_procedures, - .vs_count = nlmsvc_version3_count, - .vs_dispatch = nlmsvc_dispatch, - .vs_xdrsize = NLMSVC_XDRSIZE, -}; - -#ifdef CONFIG_LOCKD_V4 -static DEFINE_PER_CPU_ALIGNED(unsigned long, - nlmsvc_version4_count[ARRAY_SIZE(nlmsvc_procedures4)]); -static const struct svc_version nlmsvc_version4 = { - .vs_vers = 4, - .vs_nproc = ARRAY_SIZE(nlmsvc_procedures4), - .vs_proc = nlmsvc_procedures4, - .vs_count = nlmsvc_version4_count, - .vs_dispatch = nlmsvc_dispatch, - .vs_xdrsize = NLMSVC_XDRSIZE, -}; -#endif - static const struct svc_version *nlmsvc_version[] = { [1] = &nlmsvc_version1, [3] = &nlmsvc_version3, diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 4b6f18d97734..5de41e249534 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -10,257 +10,528 @@ #include <linux/types.h> #include <linux/time.h> -#include <linux/lockd/lockd.h> -#include <linux/lockd/share.h> #include <linux/sunrpc/svc_xprt.h> -#define NLMDBG_FACILITY NLMDBG_CLIENT +#include "lockd.h" /* - * Obtain client and file from arguments + * xdr.h defines SM_MAXSTRLEN and SM_PRIV_SIZE as macros. + * nlm4xdr_gen.h defines them as enum constants. Undefine the + * macros to allow the xdrgen enum definitions to be used. */ -static __be32 -nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, - struct nlm_host **hostp, struct nlm_file **filp) -{ - struct nlm_host *host = NULL; - struct nlm_file *file = NULL; - struct nlm_lock *lock = &argp->lock; - __be32 error = 0; +#undef SM_MAXSTRLEN +#undef SM_PRIV_SIZE - /* nfsd callbacks must have been installed for this procedure */ - if (!nlmsvc_ops) - return nlm_lck_denied_nolocks; +#include "share.h" +#include "nlm4xdr_gen.h" - if (lock->lock_start > OFFSET_MAX || - (lock->lock_len && ((lock->lock_len - 1) > (OFFSET_MAX - lock->lock_start)))) - return nlm4_fbig; +/* + * Wrapper structures combine xdrgen types with legacy nlm_lock. + * The xdrgen field must be first so the structure can be cast + * to its XDR type for the RPC dispatch layer. + */ +struct nlm4_testargs_wrapper { + struct nlm4_testargs xdrgen; + struct nlm_lock lock; +}; - /* Obtain host handle */ - if (!(host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len)) - || (argp->monitor && nsm_monitor(host) < 0)) - goto no_locks; - *hostp = host; - - /* Obtain file pointer. Not used by FREE_ALL call. */ - if (filp != NULL) { - int mode = lock_to_openmode(&lock->fl); - - lock->fl.c.flc_flags = FL_POSIX; - - error = nlm_lookup_file(rqstp, &file, lock); - if (error) - goto no_locks; - *filp = file; - - /* Set up the missing parts of the file_lock structure */ - lock->fl.c.flc_file = file->f_file[mode]; - lock->fl.c.flc_pid = current->tgid; - lock->fl.fl_start = (loff_t)lock->lock_start; - lock->fl.fl_end = lock->lock_len ? - (loff_t)(lock->lock_start + lock->lock_len - 1) : - OFFSET_MAX; - lock->fl.fl_lmops = &nlmsvc_lock_operations; - nlmsvc_locks_init_private(&lock->fl, host, (pid_t)lock->svid); - if (!lock->fl.c.flc_owner) { - /* lockowner allocation has failed */ - nlmsvc_release_host(host); - return nlm_lck_denied_nolocks; - } - } +static_assert(offsetof(struct nlm4_testargs_wrapper, xdrgen) == 0); - return 0; +struct nlm4_lockargs_wrapper { + struct nlm4_lockargs xdrgen; + struct nlm_cookie cookie; + struct nlm_lock lock; +}; -no_locks: - nlmsvc_release_host(host); - if (error) - return error; - return nlm_lck_denied_nolocks; -} +static_assert(offsetof(struct nlm4_lockargs_wrapper, xdrgen) == 0); -/* - * NULL: Test for presence of service - */ -static __be32 -nlm4svc_proc_null(struct svc_rqst *rqstp) -{ - dprintk("lockd: NULL called\n"); - return rpc_success; -} +struct nlm4_cancargs_wrapper { + struct nlm4_cancargs xdrgen; + struct nlm_lock lock; +}; -/* - * TEST: Check for conflicting lock - */ -static __be32 -__nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp) -{ - struct nlm_args *argp = rqstp->rq_argp; - struct nlm_host *host; - struct nlm_file *file; - __be32 rc = rpc_success; +static_assert(offsetof(struct nlm4_cancargs_wrapper, xdrgen) == 0); - dprintk("lockd: TEST4 called\n"); - resp->cookie = argp->cookie; +struct nlm4_unlockargs_wrapper { + struct nlm4_unlockargs xdrgen; + struct nlm_lock lock; +}; - /* Obtain client and file */ - if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; +static_assert(offsetof(struct nlm4_unlockargs_wrapper, xdrgen) == 0); - /* Now check for conflicting locks */ - resp->status = nlmsvc_testlock(rqstp, file, host, &argp->lock, - &resp->lock); - if (resp->status == nlm_drop_reply) - rc = rpc_drop_reply; - else - dprintk("lockd: TEST4 status %d\n", ntohl(resp->status)); +struct nlm4_notifyargs_wrapper { + struct nlm4_notifyargs xdrgen; + struct nlm_reboot reboot; +}; - nlmsvc_release_lockowner(&argp->lock); - nlmsvc_release_host(host); - nlm_release_file(file); - return rc; -} +static_assert(offsetof(struct nlm4_notifyargs_wrapper, xdrgen) == 0); -static __be32 -nlm4svc_proc_test(struct svc_rqst *rqstp) -{ - return __nlm4svc_proc_test(rqstp, rqstp->rq_resp); -} +struct nlm4_notify_wrapper { + struct nlm4_notify xdrgen; +}; -static __be32 -__nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_res *resp) -{ - struct nlm_args *argp = rqstp->rq_argp; - struct nlm_host *host; - struct nlm_file *file; - __be32 rc = rpc_success; +static_assert(offsetof(struct nlm4_notify_wrapper, xdrgen) == 0); - dprintk("lockd: LOCK called\n"); +struct nlm4_testres_wrapper { + struct nlm4_testres xdrgen; + struct nlm_lock lock; +}; - resp->cookie = argp->cookie; +struct nlm4_shareargs_wrapper { + struct nlm4_shareargs xdrgen; + struct nlm_lock lock; +}; - /* Obtain client and file */ - if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; +static_assert(offsetof(struct nlm4_shareargs_wrapper, xdrgen) == 0); - /* Now try to lock the file */ - resp->status = nlmsvc_lock(rqstp, file, host, &argp->lock, - argp->block, &argp->cookie, - argp->reclaim); - if (resp->status == nlm_drop_reply) - rc = rpc_drop_reply; - else - dprintk("lockd: LOCK status %d\n", ntohl(resp->status)); +static_assert(offsetof(struct nlm4_testres_wrapper, xdrgen) == 0); - nlmsvc_release_lockowner(&argp->lock); - nlmsvc_release_host(host); - nlm_release_file(file); - return rc; +struct nlm4_res_wrapper { + struct nlm4_res xdrgen; + struct nlm_cookie cookie; +}; + +static_assert(offsetof(struct nlm4_res_wrapper, xdrgen) == 0); + +struct nlm4_shareres_wrapper { + struct nlm4_shareres xdrgen; +}; + +static_assert(offsetof(struct nlm4_shareres_wrapper, xdrgen) == 0); + +static __be32 +nlm4_netobj_to_cookie(struct nlm_cookie *cookie, netobj *object) +{ + if (object->len > NLM_MAXCOOKIELEN) + return nlm_lck_denied_nolocks; + cookie->len = object->len; + memcpy(cookie->data, object->data, object->len); + return nlm_granted; } static __be32 -nlm4svc_proc_lock(struct svc_rqst *rqstp) +nlm4_lock_to_nlm_lock(struct nlm_lock *lock, struct nlm4_lock *alock) { - return __nlm4svc_proc_lock(rqstp, rqstp->rq_resp); + if (alock->fh.len > NFS_MAXFHSIZE) + return nlm_lck_denied; + lock->fh.size = alock->fh.len; + memcpy(lock->fh.data, alock->fh.data, alock->fh.len); + lock->oh.len = alock->oh.len; + lock->oh.data = alock->oh.data; + lock->svid = alock->svid; + locks_init_lock(&lock->fl); + lockd_set_file_lock_range4(&lock->fl, alock->l_offset, alock->l_len); + return nlm_granted; +} + +static struct nlm_host * +nlm4svc_lookup_host(struct svc_rqst *rqstp, string caller, bool monitored) +{ + struct nlm_host *host; + + if (!nlmsvc_ops) + return NULL; + host = nlmsvc_lookup_host(rqstp, caller.data, caller.len); + if (!host) + return NULL; + if (monitored && nsm_monitor(host) < 0) { + nlmsvc_release_host(host); + return NULL; + } + return host; } static __be32 -__nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_res *resp) +nlm4svc_lookup_file(struct svc_rqst *rqstp, struct nlm_host *host, + struct nlm_lock *lock, struct nlm_file **filp, + struct nlm4_lock *xdr_lock, unsigned char type) { - struct nlm_args *argp = rqstp->rq_argp; - struct nlm_host *host; - struct nlm_file *file; + struct file_lock *fl = &lock->fl; + struct nlm_file *file = NULL; + __be32 error; - dprintk("lockd: CANCEL called\n"); + if (xdr_lock->fh.len > NFS_MAXFHSIZE) + return nlm_lck_denied_nolocks; + lock->fh.size = xdr_lock->fh.len; + memcpy(lock->fh.data, xdr_lock->fh.data, xdr_lock->fh.len); - resp->cookie = argp->cookie; + lock->oh.len = xdr_lock->oh.len; + lock->oh.data = xdr_lock->oh.data; - /* Don't accept requests during grace period */ - if (locks_in_grace(SVC_NET(rqstp))) { - resp->status = nlm_lck_denied_grace_period; - return rpc_success; - } + lock->svid = xdr_lock->svid; + lock->lock_start = xdr_lock->l_offset; + lock->lock_len = xdr_lock->l_len; - /* Obtain client and file */ - if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + if (lock->lock_start > OFFSET_MAX || + (lock->lock_len && ((lock->lock_len - 1) > (OFFSET_MAX - lock->lock_start)))) + return nlm4_fbig; - /* Try to cancel request. */ - resp->status = nlmsvc_cancel_blocked(SVC_NET(rqstp), file, &argp->lock); + locks_init_lock(fl); + fl->c.flc_type = type; + lockd_set_file_lock_range4(fl, lock->lock_start, lock->lock_len); + + error = nlm_lookup_file(rqstp, &file, lock); + switch (error) { + case nlm_granted: + break; + case nlm__int__stale_fh: + return nlm4_stale_fh; + case nlm__int__failed: + return nlm4_failed; + default: + return error; + } + *filp = file; + + fl->c.flc_flags = FL_POSIX; + fl->c.flc_file = file->f_file[lock_to_openmode(fl)]; + fl->c.flc_pid = current->tgid; + fl->fl_lmops = &nlmsvc_lock_operations; + nlmsvc_locks_init_private(fl, host, (pid_t)lock->svid); + if (!fl->c.flc_owner) + return nlm_lck_denied_nolocks; - dprintk("lockd: CANCEL status %d\n", ntohl(resp->status)); - nlmsvc_release_lockowner(&argp->lock); - nlmsvc_release_host(host); - nlm_release_file(file); - return rpc_success; + return nlm_granted; } +/** + * nlm4svc_proc_null - NULL: Test for presence of service + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully + * + * RPC synopsis: + * void NLMPROC4_NULL(void) = 0; + */ static __be32 -nlm4svc_proc_cancel(struct svc_rqst *rqstp) +nlm4svc_proc_null(struct svc_rqst *rqstp) { - return __nlm4svc_proc_cancel(rqstp, rqstp->rq_resp); + return rpc_success; } -/* - * UNLOCK: release a lock +/** + * nlm4svc_proc_test - TEST: Check for conflicting lock + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully. + * %rpc_drop_reply: Do not send an RPC reply. + * + * RPC synopsis: + * nlm4_testres NLMPROC4_TEST(nlm4_testargs) = 1; + * + * Permissible procedure status codes: + * %NLM4_GRANTED: The server would be able to grant the + * requested lock. + * %NLM4_DENIED: The requested lock conflicted with existing + * lock reservations for the file. + * %NLM4_DENIED_NOLOCKS: The server could not allocate the resources + * needed to process the request. + * %NLM4_DENIED_GRACE_PERIOD: The server has recently restarted and is + * re-establishing existing locks, and is not + * yet ready to accept normal service requests. + * + * The Linux NLM server implementation also returns: + * %NLM4_STALE_FH: The request specified an invalid file handle. + * %NLM4_FBIG: The request specified a length or offset + * that exceeds the range supported by the + * server. + * %NLM4_FAILED: The request failed for an unspecified reason. */ -static __be32 -__nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_res *resp) +static __be32 nlm4svc_proc_test(struct svc_rqst *rqstp) { - struct nlm_args *argp = rqstp->rq_argp; + struct nlm4_testargs_wrapper *argp = rqstp->rq_argp; + unsigned char type = argp->xdrgen.exclusive ? F_WRLCK : F_RDLCK; + struct nlm4_testres_wrapper *resp = rqstp->rq_resp; + struct nlm_file *file = NULL; struct nlm_host *host; - struct nlm_file *file; - dprintk("lockd: UNLOCK called\n"); + resp->xdrgen.cookie = argp->xdrgen.cookie; - resp->cookie = argp->cookie; + resp->xdrgen.stat.stat = nlm_lck_denied_nolocks; + host = nlm4svc_lookup_host(rqstp, argp->xdrgen.alock.caller_name, false); + if (!host) + goto out; - /* Don't accept new lock requests during grace period */ - if (locks_in_grace(SVC_NET(rqstp))) { - resp->status = nlm_lck_denied_grace_period; - return rpc_success; + resp->xdrgen.stat.stat = nlm4svc_lookup_file(rqstp, host, &argp->lock, + &file, &argp->xdrgen.alock, + type); + if (resp->xdrgen.stat.stat) + goto out; + + resp->xdrgen.stat.stat = nlmsvc_testlock(rqstp, file, host, + &argp->lock, &resp->lock); + nlmsvc_release_lockowner(&argp->lock); + + if (resp->xdrgen.stat.stat == nlm_lck_denied) { + struct nlm_lock *conf = &resp->lock; + struct nlm4_holder *holder = &resp->xdrgen.stat.u.holder; + + holder->exclusive = (conf->fl.c.flc_type != F_RDLCK); + holder->svid = conf->svid; + holder->oh.len = conf->oh.len; + holder->oh.data = conf->oh.data; + holder->l_offset = conf->fl.fl_start; + if (conf->fl.fl_end == OFFSET_MAX) + holder->l_len = 0; + else + holder->l_len = conf->fl.fl_end - conf->fl.fl_start + 1; } - /* Obtain client and file */ - if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; +out: + if (file) + nlm_release_file(file); + nlmsvc_release_host(host); + return resp->xdrgen.stat.stat == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; +} - /* Now try to remove the lock */ - resp->status = nlmsvc_unlock(SVC_NET(rqstp), file, &argp->lock); +static __be32 +nlm4svc_do_lock(struct svc_rqst *rqstp, bool monitored) +{ + struct nlm4_lockargs_wrapper *argp = rqstp->rq_argp; + unsigned char type = argp->xdrgen.exclusive ? F_WRLCK : F_RDLCK; + struct nlm4_res_wrapper *resp = rqstp->rq_resp; + struct nlm_file *file = NULL; + struct nlm_host *host = NULL; + + resp->xdrgen.cookie = argp->xdrgen.cookie; + + resp->xdrgen.stat.stat = nlm4_netobj_to_cookie(&argp->cookie, + &argp->xdrgen.cookie); + if (resp->xdrgen.stat.stat) + goto out; + + resp->xdrgen.stat.stat = nlm_lck_denied_nolocks; + host = nlm4svc_lookup_host(rqstp, argp->xdrgen.alock.caller_name, + monitored); + if (!host) + goto out; + + resp->xdrgen.stat.stat = nlm4svc_lookup_file(rqstp, host, &argp->lock, + &file, &argp->xdrgen.alock, + type); + if (resp->xdrgen.stat.stat) + goto out; + + resp->xdrgen.stat.stat = nlmsvc_lock(rqstp, file, host, &argp->lock, + argp->xdrgen.block, &argp->cookie, + argp->xdrgen.reclaim); + if (resp->xdrgen.stat.stat == nlm__int__deadlock) + resp->xdrgen.stat.stat = nlm4_deadlock; - dprintk("lockd: UNLOCK status %d\n", ntohl(resp->status)); nlmsvc_release_lockowner(&argp->lock); + +out: + if (file) + nlm_release_file(file); nlmsvc_release_host(host); - nlm_release_file(file); - return rpc_success; + return resp->xdrgen.stat.stat == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; } +/** + * nlm4svc_proc_lock - LOCK: Establish a monitored lock + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully. + * %rpc_drop_reply: Do not send an RPC reply. + * + * RPC synopsis: + * nlm4_res NLMPROC4_LOCK(nlm4_lockargs) = 2; + * + * Permissible procedure status codes: + * %NLM4_GRANTED: The requested lock was granted. + * %NLM4_DENIED: The requested lock conflicted with existing + * lock reservations for the file. + * %NLM4_DENIED_NOLOCKS: The server could not allocate the resources + * needed to process the request. + * %NLM4_BLOCKED: The blocking request cannot be granted + * immediately. The server will send an + * NLMPROC4_GRANTED callback to the client when + * the lock can be granted. + * %NLM4_DENIED_GRACE_PERIOD: The server has recently restarted and is + * re-establishing existing locks, and is not + * yet ready to accept normal service requests. + * + * The Linux NLM server implementation also returns: + * %NLM4_DEADLCK: The request could not be granted and + * blocking would cause a deadlock. + * %NLM4_STALE_FH: The request specified an invalid file handle. + * %NLM4_FBIG: The request specified a length or offset + * that exceeds the range supported by the + * server. + * %NLM4_FAILED: The request failed for an unspecified reason. + */ static __be32 -nlm4svc_proc_unlock(struct svc_rqst *rqstp) +nlm4svc_proc_lock(struct svc_rqst *rqstp) { - return __nlm4svc_proc_unlock(rqstp, rqstp->rq_resp); + return nlm4svc_do_lock(rqstp, true); } -/* - * GRANTED: A server calls us to tell that a process' lock request - * was granted +/** + * nlm4svc_proc_cancel - CANCEL: Cancel an outstanding blocked lock request + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully + * %rpc_drop_reply: Do not send an RPC reply + * + * RPC synopsis: + * nlm4_res NLMPROC4_CANCEL(nlm4_cancargs) = 3; + * + * Permissible procedure status codes: + * %NLM4_LCK_GRANTED: The requested lock was canceled. + * %NLM4_LCK_DENIED: There was no lock to cancel. + * %NLM4_DENIED_GRACE_PERIOD: The server has recently restarted and is + * re-establishing existing locks, and is not + * yet ready to accept normal service requests. + * + * The Linux NLM server implementation also returns: + * %NLM4_DENIED_NOLOCKS: A needed resource could not be allocated. + * %NLM4_STALE_FH: The request specified an invalid file handle. + * %NLM4_FBIG: The request specified a length or offset + * that exceeds the range supported by the + * server. + * %NLM4_FAILED: The request failed for an unspecified reason. */ static __be32 -__nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_res *resp) +nlm4svc_proc_cancel(struct svc_rqst *rqstp) { - struct nlm_args *argp = rqstp->rq_argp; + struct nlm4_cancargs_wrapper *argp = rqstp->rq_argp; + unsigned char type = argp->xdrgen.exclusive ? F_WRLCK : F_RDLCK; + struct nlm4_res_wrapper *resp = rqstp->rq_resp; + struct net *net = SVC_NET(rqstp); + struct nlm_host *host = NULL; + struct nlm_file *file = NULL; + + resp->xdrgen.cookie = argp->xdrgen.cookie; + + resp->xdrgen.stat.stat = nlm_lck_denied_grace_period; + if (locks_in_grace(net)) + goto out; + + resp->xdrgen.stat.stat = nlm_lck_denied_nolocks; + host = nlm4svc_lookup_host(rqstp, argp->xdrgen.alock.caller_name, false); + if (!host) + goto out; + + resp->xdrgen.stat.stat = nlm4svc_lookup_file(rqstp, host, &argp->lock, + &file, &argp->xdrgen.alock, + type); + if (resp->xdrgen.stat.stat) + goto out; + + resp->xdrgen.stat.stat = nlmsvc_cancel_blocked(net, file, &argp->lock); + nlmsvc_release_lockowner(&argp->lock); - resp->cookie = argp->cookie; +out: + if (file) + nlm_release_file(file); + nlmsvc_release_host(host); + return resp->xdrgen.stat.stat == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; +} - dprintk("lockd: GRANTED called\n"); - resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock); - dprintk("lockd: GRANTED status %d\n", ntohl(resp->status)); - return rpc_success; +/** + * nlm4svc_proc_unlock - UNLOCK: Remove a lock + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully. + * %rpc_drop_reply: Do not send an RPC reply. + * + * RPC synopsis: + * nlm4_res NLMPROC4_UNLOCK(nlm4_unlockargs) = 4; + * + * Permissible procedure status codes: + * %NLM4_GRANTED: The requested lock was released. + * %NLM4_DENIED_GRACE_PERIOD: The server has recently restarted and is + * re-establishing existing locks, and is not + * yet ready to accept normal service requests. + * + * The Linux NLM server implementation also returns: + * %NLM4_DENIED_NOLOCKS: A needed resource could not be allocated. + * %NLM4_STALE_FH: The request specified an invalid file handle. + * %NLM4_FBIG: The request specified a length or offset + * that exceeds the range supported by the + * server. + * %NLM4_FAILED: The request failed for an unspecified reason. + */ +static __be32 +nlm4svc_proc_unlock(struct svc_rqst *rqstp) +{ + struct nlm4_unlockargs_wrapper *argp = rqstp->rq_argp; + struct nlm4_res_wrapper *resp = rqstp->rq_resp; + struct net *net = SVC_NET(rqstp); + struct nlm_host *host = NULL; + struct nlm_file *file = NULL; + + resp->xdrgen.cookie = argp->xdrgen.cookie; + + resp->xdrgen.stat.stat = nlm_lck_denied_grace_period; + if (locks_in_grace(net)) + goto out; + + resp->xdrgen.stat.stat = nlm_lck_denied_nolocks; + host = nlm4svc_lookup_host(rqstp, argp->xdrgen.alock.caller_name, false); + if (!host) + goto out; + + resp->xdrgen.stat.stat = nlm4svc_lookup_file(rqstp, host, &argp->lock, + &file, &argp->xdrgen.alock, + F_UNLCK); + if (resp->xdrgen.stat.stat) + goto out; + + resp->xdrgen.stat.stat = nlmsvc_unlock(net, file, &argp->lock); + nlmsvc_release_lockowner(&argp->lock); + +out: + if (file) + nlm_release_file(file); + nlmsvc_release_host(host); + return resp->xdrgen.stat.stat == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; } +/** + * nlm4svc_proc_granted - GRANTED: Server grants a previously blocked lock + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully. + * + * RPC synopsis: + * nlm4_res NLMPROC4_GRANTED(nlm4_testargs) = 5; + * + * Permissible procedure status codes: + * %NLM4_GRANTED: The requested lock was granted. + * %NLM4_DENIED: The server could not allocate the resources + * needed to process the request. + * %NLM4_DENIED_GRACE_PERIOD: The server has recently restarted and is + * re-establishing existing locks, and is not + * yet ready to accept normal service requests. + */ static __be32 nlm4svc_proc_granted(struct svc_rqst *rqstp) { - return __nlm4svc_proc_granted(rqstp, rqstp->rq_resp); + struct nlm4_testargs_wrapper *argp = rqstp->rq_argp; + struct nlm4_res_wrapper *resp = rqstp->rq_resp; + + resp->xdrgen.cookie = argp->xdrgen.cookie; + + resp->xdrgen.stat.stat = nlm4_lock_to_nlm_lock(&argp->lock, + &argp->xdrgen.alock); + if (resp->xdrgen.stat.stat) + goto out; + + resp->xdrgen.stat.stat = nlmclnt_grant(svc_addr(rqstp), &argp->lock); + +out: + return rpc_success; } /* @@ -281,24 +552,17 @@ static const struct rpc_call_ops nlm4svc_callback_ops = { }; /* - * `Async' versions of the above service routines. They aren't really, - * because we send the callback before the reply proper. I hope this - * doesn't break any clients. + * Dispatch an async callback RPC to a client with a pre-resolved host. + * Caller provides a reference to @host; this function takes ownership + * and releases it via nlmsvc_release_host() before returning. */ -static __be32 nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, - __be32 (*func)(struct svc_rqst *, struct nlm_res *)) +static __be32 +nlm4svc_callback(struct svc_rqst *rqstp, struct nlm_host *host, u32 proc, + __be32 (*func)(struct svc_rqst *, struct nlm_res *)) { - struct nlm_args *argp = rqstp->rq_argp; - struct nlm_host *host; struct nlm_rqst *call; __be32 stat; - host = nlmsvc_lookup_host(rqstp, - argp->lock.caller, - argp->lock.len); - if (host == NULL) - return rpc_system_err; - call = nlm_alloc_call(host); nlmsvc_release_host(host); if (call == NULL) @@ -316,433 +580,845 @@ static __be32 nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, return rpc_success; } -static __be32 nlm4svc_proc_test_msg(struct svc_rqst *rqstp) +static __be32 +__nlm4svc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_res *resp) { - dprintk("lockd: TEST_MSG called\n"); - return nlm4svc_callback(rqstp, NLMPROC_TEST_RES, __nlm4svc_proc_test); -} + struct nlm4_testargs_wrapper *argp = rqstp->rq_argp; + unsigned char type = argp->xdrgen.exclusive ? F_WRLCK : F_RDLCK; + struct nlm_lockowner *owner; + struct nlm_file *file = NULL; + struct nlm_host *host = NULL; + + resp->status = nlm_lck_denied_nolocks; + if (nlm4_netobj_to_cookie(&resp->cookie, &argp->xdrgen.cookie)) + goto out; + + host = nlm4svc_lookup_host(rqstp, argp->xdrgen.alock.caller_name, false); + if (!host) + goto out; + + resp->status = nlm4svc_lookup_file(rqstp, host, &argp->lock, + &file, &argp->xdrgen.alock, type); + if (resp->status) + goto out; + + owner = argp->lock.fl.c.flc_owner; + resp->status = nlmsvc_testlock(rqstp, file, host, &argp->lock, + &resp->lock); + nlmsvc_put_lockowner(owner); -static __be32 nlm4svc_proc_lock_msg(struct svc_rqst *rqstp) -{ - dprintk("lockd: LOCK_MSG called\n"); - return nlm4svc_callback(rqstp, NLMPROC_LOCK_RES, __nlm4svc_proc_lock); +out: + if (file) + nlm_release_file(file); + nlmsvc_release_host(host); + return resp->status == nlm__int__drop_reply ? rpc_drop_reply : rpc_success; } -static __be32 nlm4svc_proc_cancel_msg(struct svc_rqst *rqstp) +/** + * nlm4svc_proc_test_msg - TEST_MSG: Check for conflicting lock + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully. + * %rpc_system_err: RPC execution failed. + * + * RPC synopsis: + * void NLMPROC4_TEST_MSG(nlm4_testargs) = 6; + * + * The response to this request is delivered via the TEST_RES procedure. + */ +static __be32 nlm4svc_proc_test_msg(struct svc_rqst *rqstp) { - dprintk("lockd: CANCEL_MSG called\n"); - return nlm4svc_callback(rqstp, NLMPROC_CANCEL_RES, __nlm4svc_proc_cancel); + struct nlm4_testargs_wrapper *argp = rqstp->rq_argp; + struct nlm_host *host; + + host = nlm4svc_lookup_host(rqstp, argp->xdrgen.alock.caller_name, false); + if (!host) + return rpc_system_err; + + return nlm4svc_callback(rqstp, host, NLMPROC4_TEST_RES, + __nlm4svc_proc_test_msg); } -static __be32 nlm4svc_proc_unlock_msg(struct svc_rqst *rqstp) +static __be32 +__nlm4svc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_res *resp) { - dprintk("lockd: UNLOCK_MSG called\n"); - return nlm4svc_callback(rqstp, NLMPROC_UNLOCK_RES, __nlm4svc_proc_unlock); + struct nlm4_lockargs_wrapper *argp = rqstp->rq_argp; + unsigned char type = argp->xdrgen.exclusive ? F_WRLCK : F_RDLCK; + struct nlm_file *file = NULL; + struct nlm_host *host = NULL; + + resp->status = nlm_lck_denied_nolocks; + if (nlm4_netobj_to_cookie(&resp->cookie, &argp->xdrgen.cookie)) + goto out; + + host = nlm4svc_lookup_host(rqstp, argp->xdrgen.alock.caller_name, true); + if (!host) + goto out; + + resp->status = nlm4svc_lookup_file(rqstp, host, &argp->lock, + &file, &argp->xdrgen.alock, type); + if (resp->status) + goto out; + + resp->status = nlmsvc_lock(rqstp, file, host, &argp->lock, + argp->xdrgen.block, &resp->cookie, + argp->xdrgen.reclaim); + nlmsvc_release_lockowner(&argp->lock); + +out: + if (file) + nlm_release_file(file); + nlmsvc_release_host(host); + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; } -static __be32 nlm4svc_proc_granted_msg(struct svc_rqst *rqstp) +/** + * nlm4svc_proc_lock_msg - LOCK_MSG: Establish a monitored lock + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully. + * %rpc_system_err: RPC execution failed. + * + * RPC synopsis: + * void NLMPROC4_LOCK_MSG(nlm4_lockargs) = 7; + * + * The response to this request is delivered via the LOCK_RES procedure. + */ +static __be32 nlm4svc_proc_lock_msg(struct svc_rqst *rqstp) { - dprintk("lockd: GRANTED_MSG called\n"); - return nlm4svc_callback(rqstp, NLMPROC_GRANTED_RES, __nlm4svc_proc_granted); + struct nlm4_lockargs_wrapper *argp = rqstp->rq_argp; + struct nlm_host *host; + + host = nlm4svc_lookup_host(rqstp, argp->xdrgen.alock.caller_name, true); + if (!host) + return rpc_system_err; + + return nlm4svc_callback(rqstp, host, NLMPROC4_LOCK_RES, + __nlm4svc_proc_lock_msg); } -/* - * SHARE: create a DOS share or alter existing share. - */ static __be32 -nlm4svc_proc_share(struct svc_rqst *rqstp) +__nlm4svc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_res *resp) { - struct nlm_args *argp = rqstp->rq_argp; - struct nlm_res *resp = rqstp->rq_resp; - struct nlm_host *host; - struct nlm_file *file; - - dprintk("lockd: SHARE called\n"); + struct nlm4_cancargs_wrapper *argp = rqstp->rq_argp; + unsigned char type = argp->xdrgen.exclusive ? F_WRLCK : F_RDLCK; + struct net *net = SVC_NET(rqstp); + struct nlm_file *file = NULL; + struct nlm_host *host = NULL; + + resp->status = nlm_lck_denied_nolocks; + if (nlm4_netobj_to_cookie(&resp->cookie, &argp->xdrgen.cookie)) + goto out; + + resp->status = nlm_lck_denied_grace_period; + if (locks_in_grace(net)) + goto out; + + resp->status = nlm_lck_denied_nolocks; + host = nlm4svc_lookup_host(rqstp, argp->xdrgen.alock.caller_name, false); + if (!host) + goto out; + + resp->status = nlm4svc_lookup_file(rqstp, host, &argp->lock, + &file, &argp->xdrgen.alock, type); + if (resp->status) + goto out; + + resp->status = nlmsvc_cancel_blocked(net, file, &argp->lock); + nlmsvc_release_lockowner(&argp->lock); - resp->cookie = argp->cookie; +out: + if (file) + nlm_release_file(file); + nlmsvc_release_host(host); + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; +} - /* Don't accept new lock requests during grace period */ - if (locks_in_grace(SVC_NET(rqstp)) && !argp->reclaim) { - resp->status = nlm_lck_denied_grace_period; - return rpc_success; - } +/** + * nlm4svc_proc_cancel_msg - CANCEL_MSG: Cancel an outstanding lock request + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully. + * %rpc_system_err: RPC execution failed. + * + * RPC synopsis: + * void NLMPROC4_CANCEL_MSG(nlm4_cancargs) = 8; + * + * The response to this request is delivered via the CANCEL_RES procedure. + */ +static __be32 nlm4svc_proc_cancel_msg(struct svc_rqst *rqstp) +{ + struct nlm4_cancargs_wrapper *argp = rqstp->rq_argp; + struct nlm_host *host; - /* Obtain client and file */ - if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + host = nlm4svc_lookup_host(rqstp, argp->xdrgen.alock.caller_name, false); + if (!host) + return rpc_system_err; - /* Now try to create the share */ - resp->status = nlmsvc_share_file(host, file, argp); + return nlm4svc_callback(rqstp, host, NLMPROC4_CANCEL_RES, + __nlm4svc_proc_cancel_msg); +} - dprintk("lockd: SHARE status %d\n", ntohl(resp->status)); +static __be32 +__nlm4svc_proc_unlock_msg(struct svc_rqst *rqstp, struct nlm_res *resp) +{ + struct nlm4_unlockargs_wrapper *argp = rqstp->rq_argp; + struct net *net = SVC_NET(rqstp); + struct nlm_file *file = NULL; + struct nlm_host *host = NULL; + + resp->status = nlm_lck_denied_nolocks; + if (nlm4_netobj_to_cookie(&resp->cookie, &argp->xdrgen.cookie)) + goto out; + + resp->status = nlm_lck_denied_grace_period; + if (locks_in_grace(net)) + goto out; + + resp->status = nlm_lck_denied_nolocks; + host = nlm4svc_lookup_host(rqstp, argp->xdrgen.alock.caller_name, false); + if (!host) + goto out; + + resp->status = nlm4svc_lookup_file(rqstp, host, &argp->lock, + &file, &argp->xdrgen.alock, F_UNLCK); + if (resp->status) + goto out; + + resp->status = nlmsvc_unlock(net, file, &argp->lock); nlmsvc_release_lockowner(&argp->lock); + +out: + if (file) + nlm_release_file(file); nlmsvc_release_host(host); - nlm_release_file(file); - return rpc_success; + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; } -/* - * UNSHARE: Release a DOS share. +/** + * nlm4svc_proc_unlock_msg - UNLOCK_MSG: Remove an existing lock + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully. + * %rpc_system_err: RPC execution failed. + * + * RPC synopsis: + * void NLMPROC4_UNLOCK_MSG(nlm4_unlockargs) = 9; + * + * The response to this request is delivered via the UNLOCK_RES procedure. */ -static __be32 -nlm4svc_proc_unshare(struct svc_rqst *rqstp) +static __be32 nlm4svc_proc_unlock_msg(struct svc_rqst *rqstp) { - struct nlm_args *argp = rqstp->rq_argp; - struct nlm_res *resp = rqstp->rq_resp; - struct nlm_host *host; - struct nlm_file *file; + struct nlm4_unlockargs_wrapper *argp = rqstp->rq_argp; + struct nlm_host *host; - dprintk("lockd: UNSHARE called\n"); + host = nlm4svc_lookup_host(rqstp, argp->xdrgen.alock.caller_name, false); + if (!host) + return rpc_system_err; - resp->cookie = argp->cookie; + return nlm4svc_callback(rqstp, host, NLMPROC4_UNLOCK_RES, + __nlm4svc_proc_unlock_msg); +} - /* Don't accept requests during grace period */ - if (locks_in_grace(SVC_NET(rqstp))) { - resp->status = nlm_lck_denied_grace_period; - return rpc_success; - } +static __be32 +__nlm4svc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_res *resp) +{ + struct nlm4_testargs_wrapper *argp = rqstp->rq_argp; - /* Obtain client and file */ - if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + resp->status = nlm_lck_denied; + if (nlm4_netobj_to_cookie(&resp->cookie, &argp->xdrgen.cookie)) + goto out; - /* Now try to lock the file */ - resp->status = nlmsvc_unshare_file(host, file, argp); + if (nlm4_lock_to_nlm_lock(&argp->lock, &argp->xdrgen.alock)) + goto out; - dprintk("lockd: UNSHARE status %d\n", ntohl(resp->status)); - nlmsvc_release_lockowner(&argp->lock); - nlmsvc_release_host(host); - nlm_release_file(file); + resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock); + +out: return rpc_success; } -/* - * NM_LOCK: Create an unmonitored lock +/** + * nlm4svc_proc_granted_msg - GRANTED_MSG: Blocked lock has been granted + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully. + * %rpc_system_err: RPC execution failed. + * + * RPC synopsis: + * void NLMPROC4_GRANTED_MSG(nlm4_testargs) = 10; + * + * The response to this request is delivered via the GRANTED_RES procedure. */ -static __be32 -nlm4svc_proc_nm_lock(struct svc_rqst *rqstp) +static __be32 nlm4svc_proc_granted_msg(struct svc_rqst *rqstp) { - struct nlm_args *argp = rqstp->rq_argp; + struct nlm4_testargs_wrapper *argp = rqstp->rq_argp; + struct nlm_host *host; - dprintk("lockd: NM_LOCK called\n"); + host = nlm4svc_lookup_host(rqstp, argp->xdrgen.alock.caller_name, false); + if (!host) + return rpc_system_err; - argp->monitor = 0; /* just clean the monitor flag */ - return nlm4svc_proc_lock(rqstp); + return nlm4svc_callback(rqstp, host, NLMPROC4_GRANTED_RES, + __nlm4svc_proc_granted_msg); } -/* - * FREE_ALL: Release all locks and shares held by client +/** + * nlm4svc_proc_granted_res - GRANTED_RES: Lock Granted result + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully. + * + * RPC synopsis: + * void NLMPROC4_GRANTED_RES(nlm4_res) = 15; */ -static __be32 -nlm4svc_proc_free_all(struct svc_rqst *rqstp) +static __be32 nlm4svc_proc_granted_res(struct svc_rqst *rqstp) { - struct nlm_args *argp = rqstp->rq_argp; - struct nlm_host *host; + struct nlm4_res_wrapper *argp = rqstp->rq_argp; - /* Obtain client */ - if (nlm4svc_retrieve_args(rqstp, argp, &host, NULL)) + if (!nlmsvc_ops) return rpc_success; - nlmsvc_free_host_resources(host); - nlmsvc_release_host(host); + if (nlm4_netobj_to_cookie(&argp->cookie, &argp->xdrgen.cookie)) + return rpc_success; + nlmsvc_grant_reply(&argp->cookie, argp->xdrgen.stat.stat); + return rpc_success; } -/* - * SM_NOTIFY: private callback from statd (not part of official NLM proto) +/** + * nlm4svc_proc_sm_notify - SM_NOTIFY: Peer has rebooted + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully. + * %rpc_system_err: RPC execution failed. + * + * The SM_NOTIFY procedure is a private callback from Linux statd and is + * not part of the official NLM protocol. + * + * RPC synopsis: + * void NLMPROC4_SM_NOTIFY(nlm4_notifyargs) = 16; */ -static __be32 -nlm4svc_proc_sm_notify(struct svc_rqst *rqstp) +static __be32 nlm4svc_proc_sm_notify(struct svc_rqst *rqstp) { - struct nlm_reboot *argp = rqstp->rq_argp; - - dprintk("lockd: SM_NOTIFY called\n"); + struct nlm4_notifyargs_wrapper *argp = rqstp->rq_argp; + struct nlm_reboot *reboot = &argp->reboot; if (!nlm_privileged_requester(rqstp)) { char buf[RPC_MAX_ADDRBUFLEN]; - printk(KERN_WARNING "lockd: rejected NSM callback from %s\n", - svc_print_addr(rqstp, buf, sizeof(buf))); + + pr_warn("lockd: rejected NSM callback from %s\n", + svc_print_addr(rqstp, buf, sizeof(buf))); return rpc_system_err; } - nlm_host_rebooted(SVC_NET(rqstp), argp); + reboot->len = argp->xdrgen.notify.name.len; + reboot->mon = (char *)argp->xdrgen.notify.name.data; + reboot->state = argp->xdrgen.notify.state; + memcpy(&reboot->priv.data, argp->xdrgen.private, + sizeof(reboot->priv.data)); + + nlm_host_rebooted(SVC_NET(rqstp), reboot); + return rpc_success; } -/* - * client sent a GRANTED_RES, let's remove the associated block +/** + * nlm4svc_proc_unused - stub for unused procedures + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_proc_unavail: Program can't support procedure. */ -static __be32 -nlm4svc_proc_granted_res(struct svc_rqst *rqstp) +static __be32 nlm4svc_proc_unused(struct svc_rqst *rqstp) { - struct nlm_res *argp = rqstp->rq_argp; + return rpc_proc_unavail; +} - if (!nlmsvc_ops) - return rpc_success; +/** + * nlm4svc_proc_share - SHARE: Open a file using DOS file-sharing modes + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully. + * %rpc_drop_reply: Do not send an RPC reply. + * + * RPC synopsis: + * nlm4_shareres NLMPROC4_SHARE(nlm4_shareargs) = 20; + * + * Permissible procedure status codes: + * %NLM4_GRANTED: The requested share lock was granted. + * %NLM4_DENIED: The requested lock conflicted with existing + * lock reservations for the file. + * %NLM4_DENIED_GRACE_PERIOD: The server has recently restarted and is + * re-establishing existing locks, and is not + * yet ready to accept normal service requests. + * + * The Linux NLM server implementation also returns: + * %NLM4_DENIED_NOLOCKS: A needed resource could not be allocated. + * %NLM4_STALE_FH: The request specified an invalid file handle. + * %NLM4_FBIG: The request specified a length or offset + * that exceeds the range supported by the + * server. + * %NLM4_FAILED: The request failed for an unspecified reason. + */ +static __be32 nlm4svc_proc_share(struct svc_rqst *rqstp) +{ + struct nlm4_shareargs_wrapper *argp = rqstp->rq_argp; + struct nlm4_shareres_wrapper *resp = rqstp->rq_resp; + struct nlm_lock *lock = &argp->lock; + struct nlm_host *host = NULL; + struct nlm_file *file = NULL; + struct nlm4_lock xdr_lock = { + .fh = argp->xdrgen.share.fh, + .oh = argp->xdrgen.share.oh, + .svid = LOCKD_SHARE_SVID, + }; + + resp->xdrgen.cookie = argp->xdrgen.cookie; + + resp->xdrgen.stat = nlm_lck_denied_grace_period; + if (locks_in_grace(SVC_NET(rqstp)) && !argp->xdrgen.reclaim) + goto out; + + resp->xdrgen.stat = nlm_lck_denied_nolocks; + host = nlm4svc_lookup_host(rqstp, argp->xdrgen.share.caller_name, true); + if (!host) + goto out; + + resp->xdrgen.stat = nlm4svc_lookup_file(rqstp, host, lock, &file, + &xdr_lock, F_RDLCK); + if (resp->xdrgen.stat) + goto out; + + resp->xdrgen.stat = nlmsvc_share_file(host, file, &lock->oh, + argp->xdrgen.share.access, + argp->xdrgen.share.mode); + + nlmsvc_release_lockowner(lock); + +out: + if (file) + nlm_release_file(file); + nlmsvc_release_host(host); + return resp->xdrgen.stat == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; +} - dprintk("lockd: GRANTED_RES called\n"); +/** + * nlm4svc_proc_unshare - UNSHARE: Release a share reservation + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully. + * %rpc_drop_reply: Do not send an RPC reply. + * + * RPC synopsis: + * nlm4_shareres NLMPROC4_UNSHARE(nlm4_shareargs) = 21; + * + * Permissible procedure status codes: + * %NLM4_GRANTED: The share reservation was released. + * %NLM4_DENIED_GRACE_PERIOD: The server has recently restarted and is + * re-establishing existing locks, and is not + * yet ready to accept normal service requests. + * + * The Linux NLM server implementation also returns: + * %NLM4_DENIED_NOLOCKS: A needed resource could not be allocated. + * %NLM4_STALE_FH: The request specified an invalid file handle. + * %NLM4_FBIG: The request specified a length or offset + * that exceeds the range supported by the + * server. + * %NLM4_FAILED: The request failed for an unspecified reason. + */ +static __be32 nlm4svc_proc_unshare(struct svc_rqst *rqstp) +{ + struct nlm4_shareargs_wrapper *argp = rqstp->rq_argp; + struct nlm4_shareres_wrapper *resp = rqstp->rq_resp; + struct nlm_lock *lock = &argp->lock; + struct nlm4_lock xdr_lock = { + .fh = argp->xdrgen.share.fh, + .oh = argp->xdrgen.share.oh, + .svid = LOCKD_SHARE_SVID, + }; + struct nlm_host *host = NULL; + struct nlm_file *file = NULL; + + resp->xdrgen.cookie = argp->xdrgen.cookie; + + resp->xdrgen.stat = nlm_lck_denied_grace_period; + if (locks_in_grace(SVC_NET(rqstp))) + goto out; + + resp->xdrgen.stat = nlm_lck_denied_nolocks; + host = nlm4svc_lookup_host(rqstp, argp->xdrgen.share.caller_name, true); + if (!host) + goto out; + + resp->xdrgen.stat = nlm4svc_lookup_file(rqstp, host, lock, &file, + &xdr_lock, F_RDLCK); + if (resp->xdrgen.stat) + goto out; + + resp->xdrgen.stat = nlmsvc_unshare_file(host, file, &lock->oh); + + nlmsvc_release_lockowner(lock); + +out: + if (file) + nlm_release_file(file); + nlmsvc_release_host(host); + return resp->xdrgen.stat == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; +} - nlmsvc_grant_reply(&argp->cookie, argp->status); - return rpc_success; +/** + * nlm4svc_proc_nm_lock - NM_LOCK: Establish a non-monitored lock + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully. + * %rpc_drop_reply: Do not send an RPC reply. + * + * RPC synopsis: + * nlm4_res NLMPROC4_NM_LOCK(nlm4_lockargs) = 22; + * + * Permissible procedure status codes: + * %NLM4_GRANTED: The requested lock was granted. + * %NLM4_DENIED: The requested lock conflicted with existing + * lock reservations for the file. + * %NLM4_DENIED_NOLOCKS: The server could not allocate the resources + * needed to process the request. + * %NLM4_BLOCKED: The blocking request cannot be granted + * immediately. The server will send an + * NLMPROC4_GRANTED callback to the client when + * the lock can be granted. + * %NLM4_DENIED_GRACE_PERIOD: The server has recently restarted and is + * re-establishing existing locks, and is not + * yet ready to accept normal service requests. + * + * The Linux NLM server implementation also returns: + * %NLM4_DEADLCK: The request could not be granted and + * blocking would cause a deadlock. + * %NLM4_STALE_FH: The request specified an invalid file handle. + * %NLM4_FBIG: The request specified a length or offset + * that exceeds the range supported by the + * server. + * %NLM4_FAILED: The request failed for an unspecified reason. + */ +static __be32 nlm4svc_proc_nm_lock(struct svc_rqst *rqstp) +{ + return nlm4svc_do_lock(rqstp, false); } -static __be32 -nlm4svc_proc_unused(struct svc_rqst *rqstp) +/** + * nlm4svc_proc_free_all - FREE_ALL: Discard client's lock and share state + * @rqstp: RPC transaction context + * + * Returns: + * %rpc_success: RPC executed successfully. + * + * RPC synopsis: + * void NLMPROC4_FREE_ALL(nlm4_notify) = 23; + */ +static __be32 nlm4svc_proc_free_all(struct svc_rqst *rqstp) { - return rpc_proc_unavail; + struct nlm4_notify_wrapper *argp = rqstp->rq_argp; + struct nlm_host *host; + + host = nlm4svc_lookup_host(rqstp, argp->xdrgen.name, false); + if (!host) + goto out; + + nlmsvc_free_host_resources(host); + + nlmsvc_release_host(host); + +out: + return rpc_success; } /* - * NLM Server procedures. + * NLMv4 Server procedures. */ -struct nlm_void { int dummy; }; - -#define Ck (1+XDR_QUADLEN(NLM_MAXCOOKIELEN)) /* cookie */ -#define No (1+1024/4) /* netobj */ -#define St 1 /* status */ -#define Rg 4 /* range (offset + length) */ - -const struct svc_procedure nlmsvc_procedures4[24] = { - [NLMPROC_NULL] = { - .pc_func = nlm4svc_proc_null, - .pc_decode = nlm4svc_decode_void, - .pc_encode = nlm4svc_encode_void, - .pc_argsize = sizeof(struct nlm_void), - .pc_argzero = sizeof(struct nlm_void), - .pc_ressize = sizeof(struct nlm_void), - .pc_xdrressize = St, - .pc_name = "NULL", +static const struct svc_procedure nlm4svc_procedures[24] = { + [NLMPROC4_NULL] = { + .pc_func = nlm4svc_proc_null, + .pc_decode = nlm4_svc_decode_void, + .pc_encode = nlm4_svc_encode_void, + .pc_argsize = XDR_void, + .pc_argzero = 0, + .pc_ressize = 0, + .pc_xdrressize = XDR_void, + .pc_name = "NULL", }, - [NLMPROC_TEST] = { - .pc_func = nlm4svc_proc_test, - .pc_decode = nlm4svc_decode_testargs, - .pc_encode = nlm4svc_encode_testres, - .pc_argsize = sizeof(struct nlm_args), - .pc_argzero = sizeof(struct nlm_args), - .pc_ressize = sizeof(struct nlm_res), - .pc_xdrressize = Ck+St+2+No+Rg, - .pc_name = "TEST", + [NLMPROC4_TEST] = { + .pc_func = nlm4svc_proc_test, + .pc_decode = nlm4_svc_decode_nlm4_testargs, + .pc_encode = nlm4_svc_encode_nlm4_testres, + .pc_argsize = sizeof(struct nlm4_testargs_wrapper), + .pc_argzero = 0, + .pc_ressize = sizeof(struct nlm4_testres_wrapper), + .pc_xdrressize = NLM4_nlm4_testres_sz, + .pc_name = "TEST", }, - [NLMPROC_LOCK] = { - .pc_func = nlm4svc_proc_lock, - .pc_decode = nlm4svc_decode_lockargs, - .pc_encode = nlm4svc_encode_res, - .pc_argsize = sizeof(struct nlm_args), - .pc_argzero = sizeof(struct nlm_args), - .pc_ressize = sizeof(struct nlm_res), - .pc_xdrressize = Ck+St, - .pc_name = "LOCK", + [NLMPROC4_LOCK] = { + .pc_func = nlm4svc_proc_lock, + .pc_decode = nlm4_svc_decode_nlm4_lockargs, + .pc_encode = nlm4_svc_encode_nlm4_res, + .pc_argsize = sizeof(struct nlm4_lockargs_wrapper), + .pc_argzero = 0, + .pc_ressize = sizeof(struct nlm4_res_wrapper), + .pc_xdrressize = NLM4_nlm4_res_sz, + .pc_name = "LOCK", }, - [NLMPROC_CANCEL] = { - .pc_func = nlm4svc_proc_cancel, - .pc_decode = nlm4svc_decode_cancargs, - .pc_encode = nlm4svc_encode_res, - .pc_argsize = sizeof(struct nlm_args), - .pc_argzero = sizeof(struct nlm_args), - .pc_ressize = sizeof(struct nlm_res), - .pc_xdrressize = Ck+St, - .pc_name = "CANCEL", + [NLMPROC4_CANCEL] = { + .pc_func = nlm4svc_proc_cancel, + .pc_decode = nlm4_svc_decode_nlm4_cancargs, + .pc_encode = nlm4_svc_encode_nlm4_res, + .pc_argsize = sizeof(struct nlm4_cancargs_wrapper), + .pc_argzero = 0, + .pc_ressize = sizeof(struct nlm4_res_wrapper), + .pc_xdrressize = NLM4_nlm4_res_sz, + .pc_name = "CANCEL", }, - [NLMPROC_UNLOCK] = { - .pc_func = nlm4svc_proc_unlock, - .pc_decode = nlm4svc_decode_unlockargs, - .pc_encode = nlm4svc_encode_res, - .pc_argsize = sizeof(struct nlm_args), - .pc_argzero = sizeof(struct nlm_args), - .pc_ressize = sizeof(struct nlm_res), - .pc_xdrressize = Ck+St, - .pc_name = "UNLOCK", + [NLMPROC4_UNLOCK] = { + .pc_func = nlm4svc_proc_unlock, + .pc_decode = nlm4_svc_decode_nlm4_unlockargs, + .pc_encode = nlm4_svc_encode_nlm4_res, + .pc_argsize = sizeof(struct nlm4_unlockargs_wrapper), + .pc_argzero = 0, + .pc_ressize = sizeof(struct nlm4_res_wrapper), + .pc_xdrressize = NLM4_nlm4_res_sz, + .pc_name = "UNLOCK", }, - [NLMPROC_GRANTED] = { - .pc_func = nlm4svc_proc_granted, - .pc_decode = nlm4svc_decode_testargs, - .pc_encode = nlm4svc_encode_res, - .pc_argsize = sizeof(struct nlm_args), - .pc_argzero = sizeof(struct nlm_args), - .pc_ressize = sizeof(struct nlm_res), - .pc_xdrressize = Ck+St, - .pc_name = "GRANTED", + [NLMPROC4_GRANTED] = { + .pc_func = nlm4svc_proc_granted, + .pc_decode = nlm4_svc_decode_nlm4_testargs, + .pc_encode = nlm4_svc_encode_nlm4_res, + .pc_argsize = sizeof(struct nlm4_testargs_wrapper), + .pc_argzero = 0, + .pc_ressize = sizeof(struct nlm4_res_wrapper), + .pc_xdrressize = NLM4_nlm4_res_sz, + .pc_name = "GRANTED", }, - [NLMPROC_TEST_MSG] = { - .pc_func = nlm4svc_proc_test_msg, - .pc_decode = nlm4svc_decode_testargs, - .pc_encode = nlm4svc_encode_void, - .pc_argsize = sizeof(struct nlm_args), - .pc_argzero = sizeof(struct nlm_args), - .pc_ressize = sizeof(struct nlm_void), - .pc_xdrressize = St, - .pc_name = "TEST_MSG", + [NLMPROC4_TEST_MSG] = { + .pc_func = nlm4svc_proc_test_msg, + .pc_decode = nlm4_svc_decode_nlm4_testargs, + .pc_encode = nlm4_svc_encode_void, + .pc_argsize = sizeof(struct nlm4_testargs_wrapper), + .pc_argzero = 0, + .pc_ressize = 0, + .pc_xdrressize = XDR_void, + .pc_name = "TEST_MSG", }, - [NLMPROC_LOCK_MSG] = { - .pc_func = nlm4svc_proc_lock_msg, - .pc_decode = nlm4svc_decode_lockargs, - .pc_encode = nlm4svc_encode_void, - .pc_argsize = sizeof(struct nlm_args), - .pc_argzero = sizeof(struct nlm_args), - .pc_ressize = sizeof(struct nlm_void), - .pc_xdrressize = St, - .pc_name = "LOCK_MSG", + [NLMPROC4_LOCK_MSG] = { + .pc_func = nlm4svc_proc_lock_msg, + .pc_decode = nlm4_svc_decode_nlm4_lockargs, + .pc_encode = nlm4_svc_encode_void, + .pc_argsize = sizeof(struct nlm4_lockargs_wrapper), + .pc_argzero = 0, + .pc_ressize = 0, + .pc_xdrressize = XDR_void, + .pc_name = "LOCK_MSG", }, - [NLMPROC_CANCEL_MSG] = { - .pc_func = nlm4svc_proc_cancel_msg, - .pc_decode = nlm4svc_decode_cancargs, - .pc_encode = nlm4svc_encode_void, - .pc_argsize = sizeof(struct nlm_args), - .pc_argzero = sizeof(struct nlm_args), - .pc_ressize = sizeof(struct nlm_void), - .pc_xdrressize = St, - .pc_name = "CANCEL_MSG", + [NLMPROC4_CANCEL_MSG] = { + .pc_func = nlm4svc_proc_cancel_msg, + .pc_decode = nlm4_svc_decode_nlm4_cancargs, + .pc_encode = nlm4_svc_encode_void, + .pc_argsize = sizeof(struct nlm4_cancargs_wrapper), + .pc_argzero = 0, + .pc_ressize = 0, + .pc_xdrressize = XDR_void, + .pc_name = "CANCEL_MSG", }, - [NLMPROC_UNLOCK_MSG] = { - .pc_func = nlm4svc_proc_unlock_msg, - .pc_decode = nlm4svc_decode_unlockargs, - .pc_encode = nlm4svc_encode_void, - .pc_argsize = sizeof(struct nlm_args), - .pc_argzero = sizeof(struct nlm_args), - .pc_ressize = sizeof(struct nlm_void), - .pc_xdrressize = St, - .pc_name = "UNLOCK_MSG", + [NLMPROC4_UNLOCK_MSG] = { + .pc_func = nlm4svc_proc_unlock_msg, + .pc_decode = nlm4_svc_decode_nlm4_unlockargs, + .pc_encode = nlm4_svc_encode_void, + .pc_argsize = sizeof(struct nlm4_unlockargs_wrapper), + .pc_argzero = 0, + .pc_ressize = 0, + .pc_xdrressize = XDR_void, + .pc_name = "UNLOCK_MSG", }, - [NLMPROC_GRANTED_MSG] = { - .pc_func = nlm4svc_proc_granted_msg, - .pc_decode = nlm4svc_decode_testargs, - .pc_encode = nlm4svc_encode_void, - .pc_argsize = sizeof(struct nlm_args), - .pc_argzero = sizeof(struct nlm_args), - .pc_ressize = sizeof(struct nlm_void), - .pc_xdrressize = St, - .pc_name = "GRANTED_MSG", + [NLMPROC4_GRANTED_MSG] = { + .pc_func = nlm4svc_proc_granted_msg, + .pc_decode = nlm4_svc_decode_nlm4_testargs, + .pc_encode = nlm4_svc_encode_void, + .pc_argsize = sizeof(struct nlm4_testargs_wrapper), + .pc_argzero = 0, + .pc_ressize = 0, + .pc_xdrressize = XDR_void, + .pc_name = "GRANTED_MSG", }, - [NLMPROC_TEST_RES] = { - .pc_func = nlm4svc_proc_null, - .pc_decode = nlm4svc_decode_void, - .pc_encode = nlm4svc_encode_void, - .pc_argsize = sizeof(struct nlm_res), - .pc_argzero = sizeof(struct nlm_res), - .pc_ressize = sizeof(struct nlm_void), - .pc_xdrressize = St, - .pc_name = "TEST_RES", + [NLMPROC4_TEST_RES] = { + .pc_func = nlm4svc_proc_null, + .pc_decode = nlm4_svc_decode_nlm4_testres, + .pc_encode = nlm4_svc_encode_void, + .pc_argsize = sizeof(struct nlm4_testres), + .pc_argzero = 0, + .pc_ressize = 0, + .pc_xdrressize = XDR_void, + .pc_name = "TEST_RES", }, - [NLMPROC_LOCK_RES] = { - .pc_func = nlm4svc_proc_null, - .pc_decode = nlm4svc_decode_void, - .pc_encode = nlm4svc_encode_void, - .pc_argsize = sizeof(struct nlm_res), - .pc_argzero = sizeof(struct nlm_res), - .pc_ressize = sizeof(struct nlm_void), - .pc_xdrressize = St, - .pc_name = "LOCK_RES", + [NLMPROC4_LOCK_RES] = { + .pc_func = nlm4svc_proc_null, + .pc_decode = nlm4_svc_decode_nlm4_res, + .pc_encode = nlm4_svc_encode_void, + .pc_argsize = sizeof(struct nlm4_res), + .pc_argzero = 0, + .pc_ressize = 0, + .pc_xdrressize = XDR_void, + .pc_name = "LOCK_RES", }, - [NLMPROC_CANCEL_RES] = { - .pc_func = nlm4svc_proc_null, - .pc_decode = nlm4svc_decode_void, - .pc_encode = nlm4svc_encode_void, - .pc_argsize = sizeof(struct nlm_res), - .pc_argzero = sizeof(struct nlm_res), - .pc_ressize = sizeof(struct nlm_void), - .pc_xdrressize = St, - .pc_name = "CANCEL_RES", + [NLMPROC4_CANCEL_RES] = { + .pc_func = nlm4svc_proc_null, + .pc_decode = nlm4_svc_decode_nlm4_res, + .pc_encode = nlm4_svc_encode_void, + .pc_argsize = sizeof(struct nlm4_res), + .pc_argzero = 0, + .pc_ressize = 0, + .pc_xdrressize = XDR_void, + .pc_name = "CANCEL_RES", }, - [NLMPROC_UNLOCK_RES] = { - .pc_func = nlm4svc_proc_null, - .pc_decode = nlm4svc_decode_void, - .pc_encode = nlm4svc_encode_void, - .pc_argsize = sizeof(struct nlm_res), - .pc_argzero = sizeof(struct nlm_res), - .pc_ressize = sizeof(struct nlm_void), - .pc_xdrressize = St, - .pc_name = "UNLOCK_RES", + [NLMPROC4_UNLOCK_RES] = { + .pc_func = nlm4svc_proc_null, + .pc_decode = nlm4_svc_decode_nlm4_res, + .pc_encode = nlm4_svc_encode_void, + .pc_argsize = sizeof(struct nlm4_res), + .pc_argzero = 0, + .pc_ressize = 0, + .pc_xdrressize = XDR_void, + .pc_name = "UNLOCK_RES", }, - [NLMPROC_GRANTED_RES] = { - .pc_func = nlm4svc_proc_granted_res, - .pc_decode = nlm4svc_decode_res, - .pc_encode = nlm4svc_encode_void, - .pc_argsize = sizeof(struct nlm_res), - .pc_argzero = sizeof(struct nlm_res), - .pc_ressize = sizeof(struct nlm_void), - .pc_xdrressize = St, - .pc_name = "GRANTED_RES", + [NLMPROC4_GRANTED_RES] = { + .pc_func = nlm4svc_proc_granted_res, + .pc_decode = nlm4_svc_decode_nlm4_res, + .pc_encode = nlm4_svc_encode_void, + .pc_argsize = sizeof(struct nlm4_res_wrapper), + .pc_argzero = 0, + .pc_ressize = 0, + .pc_xdrressize = XDR_void, + .pc_name = "GRANTED_RES", }, - [NLMPROC_NSM_NOTIFY] = { - .pc_func = nlm4svc_proc_sm_notify, - .pc_decode = nlm4svc_decode_reboot, - .pc_encode = nlm4svc_encode_void, - .pc_argsize = sizeof(struct nlm_reboot), - .pc_argzero = sizeof(struct nlm_reboot), - .pc_ressize = sizeof(struct nlm_void), - .pc_xdrressize = St, - .pc_name = "SM_NOTIFY", + [NLMPROC4_SM_NOTIFY] = { + .pc_func = nlm4svc_proc_sm_notify, + .pc_decode = nlm4_svc_decode_nlm4_notifyargs, + .pc_encode = nlm4_svc_encode_void, + .pc_argsize = sizeof(struct nlm4_notifyargs_wrapper), + .pc_argzero = 0, + .pc_ressize = 0, + .pc_xdrressize = XDR_void, + .pc_name = "SM_NOTIFY", }, [17] = { - .pc_func = nlm4svc_proc_unused, - .pc_decode = nlm4svc_decode_void, - .pc_encode = nlm4svc_encode_void, - .pc_argsize = sizeof(struct nlm_void), - .pc_argzero = sizeof(struct nlm_void), - .pc_ressize = sizeof(struct nlm_void), - .pc_xdrressize = 0, - .pc_name = "UNUSED", + .pc_func = nlm4svc_proc_unused, + .pc_decode = nlm4_svc_decode_void, + .pc_encode = nlm4_svc_encode_void, + .pc_argsize = 0, + .pc_argzero = 0, + .pc_ressize = 0, + .pc_xdrressize = XDR_void, + .pc_name = "UNUSED", }, [18] = { - .pc_func = nlm4svc_proc_unused, - .pc_decode = nlm4svc_decode_void, - .pc_encode = nlm4svc_encode_void, - .pc_argsize = sizeof(struct nlm_void), - .pc_argzero = sizeof(struct nlm_void), - .pc_ressize = sizeof(struct nlm_void), - .pc_xdrressize = 0, - .pc_name = "UNUSED", + .pc_func = nlm4svc_proc_unused, + .pc_decode = nlm4_svc_decode_void, + .pc_encode = nlm4_svc_encode_void, + .pc_argsize = 0, + .pc_argzero = 0, + .pc_ressize = 0, + .pc_xdrressize = XDR_void, + .pc_name = "UNUSED", }, [19] = { - .pc_func = nlm4svc_proc_unused, - .pc_decode = nlm4svc_decode_void, - .pc_encode = nlm4svc_encode_void, - .pc_argsize = sizeof(struct nlm_void), - .pc_argzero = sizeof(struct nlm_void), - .pc_ressize = sizeof(struct nlm_void), - .pc_xdrressize = 0, - .pc_name = "UNUSED", + .pc_func = nlm4svc_proc_unused, + .pc_decode = nlm4_svc_decode_void, + .pc_encode = nlm4_svc_encode_void, + .pc_argsize = 0, + .pc_argzero = 0, + .pc_ressize = 0, + .pc_xdrressize = XDR_void, + .pc_name = "UNUSED", }, - [NLMPROC_SHARE] = { - .pc_func = nlm4svc_proc_share, - .pc_decode = nlm4svc_decode_shareargs, - .pc_encode = nlm4svc_encode_shareres, - .pc_argsize = sizeof(struct nlm_args), - .pc_argzero = sizeof(struct nlm_args), - .pc_ressize = sizeof(struct nlm_res), - .pc_xdrressize = Ck+St+1, - .pc_name = "SHARE", + [NLMPROC4_SHARE] = { + .pc_func = nlm4svc_proc_share, + .pc_decode = nlm4_svc_decode_nlm4_shareargs, + .pc_encode = nlm4_svc_encode_nlm4_shareres, + .pc_argsize = sizeof(struct nlm4_shareargs_wrapper), + .pc_argzero = 0, + .pc_ressize = sizeof(struct nlm4_shareres_wrapper), + .pc_xdrressize = NLM4_nlm4_shareres_sz, + .pc_name = "SHARE", }, - [NLMPROC_UNSHARE] = { - .pc_func = nlm4svc_proc_unshare, - .pc_decode = nlm4svc_decode_shareargs, - .pc_encode = nlm4svc_encode_shareres, - .pc_argsize = sizeof(struct nlm_args), - .pc_argzero = sizeof(struct nlm_args), - .pc_ressize = sizeof(struct nlm_res), - .pc_xdrressize = Ck+St+1, - .pc_name = "UNSHARE", + [NLMPROC4_UNSHARE] = { + .pc_func = nlm4svc_proc_unshare, + .pc_decode = nlm4_svc_decode_nlm4_shareargs, + .pc_encode = nlm4_svc_encode_nlm4_shareres, + .pc_argsize = sizeof(struct nlm4_shareargs_wrapper), + .pc_argzero = 0, + .pc_ressize = sizeof(struct nlm4_shareres_wrapper), + .pc_xdrressize = NLM4_nlm4_shareres_sz, + .pc_name = "UNSHARE", }, - [NLMPROC_NM_LOCK] = { - .pc_func = nlm4svc_proc_nm_lock, - .pc_decode = nlm4svc_decode_lockargs, - .pc_encode = nlm4svc_encode_res, - .pc_argsize = sizeof(struct nlm_args), - .pc_argzero = sizeof(struct nlm_args), - .pc_ressize = sizeof(struct nlm_res), - .pc_xdrressize = Ck+St, - .pc_name = "NM_LOCK", + [NLMPROC4_NM_LOCK] = { + .pc_func = nlm4svc_proc_nm_lock, + .pc_decode = nlm4_svc_decode_nlm4_lockargs, + .pc_encode = nlm4_svc_encode_nlm4_res, + .pc_argsize = sizeof(struct nlm4_lockargs_wrapper), + .pc_argzero = 0, + .pc_ressize = sizeof(struct nlm4_res_wrapper), + .pc_xdrressize = NLM4_nlm4_res_sz, + .pc_name = "NM_LOCK", }, - [NLMPROC_FREE_ALL] = { - .pc_func = nlm4svc_proc_free_all, - .pc_decode = nlm4svc_decode_notify, - .pc_encode = nlm4svc_encode_void, - .pc_argsize = sizeof(struct nlm_args), - .pc_argzero = sizeof(struct nlm_args), - .pc_ressize = sizeof(struct nlm_void), - .pc_xdrressize = St, - .pc_name = "FREE_ALL", + [NLMPROC4_FREE_ALL] = { + .pc_func = nlm4svc_proc_free_all, + .pc_decode = nlm4_svc_decode_nlm4_notify, + .pc_encode = nlm4_svc_encode_void, + .pc_argsize = sizeof(struct nlm4_notify_wrapper), + .pc_argzero = 0, + .pc_ressize = 0, + .pc_xdrressize = XDR_void, + .pc_name = "FREE_ALL", }, }; + +/* + * Storage requirements for XDR arguments and results + */ +union nlm4svc_xdrstore { + struct nlm4_testargs_wrapper testargs; + struct nlm4_lockargs_wrapper lockargs; + struct nlm4_cancargs_wrapper cancargs; + struct nlm4_unlockargs_wrapper unlockargs; + struct nlm4_notifyargs_wrapper notifyargs; + struct nlm4_shareargs_wrapper shareargs; + struct nlm4_notify_wrapper notify; + struct nlm4_testres_wrapper testres; + struct nlm4_res_wrapper res; + struct nlm4_shareres_wrapper shareres; +}; + +static DEFINE_PER_CPU_ALIGNED(unsigned long, + nlm4svc_call_counters[ARRAY_SIZE(nlm4svc_procedures)]); + +const struct svc_version nlmsvc_version4 = { + .vs_vers = 4, + .vs_nproc = ARRAY_SIZE(nlm4svc_procedures), + .vs_proc = nlm4svc_procedures, + .vs_count = nlm4svc_call_counters, + .vs_dispatch = nlmsvc_dispatch, + .vs_xdrsize = sizeof(union nlm4svc_xdrstore), +}; diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 0b6be8b8aeb1..b98b1d0ada35 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -28,16 +28,10 @@ #include <linux/sched.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svc_xprt.h> -#include <linux/lockd/nlm.h> -#include <linux/lockd/lockd.h> -#define NLMDBG_FACILITY NLMDBG_SVCLOCK +#include "lockd.h" -#ifdef CONFIG_LOCKD_V4 -#define nlm_deadlock nlm4_deadlock -#else -#define nlm_deadlock nlm_lck_denied -#endif +#define NLMDBG_FACILITY NLMDBG_SVCLOCK static void nlmsvc_release_block(struct nlm_block *block); static void nlmsvc_insert_block(struct nlm_block *block, unsigned long); @@ -80,6 +74,11 @@ static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie) return buf; } +#else +static inline const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie) +{ + return "???"; +} #endif /* @@ -463,7 +462,7 @@ nlmsvc_defer_lock_rqst(struct svc_rqst *rqstp, struct nlm_block *block) block->b_deferred_req = rqstp->rq_chandle.defer(block->b_cache_req); if (block->b_deferred_req != NULL) - status = nlm_drop_reply; + status = nlm__int__drop_reply; } dprintk("lockd: nlmsvc_defer_lock_rqst block %p flags %d status %d\n", block, block->b_flags, ntohl(status)); @@ -531,7 +530,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, ret = nlm_lck_denied; goto out; } - ret = nlm_drop_reply; + ret = nlm__int__drop_reply; goto out; } @@ -589,7 +588,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; case -EDEADLK: nlmsvc_remove_block(block); - ret = nlm_deadlock; + ret = nlm__int__deadlock; goto out; default: /* includes ENOLCK */ nlmsvc_remove_block(block); diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 5817ef272332..749abf8886ba 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -10,39 +10,52 @@ #include <linux/types.h> #include <linux/time.h> -#include <linux/lockd/lockd.h> -#include <linux/lockd/share.h> #include <linux/sunrpc/svc_xprt.h> +#include "lockd.h" +#include "share.h" + #define NLMDBG_FACILITY NLMDBG_CLIENT #ifdef CONFIG_LOCKD_V4 -static __be32 -cast_to_nlm(__be32 status, u32 vers) +static inline __be32 cast_status(__be32 status) { - /* Note: status is assumed to be in network byte order !!! */ - if (vers != 4){ - switch (status) { - case nlm_granted: - case nlm_lck_denied: - case nlm_lck_denied_nolocks: - case nlm_lck_blocked: - case nlm_lck_denied_grace_period: - case nlm_drop_reply: - break; - case nlm4_deadlock: - status = nlm_lck_denied; - break; - default: - status = nlm_lck_denied_nolocks; - } + switch (status) { + case nlm_granted: + case nlm_lck_denied: + case nlm_lck_denied_nolocks: + case nlm_lck_blocked: + case nlm_lck_denied_grace_period: + case nlm__int__drop_reply: + break; + case nlm__int__deadlock: + status = nlm_lck_denied; + break; + default: + status = nlm_lck_denied_nolocks; } - return (status); + return status; } -#define cast_status(status) (cast_to_nlm(status, rqstp->rq_vers)) #else -#define cast_status(status) (status) +static inline __be32 cast_status(__be32 status) +{ + switch (status) { + case nlm__int__deadlock: + status = nlm_lck_denied; + break; + case nlm__int__stale_fh: + case nlm__int__failed: + status = nlm_lck_denied_nolocks; + break; + default: + if (be32_to_cpu(status) >= 30000) + pr_warn_once("lockd: unhandled internal status %u\n", + be32_to_cpu(status)); + break; + } + return status; +} #endif /* @@ -124,12 +137,13 @@ __nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp) /* Obtain client and file */ if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; /* Now check for conflicting locks */ resp->status = cast_status(nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock)); - if (resp->status == nlm_drop_reply) + if (resp->status == nlm__int__drop_reply) rc = rpc_drop_reply; else dprintk("lockd: TEST status %d vers %d\n", @@ -161,13 +175,14 @@ __nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_res *resp) /* Obtain client and file */ if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; /* Now try to lock the file */ resp->status = cast_status(nlmsvc_lock(rqstp, file, host, &argp->lock, argp->block, &argp->cookie, argp->reclaim)); - if (resp->status == nlm_drop_reply) + if (resp->status == nlm__int__drop_reply) rc = rpc_drop_reply; else dprintk("lockd: LOCK status %d\n", ntohl(resp->status)); @@ -204,7 +219,8 @@ __nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_res *resp) /* Obtain client and file */ if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; /* Try to cancel request. */ resp->status = cast_status(nlmsvc_cancel_blocked(net, file, &argp->lock)); @@ -245,7 +261,8 @@ __nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_res *resp) /* Obtain client and file */ if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; /* Now try to remove the lock */ resp->status = cast_status(nlmsvc_unlock(net, file, &argp->lock)); @@ -402,10 +419,13 @@ nlmsvc_proc_share(struct svc_rqst *rqstp) /* Obtain client and file */ if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; /* Now try to create the share */ - resp->status = cast_status(nlmsvc_share_file(host, file, argp)); + resp->status = cast_status(nlmsvc_share_file(host, file, &argp->lock.oh, + argp->fsm_access, + argp->fsm_mode)); dprintk("lockd: SHARE status %d\n", ntohl(resp->status)); nlmsvc_release_lockowner(&argp->lock); @@ -437,10 +457,12 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp) /* Obtain client and file */ if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) - return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + return resp->status == nlm__int__drop_reply ? + rpc_drop_reply : rpc_success; /* Now try to unshare the file */ - resp->status = cast_status(nlmsvc_unshare_file(host, file, argp)); + resp->status = cast_status(nlmsvc_unshare_file(host, file, + &argp->lock.oh)); dprintk("lockd: UNSHARE status %d\n", ntohl(resp->status)); nlmsvc_release_lockowner(&argp->lock); @@ -536,7 +558,7 @@ struct nlm_void { int dummy; }; #define No (1+1024/4) /* Net Obj */ #define Rg 2 /* range - offset + size */ -const struct svc_procedure nlmsvc_procedures[24] = { +static const struct svc_procedure nlmsvc_procedures[24] = { [NLMPROC_NULL] = { .pc_func = nlmsvc_proc_null, .pc_decode = nlmsvc_decode_void, @@ -778,3 +800,39 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_name = "FREE_ALL", }, }; + +/* + * Storage requirements for XDR arguments and results + */ +union nlmsvc_xdrstore { + struct nlm_args args; + struct nlm_res res; + struct nlm_reboot reboot; +}; + +/* + * NLMv1 defines only procedures 1 - 15. Linux lockd also implements + * procedures 0 (NULL) and 16 (SM_NOTIFY). + */ +static DEFINE_PER_CPU_ALIGNED(unsigned long, nlm1svc_call_counters[17]); + +const struct svc_version nlmsvc_version1 = { + .vs_vers = 1, + .vs_nproc = 17, + .vs_proc = nlmsvc_procedures, + .vs_count = nlm1svc_call_counters, + .vs_dispatch = nlmsvc_dispatch, + .vs_xdrsize = sizeof(union nlmsvc_xdrstore), +}; + +static DEFINE_PER_CPU_ALIGNED(unsigned long, + nlm3svc_call_counters[ARRAY_SIZE(nlmsvc_procedures)]); + +const struct svc_version nlmsvc_version3 = { + .vs_vers = 3, + .vs_nproc = ARRAY_SIZE(nlmsvc_procedures), + .vs_proc = nlmsvc_procedures, + .vs_count = nlm3svc_call_counters, + .vs_dispatch = nlmsvc_dispatch, + .vs_xdrsize = sizeof(union nlmsvc_xdrstore), +}; diff --git a/fs/lockd/svcshare.c b/fs/lockd/svcshare.c index 88c81ce1148d..53f5655c128c 100644 --- a/fs/lockd/svcshare.c +++ b/fs/lockd/svcshare.c @@ -14,8 +14,9 @@ #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svc.h> -#include <linux/lockd/lockd.h> -#include <linux/lockd/share.h> + +#include "lockd.h" +#include "share.h" static inline int nlm_cmp_owner(struct nlm_share *share, struct xdr_netobj *oh) @@ -24,12 +25,21 @@ nlm_cmp_owner(struct nlm_share *share, struct xdr_netobj *oh) && !memcmp(share->s_owner.data, oh->data, oh->len); } +/** + * nlmsvc_share_file - create a share + * @host: Network client peer + * @file: File to be shared + * @oh: Share owner handle + * @access: Requested access mode + * @mode: Requested file sharing mode + * + * Returns an NLM status code. + */ __be32 nlmsvc_share_file(struct nlm_host *host, struct nlm_file *file, - struct nlm_args *argp) + struct xdr_netobj *oh, u32 access, u32 mode) { struct nlm_share *share; - struct xdr_netobj *oh = &argp->lock.oh; u8 *ohdata; if (nlmsvc_file_cannot_lock(file)) @@ -38,13 +48,11 @@ nlmsvc_share_file(struct nlm_host *host, struct nlm_file *file, for (share = file->f_shares; share; share = share->s_next) { if (share->s_host == host && nlm_cmp_owner(share, oh)) goto update; - if ((argp->fsm_access & share->s_mode) - || (argp->fsm_mode & share->s_access )) + if ((access & share->s_mode) || (mode & share->s_access)) return nlm_lck_denied; } - share = kmalloc(sizeof(*share) + oh->len, - GFP_KERNEL); + share = kmalloc(sizeof(*share) + oh->len, GFP_KERNEL); if (share == NULL) return nlm_lck_denied_nolocks; @@ -60,20 +68,24 @@ nlmsvc_share_file(struct nlm_host *host, struct nlm_file *file, file->f_shares = share; update: - share->s_access = argp->fsm_access; - share->s_mode = argp->fsm_mode; + share->s_access = access; + share->s_mode = mode; return nlm_granted; } -/* - * Delete a share. +/** + * nlmsvc_unshare_file - delete a share + * @host: Network client peer + * @file: File to be unshared + * @oh: Share owner handle + * + * Returns an NLM status code. */ __be32 nlmsvc_unshare_file(struct nlm_host *host, struct nlm_file *file, - struct nlm_args *argp) + struct xdr_netobj *oh) { struct nlm_share *share, **shpp; - struct xdr_netobj *oh = &argp->lock.oh; if (nlmsvc_file_cannot_lock(file)) return nlm_lck_denied_nolocks; diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 79f3dd2fd366..344e6c187cde 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -15,12 +15,13 @@ #include <linux/mutex.h> #include <linux/sunrpc/svc.h> #include <linux/sunrpc/addr.h> -#include <linux/lockd/lockd.h> -#include <linux/lockd/share.h> #include <linux/module.h> #include <linux/mount.h> #include <uapi/linux/nfs2.h> +#include "lockd.h" +#include "share.h" + #define NLMDBG_FACILITY NLMDBG_SVCSUBS @@ -87,14 +88,29 @@ static __be32 nlm_do_fopen(struct svc_rqst *rqstp, struct nlm_file *file, int mode) { struct file **fp = &file->f_file[mode]; - __be32 nfserr; + __be32 nlmerr = nlm_granted; + int error; if (*fp) - return 0; - nfserr = nlmsvc_ops->fopen(rqstp, &file->f_handle, fp, mode); - if (nfserr) - dprintk("lockd: open failed (error %d)\n", nfserr); - return nfserr; + return nlmerr; + + error = nlmsvc_ops->fopen(rqstp, &file->f_handle, fp, mode); + if (error) { + dprintk("lockd: open failed (errno %d)\n", error); + switch (error) { + case -EWOULDBLOCK: + nlmerr = nlm__int__drop_reply; + break; + case -ESTALE: + nlmerr = nlm__int__stale_fh; + break; + default: + nlmerr = nlm__int__failed; + break; + } + } + + return nlmerr; } /* diff --git a/fs/lockd/trace.h b/fs/lockd/trace.h index 7461b13b6e74..7214d7e96a42 100644 --- a/fs/lockd/trace.h +++ b/fs/lockd/trace.h @@ -8,7 +8,8 @@ #include <linux/tracepoint.h> #include <linux/crc32.h> #include <linux/nfs.h> -#include <linux/lockd/lockd.h> + +#include "lockd.h" #ifdef CONFIG_LOCKD_V4 #define NLM_STATUS_LIST \ diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index adfcce2bf11b..dfca8b8dab73 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c @@ -15,13 +15,13 @@ #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svc.h> #include <linux/sunrpc/stats.h> -#include <linux/lockd/lockd.h> #include <uapi/linux/nfs2.h> +#include "lockd.h" +#include "share.h" #include "svcxdr.h" - static inline loff_t s32_to_loff_t(__s32 offset) { @@ -275,7 +275,7 @@ nlmsvc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) memset(lock, 0, sizeof(*lock)); locks_init_lock(&lock->fl); - lock->svid = ~(u32)0; + lock->svid = LOCKD_SHARE_SVID; if (!svcxdr_decode_cookie(xdr, &argp->cookie)) return false; diff --git a/include/linux/lockd/xdr.h b/fs/lockd/xdr.h index 17d53165d9f2..3c60817c4349 100644 --- a/include/linux/lockd/xdr.h +++ b/fs/lockd/xdr.h @@ -1,14 +1,12 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * linux/include/linux/lockd/xdr.h - * * XDR types for the NLM protocol * * Copyright (C) 1996 Olaf Kirch <okir@monad.swb.de> */ -#ifndef LOCKD_XDR_H -#define LOCKD_XDR_H +#ifndef _LOCKD_XDR_H +#define _LOCKD_XDR_H #include <linux/fs.h> #include <linux/filelock.h> @@ -33,8 +31,6 @@ struct svc_rqst; #define nlm_lck_blocked cpu_to_be32(NLM_LCK_BLOCKED) #define nlm_lck_denied_grace_period cpu_to_be32(NLM_LCK_DENIED_GRACE_PERIOD) -#define nlm_drop_reply cpu_to_be32(30000) - /* Lock info passed via NLM */ struct nlm_lock { char * caller; @@ -92,11 +88,6 @@ struct nlm_reboot { struct nsm_private priv; }; -/* - * Contents of statd callback when monitored host rebooted - */ -#define NLMSVC_XDRSIZE sizeof(struct nlm_args) - bool nlmsvc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); bool nlmsvc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); bool nlmsvc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); @@ -112,4 +103,4 @@ bool nlmsvc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); bool nlmsvc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); bool nlmsvc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr); -#endif /* LOCKD_XDR_H */ +#endif /* _LOCKD_XDR_H */ diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c deleted file mode 100644 index e343c820301f..000000000000 --- a/fs/lockd/xdr4.c +++ /dev/null @@ -1,347 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/fs/lockd/xdr4.c - * - * XDR support for lockd and the lock client. - * - * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> - * Copyright (C) 1999, Trond Myklebust <trond.myklebust@fys.uio.no> - */ - -#include <linux/types.h> -#include <linux/sched.h> -#include <linux/nfs.h> - -#include <linux/sunrpc/xdr.h> -#include <linux/sunrpc/clnt.h> -#include <linux/sunrpc/svc.h> -#include <linux/sunrpc/stats.h> -#include <linux/lockd/lockd.h> - -#include "svcxdr.h" - -static inline s64 -loff_t_to_s64(loff_t offset) -{ - s64 res; - if (offset > NLM4_OFFSET_MAX) - res = NLM4_OFFSET_MAX; - else if (offset < -NLM4_OFFSET_MAX) - res = -NLM4_OFFSET_MAX; - else - res = offset; - return res; -} - -void nlm4svc_set_file_lock_range(struct file_lock *fl, u64 off, u64 len) -{ - s64 end = off + len - 1; - - fl->fl_start = off; - if (len == 0 || end < 0) - fl->fl_end = OFFSET_MAX; - else - fl->fl_end = end; -} - -/* - * NLM file handles are defined by specification to be a variable-length - * XDR opaque no longer than 1024 bytes. However, this implementation - * limits their length to the size of an NFSv3 file handle. - */ -static bool -svcxdr_decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh) -{ - __be32 *p; - u32 len; - - if (xdr_stream_decode_u32(xdr, &len) < 0) - return false; - if (len > NFS_MAXFHSIZE) - return false; - - p = xdr_inline_decode(xdr, len); - if (!p) - return false; - fh->size = len; - memcpy(fh->data, p, len); - memset(fh->data + len, 0, sizeof(fh->data) - len); - - return true; -} - -static bool -svcxdr_decode_lock(struct xdr_stream *xdr, struct nlm_lock *lock) -{ - struct file_lock *fl = &lock->fl; - - if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len)) - return false; - if (!svcxdr_decode_fhandle(xdr, &lock->fh)) - return false; - if (!svcxdr_decode_owner(xdr, &lock->oh)) - return false; - if (xdr_stream_decode_u32(xdr, &lock->svid) < 0) - return false; - if (xdr_stream_decode_u64(xdr, &lock->lock_start) < 0) - return false; - if (xdr_stream_decode_u64(xdr, &lock->lock_len) < 0) - return false; - - locks_init_lock(fl); - fl->c.flc_type = F_RDLCK; - nlm4svc_set_file_lock_range(fl, lock->lock_start, lock->lock_len); - return true; -} - -static bool -svcxdr_encode_holder(struct xdr_stream *xdr, const struct nlm_lock *lock) -{ - const struct file_lock *fl = &lock->fl; - s64 start, len; - - /* exclusive */ - if (xdr_stream_encode_bool(xdr, fl->c.flc_type != F_RDLCK) < 0) - return false; - if (xdr_stream_encode_u32(xdr, lock->svid) < 0) - return false; - if (!svcxdr_encode_owner(xdr, &lock->oh)) - return false; - start = loff_t_to_s64(fl->fl_start); - if (fl->fl_end == OFFSET_MAX) - len = 0; - else - len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1); - if (xdr_stream_encode_u64(xdr, start) < 0) - return false; - if (xdr_stream_encode_u64(xdr, len) < 0) - return false; - - return true; -} - -static bool -svcxdr_encode_testrply(struct xdr_stream *xdr, const struct nlm_res *resp) -{ - if (!svcxdr_encode_stats(xdr, resp->status)) - return false; - switch (resp->status) { - case nlm_lck_denied: - if (!svcxdr_encode_holder(xdr, &resp->lock)) - return false; - } - - return true; -} - - -/* - * Decode Call arguments - */ - -bool -nlm4svc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr) -{ - return true; -} - -bool -nlm4svc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) -{ - struct nlm_args *argp = rqstp->rq_argp; - u32 exclusive; - - if (!svcxdr_decode_cookie(xdr, &argp->cookie)) - return false; - if (xdr_stream_decode_bool(xdr, &exclusive) < 0) - return false; - if (!svcxdr_decode_lock(xdr, &argp->lock)) - return false; - if (exclusive) - argp->lock.fl.c.flc_type = F_WRLCK; - - return true; -} - -bool -nlm4svc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) -{ - struct nlm_args *argp = rqstp->rq_argp; - u32 exclusive; - - if (!svcxdr_decode_cookie(xdr, &argp->cookie)) - return false; - if (xdr_stream_decode_bool(xdr, &argp->block) < 0) - return false; - if (xdr_stream_decode_bool(xdr, &exclusive) < 0) - return false; - if (!svcxdr_decode_lock(xdr, &argp->lock)) - return false; - if (exclusive) - argp->lock.fl.c.flc_type = F_WRLCK; - if (xdr_stream_decode_bool(xdr, &argp->reclaim) < 0) - return false; - if (xdr_stream_decode_u32(xdr, &argp->state) < 0) - return false; - argp->monitor = 1; /* monitor client by default */ - - return true; -} - -bool -nlm4svc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) -{ - struct nlm_args *argp = rqstp->rq_argp; - u32 exclusive; - - if (!svcxdr_decode_cookie(xdr, &argp->cookie)) - return false; - if (xdr_stream_decode_bool(xdr, &argp->block) < 0) - return false; - if (xdr_stream_decode_bool(xdr, &exclusive) < 0) - return false; - if (!svcxdr_decode_lock(xdr, &argp->lock)) - return false; - if (exclusive) - argp->lock.fl.c.flc_type = F_WRLCK; - - return true; -} - -bool -nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) -{ - struct nlm_args *argp = rqstp->rq_argp; - - if (!svcxdr_decode_cookie(xdr, &argp->cookie)) - return false; - if (!svcxdr_decode_lock(xdr, &argp->lock)) - return false; - argp->lock.fl.c.flc_type = F_UNLCK; - - return true; -} - -bool -nlm4svc_decode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr) -{ - struct nlm_res *resp = rqstp->rq_argp; - - if (!svcxdr_decode_cookie(xdr, &resp->cookie)) - return false; - if (!svcxdr_decode_stats(xdr, &resp->status)) - return false; - - return true; -} - -bool -nlm4svc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr) -{ - struct nlm_reboot *argp = rqstp->rq_argp; - __be32 *p; - u32 len; - - if (xdr_stream_decode_u32(xdr, &len) < 0) - return false; - if (len > SM_MAXSTRLEN) - return false; - p = xdr_inline_decode(xdr, len); - if (!p) - return false; - argp->len = len; - argp->mon = (char *)p; - if (xdr_stream_decode_u32(xdr, &argp->state) < 0) - return false; - p = xdr_inline_decode(xdr, SM_PRIV_SIZE); - if (!p) - return false; - memcpy(&argp->priv.data, p, sizeof(argp->priv.data)); - - return true; -} - -bool -nlm4svc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) -{ - struct nlm_args *argp = rqstp->rq_argp; - struct nlm_lock *lock = &argp->lock; - - locks_init_lock(&lock->fl); - lock->svid = ~(u32)0; - - if (!svcxdr_decode_cookie(xdr, &argp->cookie)) - return false; - if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len)) - return false; - if (!svcxdr_decode_fhandle(xdr, &lock->fh)) - return false; - if (!svcxdr_decode_owner(xdr, &lock->oh)) - return false; - /* XXX: Range checks are missing in the original code */ - if (xdr_stream_decode_u32(xdr, &argp->fsm_mode) < 0) - return false; - if (xdr_stream_decode_u32(xdr, &argp->fsm_access) < 0) - return false; - - return true; -} - -bool -nlm4svc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr) -{ - struct nlm_args *argp = rqstp->rq_argp; - struct nlm_lock *lock = &argp->lock; - - if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len)) - return false; - if (xdr_stream_decode_u32(xdr, &argp->state) < 0) - return false; - - return true; -} - - -/* - * Encode Reply results - */ - -bool -nlm4svc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr) -{ - return true; -} - -bool -nlm4svc_encode_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr) -{ - struct nlm_res *resp = rqstp->rq_resp; - - return svcxdr_encode_cookie(xdr, &resp->cookie) && - svcxdr_encode_testrply(xdr, resp); -} - -bool -nlm4svc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr) -{ - struct nlm_res *resp = rqstp->rq_resp; - - return svcxdr_encode_cookie(xdr, &resp->cookie) && - svcxdr_encode_stats(xdr, resp->status); -} - -bool -nlm4svc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr) -{ - struct nlm_res *resp = rqstp->rq_resp; - - if (!svcxdr_encode_cookie(xdr, &resp->cookie)) - return false; - if (!svcxdr_encode_stats(xdr, resp->status)) - return false; - /* sequence */ - if (xdr_stream_encode_u32(xdr, 0) < 0) - return false; - - return true; -} diff --git a/fs/locks.c b/fs/locks.c index d8b066fb4210..fead53474c30 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1534,6 +1534,7 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose) { struct file_lock_context *ctx = inode->i_flctx; struct file_lease *fl, *tmp; + bool remove; lockdep_assert_held(&ctx->flc_lock); @@ -1541,8 +1542,19 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose) trace_time_out_leases(inode, fl); if (past_time(fl->fl_downgrade_time)) lease_modify(fl, F_RDLCK, dispose); - if (past_time(fl->fl_break_time)) - lease_modify(fl, F_UNLCK, dispose); + + remove = true; + if (past_time(fl->fl_break_time)) { + /* + * Consult the lease manager when a lease break times + * out to determine whether the lease should be disposed + * of. + */ + if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout) + remove = fl->fl_lmops->lm_breaker_timedout(fl); + if (remove) + lease_modify(fl, F_UNLCK, dispose); + } } } @@ -1670,9 +1682,13 @@ int __break_lease(struct inode *inode, unsigned int flags) restart: fl = list_first_entry(&ctx->flc_lease, struct file_lease, c.flc_list); break_time = fl->fl_break_time; - if (break_time != 0) - break_time -= jiffies; - if (break_time == 0) + if (break_time != 0) { + if (time_after(jiffies, break_time)) { + fl->fl_break_time = jiffies + lease_break_time * HZ; + break_time = lease_break_time * HZ; + } else + break_time -= jiffies; + } else break_time++; locks_insert_block(&fl->c, &new_fl->c, leases_conflict); trace_break_lease_block(inode, new_fl); diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 11f9f69cde61..d54a141a89b3 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -380,14 +380,13 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync) sector_t isect, extent_length = 0; struct parallel_io *par = NULL; loff_t offset = header->args.offset; - size_t count = header->args.count; struct page **pages = header->args.pages; int pg_index = header->args.pgbase >> PAGE_SHIFT; unsigned int pg_len; struct blk_plug plug; int i; - dprintk("%s enter, %zu@%lld\n", __func__, count, offset); + dprintk("%s enter, %u@%lld\n", __func__, header->args.count, offset); /* At this point, header->page_aray is a (sequential) list of nfs_pages. * We want to write each, and if there is an error set pnfs_error @@ -428,7 +427,6 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync) } offset += pg_len; - count -= pg_len; isect += (pg_len >> SECTOR_SHIFT); extent_length -= (pg_len >> SECTOR_SHIFT); } diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index be2aebf62056..95d7cd564b74 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -16,6 +16,7 @@ #include <linux/nfs3.h> #include <linux/nfs_fs.h> #include <linux/nfs_page.h> +#include <linux/filelock.h> #include <linux/lockd/bind.h> #include <linux/nfs_mount.h> #include <linux/freezer.h> diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 8c3d2efa2636..70795684b8e8 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -41,6 +41,7 @@ #include <linux/nfs2.h> #include <linux/nfs_fs.h> #include <linux/nfs_page.h> +#include <linux/filelock.h> #include <linux/lockd/bind.h> #include <linux/freezer.h> #include "internal.h" diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c index 1da4f707f9ef..3a197252a132 100644 --- a/fs/nfs/sysfs.c +++ b/fs/nfs/sysfs.c @@ -13,7 +13,7 @@ #include <linux/nfs_fs.h> #include <net/net_namespace.h> #include <linux/rcupdate.h> -#include <linux/lockd/lockd.h> +#include <linux/lockd/bind.h> #include "internal.h" #include "nfs4_fs.h" @@ -288,7 +288,7 @@ shutdown_store(struct kobject *kobj, struct kobj_attribute *attr, shutdown_client(server->client_acl); if (server->nlm_host) - shutdown_client(server->nlm_host->h_rpcclnt); + nlmclnt_shutdown_rpc_clnt(server->nlm_host); out: shutdown_nfs_client(server->nfs_client); return count; diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 4fd6e818565e..ffb76761d6a8 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -7,6 +7,7 @@ config NFSD select CRC32 select CRYPTO_LIB_MD5 if NFSD_LEGACY_CLIENT_TRACKING select CRYPTO_LIB_SHA256 if NFSD_V4 + select CRYPTO # required by RPCSEC_GSS_KRB5 and signed filehandles select LOCKD select SUNRPC select EXPORTFS @@ -78,7 +79,6 @@ config NFSD_V4 depends on NFSD && PROC_FS select FS_POSIX_ACL select RPCSEC_GSS_KRB5 - select CRYPTO # required by RPCSEC_GSS_KRB5 select GRACE_PERIOD select NFS_V4_2_SSC_HELPER if NFS_V4_2 help @@ -177,16 +177,6 @@ config NFSD_LEGACY_CLIENT_TRACKING and will be removed in the future. Say Y here if you need support for them in the interim. -config NFSD_V4_DELEG_TIMESTAMPS - bool "Support delegated timestamps" - depends on NFSD_V4 - default n - help - NFSD implements delegated timestamps according to - draft-ietf-nfsv4-delstid-08 "Extending the Opening of Files". This - is currently an experimental feature and is therefore left disabled - by default. - config NFSD_V4_POSIX_ACLS bool "Support NFSv4 POSIX draft ACLs" depends on NFSD_V4 diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index a7cfba29990e..9d829c84f374 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -273,6 +273,52 @@ const struct nfsd4_layout_ops bl_layout_ops = { #endif /* CONFIG_NFSD_BLOCKLAYOUT */ #ifdef CONFIG_NFSD_SCSILAYOUT + +#define NFSD_MDS_PR_FENCED XA_MARK_0 + +/* + * Clear the fence flag if the device already has an entry. This occurs + * when a client re-registers after a previous fence, allowing new + * layouts for this device. + * + * Insert only on first registration. This bounds cl_dev_fences to the + * count of devices this client has accessed, preventing unbounded growth. + */ +static inline int nfsd4_scsi_fence_insert(struct nfs4_client *clp, + dev_t device) +{ + struct xarray *xa = &clp->cl_dev_fences; + int ret; + + xa_lock(xa); + ret = __xa_insert(xa, device, XA_ZERO_ENTRY, GFP_KERNEL); + if (ret == -EBUSY) { + __xa_clear_mark(xa, device, NFSD_MDS_PR_FENCED); + ret = 0; + } + xa_unlock(xa); + clp->cl_fence_retry_warn = false; + return ret; +} + +static inline bool nfsd4_scsi_fence_set(struct nfs4_client *clp, dev_t device) +{ + struct xarray *xa = &clp->cl_dev_fences; + bool skip; + + xa_lock(xa); + skip = xa_get_mark(xa, device, NFSD_MDS_PR_FENCED); + if (!skip) + __xa_set_mark(xa, device, NFSD_MDS_PR_FENCED); + xa_unlock(xa); + return skip; +} + +static inline void nfsd4_scsi_fence_clear(struct nfs4_client *clp, dev_t device) +{ + xa_clear_mark(&clp->cl_dev_fences, device, NFSD_MDS_PR_FENCED); +} + #define NFSD_MDS_PR_KEY 0x0100000000000000ULL /* @@ -342,6 +388,10 @@ nfsd4_block_get_device_info_scsi(struct super_block *sb, goto out_free_dev; } + ret = nfsd4_scsi_fence_insert(clp, sb->s_bdev->bd_dev); + if (ret < 0) + goto out_free_dev; + ret = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true); if (ret) { pr_err("pNFS: failed to register key for device %s.\n", @@ -394,17 +444,67 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp, return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps); } -static void +/* + * Perform the fence operation to prevent the client from accessing the + * block device. If a fence operation is already in progress, wait for + * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the + * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set, + * update the layout stateid by setting the ls_fenced flag to indicate + * that the client has been fenced. + * + * The cl_fence_mutex ensures that the fence operation has been fully + * completed, rather than just in progress, when returning from this + * function. + * + * Return true if client was fenced otherwise return false. + */ +static bool nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file) { struct nfs4_client *clp = ls->ls_stid.sc_client; struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev; int status; + bool ret; + + mutex_lock(&clp->cl_fence_mutex); + if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) { + mutex_unlock(&clp->cl_fence_mutex); + return true; + } status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY, nfsd4_scsi_pr_key(clp), PR_EXCLUSIVE_ACCESS_REG_ONLY, true); + /* + * Reset to allow retry only when the command could not have + * reached the device. Negative status means a local error + * (e.g., -ENOMEM) prevented the command from being sent. + * PR_STS_PATH_FAILED, PR_STS_PATH_FAST_FAILED, and + * PR_STS_RETRY_PATH_FAILURE indicate transport path failures + * before device delivery. + * + * For all other errors, the command may have reached the device + * and the preempt may have succeeded. Avoid resetting, since + * retrying a successful preempt returns PR_STS_IOERR or + * PR_STS_RESERVATION_CONFLICT, which would cause an infinite + * retry loop. + */ + switch (status) { + case 0: + case PR_STS_IOERR: + case PR_STS_RESERVATION_CONFLICT: + ret = true; + break; + default: + /* retry-able and other errors */ + ret = false; + nfsd4_scsi_fence_clear(clp, bdev->bd_dev); + break; + } + mutex_unlock(&clp->cl_fence_mutex); + trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status); + return ret; } const struct nfsd4_layout_ops scsi_layout_ops = { diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c index 7f44689e0a53..386fd1c54f52 100644 --- a/fs/nfsd/debugfs.c +++ b/fs/nfsd/debugfs.c @@ -140,4 +140,8 @@ void nfsd_debugfs_init(void) debugfs_create_file("io_cache_write", 0644, nfsd_top_dir, NULL, &nfsd_io_cache_write_fops); +#ifdef CONFIG_NFSD_V4 + debugfs_create_bool("delegated_timestamps", 0644, nfsd_top_dir, + &nfsd_delegts_enabled); +#endif } diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 1aadfa8e0406..665153f1720e 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1362,13 +1362,14 @@ static struct flags { { NFSEXP_ASYNC, {"async", "sync"}}, { NFSEXP_GATHERED_WRITES, {"wdelay", "no_wdelay"}}, { NFSEXP_NOREADDIRPLUS, {"nordirplus", ""}}, + { NFSEXP_SECURITY_LABEL, {"security_label", ""}}, + { NFSEXP_SIGN_FH, {"sign_fh", ""}}, { NFSEXP_NOHIDE, {"nohide", ""}}, - { NFSEXP_CROSSMOUNT, {"crossmnt", ""}}, { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}}, { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}}, + { NFSEXP_CROSSMOUNT, {"crossmnt", ""}}, { NFSEXP_V4ROOT, {"v4root", ""}}, { NFSEXP_PNFS, {"pnfs", ""}}, - { NFSEXP_SECURITY_LABEL, {"security_label", ""}}, { 0, {"", ""}} }; diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index c774ce9aa296..6fe1325815e0 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -14,19 +14,20 @@ #define NFSDDBG_FACILITY NFSDDBG_LOCKD -#ifdef CONFIG_LOCKD_V4 -#define nlm_stale_fh nlm4_stale_fh -#define nlm_failed nlm4_failed -#else -#define nlm_stale_fh nlm_lck_denied_nolocks -#define nlm_failed nlm_lck_denied_nolocks -#endif -/* - * Note: we hold the dentry use count while the file is open. +/** + * nlm_fopen - Open an NFSD file + * @rqstp: NLM RPC procedure execution context + * @f: NFS file handle to be opened + * @filp: OUT: an opened struct file + * @flags: the POSIX open flags to use + * + * nlm_fopen() holds the dentry reference until nlm_fclose() releases it. + * + * Returns zero on success or a negative errno value if the file + * cannot be opened. */ -static __be32 -nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp, - int mode) +static int nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, + struct file **filp, int flags) { __be32 nfserr; int access; @@ -47,18 +48,17 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp, * if NFSEXP_NOAUTHNLM is set. Some older clients use AUTH_NULL * for NLM requests. */ - access = (mode == O_WRONLY) ? NFSD_MAY_WRITE : NFSD_MAY_READ; + access = (flags == O_WRONLY) ? NFSD_MAY_WRITE : NFSD_MAY_READ; access |= NFSD_MAY_NLM | NFSD_MAY_OWNER_OVERRIDE | NFSD_MAY_BYPASS_GSS; nfserr = nfsd_open(rqstp, &fh, S_IFREG, access, filp); fh_put(&fh); - /* We return nlm error codes as nlm doesn't know - * about nfsd, but nfsd does know about nlm.. - */ + switch (nfserr) { case nfs_ok: - return 0; + break; case nfserr_jukebox: - /* this error can indicate a presence of a conflicting + /* + * This error can indicate a presence of a conflicting * delegation to an NLM lock request. Options are: * (1) For now, drop this request and make the client * retry. When delegation is returned, client's lock retry @@ -66,19 +66,25 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp, * (2) NLM4_DENIED as per "spec" signals to the client * that the lock is unavailable now but client can retry. * Linux client implementation does not. It treats - * NLM4_DENIED same as NLM4_FAILED and errors the request. + * NLM4_DENIED same as NLM4_FAILED and fails the request. * (3) For the future, treat this as blocked lock and try * to callback when the delegation is returned but might * not have a proper lock request to block on. */ - return nlm_drop_reply; + return -EWOULDBLOCK; case nfserr_stale: - return nlm_stale_fh; + return -ESTALE; default: - return nlm_failed; + return -ENOLCK; } + + return 0; } +/** + * nlm_fclose - Close an NFSD file + * @filp: a struct file that was opened by nlm_fopen() + */ static void nlm_fclose(struct file *filp) { diff --git a/fs/nfsd/netlink.c b/fs/nfsd/netlink.c index 887525964451..81c943345d13 100644 --- a/fs/nfsd/netlink.c +++ b/fs/nfsd/netlink.c @@ -24,12 +24,13 @@ const struct nla_policy nfsd_version_nl_policy[NFSD_A_VERSION_ENABLED + 1] = { }; /* NFSD_CMD_THREADS_SET - do */ -static const struct nla_policy nfsd_threads_set_nl_policy[NFSD_A_SERVER_MIN_THREADS + 1] = { +static const struct nla_policy nfsd_threads_set_nl_policy[NFSD_A_SERVER_FH_KEY + 1] = { [NFSD_A_SERVER_THREADS] = { .type = NLA_U32, }, [NFSD_A_SERVER_GRACETIME] = { .type = NLA_U32, }, [NFSD_A_SERVER_LEASETIME] = { .type = NLA_U32, }, [NFSD_A_SERVER_SCOPE] = { .type = NLA_NUL_STRING, }, [NFSD_A_SERVER_MIN_THREADS] = { .type = NLA_U32, }, + [NFSD_A_SERVER_FH_KEY] = NLA_POLICY_EXACT_LEN(16), }; /* NFSD_CMD_VERSION_SET - do */ @@ -58,7 +59,7 @@ static const struct genl_split_ops nfsd_nl_ops[] = { .cmd = NFSD_CMD_THREADS_SET, .doit = nfsd_nl_threads_set_doit, .policy = nfsd_threads_set_nl_policy, - .maxattr = NFSD_A_SERVER_MIN_THREADS, + .maxattr = NFSD_A_SERVER_FH_KEY, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 9fa600602658..27da1a3edacb 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -25,6 +25,7 @@ #define SESSION_HASH_SIZE 512 struct cld_net; +struct nfsd_net_cb; struct nfsd4_client_tracking_ops; enum { @@ -99,6 +100,9 @@ struct nfsd_net { */ struct list_head client_lru; struct list_head close_lru; + + /* protects del_recall_lru and delegation hash/unhash */ + spinlock_t deleg_lock ____cacheline_aligned; struct list_head del_recall_lru; /* protected by blocked_locks_lock */ @@ -224,6 +228,9 @@ struct nfsd_net { spinlock_t local_clients_lock; struct list_head local_clients; #endif + siphash_key_t *fh_key; + + struct nfsd_net_cb *nfsd_cb; }; /* Simple check to find out if a given net was properly initialized */ diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index ef4971d71ac4..2ff9a991a8fb 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -1069,7 +1069,7 @@ svcxdr_encode_entry3_common(struct nfsd3_readdirres *resp, const char *name, * * Return values: * %0: Entry was successfully encoded. - * %-EINVAL: An encoding problem occured, secondary status code in resp->common.err + * %-EINVAL: An encoding problem occurred, secondary status code in resp->common.err * * On exit, the following fields are updated: * - resp->xdr @@ -1144,7 +1144,7 @@ out_noattrs: * * Return values: * %0: Entry was successfully encoded. - * %-EINVAL: An encoding problem occured, secondary status code in resp->common.err + * %-EINVAL: An encoding problem occurred, secondary status code in resp->common.err * * On exit, the following fields are updated: * - resp->xdr diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index aea8bdd2fdc4..50827405468d 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1016,7 +1016,7 @@ static int nfs4_xdr_dec_cb_offload(struct rpc_rqst *rqstp, .p_decode = nfs4_xdr_dec_##restype, \ .p_arglen = NFS4_enc_##argtype##_sz, \ .p_replen = NFS4_dec_##restype##_sz, \ - .p_statidx = NFSPROC4_CB_##call, \ + .p_statidx = NFSPROC4_CLNT_##proc, \ .p_name = #proc, \ } @@ -1032,39 +1032,14 @@ static const struct rpc_procinfo nfs4_cb_procedures[] = { PROC(CB_GETATTR, COMPOUND, cb_getattr, cb_getattr), }; -static unsigned int nfs4_cb_counts[ARRAY_SIZE(nfs4_cb_procedures)]; -static const struct rpc_version nfs_cb_version4 = { -/* - * Note on the callback rpc program version number: despite language in rfc - * 5661 section 18.36.3 requiring servers to use 4 in this field, the - * official xdr descriptions for both 4.0 and 4.1 specify version 1, and - * in practice that appears to be what implementations use. The section - * 18.36.3 language is expected to be fixed in an erratum. - */ - .number = 1, - .nrprocs = ARRAY_SIZE(nfs4_cb_procedures), - .procs = nfs4_cb_procedures, - .counts = nfs4_cb_counts, -}; +#define NFS4_CB_PROGRAM 0x40000000 +#define NFS4_CB_VERSION 1 -static const struct rpc_version *nfs_cb_version[2] = { - [1] = &nfs_cb_version4, -}; - -static const struct rpc_program cb_program; - -static struct rpc_stat cb_stats = { - .program = &cb_program -}; - -#define NFS4_CALLBACK 0x40000000 -static const struct rpc_program cb_program = { - .name = "nfs4_cb", - .number = NFS4_CALLBACK, - .nrvers = ARRAY_SIZE(nfs_cb_version), - .version = nfs_cb_version, - .stats = &cb_stats, - .pipe_dir_name = "nfsd4_cb", +struct nfsd_net_cb { + struct rpc_version version4; + const struct rpc_version *versions[NFS4_CB_VERSION + 1]; + struct rpc_program program; + struct rpc_stat stat; }; static int max_cb_time(struct net *net) @@ -1140,6 +1115,7 @@ static const struct cred *get_backchannel_cred(struct nfs4_client *clp, struct r static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses) { + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); int maxtime = max_cb_time(clp->net); struct rpc_timeout timeparms = { .to_initval = maxtime, @@ -1152,14 +1128,14 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c .addrsize = conn->cb_addrlen, .saddress = (struct sockaddr *) &conn->cb_saddr, .timeout = &timeparms, - .program = &cb_program, - .version = 1, + .version = NFS4_CB_VERSION, .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), .cred = current_cred(), }; struct rpc_clnt *client; const struct cred *cred; + args.program = &nn->nfsd_cb->program; if (clp->cl_minorversion == 0) { if (!clp->cl_cred.cr_principal && (clp->cl_cred.cr_flavor >= RPC_AUTH_GSS_KRB5)) { @@ -1786,3 +1762,70 @@ bool nfsd4_run_cb(struct nfsd4_callback *cb) nfsd41_cb_inflight_end(clp); return queued; } + +/** + * nfsd_net_cb_shutdown - release per-netns callback RPC program resources + * @nn: NFS server network namespace + * + * Frees resources allocated by nfsd_net_cb_init(). + */ +void nfsd_net_cb_shutdown(struct nfsd_net *nn) +{ + struct nfsd_net_cb *cb = nn->nfsd_cb; + + if (cb) { + kfree(cb->version4.counts); + kfree(cb); + nn->nfsd_cb = NULL; + } +} + +/** + * nfsd_net_cb_init - initialize per-netns callback RPC program + * @nn: NFS server network namespace + * + * Sets up the callback RPC program, version table, procedure + * counts, and statistics structure for @nn. Caller must release + * these resources using nfsd_net_cb_shutdown(). + * + * Return: 0 on success, or -ENOMEM if allocation fails. + */ +int nfsd_net_cb_init(struct nfsd_net *nn) +{ + struct nfsd_net_cb *cb; + + cb = kzalloc(sizeof(*cb), GFP_KERNEL); + if (!cb) + return -ENOMEM; + + cb->version4.counts = kzalloc_objs(unsigned int, + ARRAY_SIZE(nfs4_cb_procedures), GFP_KERNEL); + if (!cb->version4.counts) { + kfree(cb); + return -ENOMEM; + } + /* + * Note on the callback rpc program version number: despite language + * in rfc 5661 section 18.36.3 requiring servers to use 4 in this + * field, the official xdr descriptions for both 4.0 and 4.1 specify + * version 1, and in practice that appears to be what implementations + * use. The section 18.36.3 language is expected to be fixed in an + * erratum. + */ + cb->version4.number = NFS4_CB_VERSION; + cb->version4.nrprocs = ARRAY_SIZE(nfs4_cb_procedures); + cb->version4.procs = nfs4_cb_procedures; + cb->versions[NFS4_CB_VERSION] = &cb->version4; + + cb->program.name = "nfs4_cb"; + cb->program.number = NFS4_CB_PROGRAM; + cb->program.nrvers = ARRAY_SIZE(cb->versions); + cb->program.version = &cb->versions[0]; + cb->program.pipe_dir_name = "nfsd4_cb"; + cb->program.stats = &cb->stat; + cb->stat.program = &cb->program; + + nn->nfsd_cb = cb; + + return 0; +} diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index ad7af8cfcf1f..69e41105efdd 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -27,6 +27,8 @@ static struct kmem_cache *nfs4_layout_stateid_cache; static const struct nfsd4_callback_ops nfsd4_cb_layout_ops; static const struct lease_manager_operations nfsd4_layouts_lm_ops; +static void nfsd4_layout_fence_worker(struct work_struct *work); + const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = { #ifdef CONFIG_NFSD_FLEXFILELAYOUT [LAYOUT_FLEX_FILES] = &ff_layout_ops, @@ -177,6 +179,13 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid) trace_nfsd_layoutstate_free(&ls->ls_stid.sc_stateid); + spin_lock(&ls->ls_lock); + if (delayed_work_pending(&ls->ls_fence_work)) { + spin_unlock(&ls->ls_lock); + cancel_delayed_work_sync(&ls->ls_fence_work); + } else + spin_unlock(&ls->ls_lock); + spin_lock(&clp->cl_lock); list_del_init(&ls->ls_perclnt); spin_unlock(&clp->cl_lock); @@ -271,6 +280,10 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, list_add(&ls->ls_perfile, &fp->fi_lo_states); spin_unlock(&fp->fi_lock); + ls->ls_fenced = false; + ls->ls_fence_delay = 0; + INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker); + trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid); return ls; } @@ -747,11 +760,9 @@ static bool nfsd4_layout_lm_break(struct file_lease *fl) { /* - * We don't want the locks code to timeout the lease for us; - * we'll remove it ourself if a layout isn't returned - * in time: + * Enforce break lease timeout to prevent NFSD + * thread from hanging in __break_lease. */ - fl->fl_break_time = 0; nfsd4_recall_file_layout(fl->c.flc_owner); return false; } @@ -782,10 +793,143 @@ nfsd4_layout_lm_open_conflict(struct file *filp, int arg) return 0; } +static void +nfsd4_layout_fence_worker(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct nfs4_layout_stateid *ls = container_of(dwork, + struct nfs4_layout_stateid, ls_fence_work); + struct nfsd_file *nf; + struct block_device *bdev; + struct nfs4_client *clp; + struct nfsd_net *nn; + + /* + * The workqueue clears WORK_STRUCT_PENDING before invoking + * this callback. Re-arm immediately so that + * delayed_work_pending() returns true while the fence + * operation is in progress, preventing + * lm_breaker_timedout() from taking a duplicate reference. + */ + mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0); + + spin_lock(&ls->ls_lock); + if (list_empty(&ls->ls_layouts)) { + spin_unlock(&ls->ls_lock); +dispose: + cancel_delayed_work(&ls->ls_fence_work); + /* unlock the lease so that tasks waiting on it can proceed */ + nfsd4_close_layout(ls); + + ls->ls_fenced = true; + nfs4_put_stid(&ls->ls_stid); + return; + } + spin_unlock(&ls->ls_lock); + + rcu_read_lock(); + nf = nfsd_file_get(ls->ls_file); + rcu_read_unlock(); + if (!nf) + goto dispose; + + clp = ls->ls_stid.sc_client; + nn = net_generic(clp->net, nfsd_net_id); + bdev = nf->nf_file->f_path.mnt->mnt_sb->s_bdev; + if (nfsd4_layout_ops[ls->ls_layout_type]->fence_client(ls, nf)) { + /* fenced ok */ + nfsd_file_put(nf); + pr_warn("%s: FENCED client[%pISpc] clid[%d] to device[%s]\n", + __func__, (struct sockaddr *)&clp->cl_addr, + clp->cl_clientid.cl_id - nn->clientid_base, + bdev->bd_disk->disk_name); + goto dispose; + } + /* fence failed */ + nfsd_file_put(nf); + + if (!clp->cl_fence_retry_warn) { + pr_warn("%s: FENCE failed client[%pISpc] clid[%d] device[%s]\n", + __func__, (struct sockaddr *)&clp->cl_addr, + clp->cl_clientid.cl_id - nn->clientid_base, + bdev->bd_disk->disk_name); + clp->cl_fence_retry_warn = true; + } + /* + * The fence worker retries the fencing operation indefinitely to + * prevent data corruption. The admin needs to take the following + * actions to restore access to the file for other clients: + * + * . shutdown or power off the client being fenced. + * . manually expire the client to release all its state on the server; + * echo 'expire' > /proc/fs/nfsd/clients/clid/ctl'. + * + * Where: + * + * clid: is the unique client identifier displayed in + * the warning message above. + */ + if (!ls->ls_fence_delay) + ls->ls_fence_delay = HZ; + else + ls->ls_fence_delay = min(ls->ls_fence_delay << 1, + MAX_FENCE_DELAY); + mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, ls->ls_fence_delay); +} + +/** + * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out. + * @fl: file to check + * + * If the layout type supports a fence operation, schedule a worker to + * fence the client from accessing the block device. + * + * This function runs under the protection of the spin_lock flc_lock. + * At this time, the file_lease associated with the layout stateid is + * on the flc_list. A reference count is incremented on the layout + * stateid to prevent it from being freed while the fence worker is + * executing. Once the fence worker finishes its operation, it releases + * this reference. + * + * The fence worker continues to run until either the client has been + * fenced or the layout becomes invalid. The layout can become invalid + * as a result of a LAYOUTRETURN or when the CB_LAYOUT recall callback + * has completed. + * + * Return true if the file_lease should be disposed of by the caller; + * otherwise, return false. + */ +static bool +nfsd4_layout_lm_breaker_timedout(struct file_lease *fl) +{ + struct nfs4_layout_stateid *ls = fl->c.flc_owner; + + if ((!nfsd4_layout_ops[ls->ls_layout_type]->fence_client) || + ls->ls_fenced) + return true; + if (delayed_work_pending(&ls->ls_fence_work)) + return false; + /* + * Make sure layout has not been returned yet before + * taking a reference count on the layout stateid. + */ + spin_lock(&ls->ls_lock); + if (list_empty(&ls->ls_layouts) || + !refcount_inc_not_zero(&ls->ls_stid.sc_count)) { + spin_unlock(&ls->ls_lock); + return true; + } + spin_unlock(&ls->ls_lock); + + mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0); + return false; +} + static const struct lease_manager_operations nfsd4_layouts_lm_ops = { .lm_break = nfsd4_layout_lm_break, .lm_change = nfsd4_layout_lm_change, .lm_open_conflict = nfsd4_layout_lm_open_conflict, + .lm_breaker_timedout = nfsd4_layout_lm_breaker_timedout, }; int diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 6880c5c520e7..85e94c30285a 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -3043,6 +3043,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) struct svc_fh *current_fh = &cstate->current_fh; struct svc_fh *save_fh = &cstate->save_fh; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + struct nfsd_thread_local_info *ntli = rqstp->rq_private; __be32 status; resp->xdr = &rqstp->rq_res_stream; @@ -3081,7 +3082,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) } check_if_stalefh_allowed(args); - rqstp->rq_lease_breaker = (void **)&cstate->clp; + ntli->ntli_lease_breaker = &cstate->clp; trace_nfsd_compound(rqstp, args->tag, args->taglen, args->client_opcnt); while (!status && resp->opcnt < args->opcnt) { diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index a569d89ac912..c2d13b26a687 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -76,6 +76,8 @@ static const stateid_t close_stateid = { static u64 current_sessionid = 1; +bool nfsd_delegts_enabled __read_mostly = true; + #define ZERO_STATEID(stateid) (!memcmp((stateid), &zero_stateid, sizeof(stateid_t))) #define ONE_STATEID(stateid) (!memcmp((stateid), &one_stateid, sizeof(stateid_t))) #define CURRENT_STATEID(stateid) (!memcmp((stateid), ¤tstateid, sizeof(stateid_t))) @@ -91,13 +93,6 @@ static void deleg_reaper(struct nfsd_net *nn); /* Locking: */ -/* - * Currently used for the del_recall_lru and file hash table. In an - * effort to decrease the scope of the client_mutex, this spinlock may - * eventually cover more: - */ -static DEFINE_SPINLOCK(state_lock); - enum nfsd4_st_mutex_lock_subclass { OPEN_STATEID_MUTEX = 0, LOCK_STATEID_MUTEX = 1, @@ -1293,8 +1288,9 @@ nfs4_delegation_exists(struct nfs4_client *clp, struct nfs4_file *fp) { struct nfs4_delegation *searchdp = NULL; struct nfs4_client *searchclp = NULL; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); - lockdep_assert_held(&state_lock); + lockdep_assert_held(&nn->deleg_lock); lockdep_assert_held(&fp->fi_lock); list_for_each_entry(searchdp, &fp->fi_delegations, dl_perfile) { @@ -1323,8 +1319,9 @@ static int hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp) { struct nfs4_client *clp = dp->dl_stid.sc_client; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); - lockdep_assert_held(&state_lock); + lockdep_assert_held(&nn->deleg_lock); lockdep_assert_held(&fp->fi_lock); lockdep_assert_held(&clp->cl_lock); @@ -1346,8 +1343,10 @@ static bool unhash_delegation_locked(struct nfs4_delegation *dp, unsigned short statusmask) { struct nfs4_file *fp = dp->dl_stid.sc_file; + struct nfsd_net *nn = net_generic(dp->dl_stid.sc_client->net, + nfsd_net_id); - lockdep_assert_held(&state_lock); + lockdep_assert_held(&nn->deleg_lock); if (!delegation_hashed(dp)) return false; @@ -1372,10 +1371,12 @@ unhash_delegation_locked(struct nfs4_delegation *dp, unsigned short statusmask) static void destroy_delegation(struct nfs4_delegation *dp) { bool unhashed; + struct nfsd_net *nn = net_generic(dp->dl_stid.sc_client->net, + nfsd_net_id); - spin_lock(&state_lock); + spin_lock(&nn->deleg_lock); unhashed = unhash_delegation_locked(dp, SC_STATUS_CLOSED); - spin_unlock(&state_lock); + spin_unlock(&nn->deleg_lock); if (unhashed) destroy_unhashed_deleg(dp); } @@ -1495,8 +1496,24 @@ release_all_access(struct nfs4_ol_stateid *stp) } } +/** + * nfs4_replay_free_cache - release dynamically allocated replay buffer + * @rp: replay cache to reset + * + * If @rp->rp_buf points to a kmalloc'd buffer, free it and reset + * rp_buf to the inline rp_ibuf. Always zeroes rp_buflen. + */ +void nfs4_replay_free_cache(struct nfs4_replay *rp) +{ + if (rp->rp_buf != rp->rp_ibuf) + kfree(rp->rp_buf); + rp->rp_buf = rp->rp_ibuf; + rp->rp_buflen = 0; +} + static inline void nfs4_free_stateowner(struct nfs4_stateowner *sop) { + nfs4_replay_free_cache(&sop->so_replay); kfree(sop->so_owner.data); sop->so_ops->so_free(sop); } @@ -1838,11 +1855,11 @@ void nfsd4_revoke_states(struct nfsd_net *nn, struct super_block *sb) case SC_TYPE_DELEG: refcount_inc(&stid->sc_count); dp = delegstateid(stid); - spin_lock(&state_lock); + spin_lock(&nn->deleg_lock); if (!unhash_delegation_locked( dp, SC_STATUS_ADMIN_REVOKED)) dp = NULL; - spin_unlock(&state_lock); + spin_unlock(&nn->deleg_lock); if (dp) revoke_delegation(dp); break; @@ -2382,6 +2399,10 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name, #ifdef CONFIG_NFSD_PNFS INIT_LIST_HEAD(&clp->cl_lo_states); #endif +#ifdef CONFIG_NFSD_SCSILAYOUT + xa_init(&clp->cl_dev_fences); + mutex_init(&clp->cl_fence_mutex); +#endif INIT_LIST_HEAD(&clp->async_copies); spin_lock_init(&clp->async_lock); spin_lock_init(&clp->cl_lock); @@ -2504,13 +2525,13 @@ __destroy_client(struct nfs4_client *clp) struct nfs4_delegation *dp; LIST_HEAD(reaplist); - spin_lock(&state_lock); + spin_lock(&nn->deleg_lock); while (!list_empty(&clp->cl_delegations)) { dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt); unhash_delegation_locked(dp, SC_STATUS_CLOSED); list_add(&dp->dl_recall_lru, &reaplist); } - spin_unlock(&state_lock); + spin_unlock(&nn->deleg_lock); while (!list_empty(&reaplist)) { dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); list_del_init(&dp->dl_recall_lru); @@ -2543,6 +2564,9 @@ __destroy_client(struct nfs4_client *clp) svc_xprt_put(clp->cl_cb_conn.cb_xprt); atomic_add_unless(&nn->nfs4_client_count, -1, 0); nfsd4_dec_courtesy_client_count(nn, clp); +#ifdef CONFIG_NFSD_SCSILAYOUT + xa_destroy(&clp->cl_dev_fences); +#endif free_client(clp); wake_up_all(&expiry_wq); } @@ -5418,12 +5442,12 @@ static void nfsd4_cb_recall_prepare(struct nfsd4_callback *cb) * If the dl_time != 0, then we know that it has already been * queued for a lease break. Don't queue it again. */ - spin_lock(&state_lock); + spin_lock(&nn->deleg_lock); if (delegation_hashed(dp) && dp->dl_time == 0) { dp->dl_time = ktime_get_boottime_seconds(); list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru); } - spin_unlock(&state_lock); + spin_unlock(&nn->deleg_lock); } static int nfsd4_cb_recall_done(struct nfsd4_callback *cb, @@ -5535,13 +5559,15 @@ nfsd_break_deleg_cb(struct file_lease *fl) static bool nfsd_breaker_owns_lease(struct file_lease *fl) { struct nfs4_delegation *dl = fl->c.flc_owner; + struct nfsd_thread_local_info *ntli; struct svc_rqst *rqst; struct nfs4_client *clp; rqst = nfsd_current_rqst(); if (!nfsd_v4client(rqst)) return false; - clp = *(rqst->rq_lease_breaker); + ntli = rqst->rq_private; + clp = *ntli->ntli_lease_breaker; return dl->dl_stid.sc_client == clp; } @@ -6036,17 +6062,16 @@ nfsd4_verify_setuid_write(struct nfsd4_open *open, struct nfsd_file *nf) return 0; } -#ifdef CONFIG_NFSD_V4_DELEG_TIMESTAMPS +/* + * Timestamp delegation was introduced in RFC7862. Runtime switch for disabling + * this feature is /sys/kernel/debug/nfsd/delegated_timestamps. + */ static bool nfsd4_want_deleg_timestamps(const struct nfsd4_open *open) { + if (!nfsd_delegts_enabled) + return false; return open->op_deleg_want & OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS; } -#else /* CONFIG_NFSD_V4_DELEG_TIMESTAMPS */ -static bool nfsd4_want_deleg_timestamps(const struct nfsd4_open *open) -{ - return false; -} -#endif /* CONFIG NFSD_V4_DELEG_TIMESTAMPS */ static struct nfs4_delegation * nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, @@ -6054,6 +6079,7 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, { bool deleg_ts = nfsd4_want_deleg_timestamps(open); struct nfs4_client *clp = stp->st_stid.sc_client; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); struct nfs4_file *fp = stp->st_stid.sc_file; struct nfs4_clnt_odstate *odstate = stp->st_clnt_odstate; struct nfs4_delegation *dp; @@ -6113,7 +6139,7 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, return ERR_PTR(-EOPNOTSUPP); } - spin_lock(&state_lock); + spin_lock(&nn->deleg_lock); spin_lock(&fp->fi_lock); if (nfs4_delegation_exists(clp, fp)) status = -EAGAIN; @@ -6128,7 +6154,7 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, } else fp->fi_delegees++; spin_unlock(&fp->fi_lock); - spin_unlock(&state_lock); + spin_unlock(&nn->deleg_lock); if (nf) nfsd_file_put(nf); if (status) @@ -6172,13 +6198,13 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, if (fp->fi_had_conflict) goto out_unlock; - spin_lock(&state_lock); + spin_lock(&nn->deleg_lock); spin_lock(&clp->cl_lock); spin_lock(&fp->fi_lock); status = hash_delegation_locked(dp, fp); spin_unlock(&fp->fi_lock); spin_unlock(&clp->cl_lock); - spin_unlock(&state_lock); + spin_unlock(&nn->deleg_lock); if (status) goto out_unlock; @@ -6257,12 +6283,12 @@ nfsd4_add_rdaccess_to_wrdeleg(struct svc_rqst *rqstp, struct nfsd4_open *open, return (false); fp = stp->st_stid.sc_file; spin_lock(&fp->fi_lock); - __nfs4_file_get_access(fp, NFS4_SHARE_ACCESS_READ); if (!fp->fi_fds[O_RDONLY]) { + __nfs4_file_get_access(fp, NFS4_SHARE_ACCESS_READ); fp->fi_fds[O_RDONLY] = nf; + fp->fi_rdeleg_file = nfsd_file_get(fp->fi_fds[O_RDONLY]); nf = NULL; } - fp->fi_rdeleg_file = nfsd_file_get(fp->fi_fds[O_RDONLY]); spin_unlock(&fp->fi_lock); if (nf) nfsd_file_put(nf); @@ -6954,7 +6980,7 @@ nfs4_laundromat(struct nfsd_net *nn) nfs40_clean_admin_revoked(nn, <); - spin_lock(&state_lock); + spin_lock(&nn->deleg_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); if (!state_expired(<, dp->dl_time)) @@ -6963,7 +6989,7 @@ nfs4_laundromat(struct nfsd_net *nn) unhash_delegation_locked(dp, SC_STATUS_REVOKED); list_add(&dp->dl_recall_lru, &reaplist); } - spin_unlock(&state_lock); + spin_unlock(&nn->deleg_lock); while (!list_empty(&reaplist)) { dp = list_first_entry(&reaplist, struct nfs4_delegation, dl_recall_lru); @@ -8986,6 +9012,7 @@ static int nfs4_state_create_net(struct net *net) INIT_LIST_HEAD(&nn->client_lru); INIT_LIST_HEAD(&nn->close_lru); INIT_LIST_HEAD(&nn->del_recall_lru); + spin_lock_init(&nn->deleg_lock); spin_lock_init(&nn->client_lock); spin_lock_init(&nn->s2s_cp_lock); idr_init(&nn->s2s_cp_stateids); @@ -9117,13 +9144,13 @@ nfs4_state_shutdown_net(struct net *net) locks_end_grace(&nn->nfsd4_manager); INIT_LIST_HEAD(&reaplist); - spin_lock(&state_lock); + spin_lock(&nn->deleg_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); unhash_delegation_locked(dp, SC_STATUS_CLOSED); list_add(&dp->dl_recall_lru, &reaplist); } - spin_unlock(&state_lock); + spin_unlock(&nn->deleg_lock); list_for_each_safe(pos, next, &reaplist) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); list_del_init(&dp->dl_recall_lru); @@ -9348,13 +9375,14 @@ __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_delegation **pdp) { - __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + struct nfsd_thread_local_info *ntli = rqstp->rq_private; struct file_lock_context *ctx; struct nfs4_delegation *dp = NULL; struct file_lease *fl; struct nfs4_cb_fattr *ncf; struct inode *inode = d_inode(dentry); + __be32 status; ctx = locks_inode_context(inode); if (!ctx) @@ -9375,7 +9403,7 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry, break; } if (dp == NULL || dp == NON_NFSD_LEASE || - dp->dl_recall.cb_clp == *(rqstp->rq_lease_breaker)) { + dp->dl_recall.cb_clp == *(ntli->ntli_lease_breaker)) { spin_unlock(&ctx->flc_lock); if (dp == NON_NFSD_LEASE) { status = nfserrno(nfsd_open_break_lease(inode, @@ -9445,6 +9473,7 @@ nfsd_get_dir_deleg(struct nfsd4_compound_state *cstate, struct nfsd_file *nf) { struct nfs4_client *clp = cstate->clp; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); struct nfs4_delegation *dp; struct file_lease *fl; struct nfs4_file *fp, *rfp; @@ -9468,7 +9497,7 @@ nfsd_get_dir_deleg(struct nfsd4_compound_state *cstate, } /* if this client already has one, return that it's unavailable */ - spin_lock(&state_lock); + spin_lock(&nn->deleg_lock); spin_lock(&fp->fi_lock); /* existing delegation? */ if (nfs4_delegation_exists(clp, fp)) { @@ -9480,7 +9509,7 @@ nfsd_get_dir_deleg(struct nfsd4_compound_state *cstate, ++fp->fi_delegees; } spin_unlock(&fp->fi_lock); - spin_unlock(&state_lock); + spin_unlock(&nn->deleg_lock); if (status) { put_nfs4_file(fp); @@ -9509,13 +9538,13 @@ nfsd_get_dir_deleg(struct nfsd4_compound_state *cstate, * trying to set a delegation on the same file. If that happens, * then just say UNAVAIL. */ - spin_lock(&state_lock); + spin_lock(&nn->deleg_lock); spin_lock(&clp->cl_lock); spin_lock(&fp->fi_lock); status = hash_delegation_locked(dp, fp); spin_unlock(&fp->fi_lock); spin_unlock(&clp->cl_lock); - spin_unlock(&state_lock); + spin_unlock(&nn->deleg_lock); if (!status) { put_nfs4_file(fp); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 9d234913100b..2a0946c630e1 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2598,6 +2598,7 @@ nfsd4_opnum_in_range(struct nfsd4_compoundargs *argp, struct nfsd4_op *op) static bool nfsd4_decode_compound(struct nfsd4_compoundargs *argp) { + struct nfsd_thread_local_info *ntli = argp->rqstp->rq_private; struct nfsd4_op *op; bool cachethis = false; int auth_slack= argp->rqstp->rq_auth_slack; @@ -2690,7 +2691,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) if (argp->minorversion) cachethis = false; svc_reserve_auth(argp->rqstp, max_reply + readbytes); - argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE; + ntli->ntli_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE; argp->splice_ok = nfsd_read_splice_ok(argp->rqstp); if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack) @@ -6281,14 +6282,23 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) int len = xdr->buf->len - (op_status_offset + XDR_UNIT); so->so_replay.rp_status = op->status; - if (len <= NFSD4_REPLAY_ISIZE) { - so->so_replay.rp_buflen = len; - read_bytes_from_xdr_buf(xdr->buf, - op_status_offset + XDR_UNIT, - so->so_replay.rp_buf, len); - } else { - so->so_replay.rp_buflen = 0; + if (len > NFSD4_REPLAY_ISIZE) { + char *buf = kmalloc(len, GFP_KERNEL); + + nfs4_replay_free_cache(&so->so_replay); + if (buf) { + so->so_replay.rp_buf = buf; + } else { + /* rp_buflen already zeroed; skip caching */ + goto status; + } + } else if (so->so_replay.rp_buf != so->so_replay.rp_ibuf) { + nfs4_replay_free_cache(&so->so_replay); } + so->so_replay.rp_buflen = len; + read_bytes_from_xdr_buf(xdr->buf, + op_status_offset + XDR_UNIT, + so->so_replay.rp_buf, len); } status: op->status = nfsd4_map_status(op->status, diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index ab13ee9c7fd8..154468ceccdc 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -467,10 +467,11 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, unsigned int len, struct nfsd_cacherep **cacherep) { struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + struct nfsd_thread_local_info *ntli = rqstp->rq_private; struct nfsd_cacherep *rp, *found; __wsum csum; struct nfsd_drc_bucket *b; - int type = rqstp->rq_cachetype; + int type = ntli->ntli_cachetype; LIST_HEAD(dispose); int rtn = RC_DOIT; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 71aabdaa1d15..39e7012a60d8 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -11,7 +11,7 @@ #include <linux/fs_context.h> #include <linux/sunrpc/svcsock.h> -#include <linux/lockd/lockd.h> +#include <linux/lockd/bind.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/gss_api.h> #include <linux/sunrpc/rpc_pipe_fs.h> @@ -1582,6 +1582,32 @@ out_unlock: } /** + * nfsd_nl_fh_key_set - helper to copy fh_key from userspace + * @attr: nlattr NFSD_A_SERVER_FH_KEY + * @nn: nfsd_net + * + * Callers should hold nfsd_mutex, returns 0 on success or negative errno. + * Callers must ensure the server is shut down (sv_nrthreads == 0), + * userspace documentation asserts the key may only be set when the server + * is not running. + */ +static int nfsd_nl_fh_key_set(const struct nlattr *attr, struct nfsd_net *nn) +{ + siphash_key_t *fh_key = nn->fh_key; + + if (!fh_key) { + fh_key = kmalloc(sizeof(siphash_key_t), GFP_KERNEL); + if (!fh_key) + return -ENOMEM; + nn->fh_key = fh_key; + } + + fh_key->key[0] = get_unaligned_le64(nla_data(attr)); + fh_key->key[1] = get_unaligned_le64(nla_data(attr) + 8); + return 0; +} + +/** * nfsd_nl_threads_set_doit - set the number of running threads * @skb: reply buffer * @info: netlink metadata and command arguments @@ -1622,7 +1648,8 @@ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NFSD_A_SERVER_GRACETIME] || info->attrs[NFSD_A_SERVER_LEASETIME] || - info->attrs[NFSD_A_SERVER_SCOPE]) { + info->attrs[NFSD_A_SERVER_SCOPE] || + info->attrs[NFSD_A_SERVER_FH_KEY]) { ret = -EBUSY; if (nn->nfsd_serv && nn->nfsd_serv->sv_nrthreads) goto out_unlock; @@ -1651,6 +1678,14 @@ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info) attr = info->attrs[NFSD_A_SERVER_SCOPE]; if (attr) scope = nla_data(attr); + + attr = info->attrs[NFSD_A_SERVER_FH_KEY]; + if (attr) { + ret = nfsd_nl_fh_key_set(attr, nn); + trace_nfsd_ctl_fh_key_set((const char *)nn->fh_key, ret); + if (ret) + goto out_unlock; + } } attr = info->attrs[NFSD_A_SERVER_MIN_THREADS]; @@ -2168,6 +2203,9 @@ static __net_init int nfsd_net_init(struct net *net) int retval; int i; + retval = nfsd_net_cb_init(nn); + if (retval) + return retval; retval = nfsd_export_init(net); if (retval) goto out_export_error; @@ -2208,6 +2246,7 @@ out_repcache_error: out_idmap_error: nfsd_export_shutdown(net); out_export_error: + nfsd_net_cb_shutdown(nn); return retval; } @@ -2237,6 +2276,8 @@ static __net_exit void nfsd_net_exit(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); + kfree_sensitive(nn->fh_key); + nfsd_net_cb_shutdown(nn); nfsd_proc_stat_shutdown(net); percpu_counter_destroy_many(nn->counter, NFSD_STATS_COUNTERS_NUM); nfsd_idmap_shutdown(net); diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index a01d70953358..7c009f07c90b 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -82,6 +82,11 @@ extern atomic_t nfsd_th_cnt; /* number of available threads */ extern const struct seq_operations nfs_exports_op; +struct nfsd_thread_local_info { + struct nfs4_client **ntli_lease_breaker; + int ntli_cachetype; +}; + /* * Common void argument and result helpers */ @@ -155,6 +160,7 @@ static inline void nfsd_debugfs_exit(void) {} #endif extern bool nfsd_disable_splice_read __read_mostly; +extern bool nfsd_delegts_enabled __read_mostly; enum { /* Any new NFSD_IO enum value must be added at the end */ diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index ee72c9565e4f..429ca5c6ec08 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -11,6 +11,7 @@ #include <linux/exportfs.h> #include <linux/sunrpc/svcauth_gss.h> +#include <crypto/utils.h> #include "nfsd.h" #include "vfs.h" #include "auth.h" @@ -105,9 +106,12 @@ static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp, { /* Check if the request originated from a secure port. */ if (rqstp && !nfsd_originating_port_ok(rqstp, cred, exp)) { - RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); - dprintk("nfsd: request from insecure port %s!\n", - svc_print_addr(rqstp, buf, sizeof(buf))); + if (IS_ENABLED(CONFIG_SUNRPC_DEBUG)) { + char buf[RPC_MAX_ADDRBUFLEN]; + + dprintk("nfsd: request from insecure port %s!\n", + svc_print_addr(rqstp, buf, sizeof(buf))); + } return nfserr_perm; } @@ -137,6 +141,57 @@ static inline __be32 check_pseudo_root(struct dentry *dentry, return nfs_ok; } +/* Size of a file handle MAC, in 4-octet words */ +#define FH_MAC_WORDS (sizeof(__le64) / 4) + +static bool fh_append_mac(struct svc_fh *fhp, struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct knfsd_fh *fh = &fhp->fh_handle; + siphash_key_t *fh_key = nn->fh_key; + __le64 hash; + + if (!fh_key) + goto out_no_key; + if (fh->fh_size + sizeof(hash) > fhp->fh_maxsize) + goto out_no_space; + + hash = cpu_to_le64(siphash(&fh->fh_raw, fh->fh_size, fh_key)); + memcpy(&fh->fh_raw[fh->fh_size], &hash, sizeof(hash)); + fh->fh_size += sizeof(hash); + return true; + +out_no_key: + pr_warn_ratelimited("NFSD: unable to sign filehandles, fh_key not set.\n"); + return false; + +out_no_space: + pr_warn_ratelimited("NFSD: unable to sign filehandles, fh_size %zu would be greater than fh_maxsize %d.\n", + fh->fh_size + sizeof(hash), fhp->fh_maxsize); + return false; +} + +/* + * Verify that the filehandle's MAC was hashed from this filehandle + * given the server's fh_key: + */ +static bool fh_verify_mac(struct svc_fh *fhp, struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct knfsd_fh *fh = &fhp->fh_handle; + siphash_key_t *fh_key = nn->fh_key; + __le64 hash; + + if (!fh_key) { + pr_warn_ratelimited("NFSD: unable to verify signed filehandles, fh_key not set.\n"); + return false; + } + + hash = cpu_to_le64(siphash(&fh->fh_raw, fh->fh_size - sizeof(hash), fh_key)); + return crypto_memneq(&fh->fh_raw[fh->fh_size - sizeof(hash)], + &hash, sizeof(hash)) == 0; +} + /* * Use the given filehandle to look up the corresponding export and * dentry. On success, the results are used to set fh_export and @@ -233,13 +288,21 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct net *net, /* * Look up the dentry using the NFS file handle. */ - error = nfserr_badhandle; - fileid_type = fh->fh_fileid_type; + error = nfserr_stale; - if (fileid_type == FILEID_ROOT) + if (fileid_type == FILEID_ROOT) { + /* We don't sign or verify the root, no per-file identity */ dentry = dget(exp->ex_path.dentry); - else { + } else { + if (exp->ex_flags & NFSEXP_SIGN_FH) { + if (!fh_verify_mac(fhp, net)) { + trace_nfsd_set_fh_dentry_badmac(rqstp, fhp, -ESTALE); + goto out; + } + data_left -= FH_MAC_WORDS; + } + dentry = exportfs_decode_fh_raw(exp->ex_path.mnt, fid, data_left, fileid_type, 0, nfsd_acceptable, exp); @@ -255,6 +318,8 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct net *net, } } } + + error = nfserr_badhandle; if (dentry == NULL) goto out; if (IS_ERR(dentry)) { @@ -495,6 +560,10 @@ static void _fh_update(struct svc_fh *fhp, struct svc_export *exp, fhp->fh_handle.fh_fileid_type = fileid_type > 0 ? fileid_type : FILEID_INVALID; fhp->fh_handle.fh_size += maxsize * 4; + + if (exp->ex_flags & NFSEXP_SIGN_FH) + if (!fh_append_mac(fhp, exp->cd->net)) + fhp->fh_handle.fh_fileid_type = FILEID_INVALID; } else { fhp->fh_handle.fh_fileid_type = FILEID_ROOT; } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 4a04208393b8..4f1ab3222a4d 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -887,6 +887,7 @@ nfsd(void *vrqstp) struct svc_xprt *perm_sock = list_entry(rqstp->rq_server->sv_permsocks.next, typeof(struct svc_xprt), xpt_list); struct net *net = perm_sock->xpt_net; struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct nfsd_thread_local_info ntli = { }; bool have_mutex = false; /* At this point, the thread shares current->fs @@ -901,6 +902,10 @@ nfsd(void *vrqstp) set_freezable(); + /* use dynamic allocation if ntli should ever become large */ + static_assert(sizeof(struct nfsd_thread_local_info) < 256); + rqstp->rq_private = &ntli; + /* * The main request loop */ @@ -967,6 +972,7 @@ nfsd(void *vrqstp) */ int nfsd_dispatch(struct svc_rqst *rqstp) { + struct nfsd_thread_local_info *ntli = rqstp->rq_private; const struct svc_procedure *proc = rqstp->rq_procinfo; __be32 *statp = rqstp->rq_accept_statp; struct nfsd_cacherep *rp; @@ -977,7 +983,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp) * Give the xdr decoder a chance to change this if it wants * (necessary in the NFSv4.0 compound case) */ - rqstp->rq_cachetype = proc->pc_cachetype; + ntli->ntli_cachetype = proc->pc_cachetype; /* * ->pc_decode advances the argument stream past the NFS @@ -1022,7 +1028,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp) */ smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter + 1); - nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, nfs_reply); + nfsd_cache_update(rqstp, rp, ntli->ntli_cachetype, nfs_reply); out_cached_reply: return 1; diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index fc262ceafca9..ae71e0621317 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -605,7 +605,7 @@ svcxdr_encode_entry_common(struct nfsd_readdirres *resp, const char *name, * * Return values: * %0: Entry was successfully encoded. - * %-EINVAL: An encoding problem occured, secondary status code in resp->common.err + * %-EINVAL: An encoding problem occurred, secondary status code in resp->common.err * * On exit, the following fields are updated: * - resp->xdr diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h index db9af780438b..f7bee4dc5d3d 100644 --- a/fs/nfsd/pnfs.h +++ b/fs/nfsd/pnfs.h @@ -11,6 +11,9 @@ struct xdr_stream; +/* Cap exponential backoff between fence retries at 3 minutes */ +#define MAX_FENCE_DELAY ((unsigned int)(3 * 60 * HZ)) + struct nfsd4_deviceid_map { struct list_head hash; u64 idx; @@ -38,7 +41,7 @@ struct nfsd4_layout_ops { struct svc_rqst *rqstp, struct nfsd4_layoutcommit *lcp); - void (*fence_client)(struct nfs4_layout_stateid *ls, + bool (*fence_client)(struct nfs4_layout_stateid *ls, struct nfsd_file *file); }; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index c0ca115c3b74..953675eba5c3 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -123,7 +123,7 @@ struct nfs4_stid { #define SC_TYPE_LAYOUT BIT(3) unsigned short sc_type; -/* state_lock protects sc_status for delegation stateids. +/* nn->deleg_lock protects sc_status for delegation stateids. * ->cl_lock protects sc_status for open and lock stateids. * ->st_mutex also protect sc_status for open stateids. * ->ls_lock protects sc_status for layout stateids. @@ -456,6 +456,7 @@ struct nfs4_client { struct list_head cl_lru; /* tail queue */ #ifdef CONFIG_NFSD_PNFS struct list_head cl_lo_states; /* outstanding layout states */ + bool cl_fence_retry_warn; #endif struct xdr_netobj cl_name; /* id generated by client */ nfs4_verifier cl_verifier; /* generated by client */ @@ -527,6 +528,10 @@ struct nfs4_client { struct nfsd4_cb_recall_any *cl_ra; time64_t cl_ra_time; +#ifdef CONFIG_NFSD_SCSILAYOUT + struct xarray cl_dev_fences; + struct mutex cl_fence_mutex; +#endif }; /* struct nfs4_client_reset @@ -549,10 +554,10 @@ struct nfs4_client_reclaim { * ~32(deleg. ace) = 112 bytes * * Some responses can exceed this. A LOCK denial includes the conflicting - * lock owner, which can be up to 1024 bytes (NFS4_OPAQUE_LIMIT). Responses - * larger than REPLAY_ISIZE are not cached in rp_ibuf; only rp_status is - * saved. Enlarging this constant increases the size of every - * nfs4_stateowner. + * lock owner, which can be up to 1024 bytes (NFS4_OPAQUE_LIMIT). When a + * response exceeds REPLAY_ISIZE, a buffer is dynamically allocated. If + * that allocation fails, only rp_status is saved. Enlarging this constant + * increases the size of every nfs4_stateowner. */ #define NFSD4_REPLAY_ISIZE 112 @@ -564,12 +569,14 @@ struct nfs4_client_reclaim { struct nfs4_replay { __be32 rp_status; unsigned int rp_buflen; - char *rp_buf; + char *rp_buf; /* rp_ibuf or kmalloc'd */ struct knfsd_fh rp_openfh; int rp_locked; char rp_ibuf[NFSD4_REPLAY_ISIZE]; }; +extern void nfs4_replay_free_cache(struct nfs4_replay *rp); + struct nfs4_stateowner; struct nfs4_stateowner_operations { @@ -742,6 +749,10 @@ struct nfs4_layout_stateid { stateid_t ls_recall_sid; bool ls_recalled; struct mutex ls_mutex; + + struct delayed_work ls_fence_work; + unsigned int ls_fence_delay; + bool ls_fenced; }; static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s) @@ -851,6 +862,8 @@ struct nfsd_file *find_any_file(struct nfs4_file *f); #ifdef CONFIG_NFSD_V4 void nfsd4_revoke_states(struct nfsd_net *nn, struct super_block *sb); void nfsd4_cancel_copy_by_sb(struct net *net, struct super_block *sb); +int nfsd_net_cb_init(struct nfsd_net *nn); +void nfsd_net_cb_shutdown(struct nfsd_net *nn); #else static inline void nfsd4_revoke_states(struct nfsd_net *nn, struct super_block *sb) { @@ -858,6 +871,13 @@ static inline void nfsd4_revoke_states(struct nfsd_net *nn, struct super_block * static inline void nfsd4_cancel_copy_by_sb(struct net *net, struct super_block *sb) { } +static inline int nfsd_net_cb_init(struct nfsd_net *nn) +{ + return 0; +} +static inline void nfsd_net_cb_shutdown(struct nfsd_net *nn) +{ +} #endif /* grace period management */ diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index d1d0b0dd0545..5ad38f50836d 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -373,6 +373,7 @@ DEFINE_EVENT_CONDITION(nfsd_fh_err_class, nfsd_##name, \ DEFINE_NFSD_FH_ERR_EVENT(set_fh_dentry_badexport); DEFINE_NFSD_FH_ERR_EVENT(set_fh_dentry_badhandle); +DEFINE_NFSD_FH_ERR_EVENT(set_fh_dentry_badmac); TRACE_EVENT(nfsd_exp_find_key, TP_PROTO(const struct svc_expkey *key, @@ -2240,6 +2241,28 @@ TRACE_EVENT(nfsd_end_grace, ) ); +TRACE_EVENT(nfsd_ctl_fh_key_set, + TP_PROTO( + const char *key, + int result + ), + TP_ARGS(key, result), + TP_STRUCT__entry( + __field(u32, key_hash) + __field(int, result) + ), + TP_fast_assign( + if (key) + __entry->key_hash = ~crc32_le(0xFFFFFFFF, key, 16); + else + __entry->key_hash = 0; + __entry->result = result; + ), + TP_printk("key=0x%08x result=%d", + __entry->key_hash, __entry->result + ) +); + DECLARE_EVENT_CLASS(nfsd_copy_class, TP_PROTO( const struct nfsd4_copy *copy diff --git a/include/linux/filelock.h b/include/linux/filelock.h index d2c9740e26a8..5f0a2fb31450 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -50,6 +50,7 @@ struct lease_manager_operations { void (*lm_setup)(struct file_lease *, void **); bool (*lm_breaker_owns_lease)(struct file_lease *); int (*lm_open_conflict)(struct file *, int); + bool (*lm_breaker_timedout)(struct file_lease *fl); }; struct lock_manager { diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index c53c81242e72..b614e0deea72 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -10,27 +10,20 @@ #ifndef LINUX_LOCKD_BIND_H #define LINUX_LOCKD_BIND_H -#include <linux/lockd/nlm.h> -/* need xdr-encoded error codes too, so... */ -#include <linux/lockd/xdr.h> -#ifdef CONFIG_LOCKD_V4 -#include <linux/lockd/xdr4.h> -#endif - -/* Dummy declarations */ +struct file_lock; +struct nfs_fh; struct svc_rqst; struct rpc_task; struct rpc_clnt; +struct super_block; /* * This is the set of functions for lockd->nfsd communication */ struct nlmsvc_binding { - __be32 (*fopen)(struct svc_rqst *, - struct nfs_fh *, - struct file **, - int mode); - void (*fclose)(struct file *); + int (*fopen)(struct svc_rqst *rqstp, struct nfs_fh *f, + struct file **filp, int flags); + void (*fclose)(struct file *filp); }; extern const struct nlmsvc_binding *nlmsvc_ops; @@ -58,6 +51,7 @@ struct nlmclnt_initdata { extern struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init); extern void nlmclnt_done(struct nlm_host *host); extern struct rpc_clnt *nlmclnt_rpc_clnt(struct nlm_host *host); +extern void nlmclnt_shutdown_rpc_clnt(struct nlm_host *host); /* * NLM client operations provide a means to modify RPC processing of NLM @@ -82,4 +76,10 @@ extern int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl, vo extern int lockd_up(struct net *net, const struct cred *cred); extern void lockd_down(struct net *net); +/* + * Cluster failover support + */ +int nlmsvc_unlock_all_by_sb(struct super_block *sb); +int nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr); + #endif /* LINUX_LOCKD_BIND_H */ diff --git a/include/linux/lockd/debug.h b/include/linux/lockd/debug.h deleted file mode 100644 index eede2ab5246f..000000000000 --- a/include/linux/lockd/debug.h +++ /dev/null @@ -1,40 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * linux/include/linux/lockd/debug.h - * - * Debugging stuff. - * - * Copyright (C) 1996 Olaf Kirch <okir@monad.swb.de> - */ - -#ifndef LINUX_LOCKD_DEBUG_H -#define LINUX_LOCKD_DEBUG_H - -#include <linux/sunrpc/debug.h> - -/* - * Enable lockd debugging. - * Requires RPC_DEBUG. - */ -#undef ifdebug -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define ifdebug(flag) if (unlikely(nlm_debug & NLMDBG_##flag)) -#else -# define ifdebug(flag) if (0) -#endif - -/* - * Debug flags - */ -#define NLMDBG_SVC 0x0001 -#define NLMDBG_CLIENT 0x0002 -#define NLMDBG_CLNTLOCK 0x0004 -#define NLMDBG_SVCLOCK 0x0008 -#define NLMDBG_MONITOR 0x0010 -#define NLMDBG_CLNTSUBS 0x0020 -#define NLMDBG_SVCSUBS 0x0040 -#define NLMDBG_HOSTCACHE 0x0080 -#define NLMDBG_XDR 0x0100 -#define NLMDBG_ALL 0x7fff - -#endif /* LINUX_LOCKD_DEBUG_H */ diff --git a/include/linux/lockd/xdr4.h b/include/linux/lockd/xdr4.h deleted file mode 100644 index 72831e35dca3..000000000000 --- a/include/linux/lockd/xdr4.h +++ /dev/null @@ -1,43 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * linux/include/linux/lockd/xdr4.h - * - * XDR types for the NLM protocol - * - * Copyright (C) 1996 Olaf Kirch <okir@monad.swb.de> - */ - -#ifndef LOCKD_XDR4_H -#define LOCKD_XDR4_H - -#include <linux/fs.h> -#include <linux/nfs.h> -#include <linux/sunrpc/xdr.h> -#include <linux/lockd/xdr.h> - -/* error codes new to NLMv4 */ -#define nlm4_deadlock cpu_to_be32(NLM_DEADLCK) -#define nlm4_rofs cpu_to_be32(NLM_ROFS) -#define nlm4_stale_fh cpu_to_be32(NLM_STALE_FH) -#define nlm4_fbig cpu_to_be32(NLM_FBIG) -#define nlm4_failed cpu_to_be32(NLM_FAILED) - -void nlm4svc_set_file_lock_range(struct file_lock *fl, u64 off, u64 len); -bool nlm4svc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr); - -bool nlm4svc_encode_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); -bool nlm4svc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr); - -extern const struct rpc_version nlm_version4; - -#endif /* LOCKD_XDR4_H */ diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index e783132e481f..b1e595c2615b 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -16,6 +16,7 @@ #include <linux/atomic.h> #include <linux/kstrtox.h> #include <linux/proc_fs.h> +#include <linux/wait.h> /* * Each cache requires: @@ -112,7 +113,11 @@ struct cache_detail { int entries; /* fields for communication over channel */ - struct list_head queue; + struct list_head requests; + struct list_head readers; + spinlock_t queue_lock; + wait_queue_head_t queue_wait; + u64 next_seqno; atomic_t writers; /* how many time is /channel open */ time64_t last_close; /* if no writers, when did last close */ diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h index eb4bd62df319..ab61bed2f7af 100644 --- a/include/linux/sunrpc/debug.h +++ b/include/linux/sunrpc/debug.h @@ -38,6 +38,8 @@ extern unsigned int nlm_debug; do { \ ifdebug(fac) \ __sunrpc_printk(fmt, ##__VA_ARGS__); \ + else \ + no_printk(fmt, ##__VA_ARGS__); \ } while (0) # define dfprintk_rcu(fac, fmt, ...) \ @@ -46,15 +48,15 @@ do { \ rcu_read_lock(); \ __sunrpc_printk(fmt, ##__VA_ARGS__); \ rcu_read_unlock(); \ + } else { \ + no_printk(fmt, ##__VA_ARGS__); \ } \ } while (0) -# define RPC_IFDEBUG(x) x #else # define ifdebug(fac) if (0) -# define dfprintk(fac, fmt, ...) do {} while (0) -# define dfprintk_rcu(fac, fmt, ...) do {} while (0) -# define RPC_IFDEBUG(x) +# define dfprintk(fac, fmt, ...) no_printk(fmt, ##__VA_ARGS__) +# define dfprintk_rcu(fac, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif /* diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index ccba79ebf893..0dbdf3722537 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -95,10 +95,7 @@ struct rpc_task { int tk_rpc_status; /* Result of last RPC operation */ unsigned short tk_flags; /* misc flags */ unsigned short tk_timeouts; /* maj timeouts */ - -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) || IS_ENABLED(CONFIG_TRACEPOINTS) unsigned short tk_pid; /* debugging aid */ -#endif unsigned char tk_priority : 2,/* Task priority */ tk_garb_retry : 2, tk_cred_retry : 2; diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index a11acf5cd63b..4be6204f6630 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -134,25 +134,37 @@ enum { extern u32 svc_max_payload(const struct svc_rqst *rqstp); /* - * RPC Requests and replies are stored in one or more pages. - * We maintain an array of pages for each server thread. - * Requests are copied into these pages as they arrive. Remaining - * pages are available to write the reply into. + * RPC Call and Reply messages each have their own page array. + * rq_pages holds the incoming Call message; rq_respages holds + * the outgoing Reply message. Both arrays are sized to + * svc_serv_maxpages() entries and are allocated dynamically. * - * Pages are sent using ->sendmsg with MSG_SPLICE_PAGES so each server thread - * needs to allocate more to replace those used in sending. To help keep track - * of these pages we have a receive list where all pages initialy live, and a - * send list where pages are moved to when there are to be part of a reply. + * Pages are sent using ->sendmsg with MSG_SPLICE_PAGES so each + * server thread needs to allocate more to replace those used in + * sending. * - * We use xdr_buf for holding responses as it fits well with NFS - * read responses (that have a header, and some data pages, and possibly - * a tail) and means we can share some client side routines. + * rq_pages request page contract: * - * The xdr_buf.head kvec always points to the first page in the rq_*pages - * list. The xdr_buf.pages pointer points to the second page on that - * list. xdr_buf.tail points to the end of the first page. - * This assumes that the non-page part of an rpc reply will fit - * in a page - NFSd ensures this. lockd also has no trouble. + * Transport receive paths that move request data pages out of + * rq_pages -- TCP multi-fragment reassembly (svc_tcp_save_pages) + * and RDMA Read I/O (svc_rdma_clear_rqst_pages) -- NULL those + * entries to prevent svc_rqst_release_pages() from freeing pages + * still in transport use, and set rq_pages_nfree to the count. + * svc_alloc_arg() refills only that many rq_pages entries. + * + * For rq_respages, svc_rqst_release_pages() NULLs entries in + * [rq_respages, rq_next_page) after each RPC. svc_alloc_arg() + * refills only that range. + * + * xdr_buf holds responses; the structure fits NFS read responses + * (header, data pages, optional tail) and enables sharing of + * client-side routines. + * + * The xdr_buf.head kvec always points to the first page in the + * rq_*pages list. The xdr_buf.pages pointer points to the second + * page on that list. xdr_buf.tail points to the end of the first + * page. This assumes that the non-page part of an rpc reply will + * fit in a page - NFSd ensures this. lockd also has no trouble. */ /** @@ -162,10 +174,10 @@ extern u32 svc_max_payload(const struct svc_rqst *rqstp); * Returns a count of pages or vectors that can hold the maximum * size RPC message for @serv. * - * Each request/reply pair can have at most one "payload", plus two - * pages, one for the request, and one for the reply. - * nfsd_splice_actor() might need an extra page when a READ payload - * is not page-aligned. + * Each page array can hold at most one payload plus two + * overhead pages (one for the RPC header, one for tail data). + * nfsd_splice_actor() might need an extra page when a READ + * payload is not page-aligned. */ static inline unsigned long svc_serv_maxpages(const struct svc_serv *serv) { @@ -175,6 +187,9 @@ static inline unsigned long svc_serv_maxpages(const struct svc_serv *serv) /* * The context of a single thread, including the request currently being * processed. + * + * RPC programs are free to use rq_private to stash thread-local information. + * The sunrpc layer will not access it. */ struct svc_rqst { struct list_head rq_all; /* all threads list */ @@ -201,11 +216,12 @@ struct svc_rqst { struct xdr_stream rq_res_stream; struct folio *rq_scratch_folio; struct xdr_buf rq_res; - unsigned long rq_maxpages; /* num of entries in rq_pages */ - struct page * *rq_pages; - struct page * *rq_respages; /* points into rq_pages */ + unsigned long rq_maxpages; /* entries per page array */ + unsigned long rq_pages_nfree; /* rq_pages entries NULLed by transport */ + struct page * *rq_pages; /* Call buffer pages */ + struct page * *rq_respages; /* Reply buffer pages */ struct page * *rq_next_page; /* next reply page to use */ - struct page * *rq_page_end; /* one past the last page */ + struct page * *rq_page_end; /* one past the last reply page */ struct folio_batch rq_fbatch; struct bio_vec *rq_bvec; @@ -215,7 +231,6 @@ struct svc_rqst { u32 rq_vers; /* program version */ u32 rq_proc; /* procedure number */ u32 rq_prot; /* IP protocol */ - int rq_cachetype; /* catering to nfsd */ unsigned long rq_flags; /* flags field */ ktime_t rq_qtime; /* enqueue time */ @@ -251,7 +266,7 @@ struct svc_rqst { unsigned long bc_to_initval; unsigned int bc_to_retries; unsigned int rq_status_counter; /* RPC processing counter */ - void **rq_lease_breaker; /* The v4 client breaking a lease */ + void *rq_private; /* For use by the service thread */ }; /* bits for rq_flags */ @@ -483,6 +498,21 @@ int svc_generic_rpcbind_set(struct net *net, #define RPC_MAX_ADDRBUFLEN (63U) +/** + * svc_rqst_page_release - release a page associated with an RPC transaction + * @rqstp: RPC transaction context + * @page: page to release + * + * Released pages are batched and freed together, reducing + * allocator pressure under heavy RPC workloads. + */ +static inline void svc_rqst_page_release(struct svc_rqst *rqstp, + struct page *page) +{ + if (!folio_batch_add(&rqstp->rq_fbatch, page_folio(page))) + __folio_batch_release(&rqstp->rq_fbatch); +} + /* * When we want to reduce the size of the reserved space in the response * buffer, we need to take into account the size of any checksum data that diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 57f4fd94166a..df6e08aaad57 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -84,6 +84,9 @@ struct svcxprt_rdma { atomic_t sc_sq_avail; /* SQEs ready to be consumed */ unsigned int sc_sq_depth; /* Depth of SQ */ + atomic_t sc_sq_ticket_head; /* Next ticket to issue */ + atomic_t sc_sq_ticket_tail; /* Ticket currently serving */ + wait_queue_head_t sc_sq_ticket_wait; /* Ticket ordering waitlist */ __be32 sc_fc_credits; /* Forward credits */ u32 sc_max_requests; /* Max requests */ u32 sc_max_bc_requests;/* Backward credits */ @@ -213,6 +216,7 @@ struct svc_rdma_recv_ctxt { */ struct svc_rdma_write_info { struct svcxprt_rdma *wi_rdma; + struct list_head wi_list; const struct svc_rdma_chunk *wi_chunk; @@ -241,7 +245,10 @@ struct svc_rdma_send_ctxt { struct ib_cqe sc_cqe; struct xdr_buf sc_hdrbuf; struct xdr_stream sc_stream; + + struct list_head sc_write_info_list; struct svc_rdma_write_info sc_reply_info; + void *sc_xprt_buf; int sc_page_count; int sc_cur_sge_no; @@ -274,11 +281,14 @@ extern void svc_rdma_cc_init(struct svcxprt_rdma *rdma, extern void svc_rdma_cc_release(struct svcxprt_rdma *rdma, struct svc_rdma_chunk_ctxt *cc, enum dma_data_direction dir); +extern void svc_rdma_write_chunk_release(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt); extern void svc_rdma_reply_chunk_release(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt); -extern int svc_rdma_send_write_list(struct svcxprt_rdma *rdma, - const struct svc_rdma_recv_ctxt *rctxt, - const struct xdr_buf *xdr); +extern int svc_rdma_prepare_write_list(struct svcxprt_rdma *rdma, + const struct svc_rdma_recv_ctxt *rctxt, + struct svc_rdma_send_ctxt *sctxt, + const struct xdr_buf *xdr); extern int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma, const struct svc_rdma_pcl *write_pcl, const struct svc_rdma_pcl *reply_pcl, @@ -306,6 +316,13 @@ extern void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, struct svc_rdma_recv_ctxt *rctxt, int status); extern void svc_rdma_wake_send_waiters(struct svcxprt_rdma *rdma, int avail); +extern int svc_rdma_sq_wait(struct svcxprt_rdma *rdma, + const struct rpc_rdma_cid *cid, int sqecount); +extern int svc_rdma_post_send_err(struct svcxprt_rdma *rdma, + const struct rpc_rdma_cid *cid, + const struct ib_send_wr *bad_wr, + const struct ib_send_wr *first_wr, + int sqecount, int ret); extern int svc_rdma_sendto(struct svc_rqst *); extern int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset, unsigned int length); diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 152597750f55..b639a6fafcbc 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -290,7 +290,7 @@ xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen) /** * xdr_set_scratch_folio - Attach a scratch buffer for decoding data * @xdr: pointer to xdr_stream struct - * @page: an anonymous folio + * @folio: an anonymous folio * * See xdr_set_scratch_buffer(). */ @@ -330,7 +330,7 @@ static inline void xdr_commit_encode(struct xdr_stream *xdr) * xdr_stream_remaining - Return the number of bytes remaining in the stream * @xdr: pointer to struct xdr_stream * - * Return value: + * Returns: * Number of bytes remaining in @xdr before xdr->end */ static inline size_t @@ -350,7 +350,7 @@ ssize_t xdr_stream_encode_opaque_auth(struct xdr_stream *xdr, u32 flavor, * xdr_align_size - Calculate padded size of an object * @n: Size of an object being XDR encoded (in bytes) * - * Return value: + * Returns: * Size (in bytes) of the object including xdr padding */ static inline size_t @@ -368,7 +368,7 @@ xdr_align_size(size_t n) * This implementation avoids the need for conditional * branches or modulo division. * - * Return value: + * Returns: * Size (in bytes) of the needed XDR pad */ static inline size_t xdr_pad_size(size_t n) @@ -380,7 +380,7 @@ static inline size_t xdr_pad_size(size_t n) * xdr_stream_encode_item_present - Encode a "present" list item * @xdr: pointer to xdr_stream * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -399,7 +399,7 @@ static inline ssize_t xdr_stream_encode_item_present(struct xdr_stream *xdr) * xdr_stream_encode_item_absent - Encode a "not present" list item * @xdr: pointer to xdr_stream * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -419,7 +419,7 @@ static inline int xdr_stream_encode_item_absent(struct xdr_stream *xdr) * @p: address in a buffer into which to encode * @n: boolean value to encode * - * Return value: + * Returns: * Address of item following the encoded boolean */ static inline __be32 *xdr_encode_bool(__be32 *p, u32 n) @@ -433,7 +433,7 @@ static inline __be32 *xdr_encode_bool(__be32 *p, u32 n) * @xdr: pointer to xdr_stream * @n: boolean value to encode * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -453,7 +453,7 @@ static inline int xdr_stream_encode_bool(struct xdr_stream *xdr, __u32 n) * @xdr: pointer to xdr_stream * @n: integer to encode * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -474,7 +474,7 @@ xdr_stream_encode_u32(struct xdr_stream *xdr, __u32 n) * @xdr: pointer to xdr_stream * @n: integer to encode * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -495,7 +495,7 @@ xdr_stream_encode_be32(struct xdr_stream *xdr, __be32 n) * @xdr: pointer to xdr_stream * @n: 64-bit integer to encode * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -517,7 +517,7 @@ xdr_stream_encode_u64(struct xdr_stream *xdr, __u64 n) * @ptr: pointer to void pointer * @len: size of object * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -542,7 +542,7 @@ xdr_stream_encode_opaque_inline(struct xdr_stream *xdr, void **ptr, size_t len) * @ptr: pointer to opaque data object * @len: size of object pointed to by @ptr * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -563,7 +563,7 @@ xdr_stream_encode_opaque_fixed(struct xdr_stream *xdr, const void *ptr, size_t l * @ptr: pointer to opaque data object * @len: size of object pointed to by @ptr * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -585,7 +585,7 @@ xdr_stream_encode_opaque(struct xdr_stream *xdr, const void *ptr, size_t len) * @array: array of integers * @array_size: number of elements in @array * - * Return values: + * Returns: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ @@ -608,7 +608,7 @@ xdr_stream_encode_uint32_array(struct xdr_stream *xdr, * xdr_item_is_absent - symbolically handle XDR discriminators * @p: pointer to undecoded discriminator * - * Return values: + * Returns: * %true if the following XDR item is absent * %false if the following XDR item is present */ @@ -621,7 +621,7 @@ static inline bool xdr_item_is_absent(const __be32 *p) * xdr_item_is_present - symbolically handle XDR discriminators * @p: pointer to undecoded discriminator * - * Return values: + * Returns: * %true if the following XDR item is present * %false if the following XDR item is absent */ @@ -635,7 +635,7 @@ static inline bool xdr_item_is_present(const __be32 *p) * @xdr: pointer to xdr_stream * @ptr: pointer to a u32 in which to store the result * - * Return values: + * Returns: * %0 on success * %-EBADMSG on XDR buffer overflow */ @@ -656,7 +656,7 @@ xdr_stream_decode_bool(struct xdr_stream *xdr, __u32 *ptr) * @xdr: pointer to xdr_stream * @ptr: location to store integer * - * Return values: + * Returns: * %0 on success * %-EBADMSG on XDR buffer overflow */ @@ -677,7 +677,7 @@ xdr_stream_decode_u32(struct xdr_stream *xdr, __u32 *ptr) * @xdr: pointer to xdr_stream * @ptr: location to store integer * - * Return values: + * Returns: * %0 on success * %-EBADMSG on XDR buffer overflow */ @@ -698,7 +698,7 @@ xdr_stream_decode_be32(struct xdr_stream *xdr, __be32 *ptr) * @xdr: pointer to xdr_stream * @ptr: location to store 64-bit integer * - * Return values: + * Returns: * %0 on success * %-EBADMSG on XDR buffer overflow */ @@ -720,7 +720,7 @@ xdr_stream_decode_u64(struct xdr_stream *xdr, __u64 *ptr) * @ptr: location to store data * @len: size of buffer pointed to by @ptr * - * Return values: + * Returns: * %0 on success * %-EBADMSG on XDR buffer overflow */ @@ -746,7 +746,7 @@ xdr_stream_decode_opaque_fixed(struct xdr_stream *xdr, void *ptr, size_t len) * on @xdr. It is therefore expected that the object it points to should * be processed immediately. * - * Return values: + * Returns: * On success, returns size of object stored in *@ptr * %-EBADMSG on XDR buffer overflow * %-EMSGSIZE if the size of the object would exceed @maxlen @@ -777,7 +777,7 @@ xdr_stream_decode_opaque_inline(struct xdr_stream *xdr, void **ptr, size_t maxle * @array: location to store the integer array or NULL * @array_size: number of elements to store * - * Return values: + * Returns: * On success, returns number of elements stored in @array * %-EBADMSG on XDR buffer overflow * %-EMSGSIZE if the size of the array exceeds @array_size diff --git a/include/linux/sunrpc/xdrgen/nlm4.h b/include/linux/sunrpc/xdrgen/nlm4.h new file mode 100644 index 000000000000..e95e8f105624 --- /dev/null +++ b/include/linux/sunrpc/xdrgen/nlm4.h @@ -0,0 +1,233 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Generated by xdrgen. Manual edits will be lost. */ +/* XDR specification file: ../../Documentation/sunrpc/xdr/nlm4.x */ +/* XDR specification modification time: Thu Dec 25 13:10:19 2025 */ + +#ifndef _LINUX_XDRGEN_NLM4_DEF_H +#define _LINUX_XDRGEN_NLM4_DEF_H + +#include <linux/types.h> +#include <linux/sunrpc/xdrgen/_defs.h> + +enum { LM_MAXSTRLEN = 1024 }; + +enum { LM_MAXNAMELEN = 1025 }; + +enum { MAXNETOBJ_SZ = 1024 }; + +typedef opaque netobj; + +enum fsh4_mode { + fsm_DN = 0, + fsm_DR = 1, + fsm_DW = 2, + fsm_DRW = 3, +}; + +typedef enum fsh4_mode fsh4_mode; + +enum fsh4_access { + fsa_NONE = 0, + fsa_R = 1, + fsa_W = 2, + fsa_RW = 3, +}; + +typedef enum fsh4_access fsh4_access; + +enum { SM_MAXSTRLEN = 1024 }; + +typedef u64 uint64; + +typedef s64 int64; + +typedef u32 uint32; + +typedef s32 int32; + +enum nlm4_stats { + NLM4_GRANTED = 0, + NLM4_DENIED = 1, + NLM4_DENIED_NOLOCKS = 2, + NLM4_BLOCKED = 3, + NLM4_DENIED_GRACE_PERIOD = 4, + NLM4_DEADLCK = 5, + NLM4_ROFS = 6, + NLM4_STALE_FH = 7, + NLM4_FBIG = 8, + NLM4_FAILED = 9, +}; + +typedef __be32 nlm4_stats; + +struct nlm4_holder { + bool exclusive; + int32 svid; + netobj oh; + uint64 l_offset; + uint64 l_len; +}; + +struct nlm4_testrply { + nlm4_stats stat; + union { + struct nlm4_holder holder; + } u; +}; + +struct nlm4_stat { + nlm4_stats stat; +}; + +struct nlm4_res { + netobj cookie; + struct nlm4_stat stat; +}; + +struct nlm4_testres { + netobj cookie; + struct nlm4_testrply stat; +}; + +struct nlm4_lock { + string caller_name; + netobj fh; + netobj oh; + int32 svid; + uint64 l_offset; + uint64 l_len; +}; + +struct nlm4_lockargs { + netobj cookie; + bool block; + bool exclusive; + struct nlm4_lock alock; + bool reclaim; + int32 state; +}; + +struct nlm4_cancargs { + netobj cookie; + bool block; + bool exclusive; + struct nlm4_lock alock; +}; + +struct nlm4_testargs { + netobj cookie; + bool exclusive; + struct nlm4_lock alock; +}; + +struct nlm4_unlockargs { + netobj cookie; + struct nlm4_lock alock; +}; + +struct nlm4_share { + string caller_name; + netobj fh; + netobj oh; + fsh4_mode mode; + fsh4_access access; +}; + +struct nlm4_shareargs { + netobj cookie; + struct nlm4_share share; + bool reclaim; +}; + +struct nlm4_shareres { + netobj cookie; + nlm4_stats stat; + int32 sequence; +}; + +struct nlm4_notify { + string name; + int32 state; +}; + +enum { SM_PRIV_SIZE = 16 }; + +struct nlm4_notifyargs { + struct nlm4_notify notify; + u8 private[SM_PRIV_SIZE]; +}; + +enum { + NLMPROC4_NULL = 0, + NLMPROC4_TEST = 1, + NLMPROC4_LOCK = 2, + NLMPROC4_CANCEL = 3, + NLMPROC4_UNLOCK = 4, + NLMPROC4_GRANTED = 5, + NLMPROC4_TEST_MSG = 6, + NLMPROC4_LOCK_MSG = 7, + NLMPROC4_CANCEL_MSG = 8, + NLMPROC4_UNLOCK_MSG = 9, + NLMPROC4_GRANTED_MSG = 10, + NLMPROC4_TEST_RES = 11, + NLMPROC4_LOCK_RES = 12, + NLMPROC4_CANCEL_RES = 13, + NLMPROC4_UNLOCK_RES = 14, + NLMPROC4_GRANTED_RES = 15, + NLMPROC4_SM_NOTIFY = 16, + NLMPROC4_SHARE = 20, + NLMPROC4_UNSHARE = 21, + NLMPROC4_NM_LOCK = 22, + NLMPROC4_FREE_ALL = 23, +}; + +#ifndef NLM4_PROG +#define NLM4_PROG (100021) +#endif + +#define NLM4_netobj_sz (XDR_unsigned_int + XDR_QUADLEN(MAXNETOBJ_SZ)) +#define NLM4_fsh4_mode_sz (XDR_int) +#define NLM4_fsh4_access_sz (XDR_int) +#define NLM4_uint64_sz \ + (XDR_unsigned_hyper) +#define NLM4_int64_sz \ + (XDR_hyper) +#define NLM4_uint32_sz \ + (XDR_unsigned_long) +#define NLM4_int32_sz \ + (XDR_long) +#define NLM4_nlm4_stats_sz (XDR_int) +#define NLM4_nlm4_holder_sz \ + (XDR_bool + NLM4_int32_sz + NLM4_netobj_sz + NLM4_uint64_sz + NLM4_uint64_sz) +#define NLM4_nlm4_testrply_sz \ + (NLM4_nlm4_stats_sz + NLM4_nlm4_holder_sz) +#define NLM4_nlm4_stat_sz \ + (NLM4_nlm4_stats_sz) +#define NLM4_nlm4_res_sz \ + (NLM4_netobj_sz + NLM4_nlm4_stat_sz) +#define NLM4_nlm4_testres_sz \ + (NLM4_netobj_sz + NLM4_nlm4_testrply_sz) +#define NLM4_nlm4_lock_sz \ + (XDR_unsigned_int + XDR_QUADLEN(LM_MAXSTRLEN) + NLM4_netobj_sz + NLM4_netobj_sz + NLM4_int32_sz + NLM4_uint64_sz + NLM4_uint64_sz) +#define NLM4_nlm4_lockargs_sz \ + (NLM4_netobj_sz + XDR_bool + XDR_bool + NLM4_nlm4_lock_sz + XDR_bool + NLM4_int32_sz) +#define NLM4_nlm4_cancargs_sz \ + (NLM4_netobj_sz + XDR_bool + XDR_bool + NLM4_nlm4_lock_sz) +#define NLM4_nlm4_testargs_sz \ + (NLM4_netobj_sz + XDR_bool + NLM4_nlm4_lock_sz) +#define NLM4_nlm4_unlockargs_sz \ + (NLM4_netobj_sz + NLM4_nlm4_lock_sz) +#define NLM4_nlm4_share_sz \ + (XDR_unsigned_int + XDR_QUADLEN(LM_MAXSTRLEN) + NLM4_netobj_sz + NLM4_netobj_sz + NLM4_fsh4_mode_sz + NLM4_fsh4_access_sz) +#define NLM4_nlm4_shareargs_sz \ + (NLM4_netobj_sz + NLM4_nlm4_share_sz + XDR_bool) +#define NLM4_nlm4_shareres_sz \ + (NLM4_netobj_sz + NLM4_nlm4_stats_sz + NLM4_int32_sz) +#define NLM4_nlm4_notify_sz \ + (XDR_unsigned_int + XDR_QUADLEN(LM_MAXNAMELEN) + NLM4_int32_sz) +#define NLM4_nlm4_notifyargs_sz \ + (NLM4_nlm4_notify_sz + XDR_QUADLEN(SM_PRIV_SIZE)) +#define NLM4_MAX_ARGS_SZ \ + (NLM4_nlm4_lockargs_sz) + +#endif /* _LINUX_XDRGEN_NLM4_DEF_H */ diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 750ecce56930..ff855197880d 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1933,7 +1933,9 @@ TRACE_EVENT(svc_stats_latency, svc_xprt_flag(CONG_CTRL) \ svc_xprt_flag(HANDSHAKE) \ svc_xprt_flag(TLS_SESSION) \ - svc_xprt_flag_end(PEER_AUTH) + svc_xprt_flag(PEER_AUTH) \ + svc_xprt_flag(PEER_VALID) \ + svc_xprt_flag_end(RPCB_UNREG) #undef svc_xprt_flag #undef svc_xprt_flag_end diff --git a/include/uapi/linux/nfsd/export.h b/include/uapi/linux/nfsd/export.h index a73ca3703abb..de647cf166c3 100644 --- a/include/uapi/linux/nfsd/export.h +++ b/include/uapi/linux/nfsd/export.h @@ -34,7 +34,7 @@ #define NFSEXP_GATHERED_WRITES 0x0020 #define NFSEXP_NOREADDIRPLUS 0x0040 #define NFSEXP_SECURITY_LABEL 0x0080 -/* 0x100 currently unused */ +#define NFSEXP_SIGN_FH 0x0100 #define NFSEXP_NOHIDE 0x0200 #define NFSEXP_NOSUBTREECHECK 0x0400 #define NFSEXP_NOAUTHNLM 0x0800 /* Don't authenticate NLM requests - just trust */ @@ -55,7 +55,7 @@ #define NFSEXP_PNFS 0x20000 /* All flags that we claim to support. (Note we don't support NOACL.) */ -#define NFSEXP_ALLFLAGS 0x3FEFF +#define NFSEXP_ALLFLAGS 0x3FFFF /* The flags that may vary depending on security flavor: */ #define NFSEXP_SECINFO_FLAGS (NFSEXP_READONLY | NFSEXP_ROOTSQUASH \ diff --git a/include/uapi/linux/nfsd_netlink.h b/include/uapi/linux/nfsd_netlink.h index e9efbc9e63d8..97c7447f4d14 100644 --- a/include/uapi/linux/nfsd_netlink.h +++ b/include/uapi/linux/nfsd_netlink.h @@ -36,6 +36,7 @@ enum { NFSD_A_SERVER_LEASETIME, NFSD_A_SERVER_SCOPE, NFSD_A_SERVER_MIN_THREADS, + NFSD_A_SERVER_FH_KEY, __NFSD_A_SERVER_MAX, NFSD_A_SERVER_MAX = (__NFSD_A_SERVER_MAX - 1) diff --git a/net/sunrpc/auth_gss/gss_krb5_test.c b/net/sunrpc/auth_gss/gss_krb5_test.c index a5bff02cd7ba..dde1ee934d0d 100644 --- a/net/sunrpc/auth_gss/gss_krb5_test.c +++ b/net/sunrpc/auth_gss/gss_krb5_test.c @@ -63,10 +63,11 @@ static void kdf_case(struct kunit *test) KUNIT_ASSERT_EQ(test, err, 0); /* Assert */ - KUNIT_EXPECT_EQ_MSG(test, - memcmp(param->expected_result->data, - derivedkey.data, derivedkey.len), 0, - "key mismatch"); + KUNIT_EXPECT_MEMEQ_MSG(test, + param->expected_result->data, + derivedkey.data, + derivedkey.len, + "key mismatch"); } static void checksum_case(struct kunit *test) @@ -111,10 +112,11 @@ static void checksum_case(struct kunit *test) KUNIT_ASSERT_EQ(test, err, 0); /* Assert */ - KUNIT_EXPECT_EQ_MSG(test, - memcmp(param->expected_result->data, - checksum.data, checksum.len), 0, - "checksum mismatch"); + KUNIT_EXPECT_MEMEQ_MSG(test, + param->expected_result->data, + checksum.data, + checksum.len, + "checksum mismatch"); crypto_free_ahash(tfm); } @@ -314,10 +316,11 @@ static void rfc3961_nfold_case(struct kunit *test) param->expected_result->len * 8, result); /* Assert */ - KUNIT_EXPECT_EQ_MSG(test, - memcmp(param->expected_result->data, - result, param->expected_result->len), 0, - "result mismatch"); + KUNIT_EXPECT_MEMEQ_MSG(test, + param->expected_result->data, + result, + param->expected_result->len, + "result mismatch"); } static struct kunit_case rfc3961_test_cases[] = { @@ -569,14 +572,16 @@ static void rfc3962_encrypt_case(struct kunit *test) KUNIT_EXPECT_EQ_MSG(test, param->expected_result->len, buf.len, "ciphertext length mismatch"); - KUNIT_EXPECT_EQ_MSG(test, - memcmp(param->expected_result->data, - text, param->expected_result->len), 0, - "ciphertext mismatch"); - KUNIT_EXPECT_EQ_MSG(test, - memcmp(param->next_iv->data, iv, - param->next_iv->len), 0, - "IV mismatch"); + KUNIT_EXPECT_MEMEQ_MSG(test, + param->expected_result->data, + text, + param->expected_result->len, + "ciphertext mismatch"); + KUNIT_EXPECT_MEMEQ_MSG(test, + param->next_iv->data, + iv, + param->next_iv->len, + "IV mismatch"); crypto_free_sync_skcipher(cts_tfm); crypto_free_sync_skcipher(cbc_tfm); @@ -1194,15 +1199,17 @@ static void rfc6803_encrypt_case(struct kunit *test) KUNIT_EXPECT_EQ_MSG(test, param->expected_result->len, buf.len + checksum.len, "ciphertext length mismatch"); - KUNIT_EXPECT_EQ_MSG(test, - memcmp(param->expected_result->data, - buf.head[0].iov_base, buf.len), 0, - "encrypted result mismatch"); - KUNIT_EXPECT_EQ_MSG(test, - memcmp(param->expected_result->data + - (param->expected_result->len - checksum.len), - checksum.data, checksum.len), 0, - "HMAC mismatch"); + KUNIT_EXPECT_MEMEQ_MSG(test, + param->expected_result->data, + buf.head[0].iov_base, + buf.len, + "encrypted result mismatch"); + KUNIT_EXPECT_MEMEQ_MSG(test, + param->expected_result->data + + (param->expected_result->len - checksum.len), + checksum.data, + checksum.len, + "HMAC mismatch"); crypto_free_ahash(ahash_tfm); crypto_free_sync_skcipher(cts_tfm); @@ -1687,15 +1694,16 @@ static void rfc8009_encrypt_case(struct kunit *test) KUNIT_EXPECT_EQ_MSG(test, param->expected_result->len, buf.len, "ciphertext length mismatch"); - KUNIT_EXPECT_EQ_MSG(test, - memcmp(param->expected_result->data, - buf.head[0].iov_base, - param->expected_result->len), 0, - "ciphertext mismatch"); - KUNIT_EXPECT_EQ_MSG(test, memcmp(param->expected_hmac->data, - checksum.data, - checksum.len), 0, - "HMAC mismatch"); + KUNIT_EXPECT_MEMEQ_MSG(test, + param->expected_result->data, + buf.head[0].iov_base, + param->expected_result->len, + "ciphertext mismatch"); + KUNIT_EXPECT_MEMEQ_MSG(test, + param->expected_hmac->data, + checksum.data, + checksum.len, + "HMAC mismatch"); crypto_free_ahash(ahash_tfm); crypto_free_sync_skcipher(cts_tfm); @@ -1826,10 +1834,11 @@ static void encrypt_selftest_case(struct kunit *test) KUNIT_EXPECT_EQ_MSG(test, param->plaintext->len, buf.len, "length mismatch"); - KUNIT_EXPECT_EQ_MSG(test, - memcmp(param->plaintext->data, - buf.head[0].iov_base, buf.len), 0, - "plaintext mismatch"); + KUNIT_EXPECT_MEMEQ_MSG(test, + param->plaintext->data, + buf.head[0].iov_base, + buf.len, + "plaintext mismatch"); crypto_free_sync_skcipher(cts_tfm); crypto_free_sync_skcipher(cbc_tfm); diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index ef8b7e8b1e9c..7081c1214e6c 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -134,11 +134,11 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail, return tmp; } + cache_get(new); hlist_add_head_rcu(&new->cache_list, head); detail->entries++; if (detail->nextcheck > new->expiry_time) detail->nextcheck = new->expiry_time + 1; - cache_get(new); spin_unlock(&detail->hash_lock); if (freeme) @@ -233,9 +233,9 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, spin_lock(&detail->hash_lock); cache_entry_update(detail, tmp, new); - hlist_add_head(&tmp->cache_list, &detail->hash_table[hash]); - detail->entries++; cache_get(tmp); + hlist_add_head_rcu(&tmp->cache_list, &detail->hash_table[hash]); + detail->entries++; cache_fresh_locked(tmp, new->expiry_time, detail); cache_fresh_locked(old, 0, detail); spin_unlock(&detail->hash_lock); @@ -399,7 +399,11 @@ static struct delayed_work cache_cleaner; void sunrpc_init_cache_detail(struct cache_detail *cd) { spin_lock_init(&cd->hash_lock); - INIT_LIST_HEAD(&cd->queue); + INIT_LIST_HEAD(&cd->requests); + INIT_LIST_HEAD(&cd->readers); + spin_lock_init(&cd->queue_lock); + init_waitqueue_head(&cd->queue_wait); + cd->next_seqno = 0; spin_lock(&cache_list_lock); cd->nextcheck = 0; cd->entries = 0; @@ -794,31 +798,20 @@ void cache_clean_deferred(void *owner) * On read, you get a full request, or block. * On write, an update request is processed. * Poll works if anything to read, and always allows write. - * - * Implemented by linked list of requests. Each open file has - * a ->private that also exists in this list. New requests are added - * to the end and may wakeup and preceding readers. - * New readers are added to the head. If, on read, an item is found with - * CACHE_UPCALLING clear, we free it from the list. - * */ -static DEFINE_SPINLOCK(queue_lock); - -struct cache_queue { - struct list_head list; - int reader; /* if 0, then request */ -}; struct cache_request { - struct cache_queue q; + struct list_head list; struct cache_head *item; - char * buf; + char *buf; int len; int readers; + u64 seqno; }; struct cache_reader { - struct cache_queue q; + struct list_head list; int offset; /* if non-0, we have a refcnt on next request */ + u64 next_seqno; }; static int cache_request(struct cache_detail *detail, @@ -833,6 +826,17 @@ static int cache_request(struct cache_detail *detail, return PAGE_SIZE - len; } +static struct cache_request * +cache_next_request(struct cache_detail *cd, u64 seqno) +{ + struct cache_request *rq; + + list_for_each_entry(rq, &cd->requests, list) + if (rq->seqno >= seqno) + return rq; + return NULL; +} + static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos, struct cache_detail *cd) { @@ -847,25 +851,18 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, inode_lock(inode); /* protect against multiple concurrent * readers on this file */ again: - spin_lock(&queue_lock); + spin_lock(&cd->queue_lock); /* need to find next request */ - while (rp->q.list.next != &cd->queue && - list_entry(rp->q.list.next, struct cache_queue, list) - ->reader) { - struct list_head *next = rp->q.list.next; - list_move(&rp->q.list, next); - } - if (rp->q.list.next == &cd->queue) { - spin_unlock(&queue_lock); + rq = cache_next_request(cd, rp->next_seqno); + if (!rq) { + spin_unlock(&cd->queue_lock); inode_unlock(inode); WARN_ON_ONCE(rp->offset); return 0; } - rq = container_of(rp->q.list.next, struct cache_request, q.list); - WARN_ON_ONCE(rq->q.reader); if (rp->offset == 0) rq->readers++; - spin_unlock(&queue_lock); + spin_unlock(&cd->queue_lock); if (rq->len == 0) { err = cache_request(cd, rq); @@ -876,9 +873,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, if (rp->offset == 0 && !test_bit(CACHE_PENDING, &rq->item->flags)) { err = -EAGAIN; - spin_lock(&queue_lock); - list_move(&rp->q.list, &rq->q.list); - spin_unlock(&queue_lock); + rp->next_seqno = rq->seqno + 1; } else { if (rp->offset + count > rq->len) count = rq->len - rp->offset; @@ -888,26 +883,24 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, rp->offset += count; if (rp->offset >= rq->len) { rp->offset = 0; - spin_lock(&queue_lock); - list_move(&rp->q.list, &rq->q.list); - spin_unlock(&queue_lock); + rp->next_seqno = rq->seqno + 1; } err = 0; } out: if (rp->offset == 0) { /* need to release rq */ - spin_lock(&queue_lock); + spin_lock(&cd->queue_lock); rq->readers--; if (rq->readers == 0 && !test_bit(CACHE_PENDING, &rq->item->flags)) { - list_del(&rq->q.list); - spin_unlock(&queue_lock); + list_del(&rq->list); + spin_unlock(&cd->queue_lock); cache_put(rq->item, cd); kfree(rq->buf); kfree(rq); } else - spin_unlock(&queue_lock); + spin_unlock(&cd->queue_lock); } if (err == -EAGAIN) goto again; @@ -971,16 +964,13 @@ out: return ret; } -static DECLARE_WAIT_QUEUE_HEAD(queue_wait); - static __poll_t cache_poll(struct file *filp, poll_table *wait, struct cache_detail *cd) { __poll_t mask; struct cache_reader *rp = filp->private_data; - struct cache_queue *cq; - poll_wait(filp, &queue_wait, wait); + poll_wait(filp, &cd->queue_wait, wait); /* alway allow write */ mask = EPOLLOUT | EPOLLWRNORM; @@ -988,15 +978,11 @@ static __poll_t cache_poll(struct file *filp, poll_table *wait, if (!rp) return mask; - spin_lock(&queue_lock); + spin_lock(&cd->queue_lock); - for (cq= &rp->q; &cq->list != &cd->queue; - cq = list_entry(cq->list.next, struct cache_queue, list)) - if (!cq->reader) { - mask |= EPOLLIN | EPOLLRDNORM; - break; - } - spin_unlock(&queue_lock); + if (cache_next_request(cd, rp->next_seqno)) + mask |= EPOLLIN | EPOLLRDNORM; + spin_unlock(&cd->queue_lock); return mask; } @@ -1006,25 +992,20 @@ static int cache_ioctl(struct inode *ino, struct file *filp, { int len = 0; struct cache_reader *rp = filp->private_data; - struct cache_queue *cq; + struct cache_request *rq; if (cmd != FIONREAD || !rp) return -EINVAL; - spin_lock(&queue_lock); + spin_lock(&cd->queue_lock); /* only find the length remaining in current request, * or the length of the next request */ - for (cq= &rp->q; &cq->list != &cd->queue; - cq = list_entry(cq->list.next, struct cache_queue, list)) - if (!cq->reader) { - struct cache_request *cr = - container_of(cq, struct cache_request, q); - len = cr->len - rp->offset; - break; - } - spin_unlock(&queue_lock); + rq = cache_next_request(cd, rp->next_seqno); + if (rq) + len = rq->len - rp->offset; + spin_unlock(&cd->queue_lock); return put_user(len, (int __user *)arg); } @@ -1044,11 +1025,11 @@ static int cache_open(struct inode *inode, struct file *filp, return -ENOMEM; } rp->offset = 0; - rp->q.reader = 1; + rp->next_seqno = 0; - spin_lock(&queue_lock); - list_add(&rp->q.list, &cd->queue); - spin_unlock(&queue_lock); + spin_lock(&cd->queue_lock); + list_add(&rp->list, &cd->readers); + spin_unlock(&cd->queue_lock); } if (filp->f_mode & FMODE_WRITE) atomic_inc(&cd->writers); @@ -1064,29 +1045,24 @@ static int cache_release(struct inode *inode, struct file *filp, if (rp) { struct cache_request *rq = NULL; - spin_lock(&queue_lock); + spin_lock(&cd->queue_lock); if (rp->offset) { - struct cache_queue *cq; - for (cq = &rp->q; &cq->list != &cd->queue; - cq = list_entry(cq->list.next, - struct cache_queue, list)) - if (!cq->reader) { - struct cache_request *cr = - container_of(cq, - struct cache_request, q); - cr->readers--; - if (cr->readers == 0 && - !test_bit(CACHE_PENDING, - &cr->item->flags)) { - list_del(&cr->q.list); - rq = cr; - } - break; + struct cache_request *cr; + + cr = cache_next_request(cd, rp->next_seqno); + if (cr) { + cr->readers--; + if (cr->readers == 0 && + !test_bit(CACHE_PENDING, + &cr->item->flags)) { + list_del(&cr->list); + rq = cr; } + } rp->offset = 0; } - list_del(&rp->q.list); - spin_unlock(&queue_lock); + list_del(&rp->list); + spin_unlock(&cd->queue_lock); if (rq) { cache_put(rq->item, cd); @@ -1109,27 +1085,24 @@ static int cache_release(struct inode *inode, struct file *filp, static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch) { - struct cache_queue *cq, *tmp; - struct cache_request *cr; + struct cache_request *cr, *tmp; LIST_HEAD(dequeued); - spin_lock(&queue_lock); - list_for_each_entry_safe(cq, tmp, &detail->queue, list) - if (!cq->reader) { - cr = container_of(cq, struct cache_request, q); - if (cr->item != ch) - continue; - if (test_bit(CACHE_PENDING, &ch->flags)) - /* Lost a race and it is pending again */ - break; - if (cr->readers != 0) - continue; - list_move(&cr->q.list, &dequeued); - } - spin_unlock(&queue_lock); + spin_lock(&detail->queue_lock); + list_for_each_entry_safe(cr, tmp, &detail->requests, list) { + if (cr->item != ch) + continue; + if (test_bit(CACHE_PENDING, &ch->flags)) + /* Lost a race and it is pending again */ + break; + if (cr->readers != 0) + continue; + list_move(&cr->list, &dequeued); + } + spin_unlock(&detail->queue_lock); while (!list_empty(&dequeued)) { - cr = list_entry(dequeued.next, struct cache_request, q.list); - list_del(&cr->q.list); + cr = list_entry(dequeued.next, struct cache_request, list); + list_del(&cr->list); cache_put(cr->item, detail); kfree(cr->buf); kfree(cr); @@ -1247,20 +1220,20 @@ static int cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h) return -EAGAIN; } - crq->q.reader = 0; crq->buf = buf; crq->len = 0; crq->readers = 0; - spin_lock(&queue_lock); + spin_lock(&detail->queue_lock); if (test_bit(CACHE_PENDING, &h->flags)) { crq->item = cache_get(h); - list_add_tail(&crq->q.list, &detail->queue); + crq->seqno = detail->next_seqno++; + list_add_tail(&crq->list, &detail->requests); trace_cache_entry_upcall(detail, h); } else /* Lost a race, no longer PENDING, so don't enqueue */ ret = -EAGAIN; - spin_unlock(&queue_lock); - wake_up(&queue_wait); + spin_unlock(&detail->queue_lock); + wake_up(&detail->queue_wait); if (ret == -EAGAIN) { kfree(buf); kfree(crq); @@ -1378,18 +1351,14 @@ static void *__cache_seq_start(struct seq_file *m, loff_t *pos) hlist_for_each_entry_rcu(ch, &cd->hash_table[hash], cache_list) if (!entry--) return ch; - n &= ~((1LL<<32) - 1); - do { - hash++; - n += 1LL<<32; - } while(hash < cd->hash_size && - hlist_empty(&cd->hash_table[hash])); - if (hash >= cd->hash_size) - return NULL; - *pos = n+1; - return hlist_entry_safe(rcu_dereference_raw( + ch = NULL; + while (!ch && ++hash < cd->hash_size) + ch = hlist_entry_safe(rcu_dereference( hlist_first_rcu(&cd->hash_table[hash])), struct cache_head, cache_list); + + *pos = ((long long)hash << 32) + 1; + return ch; } static void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos) @@ -1398,29 +1367,29 @@ static void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos) int hash = (*pos >> 32); struct cache_detail *cd = m->private; - if (p == SEQ_START_TOKEN) + if (p == SEQ_START_TOKEN) { hash = 0; - else if (ch->cache_list.next == NULL) { - hash++; - *pos += 1LL<<32; - } else { - ++*pos; - return hlist_entry_safe(rcu_dereference_raw( - hlist_next_rcu(&ch->cache_list)), - struct cache_head, cache_list); + ch = NULL; } - *pos &= ~((1LL<<32) - 1); - while (hash < cd->hash_size && - hlist_empty(&cd->hash_table[hash])) { + while (hash < cd->hash_size) { + if (ch) + ch = hlist_entry_safe( + rcu_dereference( + hlist_next_rcu(&ch->cache_list)), + struct cache_head, cache_list); + else + ch = hlist_entry_safe( + rcu_dereference( + hlist_first_rcu(&cd->hash_table[hash])), + struct cache_head, cache_list); + if (ch) { + ++*pos; + return ch; + } hash++; - *pos += 1LL<<32; + *pos = (long long)hash << 32; } - if (hash >= cd->hash_size) - return NULL; - ++*pos; - return hlist_entry_safe(rcu_dereference_raw( - hlist_first_rcu(&cd->hash_table[hash])), - struct cache_head, cache_list); + return NULL; } void *cache_seq_start_rcu(struct seq_file *m, loff_t *pos) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index d8ccb8e4b5c2..576fa42e7abf 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -638,13 +638,25 @@ svc_init_buffer(struct svc_rqst *rqstp, const struct svc_serv *serv, int node) { rqstp->rq_maxpages = svc_serv_maxpages(serv); - /* rq_pages' last entry is NULL for historical reasons. */ + /* +1 for a NULL sentinel readable by nfsd_splice_actor() */ rqstp->rq_pages = kcalloc_node(rqstp->rq_maxpages + 1, sizeof(struct page *), GFP_KERNEL, node); if (!rqstp->rq_pages) return false; + /* +1 for a NULL sentinel at rq_page_end (see svc_rqst_replace_page) */ + rqstp->rq_respages = kcalloc_node(rqstp->rq_maxpages + 1, + sizeof(struct page *), + GFP_KERNEL, node); + if (!rqstp->rq_respages) { + kfree(rqstp->rq_pages); + rqstp->rq_pages = NULL; + return false; + } + + rqstp->rq_pages_nfree = rqstp->rq_maxpages; + rqstp->rq_next_page = rqstp->rq_respages + rqstp->rq_maxpages; return true; } @@ -656,10 +668,19 @@ svc_release_buffer(struct svc_rqst *rqstp) { unsigned long i; - for (i = 0; i < rqstp->rq_maxpages; i++) - if (rqstp->rq_pages[i]) - put_page(rqstp->rq_pages[i]); - kfree(rqstp->rq_pages); + if (rqstp->rq_pages) { + for (i = 0; i < rqstp->rq_maxpages; i++) + if (rqstp->rq_pages[i]) + put_page(rqstp->rq_pages[i]); + kfree(rqstp->rq_pages); + } + + if (rqstp->rq_respages) { + for (i = 0; i < rqstp->rq_maxpages; i++) + if (rqstp->rq_respages[i]) + put_page(rqstp->rq_respages[i]); + kfree(rqstp->rq_respages); + } } static void @@ -934,11 +955,11 @@ svc_set_num_threads(struct svc_serv *serv, unsigned int min_threads, EXPORT_SYMBOL_GPL(svc_set_num_threads); /** - * svc_rqst_replace_page - Replace one page in rq_pages[] + * svc_rqst_replace_page - Replace one page in rq_respages[] * @rqstp: svc_rqst with pages to replace * @page: replacement page * - * When replacing a page in rq_pages, batch the release of the + * When replacing a page in rq_respages, batch the release of the * replaced pages to avoid hammering the page allocator. * * Return values: @@ -947,19 +968,16 @@ EXPORT_SYMBOL_GPL(svc_set_num_threads); */ bool svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page) { - struct page **begin = rqstp->rq_pages; - struct page **end = &rqstp->rq_pages[rqstp->rq_maxpages]; + struct page **begin = rqstp->rq_respages; + struct page **end = rqstp->rq_page_end; if (unlikely(rqstp->rq_next_page < begin || rqstp->rq_next_page > end)) { trace_svc_replace_page_err(rqstp); return false; } - if (*rqstp->rq_next_page) { - if (!folio_batch_add(&rqstp->rq_fbatch, - page_folio(*rqstp->rq_next_page))) - __folio_batch_release(&rqstp->rq_fbatch); - } + if (*rqstp->rq_next_page) + svc_rqst_page_release(rqstp, *rqstp->rq_next_page); get_page(page); *(rqstp->rq_next_page++) = page; @@ -971,18 +989,24 @@ EXPORT_SYMBOL_GPL(svc_rqst_replace_page); * svc_rqst_release_pages - Release Reply buffer pages * @rqstp: RPC transaction context * - * Release response pages that might still be in flight after - * svc_send, and any spliced filesystem-owned pages. + * Release response pages in the range [rq_respages, rq_next_page). + * NULL entries in this range are skipped, allowing transports to + * transfer pages to a send context before this function runs. */ void svc_rqst_release_pages(struct svc_rqst *rqstp) { - int i, count = rqstp->rq_next_page - rqstp->rq_respages; - - if (count) { - release_pages(rqstp->rq_respages, count); - for (i = 0; i < count; i++) - rqstp->rq_respages[i] = NULL; + struct page **pp; + + for (pp = rqstp->rq_respages; pp < rqstp->rq_next_page; pp++) { + if (*pp) { + if (!folio_batch_add(&rqstp->rq_fbatch, + page_folio(*pp))) + __folio_batch_release(&rqstp->rq_fbatch); + *pp = NULL; + } } + if (rqstp->rq_fbatch.nr) + __folio_batch_release(&rqstp->rq_fbatch); } /** diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 56a663b8939f..b16e710926c1 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -650,14 +650,13 @@ static void svc_check_conn_limits(struct svc_serv *serv) } } -static bool svc_alloc_arg(struct svc_rqst *rqstp) +static bool svc_fill_pages(struct svc_rqst *rqstp, struct page **pages, + unsigned long npages) { - struct xdr_buf *arg = &rqstp->rq_arg; - unsigned long pages, filled, ret; + unsigned long filled, ret; - pages = rqstp->rq_maxpages; - for (filled = 0; filled < pages; filled = ret) { - ret = alloc_pages_bulk(GFP_KERNEL, pages, rqstp->rq_pages); + for (filled = 0; filled < npages; filled = ret) { + ret = alloc_pages_bulk(GFP_KERNEL, npages, pages); if (ret > filled) /* Made progress, don't sleep yet */ continue; @@ -667,11 +666,40 @@ static bool svc_alloc_arg(struct svc_rqst *rqstp) set_current_state(TASK_RUNNING); return false; } - trace_svc_alloc_arg_err(pages, ret); + trace_svc_alloc_arg_err(npages, ret); memalloc_retry_wait(GFP_KERNEL); } - rqstp->rq_page_end = &rqstp->rq_pages[pages]; - rqstp->rq_pages[pages] = NULL; /* this might be seen in nfsd_splice_actor() */ + return true; +} + +static bool svc_alloc_arg(struct svc_rqst *rqstp) +{ + struct xdr_buf *arg = &rqstp->rq_arg; + unsigned long pages, nfree; + + pages = rqstp->rq_maxpages; + + nfree = rqstp->rq_pages_nfree; + if (nfree) { + if (!svc_fill_pages(rqstp, rqstp->rq_pages, nfree)) + return false; + rqstp->rq_pages_nfree = 0; + } + + if (WARN_ON_ONCE(rqstp->rq_next_page < rqstp->rq_respages)) + return false; + nfree = rqstp->rq_next_page - rqstp->rq_respages; + if (nfree) { + if (!svc_fill_pages(rqstp, rqstp->rq_respages, nfree)) + return false; + } + + rqstp->rq_next_page = rqstp->rq_respages; + rqstp->rq_page_end = &rqstp->rq_respages[pages]; + /* svc_rqst_replace_page() dereferences *rq_next_page even + * at rq_page_end; NULL prevents releasing a garbage page. + */ + rqstp->rq_page_end[0] = NULL; /* Make arg->head point to first page and arg->pages point to rest */ arg->head[0].iov_base = page_address(rqstp->rq_pages[0]); @@ -1277,7 +1305,6 @@ static noinline int svc_deferred_recv(struct svc_rqst *rqstp) rqstp->rq_addrlen = dr->addrlen; /* Save off transport header len in case we get deferred again */ rqstp->rq_daddr = dr->daddr; - rqstp->rq_respages = rqstp->rq_pages; rqstp->rq_xprt_ctxt = dr->xprt_ctxt; dr->xprt_ctxt = NULL; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index f28c6076f7e8..7be3de1a1aed 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -351,8 +351,6 @@ static ssize_t svc_tcp_read_msg(struct svc_rqst *rqstp, size_t buflen, for (i = 0, t = 0; t < buflen; i++, t += PAGE_SIZE) bvec_set_page(&bvec[i], rqstp->rq_pages[i], PAGE_SIZE, 0); - rqstp->rq_respages = &rqstp->rq_pages[i]; - rqstp->rq_next_page = rqstp->rq_respages + 1; iov_iter_bvec(&msg.msg_iter, ITER_DEST, bvec, i, buflen); if (seek) { @@ -677,13 +675,9 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) if (len <= rqstp->rq_arg.head[0].iov_len) { rqstp->rq_arg.head[0].iov_len = len; rqstp->rq_arg.page_len = 0; - rqstp->rq_respages = rqstp->rq_pages+1; } else { rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len; - rqstp->rq_respages = rqstp->rq_pages + 1 + - DIV_ROUND_UP(rqstp->rq_arg.page_len, PAGE_SIZE); } - rqstp->rq_next_page = rqstp->rq_respages+1; if (serv->sv_stats) serv->sv_stats->netudpcnt++; @@ -994,7 +988,7 @@ static size_t svc_tcp_restore_pages(struct svc_sock *svsk, npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; for (i = 0; i < npages; i++) { if (rqstp->rq_pages[i] != NULL) - put_page(rqstp->rq_pages[i]); + svc_rqst_page_release(rqstp, rqstp->rq_pages[i]); BUG_ON(svsk->sk_pages[i] == NULL); rqstp->rq_pages[i] = svsk->sk_pages[i]; svsk->sk_pages[i] = NULL; @@ -1015,6 +1009,7 @@ static void svc_tcp_save_pages(struct svc_sock *svsk, struct svc_rqst *rqstp) svsk->sk_pages[i] = rqstp->rq_pages[i]; rqstp->rq_pages[i] = NULL; } + rqstp->rq_pages_nfree = npages; } static void svc_tcp_clear_pages(struct svc_sock *svsk) diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index e7e4a39ca6c6..f8a0638eb095 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -118,7 +118,8 @@ svc_rdma_next_recv_ctxt(struct list_head *list) static struct svc_rdma_recv_ctxt * svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) { - int node = ibdev_to_node(rdma->sc_cm_id->device); + struct ib_device *device = rdma->sc_cm_id->device; + int node = ibdev_to_node(device); struct svc_rdma_recv_ctxt *ctxt; unsigned long pages; dma_addr_t addr; @@ -133,9 +134,9 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node); if (!buffer) goto fail1; - addr = ib_dma_map_single(rdma->sc_pd->device, buffer, - rdma->sc_max_req_size, DMA_FROM_DEVICE); - if (ib_dma_mapping_error(rdma->sc_pd->device, addr)) + addr = ib_dma_map_single(device, buffer, rdma->sc_max_req_size, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(device, addr)) goto fail2; svc_rdma_recv_cid_init(rdma, &ctxt->rc_cid); @@ -167,7 +168,7 @@ fail0: static void svc_rdma_recv_ctxt_destroy(struct svcxprt_rdma *rdma, struct svc_rdma_recv_ctxt *ctxt) { - ib_dma_unmap_single(rdma->sc_pd->device, ctxt->rc_recv_sge.addr, + ib_dma_unmap_single(rdma->sc_cm_id->device, ctxt->rc_recv_sge.addr, ctxt->rc_recv_sge.length, DMA_FROM_DEVICE); kfree(ctxt->rc_recv_buf); kfree(ctxt); @@ -861,18 +862,12 @@ static noinline void svc_rdma_read_complete(struct svc_rqst *rqstp, unsigned int i; /* Transfer the Read chunk pages into @rqstp.rq_pages, replacing - * the rq_pages that were already allocated for this rqstp. + * the receive buffer pages already allocated for this rqstp. */ - release_pages(rqstp->rq_respages, ctxt->rc_page_count); + release_pages(rqstp->rq_pages, ctxt->rc_page_count); for (i = 0; i < ctxt->rc_page_count; i++) rqstp->rq_pages[i] = ctxt->rc_pages[i]; - /* Update @rqstp's result send buffer to start after the - * last page in the RDMA Read payload. - */ - rqstp->rq_respages = &rqstp->rq_pages[ctxt->rc_page_count]; - rqstp->rq_next_page = rqstp->rq_respages + 1; - /* Prevent svc_rdma_recv_ctxt_put() from releasing the * pages in ctxt::rc_pages a second time. */ @@ -931,10 +926,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) struct svc_rdma_recv_ctxt *ctxt; int ret; - /* Prevent svc_xprt_release() from releasing pages in rq_pages - * when returning 0 or an error. + /* Precaution: a zero page count on error return causes + * svc_rqst_release_pages() to release nothing. */ - rqstp->rq_respages = rqstp->rq_pages; rqstp->rq_next_page = rqstp->rq_respages; rqstp->rq_xprt_ctxt = NULL; @@ -962,7 +956,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) return 0; percpu_counter_inc(&svcrdma_stat_recv); - ib_dma_sync_single_for_cpu(rdma_xprt->sc_pd->device, + ib_dma_sync_single_for_cpu(rdma_xprt->sc_cm_id->device, ctxt->rc_recv_sge.addr, ctxt->rc_byte_len, DMA_FROM_DEVICE); svc_rdma_build_arg_xdr(rqstp, ctxt); diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 4ec2f9ae06aa..402e2ceca4ff 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -252,6 +252,28 @@ static void svc_rdma_write_info_free(struct svc_rdma_write_info *info) } /** + * svc_rdma_write_chunk_release - Release Write chunk I/O resources + * @rdma: controlling transport + * @ctxt: Send context that is being released + * + * Write chunk resources remain live until Send completion because + * Write WRs are chained to the Send WR. This function releases all + * write_info structures accumulated on @ctxt->sc_write_info_list. + */ +void svc_rdma_write_chunk_release(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt) +{ + struct svc_rdma_write_info *info; + + while (!list_empty(&ctxt->sc_write_info_list)) { + info = list_first_entry(&ctxt->sc_write_info_list, + struct svc_rdma_write_info, wi_list); + list_del(&info->wi_list); + svc_rdma_write_info_free(info); + } +} + +/** * svc_rdma_reply_chunk_release - Release Reply chunk I/O resources * @rdma: controlling transport * @ctxt: Send context that is being released @@ -307,13 +329,11 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) struct ib_cqe *cqe = wc->wr_cqe; struct svc_rdma_chunk_ctxt *cc = container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); - struct svc_rdma_write_info *info = - container_of(cc, struct svc_rdma_write_info, wi_cc); switch (wc->status) { case IB_WC_SUCCESS: trace_svcrdma_wc_write(&cc->cc_cid); - break; + return; case IB_WC_WR_FLUSH_ERR: trace_svcrdma_wc_write_flush(wc, &cc->cc_cid); break; @@ -321,12 +341,11 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) trace_svcrdma_wc_write_err(wc, &cc->cc_cid); } - svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount); - - if (unlikely(wc->status != IB_WC_SUCCESS)) - svc_xprt_deferred_close(&rdma->sc_xprt); - - svc_rdma_write_info_free(info); + /* The RDMA Write has flushed, so the client won't get + * some of the outgoing RPC message. Signal the loss + * to the client by closing the connection. + */ + svc_xprt_deferred_close(&rdma->sc_xprt); } /** @@ -405,34 +424,17 @@ static int svc_rdma_post_chunk_ctxt(struct svcxprt_rdma *rdma, cqe = NULL; } - do { - if (atomic_sub_return(cc->cc_sqecount, - &rdma->sc_sq_avail) > 0) { - cc->cc_posttime = ktime_get(); - ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); - if (ret) - break; - return 0; - } - - percpu_counter_inc(&svcrdma_stat_sq_starve); - trace_svcrdma_sq_full(rdma, &cc->cc_cid); - atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); - wait_event(rdma->sc_send_wait, - atomic_read(&rdma->sc_sq_avail) > cc->cc_sqecount); - trace_svcrdma_sq_retry(rdma, &cc->cc_cid); - } while (1); - - trace_svcrdma_sq_post_err(rdma, &cc->cc_cid, ret); - svc_xprt_deferred_close(&rdma->sc_xprt); - - /* If even one was posted, there will be a completion. */ - if (bad_wr != first_wr) - return 0; + ret = svc_rdma_sq_wait(rdma, &cc->cc_cid, cc->cc_sqecount); + if (ret < 0) + return ret; - atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); - wake_up(&rdma->sc_send_wait); - return -ENOTCONN; + cc->cc_posttime = ktime_get(); + ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); + if (ret) + return svc_rdma_post_send_err(rdma, &cc->cc_cid, bad_wr, + first_wr, cc->cc_sqecount, + ret); + return 0; } /* Build a bvec that covers one kvec in an xdr_buf. @@ -617,9 +619,37 @@ static int svc_rdma_xb_write(const struct xdr_buf *xdr, void *data) return xdr->len; } -static int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, - const struct svc_rdma_chunk *chunk, - const struct xdr_buf *xdr) +/* Link chunk WRs onto @sctxt's WR chain. Completion is requested + * for the tail WR, which is posted first. + */ +static void svc_rdma_cc_link_wrs(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *sctxt, + struct svc_rdma_chunk_ctxt *cc) +{ + struct ib_send_wr *first_wr; + struct list_head *pos; + struct ib_cqe *cqe; + + first_wr = sctxt->sc_wr_chain; + cqe = &cc->cc_cqe; + list_for_each(pos, &cc->cc_rwctxts) { + struct svc_rdma_rw_ctxt *rwc; + + rwc = list_entry(pos, struct svc_rdma_rw_ctxt, rw_list); + first_wr = rdma_rw_ctx_wrs(&rwc->rw_ctx, rdma->sc_qp, + rdma->sc_port_num, cqe, first_wr); + cqe = NULL; + } + sctxt->sc_wr_chain = first_wr; + sctxt->sc_sqecount += cc->cc_sqecount; +} + +/* Link Write WRs for @chunk onto @sctxt's WR chain. + */ +static int svc_rdma_prepare_write_chunk(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *sctxt, + const struct svc_rdma_chunk *chunk, + const struct xdr_buf *xdr) { struct svc_rdma_write_info *info; struct svc_rdma_chunk_ctxt *cc; @@ -639,10 +669,14 @@ static int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, if (ret != payload.len) goto out_err; - trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount); - ret = svc_rdma_post_chunk_ctxt(rdma, cc); - if (ret < 0) + ret = -EINVAL; + if (unlikely(sctxt->sc_sqecount + cc->cc_sqecount > rdma->sc_sq_depth)) goto out_err; + + svc_rdma_cc_link_wrs(rdma, sctxt, cc); + list_add(&info->wi_list, &sctxt->sc_write_info_list); + + trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount); return 0; out_err: @@ -651,17 +685,19 @@ out_err: } /** - * svc_rdma_send_write_list - Send all chunks on the Write list + * svc_rdma_prepare_write_list - Construct WR chain for sending Write list * @rdma: controlling RDMA transport * @rctxt: Write list provisioned by the client + * @sctxt: Send WR resources * @xdr: xdr_buf containing an RPC Reply message * - * Returns zero on success, or a negative errno if one or more - * Write chunks could not be sent. + * Returns zero on success, or a negative errno if WR chain + * construction fails for one or more Write chunks. */ -int svc_rdma_send_write_list(struct svcxprt_rdma *rdma, - const struct svc_rdma_recv_ctxt *rctxt, - const struct xdr_buf *xdr) +int svc_rdma_prepare_write_list(struct svcxprt_rdma *rdma, + const struct svc_rdma_recv_ctxt *rctxt, + struct svc_rdma_send_ctxt *sctxt, + const struct xdr_buf *xdr) { struct svc_rdma_chunk *chunk; int ret; @@ -669,7 +705,7 @@ int svc_rdma_send_write_list(struct svcxprt_rdma *rdma, pcl_for_each_chunk(chunk, &rctxt->rc_write_pcl) { if (!chunk->ch_payload_length) break; - ret = svc_rdma_send_write_chunk(rdma, chunk, xdr); + ret = svc_rdma_prepare_write_chunk(rdma, sctxt, chunk, xdr); if (ret < 0) return ret; } @@ -699,9 +735,6 @@ int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma, { struct svc_rdma_write_info *info = &sctxt->sc_reply_info; struct svc_rdma_chunk_ctxt *cc = &info->wi_cc; - struct ib_send_wr *first_wr; - struct list_head *pos; - struct ib_cqe *cqe; int ret; info->wi_rdma = rdma; @@ -715,23 +748,222 @@ int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma, if (ret < 0) return ret; - first_wr = sctxt->sc_wr_chain; - cqe = &cc->cc_cqe; - list_for_each(pos, &cc->cc_rwctxts) { - struct svc_rdma_rw_ctxt *rwc; - - rwc = list_entry(pos, struct svc_rdma_rw_ctxt, rw_list); - first_wr = rdma_rw_ctx_wrs(&rwc->rw_ctx, rdma->sc_qp, - rdma->sc_port_num, cqe, first_wr); - cqe = NULL; - } - sctxt->sc_wr_chain = first_wr; - sctxt->sc_sqecount += cc->cc_sqecount; + svc_rdma_cc_link_wrs(rdma, sctxt, cc); trace_svcrdma_post_reply_chunk(&cc->cc_cid, cc->cc_sqecount); return xdr->len; } +/* + * Cap contiguous RDMA Read sink allocations at order-4. + * Higher orders risk allocation failure under + * __GFP_NORETRY, which would negate the benefit of the + * contiguous fast path. + */ +#define SVC_RDMA_CONTIG_MAX_ORDER 4 + +/** + * svc_rdma_alloc_read_pages - Allocate physically contiguous pages + * @nr_pages: number of pages needed + * @order: on success, set to the allocation order + * + * Attempts a higher-order allocation, falling back to smaller orders. + * The returned pages are split immediately so each sub-page has its + * own refcount and can be freed independently. + * + * Returns a pointer to the first page on success, or NULL if even + * order-1 allocation fails. + */ +static struct page * +svc_rdma_alloc_read_pages(unsigned int nr_pages, unsigned int *order) +{ + unsigned int o; + struct page *page; + + o = min(get_order(nr_pages << PAGE_SHIFT), + SVC_RDMA_CONTIG_MAX_ORDER); + + while (o >= 1) { + page = alloc_pages(GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN, + o); + if (page) { + split_page(page, o); + *order = o; + return page; + } + o--; + } + return NULL; +} + +/* + * svc_rdma_fill_contig_bvec - Replace rq_pages with a contiguous allocation + * @rqstp: RPC transaction context + * @head: context for ongoing I/O + * @bv: bvec entry to fill + * @pages_left: number of data pages remaining in the segment + * @len_left: bytes remaining in the segment + * + * On success, fills @bv with a bvec spanning the contiguous range and + * advances rc_curpage/rc_page_count. Returns the byte length covered, + * or zero if the allocation failed or would overrun rq_maxpages. + */ +static unsigned int +svc_rdma_fill_contig_bvec(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head, + struct bio_vec *bv, unsigned int pages_left, + unsigned int len_left) +{ + unsigned int order, npages, chunk_pages, chunk_len, i; + struct page *page; + + page = svc_rdma_alloc_read_pages(pages_left, &order); + if (!page) + return 0; + npages = 1 << order; + + if (head->rc_curpage + npages > rqstp->rq_maxpages) { + for (i = 0; i < npages; i++) + __free_page(page + i); + return 0; + } + + /* + * Replace rq_pages[] entries with pages from the contiguous + * allocation. If npages exceeds chunk_pages, the extra pages + * stay in rq_pages[] for later reuse or normal rqst teardown. + */ + for (i = 0; i < npages; i++) { + svc_rqst_page_release(rqstp, + rqstp->rq_pages[head->rc_curpage + i]); + rqstp->rq_pages[head->rc_curpage + i] = page + i; + } + + chunk_pages = min(npages, pages_left); + chunk_len = min_t(unsigned int, chunk_pages << PAGE_SHIFT, len_left); + bvec_set_page(bv, page, chunk_len, 0); + head->rc_page_count += chunk_pages; + head->rc_curpage += chunk_pages; + return chunk_len; +} + +/* + * svc_rdma_fill_page_bvec - Add a single rq_page to the bvec array + * @head: context for ongoing I/O + * @ctxt: R/W context whose bvec array is being filled + * @cur: page to add + * @bvec_idx: pointer to current bvec index, not advanced on merge + * @len_left: bytes remaining in the segment + * + * If @cur is physically contiguous with the preceding bvec, it is + * merged by extending that bvec's length. Otherwise a new bvec + * entry is created. Returns the byte length covered. + */ +static unsigned int +svc_rdma_fill_page_bvec(struct svc_rdma_recv_ctxt *head, + struct svc_rdma_rw_ctxt *ctxt, struct page *cur, + unsigned int *bvec_idx, unsigned int len_left) +{ + unsigned int chunk_len = min_t(unsigned int, PAGE_SIZE, len_left); + + head->rc_page_count++; + head->rc_curpage++; + + if (*bvec_idx > 0) { + struct bio_vec *prev = &ctxt->rw_bvec[*bvec_idx - 1]; + + if (page_to_phys(prev->bv_page) + prev->bv_offset + + prev->bv_len == page_to_phys(cur)) { + prev->bv_len += chunk_len; + return chunk_len; + } + } + + bvec_set_page(&ctxt->rw_bvec[*bvec_idx], cur, chunk_len, 0); + (*bvec_idx)++; + return chunk_len; +} + +/** + * svc_rdma_build_read_segment_contig - Build RDMA Read WR with contiguous pages + * @rqstp: RPC transaction context + * @head: context for ongoing I/O + * @segment: co-ordinates of remote memory to be read + * + * Greedily allocates higher-order pages to cover the segment, + * building one bvec per contiguous chunk. Each allocation is + * split so sub-pages have independent refcounts. When a + * higher-order allocation fails, remaining pages are covered + * individually, merging adjacent pages into the preceding bvec + * when they are physically contiguous. The split sub-pages + * replace entries in rq_pages[] so downstream cleanup is + * unchanged. + * + * Returns: + * %0: the Read WR was constructed successfully + * %-ENOMEM: allocation failed + * %-EIO: a DMA mapping error occurred + */ +static int svc_rdma_build_read_segment_contig(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head, + const struct svc_rdma_segment *segment) +{ + struct svcxprt_rdma *rdma = svc_rdma_rqst_rdma(rqstp); + struct svc_rdma_chunk_ctxt *cc = &head->rc_cc; + unsigned int nr_data_pages, bvec_idx; + struct svc_rdma_rw_ctxt *ctxt; + unsigned int len_left; + int ret; + + nr_data_pages = PAGE_ALIGN(segment->rs_length) >> PAGE_SHIFT; + if (head->rc_curpage + nr_data_pages > rqstp->rq_maxpages) + return -ENOMEM; + + ctxt = svc_rdma_get_rw_ctxt(rdma, nr_data_pages); + if (!ctxt) + return -ENOMEM; + + bvec_idx = 0; + len_left = segment->rs_length; + while (len_left) { + unsigned int pages_left = PAGE_ALIGN(len_left) >> PAGE_SHIFT; + unsigned int chunk_len = 0; + + if (pages_left >= 2) + chunk_len = svc_rdma_fill_contig_bvec(rqstp, head, + &ctxt->rw_bvec[bvec_idx], + pages_left, len_left); + if (chunk_len) { + bvec_idx++; + } else { + struct page *cur = + rqstp->rq_pages[head->rc_curpage]; + chunk_len = svc_rdma_fill_page_bvec(head, ctxt, cur, + &bvec_idx, + len_left); + } + + len_left -= chunk_len; + } + + ctxt->rw_nents = bvec_idx; + + head->rc_pageoff = offset_in_page(segment->rs_length); + if (head->rc_pageoff) + head->rc_curpage--; + + ret = svc_rdma_rw_ctx_init(rdma, ctxt, segment->rs_offset, + segment->rs_handle, segment->rs_length, + DMA_FROM_DEVICE); + if (ret < 0) + return -EIO; + percpu_counter_inc(&svcrdma_stat_read); + + list_add(&ctxt->rw_list, &cc->cc_rwctxts); + cc->cc_sqecount += ret; + return 0; +} + /** * svc_rdma_build_read_segment - Build RDMA Read WQEs to pull one RDMA segment * @rqstp: RPC transaction context @@ -758,6 +990,14 @@ static int svc_rdma_build_read_segment(struct svc_rqst *rqstp, if (check_add_overflow(head->rc_pageoff, len, &total)) return -EINVAL; nr_bvec = PAGE_ALIGN(total) >> PAGE_SHIFT; + + if (head->rc_pageoff == 0 && nr_bvec >= 2) { + ret = svc_rdma_build_read_segment_contig(rqstp, head, + segment); + if (ret != -ENOMEM) + return ret; + } + ctxt = svc_rdma_get_rw_ctxt(rdma, nr_bvec); if (!ctxt) return -ENOMEM; @@ -1103,10 +1343,16 @@ static void svc_rdma_clear_rqst_pages(struct svc_rqst *rqstp, { unsigned int i; + /* + * Move only pages containing RPC data into rc_pages[]. Pages + * from a contiguous allocation that were not used for the + * payload remain in rq_pages[] for subsequent reuse. + */ for (i = 0; i < head->rc_page_count; i++) { head->rc_pages[i] = rqstp->rq_pages[i]; rqstp->rq_pages[i] = NULL; } + rqstp->rq_pages_nfree = head->rc_page_count; } /** diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 914cd263c2f1..8b3f0c8c14b2 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -116,7 +116,8 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc); static struct svc_rdma_send_ctxt * svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) { - int node = ibdev_to_node(rdma->sc_cm_id->device); + struct ib_device *device = rdma->sc_cm_id->device; + int node = ibdev_to_node(device); struct svc_rdma_send_ctxt *ctxt; unsigned long pages; dma_addr_t addr; @@ -136,9 +137,9 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node); if (!buffer) goto fail2; - addr = ib_dma_map_single(rdma->sc_pd->device, buffer, - rdma->sc_max_req_size, DMA_TO_DEVICE); - if (ib_dma_mapping_error(rdma->sc_pd->device, addr)) + addr = ib_dma_map_single(device, buffer, rdma->sc_max_req_size, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(device, addr)) goto fail3; svc_rdma_send_cid_init(rdma, &ctxt->sc_cid); @@ -149,6 +150,7 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) ctxt->sc_send_wr.sg_list = ctxt->sc_sges; ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED; ctxt->sc_cqe.done = svc_rdma_wc_send; + INIT_LIST_HEAD(&ctxt->sc_write_info_list); ctxt->sc_xprt_buf = buffer; xdr_buf_init(&ctxt->sc_hdrbuf, ctxt->sc_xprt_buf, rdma->sc_max_req_size); @@ -175,15 +177,14 @@ fail0: */ void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma) { + struct ib_device *device = rdma->sc_cm_id->device; struct svc_rdma_send_ctxt *ctxt; struct llist_node *node; while ((node = llist_del_first(&rdma->sc_send_ctxts)) != NULL) { ctxt = llist_entry(node, struct svc_rdma_send_ctxt, sc_node); - ib_dma_unmap_single(rdma->sc_pd->device, - ctxt->sc_sges[0].addr, - rdma->sc_max_req_size, - DMA_TO_DEVICE); + ib_dma_unmap_single(device, ctxt->sc_sges[0].addr, + rdma->sc_max_req_size, DMA_TO_DEVICE); kfree(ctxt->sc_xprt_buf); kfree(ctxt->sc_pages); kfree(ctxt); @@ -237,6 +238,7 @@ static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma, struct ib_device *device = rdma->sc_cm_id->device; unsigned int i; + svc_rdma_write_chunk_release(rdma, ctxt); svc_rdma_reply_chunk_release(rdma, ctxt); if (ctxt->sc_page_count) @@ -295,6 +297,117 @@ void svc_rdma_wake_send_waiters(struct svcxprt_rdma *rdma, int avail) } /** + * svc_rdma_sq_wait - Wait for SQ slots using fair queuing + * @rdma: controlling transport + * @cid: completion ID for tracing + * @sqecount: number of SQ entries needed + * + * A ticket-based system ensures fair ordering when multiple threads + * wait for Send Queue capacity. Each waiter takes a ticket and is + * served in order, preventing starvation. + * + * Protocol invariant: every ticket holder must increment + * sc_sq_ticket_tail exactly once, whether the reservation + * succeeds or the connection closes. Failing to advance the + * tail stalls all subsequent waiters. + * + * The ticket counters are signed 32-bit atomics. After + * wrapping through INT_MAX, the equality check + * (tail == ticket) remains correct because both counters + * advance monotonically and the comparison uses exact + * equality rather than relational operators. + * + * Return values: + * %0: SQ slots were reserved successfully + * %-ENOTCONN: The connection was lost + */ +int svc_rdma_sq_wait(struct svcxprt_rdma *rdma, + const struct rpc_rdma_cid *cid, int sqecount) +{ + int ticket; + + /* Fast path: try to reserve SQ slots without waiting. + * + * A failed reservation temporarily understates sc_sq_avail + * until the compensating atomic_add restores it. A Send + * completion arriving in that window sees a lower count + * than reality, but the value self-corrects once the add + * completes. No ordering guarantee is needed here because + * the slow path serializes all contended waiters. + */ + if (likely(atomic_sub_return(sqecount, &rdma->sc_sq_avail) >= 0)) + return 0; + atomic_add(sqecount, &rdma->sc_sq_avail); + + /* Slow path: take a ticket and wait in line */ + ticket = atomic_fetch_inc(&rdma->sc_sq_ticket_head); + + percpu_counter_inc(&svcrdma_stat_sq_starve); + trace_svcrdma_sq_full(rdma, cid); + + /* Wait until all earlier tickets have been served */ + wait_event(rdma->sc_sq_ticket_wait, + test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags) || + atomic_read(&rdma->sc_sq_ticket_tail) == ticket); + if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) + goto out_close; + + /* It's our turn. Wait for enough SQ slots to be available. */ + while (atomic_sub_return(sqecount, &rdma->sc_sq_avail) < 0) { + atomic_add(sqecount, &rdma->sc_sq_avail); + + wait_event(rdma->sc_send_wait, + test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags) || + atomic_read(&rdma->sc_sq_avail) >= sqecount); + if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) + goto out_close; + } + + /* Slots reserved successfully. Let the next waiter proceed. */ + atomic_inc(&rdma->sc_sq_ticket_tail); + wake_up(&rdma->sc_sq_ticket_wait); + trace_svcrdma_sq_retry(rdma, cid); + return 0; + +out_close: + atomic_inc(&rdma->sc_sq_ticket_tail); + wake_up(&rdma->sc_sq_ticket_wait); + return -ENOTCONN; +} + +/** + * svc_rdma_post_send_err - Handle ib_post_send failure + * @rdma: controlling transport + * @cid: completion ID for tracing + * @bad_wr: first WR that was not posted + * @first_wr: first WR in the chain + * @sqecount: number of SQ entries that were reserved + * @ret: error code from ib_post_send + * + * Return values: + * %0: At least one WR was posted; a completion handles cleanup + * %-ENOTCONN: No WRs were posted; SQ slots are released + */ +int svc_rdma_post_send_err(struct svcxprt_rdma *rdma, + const struct rpc_rdma_cid *cid, + const struct ib_send_wr *bad_wr, + const struct ib_send_wr *first_wr, + int sqecount, int ret) +{ + trace_svcrdma_sq_post_err(rdma, cid, ret); + svc_xprt_deferred_close(&rdma->sc_xprt); + + /* If even one WR was posted, a Send completion will + * return the reserved SQ slots. + */ + if (bad_wr != first_wr) + return 0; + + svc_rdma_wake_send_waiters(rdma, sqecount); + return -ENOTCONN; +} + +/** * svc_rdma_wc_send - Invoked by RDMA provider for each polled Send WC * @cq: Completion Queue context * @wc: Work Completion object @@ -336,11 +449,6 @@ flushed: * that these values remain available after the ib_post_send() call. * In some error flow cases, svc_rdma_wc_send() releases @ctxt. * - * Note there is potential for starvation when the Send Queue is - * full because there is no order to when waiting threads are - * awoken. The transport is typically provisioned with a deep - * enough Send Queue that SQ exhaustion should be a rare event. - * * Return values: * %0: @ctxt's WR chain was posted successfully * %-ENOTCONN: The connection was lost @@ -357,47 +465,21 @@ int svc_rdma_post_send(struct svcxprt_rdma *rdma, might_sleep(); /* Sync the transport header buffer */ - ib_dma_sync_single_for_device(rdma->sc_pd->device, + ib_dma_sync_single_for_device(rdma->sc_cm_id->device, send_wr->sg_list[0].addr, send_wr->sg_list[0].length, DMA_TO_DEVICE); - /* If the SQ is full, wait until an SQ entry is available */ - while (!test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) { - if (atomic_sub_return(sqecount, &rdma->sc_sq_avail) < 0) { - svc_rdma_wake_send_waiters(rdma, sqecount); - - /* When the transport is torn down, assume - * ib_drain_sq() will trigger enough Send - * completions to wake us. The XPT_CLOSE test - * above should then cause the while loop to - * exit. - */ - percpu_counter_inc(&svcrdma_stat_sq_starve); - trace_svcrdma_sq_full(rdma, &cid); - wait_event(rdma->sc_send_wait, - atomic_read(&rdma->sc_sq_avail) > 0); - trace_svcrdma_sq_retry(rdma, &cid); - continue; - } - - trace_svcrdma_post_send(ctxt); - ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); - if (ret) { - trace_svcrdma_sq_post_err(rdma, &cid, ret); - svc_xprt_deferred_close(&rdma->sc_xprt); - - /* If even one WR was posted, there will be a - * Send completion that bumps sc_sq_avail. - */ - if (bad_wr == first_wr) { - svc_rdma_wake_send_waiters(rdma, sqecount); - break; - } - } - return 0; - } - return -ENOTCONN; + ret = svc_rdma_sq_wait(rdma, &cid, sqecount); + if (ret < 0) + return ret; + + trace_svcrdma_post_send(ctxt); + ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); + if (ret) + return svc_rdma_post_send_err(rdma, &cid, bad_wr, + first_wr, sqecount, ret); + return 0; } /** @@ -858,7 +940,8 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, /* The svc_rqst and all resources it owns are released as soon as * svc_rdma_sendto returns. Transfer pages under I/O to the ctxt - * so they are released by the Send completion handler. + * so they are released only after Send completion, and not by + * svc_rqst_release_pages(). */ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, struct svc_rdma_send_ctxt *ctxt) @@ -870,9 +953,6 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, ctxt->sc_pages[i] = rqstp->rq_respages[i]; rqstp->rq_respages[i] = NULL; } - - /* Prevent svc_xprt_release from releasing pages in rq_pages */ - rqstp->rq_next_page = rqstp->rq_respages; } /* Prepare the portion of the RPC Reply that will be transmitted @@ -976,6 +1056,12 @@ void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, sctxt->sc_send_wr.num_sge = 1; sctxt->sc_send_wr.opcode = IB_WR_SEND; sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len; + + /* Ensure only the error message is posted, not any previously + * prepared Write chunk WRs. + */ + sctxt->sc_wr_chain = &sctxt->sc_send_wr; + sctxt->sc_sqecount = 1; if (svc_rdma_post_send(rdma, sctxt)) goto put_ctxt; return; @@ -1023,7 +1109,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) if (!p) goto put_ctxt; - ret = svc_rdma_send_write_list(rdma, rctxt, &rqstp->rq_res); + ret = svc_rdma_prepare_write_list(rdma, rctxt, sctxt, &rqstp->rq_res); if (ret < 0) goto put_ctxt; diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 9b623849723e..f18bc60d9f4f 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -179,6 +179,7 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, init_llist_head(&cma_xprt->sc_recv_ctxts); init_llist_head(&cma_xprt->sc_rw_ctxts); init_waitqueue_head(&cma_xprt->sc_send_wait); + init_waitqueue_head(&cma_xprt->sc_sq_ticket_wait); spin_lock_init(&cma_xprt->sc_lock); spin_lock_init(&cma_xprt->sc_rq_dto_lock); @@ -414,7 +415,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) struct ib_qp_init_attr qp_attr; struct ib_device *dev; int ret = 0; - RPC_IFDEBUG(struct sockaddr *sap); listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); clear_bit(XPT_CONN, &xprt->xpt_flags); @@ -478,6 +478,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr) newxprt->sc_sq_depth = dev->attrs.max_qp_wr; atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth); + atomic_set(&newxprt->sc_sq_ticket_head, 0); + atomic_set(&newxprt->sc_sq_ticket_tail, 0); newxprt->sc_pd = ib_alloc_pd(dev, 0); if (IS_ERR(newxprt->sc_pd)) { @@ -560,18 +562,20 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) goto errout; } -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - dprintk("svcrdma: new connection accepted on device %s:\n", dev->name); - sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr; - dprintk(" local address : %pIS:%u\n", sap, rpc_get_port(sap)); - sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; - dprintk(" remote address : %pIS:%u\n", sap, rpc_get_port(sap)); - dprintk(" max_sge : %d\n", newxprt->sc_max_send_sges); - dprintk(" sq_depth : %d\n", newxprt->sc_sq_depth); - dprintk(" rdma_rw_ctxs : %d\n", ctxts); - dprintk(" max_requests : %d\n", newxprt->sc_max_requests); - dprintk(" ord : %d\n", conn_param.initiator_depth); -#endif + if (IS_ENABLED(CONFIG_SUNRPC_DEBUG)) { + struct sockaddr *sap; + + dprintk("svcrdma: new connection accepted on device %s:\n", dev->name); + sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr; + dprintk(" local address : %pIS:%u\n", sap, rpc_get_port(sap)); + sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; + dprintk(" remote address : %pIS:%u\n", sap, rpc_get_port(sap)); + dprintk(" max_sge : %d\n", newxprt->sc_max_send_sges); + dprintk(" sq_depth : %d\n", newxprt->sc_sq_depth); + dprintk(" rdma_rw_ctxs : %d\n", ctxts); + dprintk(" max_requests : %d\n", newxprt->sc_max_requests); + dprintk(" ord : %d\n", conn_param.initiator_depth); + } return &newxprt->sc_xprt; @@ -648,7 +652,8 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt) * If there are already waiters on the SQ, * return false. */ - if (waitqueue_active(&rdma->sc_send_wait)) + if (waitqueue_active(&rdma->sc_send_wait) || + waitqueue_active(&rdma->sc_sq_ticket_wait)) return 0; /* Otherwise return true. */ |
