From 3aa63a569c64e708df547a8913c84e64a06e7853 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 17 May 2024 20:08:40 -0400
Subject: fs: switch timespec64 fields in inode to discrete integers

Adjacent struct timespec64's pack poorly. Switch them to discrete
integer fields for the seconds and nanoseconds. This shaves 8 bytes off
struct inode with a garden-variety Fedora Kconfig on x86_64, but that
also moves the i_lock into the previous cacheline, away from the fields
it protects.

To remedy that, move i_generation above the i_lock, which moves the new
4-byte hole to just after the i_fsnotify_mask in my setup. Amir has
plans to use that to expand the i_fsnotify_mask, so add a comment to
that effect as well.

Cc: Amir Goldstein <amir73il@gmail.com>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://lore.kernel.org/r/20240517-amtime-v1-1-7b804ca4be8f@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 48 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 32 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0283cf366c2a..639885621608 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -660,9 +660,13 @@ struct inode {
 	};
 	dev_t			i_rdev;
 	loff_t			i_size;
-	struct timespec64	__i_atime;
-	struct timespec64	__i_mtime;
-	struct timespec64	__i_ctime; /* use inode_*_ctime accessors! */
+	time64_t		i_atime_sec;
+	time64_t		i_mtime_sec;
+	time64_t		i_ctime_sec;
+	u32			i_atime_nsec;
+	u32			i_mtime_nsec;
+	u32			i_ctime_nsec;
+	u32			i_generation;
 	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
 	unsigned short          i_bytes;
 	u8			i_blkbits;
@@ -719,10 +723,10 @@ struct inode {
 		unsigned		i_dir_seq;
 	};
 
-	__u32			i_generation;
 
 #ifdef CONFIG_FSNOTIFY
 	__u32			i_fsnotify_mask; /* all events this inode cares about */
+	/* 32-bit hole reserved for expanding i_fsnotify_mask */
 	struct fsnotify_mark_connector __rcu	*i_fsnotify_marks;
 #endif
 
@@ -1538,23 +1542,27 @@ struct timespec64 inode_set_ctime_current(struct inode *inode);
 
 static inline time64_t inode_get_atime_sec(const struct inode *inode)
 {
-	return inode->__i_atime.tv_sec;
+	return inode->i_atime_sec;
 }
 
 static inline long inode_get_atime_nsec(const struct inode *inode)
 {
-	return inode->__i_atime.tv_nsec;
+	return inode->i_atime_nsec;
 }
 
 static inline struct timespec64 inode_get_atime(const struct inode *inode)
 {
-	return inode->__i_atime;
+	struct timespec64 ts = { .tv_sec  = inode_get_atime_sec(inode),
+				 .tv_nsec = inode_get_atime_nsec(inode) };
+
+	return ts;
 }
 
 static inline struct timespec64 inode_set_atime_to_ts(struct inode *inode,
 						      struct timespec64 ts)
 {
-	inode->__i_atime = ts;
+	inode->i_atime_sec = ts.tv_sec;
+	inode->i_atime_nsec = ts.tv_nsec;
 	return ts;
 }
 
@@ -1563,28 +1571,32 @@ static inline struct timespec64 inode_set_atime(struct inode *inode,
 {
 	struct timespec64 ts = { .tv_sec  = sec,
 				 .tv_nsec = nsec };
+
 	return inode_set_atime_to_ts(inode, ts);
 }
 
 static inline time64_t inode_get_mtime_sec(const struct inode *inode)
 {
-	return inode->__i_mtime.tv_sec;
+	return inode->i_mtime_sec;
 }
 
 static inline long inode_get_mtime_nsec(const struct inode *inode)
 {
-	return inode->__i_mtime.tv_nsec;
+	return inode->i_mtime_nsec;
 }
 
 static inline struct timespec64 inode_get_mtime(const struct inode *inode)
 {
-	return inode->__i_mtime;
+	struct timespec64 ts = { .tv_sec  = inode_get_mtime_sec(inode),
+				 .tv_nsec = inode_get_mtime_nsec(inode) };
+	return ts;
 }
 
 static inline struct timespec64 inode_set_mtime_to_ts(struct inode *inode,
 						      struct timespec64 ts)
 {
-	inode->__i_mtime = ts;
+	inode->i_mtime_sec = ts.tv_sec;
+	inode->i_mtime_nsec = ts.tv_nsec;
 	return ts;
 }
 
@@ -1598,23 +1610,27 @@ static inline struct timespec64 inode_set_mtime(struct inode *inode,
 
 static inline time64_t inode_get_ctime_sec(const struct inode *inode)
 {
-	return inode->__i_ctime.tv_sec;
+	return inode->i_ctime_sec;
 }
 
 static inline long inode_get_ctime_nsec(const struct inode *inode)
 {
-	return inode->__i_ctime.tv_nsec;
+	return inode->i_ctime_nsec;
 }
 
 static inline struct timespec64 inode_get_ctime(const struct inode *inode)
 {
-	return inode->__i_ctime;
+	struct timespec64 ts = { .tv_sec  = inode_get_ctime_sec(inode),
+				 .tv_nsec = inode_get_ctime_nsec(inode) };
+
+	return ts;
 }
 
 static inline struct timespec64 inode_set_ctime_to_ts(struct inode *inode,
 						      struct timespec64 ts)
 {
-	inode->__i_ctime = ts;
+	inode->i_ctime_sec = ts.tv_sec;
+	inode->i_ctime_nsec = ts.tv_nsec;
 	return ts;
 }
 
-- 
cgit v1.2.3


From 620c266f394932e5decc4b34683a75dfc59dc2f4 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 24 May 2024 12:19:39 +0200
Subject: fhandle: relax open_by_handle_at() permission checks

A current limitation of open_by_handle_at() is that it's currently not possible
to use it from within containers at all because we require CAP_DAC_READ_SEARCH
in the initial namespace. That's unfortunate because there are scenarios where
using open_by_handle_at() from within containers.

Two examples:

(1) cgroupfs allows to encode cgroups to file handles and reopen them with
    open_by_handle_at().
(2) Fanotify allows placing filesystem watches they currently aren't usable in
    containers because the returned file handles cannot be used.

Here's a proposal for relaxing the permission check for open_by_handle_at().

(1) Opening file handles when the caller has privileges over the filesystem
    (1.1) The caller has an unobstructed view of the filesystem.
    (1.2) The caller has permissions to follow a path to the file handle.

This doesn't address the problem of opening a file handle when only a portion
of a filesystem is exposed as is common in containers by e.g., bind-mounting a
subtree. The proposal to solve this use-case is:

(2) Opening file handles when the caller has privileges over a subtree
    (2.1) The caller is able to reach the file from the provided mount fd.
    (2.2) The caller has permissions to construct an unobstructed path to the
          file handle.
    (2.3) The caller has permissions to follow a path to the file handle.

The relaxed permission checks are currently restricted to directory file
handles which are what both cgroupfs and fanotify need. Handling disconnected
non-directory file handles would lead to a potentially non-deterministic api.
If a disconnected non-directory file handle is provided we may fail to decode
a valid path that we could use for permission checking. That in itself isn't a
problem as we would just return EACCES in that case. However, confusion may
arise if a non-disconnected dentry ends up in the cache later and those opening
the file handle would suddenly succeed.

* It's potentially possible to use timing information (side-channel) to infer
  whether a given inode exists. I don't think that's particularly
  problematic. Thanks to Jann for bringing this to my attention.

* An unrelated note (IOW, these are thoughts that apply to
  open_by_handle_at() generically and are unrelated to the changes here):
  Jann pointed out that we should verify whether deleted files could
  potentially be reopened through open_by_handle_at(). I don't think that's
  possible though.

  Another potential thing to check is whether open_by_handle_at() could be
  abused to open internal stuff like memfds or gpu stuff. I don't think so
  but I haven't had the time to completely verify this.

This dates back to discussions Amir and I had quite some time ago and thanks to
him for providing a lot of details around the export code and related patches!

Link: https://lore.kernel.org/r/20240524-vfs-open_by_handle_at-v1-1-3d4b7d22736b@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/exportfs/expfs.c      |   9 ++-
 fs/fhandle.c             | 178 +++++++++++++++++++++++++++++++++++++----------
 fs/mount.h               |   1 +
 fs/namespace.c           |   2 +-
 fs/nfsd/nfsfh.c          |   2 +-
 include/linux/exportfs.h |   2 +
 6 files changed, 152 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 07ea3d62b298..4f2dd4ab4486 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -427,7 +427,7 @@ EXPORT_SYMBOL_GPL(exportfs_encode_fh);
 
 struct dentry *
 exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len,
-		       int fileid_type,
+		       int fileid_type, unsigned int flags,
 		       int (*acceptable)(void *, struct dentry *),
 		       void *context)
 {
@@ -445,6 +445,11 @@ exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len,
 	if (IS_ERR_OR_NULL(result))
 		return result;
 
+	if ((flags & EXPORT_FH_DIR_ONLY) && !d_is_dir(result)) {
+		err = -ENOTDIR;
+		goto err_result;
+	}
+
 	/*
 	 * If no acceptance criteria was specified by caller, a disconnected
 	 * dentry is also accepatable. Callers may use this mode to query if
@@ -581,7 +586,7 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 {
 	struct dentry *ret;
 
-	ret = exportfs_decode_fh_raw(mnt, fid, fh_len, fileid_type,
+	ret = exportfs_decode_fh_raw(mnt, fid, fh_len, fileid_type, 0,
 				     acceptable, context);
 	if (IS_ERR_OR_NULL(ret)) {
 		if (ret == ERR_PTR(-ENOMEM))
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 8a7f86c2139a..6e8cea16790e 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -115,88 +115,188 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
 	return err;
 }
 
-static struct vfsmount *get_vfsmount_from_fd(int fd)
+static int get_path_from_fd(int fd, struct path *root)
 {
-	struct vfsmount *mnt;
-
 	if (fd == AT_FDCWD) {
 		struct fs_struct *fs = current->fs;
 		spin_lock(&fs->lock);
-		mnt = mntget(fs->pwd.mnt);
+		*root = fs->pwd;
+		path_get(root);
 		spin_unlock(&fs->lock);
 	} else {
 		struct fd f = fdget(fd);
 		if (!f.file)
-			return ERR_PTR(-EBADF);
-		mnt = mntget(f.file->f_path.mnt);
+			return -EBADF;
+		*root = f.file->f_path;
+		path_get(root);
 		fdput(f);
 	}
-	return mnt;
+
+	return 0;
 }
 
+enum handle_to_path_flags {
+	HANDLE_CHECK_PERMS   = (1 << 0),
+	HANDLE_CHECK_SUBTREE = (1 << 1),
+};
+
+struct handle_to_path_ctx {
+	struct path root;
+	enum handle_to_path_flags flags;
+	unsigned int fh_flags;
+};
+
 static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
 {
-	return 1;
+	struct handle_to_path_ctx *ctx = context;
+	struct user_namespace *user_ns = current_user_ns();
+	struct dentry *d, *root = ctx->root.dentry;
+	struct mnt_idmap *idmap = mnt_idmap(ctx->root.mnt);
+	int retval = 0;
+
+	if (!root)
+		return 1;
+
+	/* Old permission model with global CAP_DAC_READ_SEARCH. */
+	if (!ctx->flags)
+		return 1;
+
+	/*
+	 * It's racy as we're not taking rename_lock but we're able to ignore
+	 * permissions and we just need an approximation whether we were able
+	 * to follow a path to the file.
+	 *
+	 * It's also potentially expensive on some filesystems especially if
+	 * there is a deep path.
+	 */
+	d = dget(dentry);
+	while (d != root && !IS_ROOT(d)) {
+		struct dentry *parent = dget_parent(d);
+
+		/*
+		 * We know that we have the ability to override DAC permissions
+		 * as we've verified this earlier via CAP_DAC_READ_SEARCH. But
+		 * we also need to make sure that there aren't any unmapped
+		 * inodes in the path that would prevent us from reaching the
+		 * file.
+		 */
+		if (!privileged_wrt_inode_uidgid(user_ns, idmap,
+						 d_inode(parent))) {
+			dput(d);
+			dput(parent);
+			return retval;
+		}
+
+		dput(d);
+		d = parent;
+	}
+
+	if (!(ctx->flags & HANDLE_CHECK_SUBTREE) || d == root)
+		retval = 1;
+	WARN_ON_ONCE(d != root && d != root->d_sb->s_root);
+	dput(d);
+	return retval;
 }
 
-static int do_handle_to_path(int mountdirfd, struct file_handle *handle,
-			     struct path *path)
+static int do_handle_to_path(struct file_handle *handle, struct path *path,
+			     struct handle_to_path_ctx *ctx)
 {
-	int retval = 0;
 	int handle_dwords;
+	struct vfsmount *mnt = ctx->root.mnt;
 
-	path->mnt = get_vfsmount_from_fd(mountdirfd);
-	if (IS_ERR(path->mnt)) {
-		retval = PTR_ERR(path->mnt);
-		goto out_err;
-	}
 	/* change the handle size to multiple of sizeof(u32) */
 	handle_dwords = handle->handle_bytes >> 2;
-	path->dentry = exportfs_decode_fh(path->mnt,
+	path->dentry = exportfs_decode_fh_raw(mnt,
 					  (struct fid *)handle->f_handle,
 					  handle_dwords, handle->handle_type,
-					  vfs_dentry_acceptable, NULL);
-	if (IS_ERR(path->dentry)) {
-		retval = PTR_ERR(path->dentry);
-		goto out_mnt;
+					  ctx->fh_flags,
+					  vfs_dentry_acceptable, ctx);
+	if (IS_ERR_OR_NULL(path->dentry)) {
+		if (path->dentry == ERR_PTR(-ENOMEM))
+			return -ENOMEM;
+		return -ESTALE;
 	}
+	path->mnt = mntget(mnt);
 	return 0;
-out_mnt:
-	mntput(path->mnt);
-out_err:
-	return retval;
+}
+
+/*
+ * Allow relaxed permissions of file handles if the caller has the
+ * ability to mount the filesystem or create a bind-mount of the
+ * provided @mountdirfd.
+ *
+ * In both cases the caller may be able to get an unobstructed way to
+ * the encoded file handle. If the caller is only able to create a
+ * bind-mount we need to verify that there are no locked mounts on top
+ * of it that could prevent us from getting to the encoded file.
+ *
+ * In principle, locked mounts can prevent the caller from mounting the
+ * filesystem but that only applies to procfs and sysfs neither of which
+ * support decoding file handles.
+ */
+static inline bool may_decode_fh(struct handle_to_path_ctx *ctx,
+				 unsigned int o_flags)
+{
+	struct path *root = &ctx->root;
+
+	/*
+	 * Restrict to O_DIRECTORY to provide a deterministic API that avoids a
+	 * confusing api in the face of disconnected non-dir dentries.
+	 *
+	 * There's only one dentry for each directory inode (VFS rule)...
+	 */
+	if (!(o_flags & O_DIRECTORY))
+		return false;
+
+	if (ns_capable(root->mnt->mnt_sb->s_user_ns, CAP_SYS_ADMIN))
+		ctx->flags = HANDLE_CHECK_PERMS;
+	else if (is_mounted(root->mnt) &&
+		 ns_capable(real_mount(root->mnt)->mnt_ns->user_ns,
+			    CAP_SYS_ADMIN) &&
+		 !has_locked_children(real_mount(root->mnt), root->dentry))
+		ctx->flags = HANDLE_CHECK_PERMS | HANDLE_CHECK_SUBTREE;
+	else
+		return false;
+
+	/* Are we able to override DAC permissions? */
+	if (!ns_capable(current_user_ns(), CAP_DAC_READ_SEARCH))
+		return false;
+
+	ctx->fh_flags = EXPORT_FH_DIR_ONLY;
+	return true;
 }
 
 static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
-		   struct path *path)
+		   struct path *path, unsigned int o_flags)
 {
 	int retval = 0;
 	struct file_handle f_handle;
 	struct file_handle *handle = NULL;
+	struct handle_to_path_ctx ctx = {};
 
-	/*
-	 * With handle we don't look at the execute bit on the
-	 * directory. Ideally we would like CAP_DAC_SEARCH.
-	 * But we don't have that
-	 */
-	if (!capable(CAP_DAC_READ_SEARCH)) {
-		retval = -EPERM;
+	retval = get_path_from_fd(mountdirfd, &ctx.root);
+	if (retval)
 		goto out_err;
+
+	if (!capable(CAP_DAC_READ_SEARCH) && !may_decode_fh(&ctx, o_flags)) {
+		retval = -EPERM;
+		goto out_path;
 	}
+
 	if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) {
 		retval = -EFAULT;
-		goto out_err;
+		goto out_path;
 	}
 	if ((f_handle.handle_bytes > MAX_HANDLE_SZ) ||
 	    (f_handle.handle_bytes == 0)) {
 		retval = -EINVAL;
-		goto out_err;
+		goto out_path;
 	}
 	handle = kmalloc(struct_size(handle, f_handle, f_handle.handle_bytes),
 			 GFP_KERNEL);
 	if (!handle) {
 		retval = -ENOMEM;
-		goto out_err;
+		goto out_path;
 	}
 	/* copy the full handle */
 	*handle = f_handle;
@@ -207,10 +307,12 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
 		goto out_handle;
 	}
 
-	retval = do_handle_to_path(mountdirfd, handle, path);
+	retval = do_handle_to_path(handle, path, &ctx);
 
 out_handle:
 	kfree(handle);
+out_path:
+	path_put(&ctx.root);
 out_err:
 	return retval;
 }
@@ -223,7 +325,7 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh,
 	struct file *file;
 	int fd;
 
-	retval = handle_to_path(mountdirfd, ufh, &path);
+	retval = handle_to_path(mountdirfd, ufh, &path, open_flag);
 	if (retval)
 		return retval;
 
diff --git a/fs/mount.h b/fs/mount.h
index 4a42fc68f4cc..4adce73211ae 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -152,3 +152,4 @@ static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
 }
 
 extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor);
+bool has_locked_children(struct mount *mnt, struct dentry *dentry);
diff --git a/fs/namespace.c b/fs/namespace.c
index 5a51315c6678..4386787210c7 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2078,7 +2078,7 @@ void drop_collected_mounts(struct vfsmount *mnt)
 	namespace_unlock();
 }
 
-static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
+bool has_locked_children(struct mount *mnt, struct dentry *dentry)
 {
 	struct mount *child;
 
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 0b75305fb5f5..dd4e11a703aa 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -247,7 +247,7 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
 		dentry = dget(exp->ex_path.dentry);
 	else {
 		dentry = exportfs_decode_fh_raw(exp->ex_path.mnt, fid,
-						data_left, fileid_type,
+						data_left, fileid_type, 0,
 						nfsd_acceptable, exp);
 		if (IS_ERR_OR_NULL(dentry)) {
 			trace_nfsd_set_fh_dentry_badhandle(rqstp, fhp,
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index bb37ad5cc954..893a1d21dc1c 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -158,6 +158,7 @@ struct fid {
 
 #define EXPORT_FH_CONNECTABLE	0x1 /* Encode file handle with parent */
 #define EXPORT_FH_FID		0x2 /* File handle may be non-decodeable */
+#define EXPORT_FH_DIR_ONLY	0x4 /* Only decode file handle for a directory */
 
 /**
  * struct export_operations - for nfsd to communicate with file systems
@@ -305,6 +306,7 @@ static inline int exportfs_encode_fid(struct inode *inode, struct fid *fid,
 extern struct dentry *exportfs_decode_fh_raw(struct vfsmount *mnt,
 					     struct fid *fid, int fh_len,
 					     int fileid_type,
+					     unsigned int flags,
 					     int (*acceptable)(void *, struct dentry *),
 					     void *context);
 extern struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
-- 
cgit v1.2.3


From dff60734fc7606fabde668ab6a26feacec8787cc Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Tue, 4 Jun 2024 17:52:56 +0200
Subject: vfs: retire user_path_at_empty and drop empty arg from getname_flags

No users after do_readlinkat started doing the job on its own.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://lore.kernel.org/r/20240604155257.109500-3-mjguzik@gmail.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fsopen.c           |  2 +-
 fs/namei.c            | 16 +++++++---------
 fs/stat.c             |  6 +++---
 include/linux/fs.h    |  2 +-
 include/linux/namei.h |  8 +-------
 io_uring/statx.c      |  3 +--
 io_uring/xattr.c      |  4 ++--
 7 files changed, 16 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fsopen.c b/fs/fsopen.c
index 18fe979da7e2..ed2dd000622e 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -448,7 +448,7 @@ SYSCALL_DEFINE5(fsconfig,
 		fallthrough;
 	case FSCONFIG_SET_PATH:
 		param.type = fs_value_is_filename;
-		param.name = getname_flags(_value, lookup_flags, NULL);
+		param.name = getname_flags(_value, lookup_flags);
 		if (IS_ERR(param.name)) {
 			ret = PTR_ERR(param.name);
 			goto out_key;
diff --git a/fs/namei.c b/fs/namei.c
index 37fb0a8aa09a..950ad6bdd9fe 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -126,7 +126,7 @@
 #define EMBEDDED_NAME_MAX	(PATH_MAX - offsetof(struct filename, iname))
 
 struct filename *
-getname_flags(const char __user *filename, int flags, int *empty)
+getname_flags(const char __user *filename, int flags)
 {
 	struct filename *result;
 	char *kname;
@@ -190,8 +190,6 @@ getname_flags(const char __user *filename, int flags, int *empty)
 	atomic_set(&result->refcnt, 1);
 	/* The empty path is special. */
 	if (unlikely(!len)) {
-		if (empty)
-			*empty = 1;
 		if (!(flags & LOOKUP_EMPTY)) {
 			putname(result);
 			return ERR_PTR(-ENOENT);
@@ -209,13 +207,13 @@ getname_uflags(const char __user *filename, int uflags)
 {
 	int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
 
-	return getname_flags(filename, flags, NULL);
+	return getname_flags(filename, flags);
 }
 
 struct filename *
 getname(const char __user * filename)
 {
-	return getname_flags(filename, 0, NULL);
+	return getname_flags(filename, 0);
 }
 
 struct filename *
@@ -2922,16 +2920,16 @@ int path_pts(struct path *path)
 }
 #endif
 
-int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
-		 struct path *path, int *empty)
+int user_path_at(int dfd, const char __user *name, unsigned flags,
+		 struct path *path)
 {
-	struct filename *filename = getname_flags(name, flags, empty);
+	struct filename *filename = getname_flags(name, flags);
 	int ret = filename_lookup(dfd, filename, flags, path, NULL);
 
 	putname(filename);
 	return ret;
 }
-EXPORT_SYMBOL(user_path_at_empty);
+EXPORT_SYMBOL(user_path_at);
 
 int __check_sticky(struct mnt_idmap *idmap, struct inode *dir,
 		   struct inode *inode)
diff --git a/fs/stat.c b/fs/stat.c
index 7f7861544500..16aa1f5ceec4 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -300,7 +300,7 @@ int vfs_fstatat(int dfd, const char __user *filename,
 			return vfs_fstat(dfd, stat);
 	}
 
-	name = getname_flags(filename, getname_statx_lookup_flags(statx_flags), NULL);
+	name = getname_flags(filename, getname_statx_lookup_flags(statx_flags));
 	ret = vfs_statx(dfd, name, statx_flags, stat, STATX_BASIC_STATS);
 	putname(name);
 
@@ -496,7 +496,7 @@ static int do_readlinkat(int dfd, const char __user *pathname,
 		return -EINVAL;
 
 retry:
-	name = getname_flags(pathname, lookup_flags, NULL);
+	name = getname_flags(pathname, lookup_flags);
 	error = filename_lookup(dfd, name, lookup_flags, &path, NULL);
 	if (unlikely(error)) {
 		putname(name);
@@ -710,7 +710,7 @@ SYSCALL_DEFINE5(statx,
 	int ret;
 	struct filename *name;
 
-	name = getname_flags(filename, getname_statx_lookup_flags(flags), NULL);
+	name = getname_flags(filename, getname_statx_lookup_flags(flags));
 	ret = do_statx(dfd, name, flags, mask, buffer);
 	putname(name);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 639885621608..bfc1e6407bf6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2701,7 +2701,7 @@ static inline struct file *file_clone_open(struct file *file)
 }
 extern int filp_close(struct file *, fl_owner_t id);
 
-extern struct filename *getname_flags(const char __user *, int, int *);
+extern struct filename *getname_flags(const char __user *, int);
 extern struct filename *getname_uflags(const char __user *, int);
 extern struct filename *getname(const char __user *);
 extern struct filename *getname_kernel(const char *);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 967aa9ea9f96..8ec8fed3bce8 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -50,13 +50,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT};
 
 extern int path_pts(struct path *path);
 
-extern int user_path_at_empty(int, const char __user *, unsigned, struct path *, int *empty);
-
-static inline int user_path_at(int dfd, const char __user *name, unsigned flags,
-		 struct path *path)
-{
-	return user_path_at_empty(dfd, name, flags, path, NULL);
-}
+extern int user_path_at(int, const char __user *, unsigned, struct path *);
 
 struct dentry *lookup_one_qstr_excl(const struct qstr *name,
 				    struct dentry *base,
diff --git a/io_uring/statx.c b/io_uring/statx.c
index abb874209caa..f7f9b202eec0 100644
--- a/io_uring/statx.c
+++ b/io_uring/statx.c
@@ -37,8 +37,7 @@ int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	sx->flags = READ_ONCE(sqe->statx_flags);
 
 	sx->filename = getname_flags(path,
-				     getname_statx_lookup_flags(sx->flags),
-				     NULL);
+				     getname_statx_lookup_flags(sx->flags));
 
 	if (IS_ERR(sx->filename)) {
 		int ret = PTR_ERR(sx->filename);
diff --git a/io_uring/xattr.c b/io_uring/xattr.c
index 44905b82eea8..6cf41c3bc369 100644
--- a/io_uring/xattr.c
+++ b/io_uring/xattr.c
@@ -96,7 +96,7 @@ int io_getxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 	path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
 
-	ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
+	ix->filename = getname_flags(path, LOOKUP_FOLLOW);
 	if (IS_ERR(ix->filename)) {
 		ret = PTR_ERR(ix->filename);
 		ix->filename = NULL;
@@ -189,7 +189,7 @@ int io_setxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 	path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
 
-	ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
+	ix->filename = getname_flags(path, LOOKUP_FOLLOW);
 	if (IS_ERR(ix->filename)) {
 		ret = PTR_ERR(ix->filename);
 		ix->filename = NULL;
-- 
cgit v1.2.3


From 9b6a14f08b4875aa22ea0b5bc35042e2580b311b Mon Sep 17 00:00:00 2001
From: Youling Tang <tangyouling@kylinos.cn>
Date: Thu, 20 Jun 2024 11:23:33 +0800
Subject: fs: Export in_group_or_capable()

Export in_group_or_capable() as a VFS helper function.

Signed-off-by: Youling Tang <tangyouling@kylinos.cn>
Link: https://lore.kernel.org/r/20240620032335.147136-1-youling.tang@linux.dev
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/attr.c          | 2 --
 fs/inode.c         | 1 +
 include/linux/fs.h | 2 ++
 3 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/attr.c b/fs/attr.c
index 960a310581eb..825007d5cda4 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -17,8 +17,6 @@
 #include <linux/filelock.h>
 #include <linux/security.h>
 
-#include "internal.h"
-
 /**
  * setattr_should_drop_sgid - determine whether the setgid bit needs to be
  *                            removed
diff --git a/fs/inode.c b/fs/inode.c
index 3a41f83a4ba5..e0815acc5abb 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2538,6 +2538,7 @@ bool in_group_or_capable(struct mnt_idmap *idmap,
 		return true;
 	return false;
 }
+EXPORT_SYMBOL(in_group_or_capable);
 
 /**
  * mode_strip_sgid - handle the sgid bit for non-directories
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bfc1e6407bf6..942cb11dba96 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1942,6 +1942,8 @@ void inode_init_owner(struct mnt_idmap *idmap, struct inode *inode,
 extern bool may_open_dev(const struct path *path);
 umode_t mode_strip_sgid(struct mnt_idmap *idmap,
 			const struct inode *dir, umode_t mode);
+bool in_group_or_capable(struct mnt_idmap *idmap,
+			 const struct inode *inode, vfsgid_t vfsgid);
 
 /*
  * This is the "filldir" function type, used by readdir() to let
-- 
cgit v1.2.3


From 1bc6d4452d5c91beb09e37a98a590808e1997b79 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 30 Apr 2024 13:54:46 +0200
Subject: fs: new helper vfs_empty_path()

Make it possible to quickly check whether AT_EMPTY_PATH is valid.
Note, after some discussion we decided to also allow NULL to be passed
instead of requiring the empty string.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 942cb11dba96..5ff362277834 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3631,4 +3631,21 @@ extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len,
 extern int generic_fadvise(struct file *file, loff_t offset, loff_t len,
 			   int advice);
 
+static inline bool vfs_empty_path(int dfd, const char __user *path)
+{
+	char c;
+
+	if (dfd < 0)
+		return false;
+
+	/* We now allow NULL to be used for empty path. */
+	if (!path)
+		return true;
+
+	if (unlikely(get_user(c, path)))
+		return false;
+
+	return !c;
+}
+
 #endif /* _LINUX_FS_H */
-- 
cgit v1.2.3


From f378ec4eec8b7e607dbc0112913fd3d5d84eb1b8 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Thu, 27 Jun 2024 18:11:52 +0200
Subject: vfs: rename parent_ino to d_parent_ino and make it use RCU

The routine is used by procfs through dir_emit_dots.

The combined RCU and lock fallback implementation is too big for an
inline. Given that the routine takes a dentry argument fs/dcache.c seems
like the place to put it in.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://lore.kernel.org/r/20240627161152.802567-1-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/dcache.c            | 28 ++++++++++++++++++++++++++++
 fs/f2fs/file.c         |  2 +-
 fs/hfsplus/ioctl.c     |  4 ++--
 include/linux/dcache.h |  2 ++
 include/linux/fs.h     | 16 +---------------
 5 files changed, 34 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 58b89c9e9b0c..68e843e78337 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3099,6 +3099,34 @@ void d_tmpfile(struct file *file, struct inode *inode)
 }
 EXPORT_SYMBOL(d_tmpfile);
 
+/*
+ * Obtain inode number of the parent dentry.
+ */
+ino_t d_parent_ino(struct dentry *dentry)
+{
+	struct dentry *parent;
+	struct inode *iparent;
+	unsigned seq;
+	ino_t ret;
+
+	scoped_guard(rcu) {
+		seq = raw_seqcount_begin(&dentry->d_seq);
+		parent = READ_ONCE(dentry->d_parent);
+		iparent = d_inode_rcu(parent);
+		if (likely(iparent)) {
+			ret = iparent->i_ino;
+			if (!read_seqcount_retry(&dentry->d_seq, seq))
+				return ret;
+		}
+	}
+
+	spin_lock(&dentry->d_lock);
+	ret = dentry->d_parent->d_inode->i_ino;
+	spin_unlock(&dentry->d_lock);
+	return ret;
+}
+EXPORT_SYMBOL(d_parent_ino);
+
 static __initdata unsigned long dhash_entries;
 static int __init set_dhash_entries(char *str)
 {
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 7a23434963d1..c1ad9b278c47 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -185,7 +185,7 @@ static int get_parent_ino(struct inode *inode, nid_t *pino)
 	if (!dentry)
 		return 0;
 
-	*pino = parent_ino(dentry);
+	*pino = d_parent_ino(dentry);
 	dput(dentry);
 	return 1;
 }
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 5661a2e24d03..40d04dba13ac 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -40,7 +40,7 @@ static int hfsplus_ioctl_bless(struct file *file, int __user *user_flags)
 
 	/* Directory containing the bootable system */
 	vh->finder_info[0] = bvh->finder_info[0] =
-		cpu_to_be32(parent_ino(dentry));
+		cpu_to_be32(d_parent_ino(dentry));
 
 	/*
 	 * Bootloader. Just using the inode here breaks in the case of
@@ -51,7 +51,7 @@ static int hfsplus_ioctl_bless(struct file *file, int __user *user_flags)
 
 	/* Per spec, the OS X system folder - same as finder_info[0] here */
 	vh->finder_info[5] = bvh->finder_info[5] =
-		cpu_to_be32(parent_ino(dentry));
+		cpu_to_be32(d_parent_ino(dentry));
 
 	mutex_unlock(&sbi->vh_mutex);
 	return 0;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index bf53e3894aae..ea58843942b9 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -278,6 +278,8 @@ static inline unsigned d_count(const struct dentry *dentry)
 	return dentry->d_lockref.count;
 }
 
+ino_t d_parent_ino(struct dentry *dentry);
+
 /*
  * helper function for dentry_operations.d_dname() members
  */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5ff362277834..2fa06a4d197a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3454,20 +3454,6 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
 	return 0;
 }
 
-static inline ino_t parent_ino(struct dentry *dentry)
-{
-	ino_t res;
-
-	/*
-	 * Don't strictly need d_lock here? If the parent ino could change
-	 * then surely we'd have a deeper race in the caller?
-	 */
-	spin_lock(&dentry->d_lock);
-	res = dentry->d_parent->d_inode->i_ino;
-	spin_unlock(&dentry->d_lock);
-	return res;
-}
-
 /* Transaction based IO helpers */
 
 /*
@@ -3592,7 +3578,7 @@ static inline bool dir_emit_dot(struct file *file, struct dir_context *ctx)
 static inline bool dir_emit_dotdot(struct file *file, struct dir_context *ctx)
 {
 	return ctx->actor(ctx, "..", 2, ctx->pos,
-			  parent_ino(file->f_path.dentry), DT_DIR);
+			  d_parent_ino(file->f_path.dentry), DT_DIR);
 }
 static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx)
 {
-- 
cgit v1.2.3