From 0e5032237ee5530147fbdf33134297e1490d5ec3 Mon Sep 17 00:00:00 2001
From: Bhavik Sachdev <b.sachdev1904@gmail.com>
Date: Sat, 29 Nov 2025 14:41:21 +0530
Subject: statmount: accept fd as a parameter

Extend `struct mnt_id_req` to take in a fd and introduce STATMOUNT_BY_FD
flag. When a valid fd is provided and STATMOUNT_BY_FD is set, statmount
will return mountinfo about the mount the fd is on.

This even works for "unmounted" mounts (mounts that have been umounted
using umount2(mnt, MNT_DETACH)), if you have access to a file descriptor
on that mount. These "umounted" mounts will have no mountpoint and no
valid mount namespace. Hence, we unset the STATMOUNT_MNT_POINT and
STATMOUNT_MNT_NS_ID in statmount.mask for "unmounted" mounts.

In case of STATMOUNT_BY_FD, given that we already have access to an fd
on the mount, accessing mount information without a capability check
seems fine because of the following reasons:

- All fs related information is available via fstatfs() without any
  capability check.
- Mount information is also available via /proc/pid/mountinfo (without
  any capability check).
- Given that we have access to a fd on the mount which tells us that we
  had access to the mount at some point (or someone that had access gave
  us the fd). So, we should be able to access mount info.

Co-developed-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Signed-off-by: Bhavik Sachdev <b.sachdev1904@gmail.com>
Link: https://patch.msgid.link/20251129091455.757724-3-b.sachdev1904@gmail.com
Acked-by: Andrei Vagin <avagin@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/uapi/linux/mount.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index 5d3f8c9e3a62..18c624405268 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -197,7 +197,10 @@ struct statmount {
  */
 struct mnt_id_req {
 	__u32 size;
-	__u32 mnt_ns_fd;
+	union {
+		__u32 mnt_ns_fd;
+		__u32 mnt_fd;
+	};
 	__u64 mnt_id;
 	__u64 param;
 	__u64 mnt_ns_id;
@@ -232,4 +235,9 @@ struct mnt_id_req {
 #define LSMT_ROOT		0xffffffffffffffff	/* root mount */
 #define LISTMOUNT_REVERSE	(1 << 0) /* List later mounts first */
 
+/*
+ * @flag bits for statmount(2)
+ */
+#define STATMOUNT_BY_FD		0x00000001U	/* want mountinfo for given fd */
+
 #endif /* _UAPI_LINUX_MOUNT_H */
-- 
cgit v1.2.3


From 51a146e0595c638c58097a1660ff6b6e7d3b72f3 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Fri, 12 Dec 2025 11:44:03 -0600
Subject: fs: Remove internal old mount API code

Now that the last in-tree filesystem has been converted to the new mount
API, remove all legacy mount API code designed to handle un-converted
filesystems, and remove associated documentation as well.

(The code to handle the legacy mount(2) syscall from userspace is still
in place, of course.)

Tested with an allmodconfig build on x86_64, and a sanity check of an
old mount(2) syscall mount.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Link: https://patch.msgid.link/20251212174403.2882183-1-sandeen@redhat.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 Documentation/filesystems/locking.rst   |   8 --
 Documentation/filesystems/mount_api.rst |   2 -
 Documentation/filesystems/porting.rst   |   7 +-
 Documentation/filesystems/vfs.rst       |  58 +--------
 fs/fs_context.c                         | 208 +-------------------------------
 fs/fsopen.c                             |  10 --
 fs/internal.h                           |   1 -
 include/linux/fs.h                      |   2 -
 include/linux/fs/super_types.h          |   1 -
 9 files changed, 8 insertions(+), 289 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index 77704fde9845..cc832074c8bb 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -177,7 +177,6 @@ prototypes::
 	int (*freeze_fs) (struct super_block *);
 	int (*unfreeze_fs) (struct super_block *);
 	int (*statfs) (struct dentry *, struct kstatfs *);
-	int (*remount_fs) (struct super_block *, int *, char *);
 	void (*umount_begin) (struct super_block *);
 	int (*show_options)(struct seq_file *, struct dentry *);
 	ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
@@ -201,7 +200,6 @@ sync_fs:		read
 freeze_fs:		write
 unfreeze_fs:		write
 statfs:			maybe(read)	(see below)
-remount_fs:		write
 umount_begin:		no
 show_options:		no		(namespace_sem)
 quota_read:		no		(see below)
@@ -226,8 +224,6 @@ file_system_type
 
 prototypes::
 
-	struct dentry *(*mount) (struct file_system_type *, int,
-		       const char *, void *);
 	void (*kill_sb) (struct super_block *);
 
 locking rules:
@@ -235,13 +231,9 @@ locking rules:
 =======		=========
 ops		may block
 =======		=========
-mount		yes
 kill_sb		yes
 =======		=========
 
-->mount() returns ERR_PTR or the root dentry; its superblock should be locked
-on return.
-
 ->kill_sb() takes a write-locked superblock, does all shutdown work on it,
 unlocks and drops the reference.
 
diff --git a/Documentation/filesystems/mount_api.rst b/Documentation/filesystems/mount_api.rst
index c99ab1f7fea4..a064234fed5b 100644
--- a/Documentation/filesystems/mount_api.rst
+++ b/Documentation/filesystems/mount_api.rst
@@ -299,8 +299,6 @@ manage the filesystem context.  They are as follows:
      On success it should return 0.  In the case of an error, it should return
      a negative error code.
 
-     .. Note:: reconfigure is intended as a replacement for remount_fs.
-
 
 Filesystem context Security
 ===========================
diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 3397937ed838..631eee9bdc33 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -448,11 +448,8 @@ a file off.
 
 **mandatory**
 
-->get_sb() is gone.  Switch to use of ->mount().  Typically it's just
-a matter of switching from calling ``get_sb_``... to ``mount_``... and changing
-the function type.  If you were doing it manually, just switch from setting
-->mnt_root to some pointer to returning that pointer.  On errors return
-ERR_PTR(...).
+->get_sb() and ->mount() are gone. Switch to using the new mount API. See
+Documentation/filesystems/mount_api.rst for more details.
 
 ---
 
diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
index 670ba66b60e4..90c357b263fe 100644
--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@ -94,11 +94,9 @@ functions:
 
 The passed struct file_system_type describes your filesystem.  When a
 request is made to mount a filesystem onto a directory in your
-namespace, the VFS will call the appropriate mount() method for the
-specific filesystem.  New vfsmount referring to the tree returned by
-->mount() will be attached to the mountpoint, so that when pathname
-resolution reaches the mountpoint it will jump into the root of that
-vfsmount.
+namespace, the VFS will call the appropriate get_tree() method for the
+specific filesystem.  See Documentation/filesystems/mount_api.rst
+for more details.
 
 You can see all filesystems that are registered to the kernel in the
 file /proc/filesystems.
@@ -117,8 +115,6 @@ members are defined:
 		int fs_flags;
 		int (*init_fs_context)(struct fs_context *);
 		const struct fs_parameter_spec *parameters;
-		struct dentry *(*mount) (struct file_system_type *, int,
-			const char *, void *);
 		void (*kill_sb) (struct super_block *);
 		struct module *owner;
 		struct file_system_type * next;
@@ -151,10 +147,6 @@ members are defined:
 	'struct fs_parameter_spec'.
 	More info in Documentation/filesystems/mount_api.rst.
 
-``mount``
-	the method to call when a new instance of this filesystem should
-	be mounted
-
 ``kill_sb``
 	the method to call when an instance of this filesystem should be
 	shut down
@@ -173,45 +165,6 @@ members are defined:
   s_lock_key, s_umount_key, s_vfs_rename_key, s_writers_key,
   i_lock_key, i_mutex_key, invalidate_lock_key, i_mutex_dir_key: lockdep-specific
 
-The mount() method has the following arguments:
-
-``struct file_system_type *fs_type``
-	describes the filesystem, partly initialized by the specific
-	filesystem code
-
-``int flags``
-	mount flags
-
-``const char *dev_name``
-	the device name we are mounting.
-
-``void *data``
-	arbitrary mount options, usually comes as an ASCII string (see
-	"Mount Options" section)
-
-The mount() method must return the root dentry of the tree requested by
-caller.  An active reference to its superblock must be grabbed and the
-superblock must be locked.  On failure it should return ERR_PTR(error).
-
-The arguments match those of mount(2) and their interpretation depends
-on filesystem type.  E.g. for block filesystems, dev_name is interpreted
-as block device name, that device is opened and if it contains a
-suitable filesystem image the method creates and initializes struct
-super_block accordingly, returning its root dentry to caller.
-
-->mount() may choose to return a subtree of existing filesystem - it
-doesn't have to create a new one.  The main result from the caller's
-point of view is a reference to dentry at the root of (sub)tree to be
-attached; creation of new superblock is a common side effect.
-
-The most interesting member of the superblock structure that the mount()
-method fills in is the "s_op" field.  This is a pointer to a "struct
-super_operations" which describes the next level of the filesystem
-implementation.
-
-For more information on mounting (and the new mount API), see
-Documentation/filesystems/mount_api.rst.
-
 The Superblock Object
 =====================
 
@@ -244,7 +197,6 @@ filesystem.  The following members are defined:
 					enum freeze_wholder who);
 		int (*unfreeze_fs) (struct super_block *);
 		int (*statfs) (struct dentry *, struct kstatfs *);
-		int (*remount_fs) (struct super_block *, int *, char *);
 		void (*umount_begin) (struct super_block *);
 
 		int (*show_options)(struct seq_file *, struct dentry *);
@@ -351,10 +303,6 @@ or bottom half).
 ``statfs``
 	called when the VFS needs to get filesystem statistics.
 
-``remount_fs``
-	called when the filesystem is remounted.  This is called with
-	the kernel lock held
-
 ``umount_begin``
 	called when the VFS is unmounting a filesystem.
 
diff --git a/fs/fs_context.c b/fs/fs_context.c
index 93b7ebf8d927..81ed94f46cac 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -24,20 +24,6 @@
 #include "mount.h"
 #include "internal.h"
 
-enum legacy_fs_param {
-	LEGACY_FS_UNSET_PARAMS,
-	LEGACY_FS_MONOLITHIC_PARAMS,
-	LEGACY_FS_INDIVIDUAL_PARAMS,
-};
-
-struct legacy_fs_context {
-	char			*legacy_data;	/* Data page for legacy filesystems */
-	size_t			data_size;
-	enum legacy_fs_param	param_type;
-};
-
-static int legacy_init_fs_context(struct fs_context *fc);
-
 static const struct constant_table common_set_sb_flag[] = {
 	{ "dirsync",	SB_DIRSYNC },
 	{ "lazytime",	SB_LAZYTIME },
@@ -275,7 +261,6 @@ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type,
 				      unsigned int sb_flags_mask,
 				      enum fs_context_purpose purpose)
 {
-	int (*init_fs_context)(struct fs_context *);
 	struct fs_context *fc;
 	int ret = -ENOMEM;
 
@@ -307,12 +292,7 @@ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type,
 		break;
 	}
 
-	/* TODO: Make all filesystems support this unconditionally */
-	init_fs_context = fc->fs_type->init_fs_context;
-	if (!init_fs_context)
-		init_fs_context = legacy_init_fs_context;
-
-	ret = init_fs_context(fc);
+	ret = fc->fs_type->init_fs_context(fc);
 	if (ret < 0)
 		goto err_fc;
 	fc->need_free = true;
@@ -376,8 +356,6 @@ void fc_drop_locked(struct fs_context *fc)
 	deactivate_locked_super(sb);
 }
 
-static void legacy_fs_context_free(struct fs_context *fc);
-
 /**
  * vfs_dup_fs_context - Duplicate a filesystem context.
  * @src_fc: The context to copy.
@@ -531,184 +509,6 @@ void put_fs_context(struct fs_context *fc)
 }
 EXPORT_SYMBOL(put_fs_context);
 
-/*
- * Free the config for a filesystem that doesn't support fs_context.
- */
-static void legacy_fs_context_free(struct fs_context *fc)
-{
-	struct legacy_fs_context *ctx = fc->fs_private;
-
-	if (ctx) {
-		if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS)
-			kfree(ctx->legacy_data);
-		kfree(ctx);
-	}
-}
-
-/*
- * Duplicate a legacy config.
- */
-static int legacy_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc)
-{
-	struct legacy_fs_context *ctx;
-	struct legacy_fs_context *src_ctx = src_fc->fs_private;
-
-	ctx = kmemdup(src_ctx, sizeof(*src_ctx), GFP_KERNEL);
-	if (!ctx)
-		return -ENOMEM;
-
-	if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS) {
-		ctx->legacy_data = kmemdup(src_ctx->legacy_data,
-					   src_ctx->data_size, GFP_KERNEL);
-		if (!ctx->legacy_data) {
-			kfree(ctx);
-			return -ENOMEM;
-		}
-	}
-
-	fc->fs_private = ctx;
-	return 0;
-}
-
-/*
- * Add a parameter to a legacy config.  We build up a comma-separated list of
- * options.
- */
-static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param)
-{
-	struct legacy_fs_context *ctx = fc->fs_private;
-	unsigned int size = ctx->data_size;
-	size_t len = 0;
-	int ret;
-
-	ret = vfs_parse_fs_param_source(fc, param);
-	if (ret != -ENOPARAM)
-		return ret;
-
-	if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS)
-		return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options");
-
-	switch (param->type) {
-	case fs_value_is_string:
-		len = 1 + param->size;
-		fallthrough;
-	case fs_value_is_flag:
-		len += strlen(param->key);
-		break;
-	default:
-		return invalf(fc, "VFS: Legacy: Parameter type for '%s' not supported",
-			      param->key);
-	}
-
-	if (size + len + 2 > PAGE_SIZE)
-		return invalf(fc, "VFS: Legacy: Cumulative options too large");
-	if (strchr(param->key, ',') ||
-	    (param->type == fs_value_is_string &&
-	     memchr(param->string, ',', param->size)))
-		return invalf(fc, "VFS: Legacy: Option '%s' contained comma",
-			      param->key);
-	if (!ctx->legacy_data) {
-		ctx->legacy_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
-		if (!ctx->legacy_data)
-			return -ENOMEM;
-	}
-
-	if (size)
-		ctx->legacy_data[size++] = ',';
-	len = strlen(param->key);
-	memcpy(ctx->legacy_data + size, param->key, len);
-	size += len;
-	if (param->type == fs_value_is_string) {
-		ctx->legacy_data[size++] = '=';
-		memcpy(ctx->legacy_data + size, param->string, param->size);
-		size += param->size;
-	}
-	ctx->legacy_data[size] = '\0';
-	ctx->data_size = size;
-	ctx->param_type = LEGACY_FS_INDIVIDUAL_PARAMS;
-	return 0;
-}
-
-/*
- * Add monolithic mount data.
- */
-static int legacy_parse_monolithic(struct fs_context *fc, void *data)
-{
-	struct legacy_fs_context *ctx = fc->fs_private;
-
-	if (ctx->param_type != LEGACY_FS_UNSET_PARAMS) {
-		pr_warn("VFS: Can't mix monolithic and individual options\n");
-		return -EINVAL;
-	}
-
-	ctx->legacy_data = data;
-	ctx->param_type = LEGACY_FS_MONOLITHIC_PARAMS;
-	if (!ctx->legacy_data)
-		return 0;
-
-	if (fc->fs_type->fs_flags & FS_BINARY_MOUNTDATA)
-		return 0;
-	return security_sb_eat_lsm_opts(ctx->legacy_data, &fc->security);
-}
-
-/*
- * Get a mountable root with the legacy mount command.
- */
-static int legacy_get_tree(struct fs_context *fc)
-{
-	struct legacy_fs_context *ctx = fc->fs_private;
-	struct super_block *sb;
-	struct dentry *root;
-
-	root = fc->fs_type->mount(fc->fs_type, fc->sb_flags,
-				      fc->source, ctx->legacy_data);
-	if (IS_ERR(root))
-		return PTR_ERR(root);
-
-	sb = root->d_sb;
-	BUG_ON(!sb);
-
-	fc->root = root;
-	return 0;
-}
-
-/*
- * Handle remount.
- */
-static int legacy_reconfigure(struct fs_context *fc)
-{
-	struct legacy_fs_context *ctx = fc->fs_private;
-	struct super_block *sb = fc->root->d_sb;
-
-	if (!sb->s_op->remount_fs)
-		return 0;
-
-	return sb->s_op->remount_fs(sb, &fc->sb_flags,
-				    ctx ? ctx->legacy_data : NULL);
-}
-
-const struct fs_context_operations legacy_fs_context_ops = {
-	.free			= legacy_fs_context_free,
-	.dup			= legacy_fs_context_dup,
-	.parse_param		= legacy_parse_param,
-	.parse_monolithic	= legacy_parse_monolithic,
-	.get_tree		= legacy_get_tree,
-	.reconfigure		= legacy_reconfigure,
-};
-
-/*
- * Initialise a legacy context for a filesystem that doesn't support
- * fs_context.
- */
-static int legacy_init_fs_context(struct fs_context *fc)
-{
-	fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL_ACCOUNT);
-	if (!fc->fs_private)
-		return -ENOMEM;
-	fc->ops = &legacy_fs_context_ops;
-	return 0;
-}
-
 int parse_monolithic_mount_data(struct fs_context *fc, void *data)
 {
 	int (*monolithic_mount_data)(struct fs_context *, void *);
@@ -757,10 +557,8 @@ int finish_clean_context(struct fs_context *fc)
 	if (fc->phase != FS_CONTEXT_AWAITING_RECONF)
 		return 0;
 
-	if (fc->fs_type->init_fs_context)
-		error = fc->fs_type->init_fs_context(fc);
-	else
-		error = legacy_init_fs_context(fc);
+	error = fc->fs_type->init_fs_context(fc);
+
 	if (unlikely(error)) {
 		fc->phase = FS_CONTEXT_FAILED;
 		return error;
diff --git a/fs/fsopen.c b/fs/fsopen.c
index f645c99204eb..622ee3926cd5 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -404,16 +404,6 @@ SYSCALL_DEFINE5(fsconfig,
 		return -EINVAL;
 
 	fc = fd_file(f)->private_data;
-	if (fc->ops == &legacy_fs_context_ops) {
-		switch (cmd) {
-		case FSCONFIG_SET_BINARY:
-		case FSCONFIG_SET_PATH:
-		case FSCONFIG_SET_PATH_EMPTY:
-		case FSCONFIG_SET_FD:
-		case FSCONFIG_CMD_CREATE_EXCL:
-			return -EOPNOTSUPP;
-		}
-	}
 
 	if (_key) {
 		param.key = strndup_user(_key, 256);
diff --git a/fs/internal.h b/fs/internal.h
index ab638d41ab81..e333b105337a 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -44,7 +44,6 @@ extern void __init chrdev_init(void);
 /*
  * fs_context.c
  */
-extern const struct fs_context_operations legacy_fs_context_ops;
 extern int parse_monolithic_mount_data(struct fs_context *, void *);
 extern void vfs_clean_context(struct fs_context *fc);
 extern int finish_clean_context(struct fs_context *fc);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 04ceeca12a0d..9949d253e5aa 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2274,8 +2274,6 @@ struct file_system_type {
 #define FS_RENAME_DOES_D_MOVE	32768	/* FS will handle d_move() during rename() internally. */
 	int (*init_fs_context)(struct fs_context *);
 	const struct fs_parameter_spec *parameters;
-	struct dentry *(*mount) (struct file_system_type *, int,
-		       const char *, void *);
 	void (*kill_sb) (struct super_block *);
 	struct module *owner;
 	struct file_system_type * next;
diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h
index 6bd3009e09b3..4bb9981af6ac 100644
--- a/include/linux/fs/super_types.h
+++ b/include/linux/fs/super_types.h
@@ -96,7 +96,6 @@ struct super_operations {
 			  const void *owner);
 	int (*unfreeze_fs)(struct super_block *sb);
 	int (*statfs)(struct dentry *dentry, struct kstatfs *kstatfs);
-	int (*remount_fs) (struct super_block *, int *, char *);
 	void (*umount_begin)(struct super_block *sb);
 
 	int (*show_options)(struct seq_file *seq, struct dentry *dentry);
-- 
cgit v1.2.3


From 9b8a0ba68246a61d903ce62c35c303b1501df28b Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 29 Dec 2025 14:03:24 +0100
Subject: mount: add OPEN_TREE_NAMESPACE

When creating containers the setup usually involves using CLONE_NEWNS
via clone3() or unshare(). This copies the caller's complete mount
namespace. The runtime will also assemble a new rootfs and then use
pivot_root() to switch the old mount tree with the new rootfs. Afterward
it will recursively umount the old mount tree thereby getting rid of all
mounts.

On a basic system here where the mount table isn't particularly large
this still copies about 30 mounts. Copying all of these mounts only to
get rid of them later is pretty wasteful.

This is exacerbated if intermediary mount namespaces are used that only
exist for a very short amount of time and are immediately destroyed
again causing a ton of mounts to be copied and destroyed needlessly.

With a large mount table and a system where thousands or ten-thousands
of containers are spawned in parallel this quickly becomes a bottleneck
increasing contention on the semaphore.

Extend open_tree() with a new OPEN_TREE_NAMESPACE flag. Similar to
OPEN_TREE_CLONE only the indicated mount tree is copied. Instead of
returning a file descriptor referring to that mount tree
OPEN_TREE_NAMESPACE will cause open_tree() to return a file descriptor
to a new mount namespace. In that new mount namespace the copied mount
tree has been mounted on top of a copy of the real rootfs.

The caller can setns() into that mount namespace and perform any
additionally required setup such as move_mount() detached mounts in
there.

This allows OPEN_TREE_NAMESPACE to function as a combined
unshare(CLONE_NEWNS) and pivot_root().

A caller may for example choose to create an extremely minimal rootfs:

fd_mntns = open_tree(-EBADF, "/var/lib/containers/wootwoot", OPEN_TREE_NAMESPACE);

This will create a mount namespace where "wootwoot" has become the
rootfs mounted on top of the real rootfs. The caller can now setns()
into this new mount namespace and assemble additional mounts.

This also works with user namespaces:

unshare(CLONE_NEWUSER);
fd_mntns = open_tree(-EBADF, "/var/lib/containers/wootwoot", OPEN_TREE_NAMESPACE);

which creates a new mount namespace owned by the earlier created user
namespace with "wootwoot" as the rootfs mounted on top of the real
rootfs.

Link: https://patch.msgid.link/20251229-work-empty-namespace-v1-1-bfb24c7b061f@kernel.org
Tested-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Aleksa Sarai <cyphar@cyphar.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Suggested-by: Christian Brauner <brauner@kernel.org>
Suggested-by: Aleksa Sarai <cyphar@cyphar.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/internal.h              |   1 +
 fs/namespace.c             | 163 ++++++++++++++++++++++++++++++++++++++++-----
 fs/nsfs.c                  |  13 ++++
 include/uapi/linux/mount.h |   3 +-
 4 files changed, 163 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/fs/internal.h b/fs/internal.h
index e333b105337a..f6932e639f36 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -246,6 +246,7 @@ extern void mnt_pin_kill(struct mount *m);
  */
 extern const struct dentry_operations ns_dentry_operations;
 int open_namespace(struct ns_common *ns);
+struct file *open_namespace_file(struct ns_common *ns);
 
 /*
  * fs/stat.c:
diff --git a/fs/namespace.c b/fs/namespace.c
index ec3b16fedd9f..59557019e422 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2796,6 +2796,9 @@ static inline void unlock_mount(struct pinned_mountpoint *m)
 		__unlock_mount(m);
 }
 
+static void lock_mount_exact(const struct path *path,
+			     struct pinned_mountpoint *mp);
+
 #define LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) \
 	struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
 	do_lock_mount((path), &mp, (beneath))
@@ -2946,10 +2949,11 @@ static inline bool may_copy_tree(const struct path *path)
 	return check_anonymous_mnt(mnt);
 }
 
-
-static struct mount *__do_loopback(const struct path *old_path, int recurse)
+static struct mount *__do_loopback(const struct path *old_path,
+				   unsigned int flags, unsigned int copy_flags)
 {
 	struct mount *old = real_mount(old_path->mnt);
+	bool recurse = flags & AT_RECURSIVE;
 
 	if (IS_MNT_UNBINDABLE(old))
 		return ERR_PTR(-EINVAL);
@@ -2960,10 +2964,22 @@ static struct mount *__do_loopback(const struct path *old_path, int recurse)
 	if (!recurse && __has_locked_children(old, old_path->dentry))
 		return ERR_PTR(-EINVAL);
 
+	/*
+	 * When creating a new mount namespace we don't want to copy over
+	 * mounts of mount namespaces to avoid the risk of cycles and also to
+	 * minimize the default complex interdependencies between mount
+	 * namespaces.
+	 *
+	 * We could ofc just check whether all mount namespace files aren't
+	 * creating cycles but really let's keep this simple.
+	 */
+	if (!(flags & OPEN_TREE_NAMESPACE))
+		copy_flags |= CL_COPY_MNT_NS_FILE;
+
 	if (recurse)
-		return copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
-	else
-		return clone_mnt(old, old_path->dentry, 0);
+		return copy_tree(old, old_path->dentry, copy_flags);
+
+	return clone_mnt(old, old_path->dentry, copy_flags);
 }
 
 /*
@@ -2974,7 +2990,9 @@ static int do_loopback(const struct path *path, const char *old_name,
 {
 	struct path old_path __free(path_put) = {};
 	struct mount *mnt = NULL;
+	unsigned int flags = recurse ? AT_RECURSIVE : 0;
 	int err;
+
 	if (!old_name || !*old_name)
 		return -EINVAL;
 	err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
@@ -2991,7 +3009,7 @@ static int do_loopback(const struct path *path, const char *old_name,
 	if (!check_mnt(mp.parent))
 		return -EINVAL;
 
-	mnt = __do_loopback(&old_path, recurse);
+	mnt = __do_loopback(&old_path, flags, 0);
 	if (IS_ERR(mnt))
 		return PTR_ERR(mnt);
 
@@ -3004,7 +3022,7 @@ static int do_loopback(const struct path *path, const char *old_name,
 	return err;
 }
 
-static struct mnt_namespace *get_detached_copy(const struct path *path, bool recursive)
+static struct mnt_namespace *get_detached_copy(const struct path *path, unsigned int flags)
 {
 	struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns;
 	struct user_namespace *user_ns = mnt_ns->user_ns;
@@ -3029,7 +3047,7 @@ static struct mnt_namespace *get_detached_copy(const struct path *path, bool rec
 			ns->seq_origin = src_mnt_ns->ns.ns_id;
 	}
 
-	mnt = __do_loopback(path, recursive);
+	mnt = __do_loopback(path, flags, 0);
 	if (IS_ERR(mnt)) {
 		emptied_ns = ns;
 		return ERR_CAST(mnt);
@@ -3043,9 +3061,9 @@ static struct mnt_namespace *get_detached_copy(const struct path *path, bool rec
 	return ns;
 }
 
-static struct file *open_detached_copy(struct path *path, bool recursive)
+static struct file *open_detached_copy(struct path *path, unsigned int flags)
 {
-	struct mnt_namespace *ns = get_detached_copy(path, recursive);
+	struct mnt_namespace *ns = get_detached_copy(path, flags);
 	struct file *file;
 
 	if (IS_ERR(ns))
@@ -3061,21 +3079,122 @@ static struct file *open_detached_copy(struct path *path, bool recursive)
 	return file;
 }
 
+DEFINE_FREE(put_empty_mnt_ns, struct mnt_namespace *,
+	    if (!IS_ERR_OR_NULL(_T)) free_mnt_ns(_T))
+
+static struct mnt_namespace *create_new_namespace(struct path *path, unsigned int flags)
+{
+	struct mnt_namespace *new_ns __free(put_empty_mnt_ns) = NULL;
+	struct path to_path __free(path_put) = {};
+	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
+	struct user_namespace *user_ns = current_user_ns();
+	struct mount *new_ns_root;
+	struct mount *mnt;
+	unsigned int copy_flags = 0;
+	bool locked = false;
+
+	if (user_ns != ns->user_ns)
+		copy_flags |= CL_SLAVE;
+
+	new_ns = alloc_mnt_ns(user_ns, false);
+	if (IS_ERR(new_ns))
+		return ERR_CAST(new_ns);
+
+	scoped_guard(namespace_excl) {
+		new_ns_root = clone_mnt(ns->root, ns->root->mnt.mnt_root, copy_flags);
+		if (IS_ERR(new_ns_root))
+			return ERR_CAST(new_ns_root);
+
+		/*
+		 * If the real rootfs had a locked mount on top of it somewhere
+		 * in the stack, lock the new mount tree as well so it can't be
+		 * exposed.
+		 */
+		mnt = ns->root;
+		while (mnt->overmount) {
+			mnt = mnt->overmount;
+			if (mnt->mnt.mnt_flags & MNT_LOCKED)
+				locked = true;
+		}
+	}
+
+	/*
+	 * We dropped the namespace semaphore so we can actually lock
+	 * the copy for mounting. The copied mount isn't attached to any
+	 * mount namespace and it is thus excluded from any propagation.
+	 * So realistically we're isolated and the mount can't be
+	 * overmounted.
+	 */
+
+	/* Borrow the reference from clone_mnt(). */
+	to_path.mnt = &new_ns_root->mnt;
+	to_path.dentry = dget(new_ns_root->mnt.mnt_root);
+
+	/* Now lock for actual mounting. */
+	LOCK_MOUNT_EXACT(mp, &to_path);
+	if (unlikely(IS_ERR(mp.parent)))
+		return ERR_CAST(mp.parent);
+
+	/*
+	 * We don't emulate unshare()ing a mount namespace. We stick to the
+	 * restrictions of creating detached bind-mounts. It has a lot
+	 * saner and simpler semantics.
+	 */
+	mnt = __do_loopback(path, flags, copy_flags);
+	if (IS_ERR(mnt))
+		return ERR_CAST(mnt);
+
+	scoped_guard(mount_writer) {
+		if (locked)
+			mnt->mnt.mnt_flags |= MNT_LOCKED;
+		/*
+		 * Now mount the detached tree on top of the copy of the
+		 * real rootfs we created.
+		 */
+		attach_mnt(mnt, new_ns_root, mp.mp);
+		if (user_ns != ns->user_ns)
+			lock_mnt_tree(new_ns_root);
+	}
+
+	/* Add all mounts to the new namespace. */
+	for (struct mount *p = new_ns_root; p; p = next_mnt(p, new_ns_root)) {
+		mnt_add_to_ns(new_ns, p);
+		new_ns->nr_mounts++;
+	}
+
+	new_ns->root = real_mount(no_free_ptr(to_path.mnt));
+	ns_tree_add_raw(new_ns);
+	return no_free_ptr(new_ns);
+}
+
+static struct file *open_new_namespace(struct path *path, unsigned int flags)
+{
+	struct mnt_namespace *new_ns;
+
+	new_ns = create_new_namespace(path, flags);
+	if (IS_ERR(new_ns))
+		return ERR_CAST(new_ns);
+	return open_namespace_file(to_ns_common(new_ns));
+}
+
 static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned int flags)
 {
 	int ret;
 	struct path path __free(path_put) = {};
 	int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
-	bool detached = flags & OPEN_TREE_CLONE;
 
 	BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
 
 	if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
 		      AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
-		      OPEN_TREE_CLOEXEC))
+		      OPEN_TREE_CLOEXEC | OPEN_TREE_NAMESPACE))
 		return ERR_PTR(-EINVAL);
 
-	if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
+	if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE | OPEN_TREE_NAMESPACE)) ==
+	    AT_RECURSIVE)
+		return ERR_PTR(-EINVAL);
+
+	if (hweight32(flags & (OPEN_TREE_CLONE | OPEN_TREE_NAMESPACE)) > 1)
 		return ERR_PTR(-EINVAL);
 
 	if (flags & AT_NO_AUTOMOUNT)
@@ -3085,15 +3204,27 @@ static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned
 	if (flags & AT_EMPTY_PATH)
 		lookup_flags |= LOOKUP_EMPTY;
 
-	if (detached && !may_mount())
+	/*
+	 * If we create a new mount namespace with the cloned mount tree we
+	 * just care about being privileged over our current user namespace.
+	 * The new mount namespace will be owned by it.
+	 */
+	if ((flags & OPEN_TREE_NAMESPACE) &&
+	    !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	if ((flags & OPEN_TREE_CLONE) && !may_mount())
 		return ERR_PTR(-EPERM);
 
 	ret = user_path_at(dfd, filename, lookup_flags, &path);
 	if (unlikely(ret))
 		return ERR_PTR(ret);
 
-	if (detached)
-		return open_detached_copy(&path, flags & AT_RECURSIVE);
+	if (flags & OPEN_TREE_NAMESPACE)
+		return open_new_namespace(&path, flags);
+
+	if (flags & OPEN_TREE_CLONE)
+		return open_detached_copy(&path, flags);
 
 	return dentry_open(&path, O_PATH, current_cred());
 }
diff --git a/fs/nsfs.c b/fs/nsfs.c
index bf27d5da91f1..db91de208645 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -99,6 +99,19 @@ int ns_get_path(struct path *path, struct task_struct *task,
 	return ns_get_path_cb(path, ns_get_path_task, &args);
 }
 
+struct file *open_namespace_file(struct ns_common *ns)
+{
+	struct path path __free(path_put) = {};
+	int err;
+
+	/* call first to consume reference */
+	err = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path);
+	if (err < 0)
+		return ERR_PTR(err);
+
+	return dentry_open(&path, O_RDONLY, current_cred());
+}
+
 /**
  * open_namespace - open a namespace
  * @ns: the namespace to open
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index 18c624405268..d9d86598d100 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -61,7 +61,8 @@
 /*
  * open_tree() flags.
  */
-#define OPEN_TREE_CLONE		1		/* Clone the target tree and attach the clone */
+#define OPEN_TREE_CLONE		(1 << 0)	/* Clone the target tree and attach the clone */
+#define OPEN_TREE_NAMESPACE	(1 << 1)	/* Clone the target tree into a new mount namespace */
 #define OPEN_TREE_CLOEXEC	O_CLOEXEC	/* Close the file on execve() */
 
 /*
-- 
cgit v1.2.3