27 files changed, 3813 insertions, 194 deletions
diff --git a/fs/namespace.c b/fs/namespace.c
index 854f4fc66469..fe919abd2f01 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2646,6 +2646,19 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 
 			if (unlikely(shorter) && child != source_mnt)
 				mp = shorter;
+			/*
+			 * If @q was locked it was meant to hide
+			 * whatever was under it. Let @child take over
+			 * that job and lock it, then we can unlock @q.
+			 * That'll allow another namespace to shed @q
+			 * and reveal @child. Clearly, that mounter
+			 * consented to this by not severing the mount
+			 * relationship. Otherwise, what's the point.
+			 */
+			if (IS_MNT_LOCKED(q)) {
+				child->mnt.mnt_flags |= MNT_LOCKED;
+				q->mnt.mnt_flags &= ~MNT_LOCKED;
+			}
 			mnt_change_mountpoint(r, mp, q);
 		}
 	}
@@ -2722,7 +2735,7 @@ static inline struct mount *where_to_mount(const struct path *path,
  * In all cases the location must not have been unmounted and the
  * chosen mountpoint must be allowed to be mounted on.  For "beneath"
  * case we also require the location to be at the root of a mount
- * that has a parent (i.e. is not a root of some namespace).
+ * that has something mounted on top of it (i.e. has an overmount).
  */
 static void do_lock_mount(const struct path *path,
 			  struct pinned_mountpoint *res,
@@ -2958,10 +2971,9 @@ static inline bool may_copy_tree(const struct path *path)
 }
 
 static struct mount *__do_loopback(const struct path *old_path,
-				   unsigned int flags, unsigned int copy_flags)
+				   bool recurse, unsigned int copy_flags)
 {
 	struct mount *old = real_mount(old_path->mnt);
-	bool recurse = flags & AT_RECURSIVE;
 
 	if (IS_MNT_UNBINDABLE(old))
 		return ERR_PTR(-EINVAL);
@@ -2972,18 +2984,6 @@ static struct mount *__do_loopback(const struct path *old_path,
 	if (!recurse && __has_locked_children(old, old_path->dentry))
 		return ERR_PTR(-EINVAL);
 
-	/*
-	 * When creating a new mount namespace we don't want to copy over
-	 * mounts of mount namespaces to avoid the risk of cycles and also to
-	 * minimize the default complex interdependencies between mount
-	 * namespaces.
-	 *
-	 * We could ofc just check whether all mount namespace files aren't
-	 * creating cycles but really let's keep this simple.
-	 */
-	if (!(flags & OPEN_TREE_NAMESPACE))
-		copy_flags |= CL_COPY_MNT_NS_FILE;
-
 	if (recurse)
 		return copy_tree(old, old_path->dentry, copy_flags);
 
@@ -2998,7 +2998,6 @@ static int do_loopback(const struct path *path, const char *old_name,
 {
 	struct path old_path __free(path_put) = {};
 	struct mount *mnt = NULL;
-	unsigned int flags = recurse ? AT_RECURSIVE : 0;
 	int err;
 
 	if (!old_name || !*old_name)
@@ -3017,7 +3016,7 @@ static int do_loopback(const struct path *path, const char *old_name,
 	if (!check_mnt(mp.parent))
 		return -EINVAL;
 
-	mnt = __do_loopback(&old_path, flags, 0);
+	mnt = __do_loopback(&old_path, recurse, CL_COPY_MNT_NS_FILE);
 	if (IS_ERR(mnt))
 		return PTR_ERR(mnt);
 
@@ -3055,7 +3054,7 @@ static struct mnt_namespace *get_detached_copy(const struct path *path, unsigned
 			ns->seq_origin = src_mnt_ns->ns.ns_id;
 	}
 
-	mnt = __do_loopback(path, flags, 0);
+	mnt = __do_loopback(path, (flags & AT_RECURSIVE), CL_COPY_MNT_NS_FILE);
 	if (IS_ERR(mnt)) {
 		emptied_ns = ns;
 		return ERR_CAST(mnt);
@@ -3087,7 +3086,13 @@ static struct file *open_detached_copy(struct path *path, unsigned int flags)
 	return file;
 }
 
-static struct mnt_namespace *create_new_namespace(struct path *path, unsigned int flags)
+enum mount_copy_flags_t {
+	MOUNT_COPY_RECURSIVE    = (1 << 0),
+	MOUNT_COPY_NEW		= (1 << 1),
+};
+
+static struct mnt_namespace *create_new_namespace(struct path *path,
+						  enum mount_copy_flags_t flags)
 {
 	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
 	struct user_namespace *user_ns = current_user_ns();
@@ -3096,7 +3101,7 @@ static struct mnt_namespace *create_new_namespace(struct path *path, unsigned in
 	struct path to_path;
 	struct mount *mnt;
 	unsigned int copy_flags = 0;
-	bool locked = false;
+	bool locked = false, recurse = flags & MOUNT_COPY_RECURSIVE;
 
 	if (user_ns != ns->user_ns)
 		copy_flags |= CL_SLAVE;
@@ -3131,11 +3136,14 @@ static struct mnt_namespace *create_new_namespace(struct path *path, unsigned in
 	}
 
 	/*
-	 * We don't emulate unshare()ing a mount namespace. We stick
-	 * to the restrictions of creating detached bind-mounts. It
-	 * has a lot saner and simpler semantics.
+	 * We don't emulate unshare()ing a mount namespace. We stick to
+	 * the restrictions of creating detached bind-mounts. It has a
+	 * lot saner and simpler semantics.
 	 */
-	mnt = __do_loopback(path, flags, copy_flags);
+	if (flags & MOUNT_COPY_NEW)
+		mnt = clone_mnt(real_mount(path->mnt), path->dentry, copy_flags);
+	else
+		mnt = __do_loopback(path, recurse, copy_flags);
 	scoped_guard(mount_writer) {
 		if (IS_ERR(mnt)) {
 			emptied_ns = new_ns;
@@ -3164,7 +3172,8 @@ static struct mnt_namespace *create_new_namespace(struct path *path, unsigned in
 	return new_ns;
 }
 
-static struct file *open_new_namespace(struct path *path, unsigned int flags)
+static struct file *open_new_namespace(struct path *path,
+				       enum mount_copy_flags_t flags)
 {
 	struct mnt_namespace *new_ns;
 
@@ -3217,7 +3226,7 @@ static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned
 		return ERR_PTR(ret);
 
 	if (flags & OPEN_TREE_NAMESPACE)
-		return open_new_namespace(&path, flags);
+		return open_new_namespace(&path, (flags & AT_RECURSIVE) ? MOUNT_COPY_RECURSIVE : 0);
 
 	if (flags & OPEN_TREE_CLONE)
 		return open_detached_copy(&path, flags);
@@ -3513,8 +3522,6 @@ static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2)
  * @mnt_to:   mount under which to mount
  * @mp:   mountpoint of @mnt_to
  *
- * - Make sure that nothing can be mounted beneath the caller's current
- *   root or the rootfs of the namespace.
  * - Make sure that the caller can unmount the topmost mount ensuring
  *   that the caller could reveal the underlying mountpoint.
  * - Ensure that nothing has been mounted on top of @mnt_from before we
@@ -3528,26 +3535,14 @@ static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2)
  */
 static int can_move_mount_beneath(const struct mount *mnt_from,
 				  const struct mount *mnt_to,
-				  const struct mountpoint *mp)
+				  struct pinned_mountpoint *mp)
 {
 	struct mount *parent_mnt_to = mnt_to->mnt_parent;
 
-	if (IS_MNT_LOCKED(mnt_to))
-		return -EINVAL;
-
 	/* Avoid creating shadow mounts during mount propagation. */
 	if (mnt_from->overmount)
 		return -EINVAL;
 
-	/*
-	 * Mounting beneath the rootfs only makes sense when the
-	 * semantics of pivot_root(".", ".") are used.
-	 */
-	if (&mnt_to->mnt == current->fs->root.mnt)
-		return -EINVAL;
-	if (parent_mnt_to == current->nsproxy->mnt_ns->root)
-		return -EINVAL;
-
 	if (mount_is_ancestor(mnt_to, mnt_from))
 		return -EINVAL;
 
@@ -3557,7 +3552,7 @@ static int can_move_mount_beneath(const struct mount *mnt_from,
 	 * propagating a copy @c of @mnt_from on top of @mnt_to. This
 	 * defeats the whole purpose of mounting beneath another mount.
 	 */
-	if (propagation_would_overmount(parent_mnt_to, mnt_to, mp))
+	if (propagation_would_overmount(parent_mnt_to, mnt_to, mp->mp))
 		return -EINVAL;
 
 	/*
@@ -3573,7 +3568,7 @@ static int can_move_mount_beneath(const struct mount *mnt_from,
 	 * @mnt_from beneath @mnt_to.
 	 */
 	if (check_mnt(mnt_from) &&
-	    propagation_would_overmount(parent_mnt_to, mnt_from, mp))
+	    propagation_would_overmount(parent_mnt_to, mnt_from, mp->mp))
 		return -EINVAL;
 
 	return 0;
@@ -3682,7 +3677,7 @@ static int do_move_mount(const struct path *old_path,
 
 		if (mp.parent != over->mnt_parent)
 			over = mp.parent->overmount;
-		err = can_move_mount_beneath(old, over, mp.mp);
+		err = can_move_mount_beneath(old, over, &mp);
 		if (err)
 			return err;
 	}
@@ -4231,8 +4226,8 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
 		struct user_namespace *user_ns, struct fs_struct *new_fs)
 {
 	struct mnt_namespace *new_ns;
-	struct vfsmount *rootmnt __free(mntput) = NULL;
-	struct vfsmount *pwdmnt __free(mntput) = NULL;
+	struct path old_root __free(path_put) = {};
+	struct path old_pwd __free(path_put) = {};
 	struct mount *p, *q;
 	struct mount *old;
 	struct mount *new;
@@ -4252,11 +4247,18 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
 		return new_ns;
 
 	guard(namespace_excl)();
-	/* First pass: copy the tree topology */
-	copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
+
+	if (flags & CLONE_EMPTY_MNTNS)
+		copy_flags = 0;
+	else
+		copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
 	if (user_ns != ns->user_ns)
 		copy_flags |= CL_SLAVE;
-	new = copy_tree(old, old->mnt.mnt_root, copy_flags);
+
+	if (flags & CLONE_EMPTY_MNTNS)
+		new = clone_mnt(old, old->mnt.mnt_root, copy_flags);
+	else
+		new = copy_tree(old, old->mnt.mnt_root, copy_flags);
 	if (IS_ERR(new)) {
 		emptied_ns = new_ns;
 		return ERR_CAST(new);
@@ -4267,33 +4269,53 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
 	}
 	new_ns->root = new;
 
-	/*
-	 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
-	 * as belonging to new namespace.  We have already acquired a private
-	 * fs_struct, so tsk->fs->lock is not needed.
-	 */
-	p = old;
-	q = new;
-	while (p) {
-		mnt_add_to_ns(new_ns, q);
-		new_ns->nr_mounts++;
+	if (flags & CLONE_EMPTY_MNTNS) {
+		/*
+		 * Empty mount namespace: only the root mount exists.
+		 * Reset root and pwd to the cloned mount's root dentry.
+		 */
 		if (new_fs) {
-			if (&p->mnt == new_fs->root.mnt) {
-				new_fs->root.mnt = mntget(&q->mnt);
-				rootmnt = &p->mnt;
-			}
-			if (&p->mnt == new_fs->pwd.mnt) {
-				new_fs->pwd.mnt = mntget(&q->mnt);
-				pwdmnt = &p->mnt;
+			old_root = new_fs->root;
+			old_pwd = new_fs->pwd;
+
+			new_fs->root.mnt = mntget(&new->mnt);
+			new_fs->root.dentry = dget(new->mnt.mnt_root);
+
+			new_fs->pwd.mnt = mntget(&new->mnt);
+			new_fs->pwd.dentry = dget(new->mnt.mnt_root);
+		}
+		mnt_add_to_ns(new_ns, new);
+		new_ns->nr_mounts++;
+	} else {
+		/*
+		 * Full copy: walk old and new trees in parallel, switching
+		 * the tsk->fs->* elements and marking new vfsmounts as
+		 * belonging to new namespace.  We have already acquired a
+		 * private fs_struct, so tsk->fs->lock is not needed.
+		 */
+		p = old;
+		q = new;
+		while (p) {
+			mnt_add_to_ns(new_ns, q);
+			new_ns->nr_mounts++;
+			if (new_fs) {
+				if (&p->mnt == new_fs->root.mnt) {
+					old_root.mnt = new_fs->root.mnt;
+					new_fs->root.mnt = mntget(&q->mnt);
+				}
+				if (&p->mnt == new_fs->pwd.mnt) {
+					old_pwd.mnt = new_fs->pwd.mnt;
+					new_fs->pwd.mnt = mntget(&q->mnt);
+				}
 			}
+			p = next_mnt(p, old);
+			q = next_mnt(q, new);
+			if (!q)
+				break;
+			// an mntns binding we'd skipped?
+			while (p->mnt.mnt_root != q->mnt.mnt_root)
+				p = next_mnt(skip_mnt_tree(p), old);
 		}
-		p = next_mnt(p, old);
-		q = next_mnt(q, new);
-		if (!q)
-			break;
-		// an mntns binding we'd skipped?
-		while (p->mnt.mnt_root != q->mnt.mnt_root)
-			p = next_mnt(skip_mnt_tree(p), old);
 	}
 	ns_tree_add_raw(new_ns);
 	return new_ns;
@@ -4414,11 +4436,15 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
 	unsigned int mnt_flags = 0;
 	long ret;
 
-	if (!may_mount())
+	if ((flags & ~(FSMOUNT_CLOEXEC | FSMOUNT_NAMESPACE)) != 0)
+		return -EINVAL;
+
+	if ((flags & FSMOUNT_NAMESPACE) &&
+	    !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if ((flags & ~(FSMOUNT_CLOEXEC)) != 0)
-		return -EINVAL;
+	if (!(flags & FSMOUNT_NAMESPACE) && !may_mount())
+		return -EPERM;
 
 	if (attr_flags & ~FSMOUNT_VALID_FLAGS)
 		return -EINVAL;
@@ -4485,6 +4511,10 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
 	 */
 	vfs_clean_context(fc);
 
+	if (flags & FSMOUNT_NAMESPACE)
+		return FD_ADD((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0,
+			      open_new_namespace(&new_path, MOUNT_COPY_NEW));
+
 	ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true);
 	if (IS_ERR(ns))
 		return PTR_ERR(ns);
@@ -5649,14 +5679,14 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
 	if (mnt_ns_empty(ns))
 		return -ENOENT;
 
-	first = child = ns->root;
-	for (;;) {
-		child = listmnt_next(child, false);
-		if (!child)
-			return -ENOENT;
-		if (child->mnt_parent == first)
+	first = ns->root;
+	for (child = node_to_mount(ns->mnt_first_node); child;
+	     child = listmnt_next(child, false)) {
+		if (child != first && child->mnt_parent == first)
 			break;
 	}
+	if (!child)
+		return -ENOENT;
 
 	root->mnt = mntget(&child->mnt);
 	root->dentry = dget(root->mnt->mnt_root);
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index d9d86598d100..2204708dbf7a 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -110,6 +110,7 @@ enum fsconfig_command {
  * fsmount() flags.
  */
 #define FSMOUNT_CLOEXEC		0x00000001
+#define FSMOUNT_NAMESPACE	0x00000002	/* Create the mount in a new mount namespace */
 
 /*
  * Mount attributes.
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 01e09609b605..33a4624285cd 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -39,6 +39,7 @@
 #define CLONE_AUTOREAP		(1ULL << 34) /* Auto-reap child on exit. */
 #define CLONE_NNP		(1ULL << 35) /* Set no_new_privs on child. */
 #define CLONE_PIDFD_AUTOKILL	(1ULL << 36) /* Kill child when clone pidfd closes. */
+#define CLONE_EMPTY_MNTNS	(1ULL << 37) /* Create an empty mount namespace. */
 
 /*
  * cloning flags intersect with CSIGNAL so can be used with unshare and clone3
@@ -46,6 +47,12 @@
  */
 #define CLONE_NEWTIME	0x00000080	/* New time namespace */
 
+/*
+ * unshare flags share the bit space with clone flags but only apply to the
+ * unshare syscall:
+ */
+#define UNSHARE_EMPTY_MNTNS 0x00100000 /* Unshare an empty mount namespace. */
+
 #ifndef __ASSEMBLY__
 /**
  * struct clone_args - arguments for the clone3 syscall
diff --git a/kernel/fork.c b/kernel/fork.c
index 131ae7bbb0de..9c194fc58736 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2667,6 +2667,16 @@ pid_t kernel_clone(struct kernel_clone_args *args)
 	pid_t nr;
 
 	/*
+	 * Creating an empty mount namespace implies creating a new mount
+	 * namespace.  Set this before copy_process() so that the
+	 * CLONE_NEWNS|CLONE_FS mutual exclusion check works correctly.
+	 */
+	if (clone_flags & CLONE_EMPTY_MNTNS) {
+		clone_flags |= CLONE_NEWNS;
+		args->flags = clone_flags;
+	}
+
+	/*
 	 * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
 	 * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
 	 * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
@@ -2944,8 +2954,9 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
 {
 	/* Verify that no unknown flags are passed along. */
 	if (kargs->flags &
-	    ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP |
-	      CLONE_AUTOREAP | CLONE_NNP | CLONE_PIDFD_AUTOKILL))
+	    ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND |
+	      CLONE_INTO_CGROUP | CLONE_AUTOREAP | CLONE_NNP |
+	      CLONE_PIDFD_AUTOKILL | CLONE_EMPTY_MNTNS))
 		return false;
 
 	/*
@@ -3096,7 +3107,7 @@ static int check_unshare_flags(unsigned long unshare_flags)
 {
 	if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_SIGHAND|
 				CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
-				CLONE_NS_ALL))
+				CLONE_NS_ALL | UNSHARE_EMPTY_MNTNS))
 		return -EINVAL;
 	/*
 	 * Not implemented, but pretend it works if there is nothing
@@ -3195,6 +3206,8 @@ int ksys_unshare(unsigned long unshare_flags)
 	/*
 	 * If unsharing namespace, must also unshare filesystem information.
 	 */
+	if (unshare_flags & UNSHARE_EMPTY_MNTNS)
+		unshare_flags |= CLONE_NEWNS;
 	if (unshare_flags & CLONE_NEWNS)
 		unshare_flags |= CLONE_FS;
 
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 63b44ee79847..d9d3d5973bf5 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -96,7 +96,8 @@ static struct nsproxy *create_new_namespaces(u64 flags,
 	if (!new_nsp)
 		return ERR_PTR(-ENOMEM);
 
-	new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
+	new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns,
+				      user_ns, new_fs);
 	if (IS_ERR(new_nsp->mnt_ns)) {
 		err = PTR_ERR(new_nsp->mnt_ns);
 		goto out_ns;
@@ -211,16 +212,26 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
 	struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs)
 {
 	struct user_namespace *user_ns;
+	u64 flags = unshare_flags;
 	int err = 0;
 
-	if (!(unshare_flags & (CLONE_NS_ALL & ~CLONE_NEWUSER)))
+	if (!(flags & (CLONE_NS_ALL & ~CLONE_NEWUSER)))
 		return 0;
 
 	user_ns = new_cred ? new_cred->user_ns : current_user_ns();
 	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
 
-	*new_nsp = create_new_namespaces(unshare_flags, current, user_ns,
+	/*
+	 * Convert the 32-bit UNSHARE_EMPTY_MNTNS (which aliases
+	 * CLONE_PARENT_SETTID) to the unique 64-bit CLONE_EMPTY_MNTNS.
+	 */
+	if (flags & UNSHARE_EMPTY_MNTNS) {
+		flags &= ~(u64)UNSHARE_EMPTY_MNTNS;
+		flags |= CLONE_EMPTY_MNTNS;
+	}
+
+	*new_nsp = create_new_namespaces(flags, current, user_ns,
 					 new_fs ? new_fs : current->fs);
 	if (IS_ERR(*new_nsp)) {
 		err = PTR_ERR(*new_nsp);
diff --git a/tools/include/uapi/linux/mount.h b/tools/include/uapi/linux/mount.h
index 7fa67c2031a5..2204708dbf7a 100644
--- a/tools/include/uapi/linux/mount.h
+++ b/tools/include/uapi/linux/mount.h
@@ -61,7 +61,8 @@
 /*
  * open_tree() flags.
  */
-#define OPEN_TREE_CLONE		1		/* Clone the target tree and attach the clone */
+#define OPEN_TREE_CLONE		(1 << 0)	/* Clone the target tree and attach the clone */
+#define OPEN_TREE_NAMESPACE	(1 << 1)	/* Clone the target tree into a new mount namespace */
 #define OPEN_TREE_CLOEXEC	O_CLOEXEC	/* Close the file on execve() */
 
 /*
@@ -109,6 +110,7 @@ enum fsconfig_command {
  * fsmount() flags.
  */
 #define FSMOUNT_CLOEXEC		0x00000001
+#define FSMOUNT_NAMESPACE	0x00000002	/* Create the mount in a new mount namespace */
 
 /*
  * Mount attributes.
@@ -197,7 +199,10 @@ struct statmount {
  */
 struct mnt_id_req {
 	__u32 size;
-	__u32 spare;
+	union {
+		__u32 mnt_ns_fd;
+		__u32 mnt_fd;
+	};
 	__u64 mnt_id;
 	__u64 param;
 	__u64 mnt_ns_id;
@@ -232,4 +237,9 @@ struct mnt_id_req {
 #define LSMT_ROOT		0xffffffffffffffff	/* root mount */
 #define LISTMOUNT_REVERSE	(1 << 0) /* List later mounts first */
 
+/*
+ * @flag bits for statmount(2)
+ */
+#define STATMOUNT_BY_FD		0x00000001U	/* want mountinfo for given fd */
+
 #endif /* _UAPI_LINUX_MOUNT_H */
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 1db72e6b05b8..984abb6d42ab 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -38,6 +38,9 @@ TARGETS += filesystems/overlayfs
 TARGETS += filesystems/statmount
 TARGETS += filesystems/mount-notify
 TARGETS += filesystems/fuse
+TARGETS += filesystems/move_mount
+TARGETS += filesystems/empty_mntns
+TARGETS += filesystems/fsmount_ns
 TARGETS += firmware
 TARGETS += fpu
 TARGETS += ftrace
diff --git a/tools/testing/selftests/filesystems/empty_mntns/.gitignore b/tools/testing/selftests/filesystems/empty_mntns/.gitignore
new file mode 100644
index 000000000000..99f89d329db2
--- /dev/null
+++ b/tools/testing/selftests/filesystems/empty_mntns/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+clone3_empty_mntns_test
+empty_mntns_test
+overmount_chroot_test
diff --git a/tools/testing/selftests/filesystems/empty_mntns/Makefile b/tools/testing/selftests/filesystems/empty_mntns/Makefile
new file mode 100644
index 000000000000..22e3fb915e81
--- /dev/null
+++ b/tools/testing/selftests/filesystems/empty_mntns/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+CFLAGS += -Wall -O2 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
+LDLIBS += -lcap
+
+TEST_GEN_PROGS := empty_mntns_test overmount_chroot_test clone3_empty_mntns_test
+
+include ../../lib.mk
+
+$(OUTPUT)/empty_mntns_test: ../utils.c
+$(OUTPUT)/overmount_chroot_test: ../utils.c
+$(OUTPUT)/clone3_empty_mntns_test: ../utils.c
diff --git a/tools/testing/selftests/filesystems/empty_mntns/clone3_empty_mntns_test.c b/tools/testing/selftests/filesystems/empty_mntns/clone3_empty_mntns_test.c
new file mode 100644
index 000000000000..6370086f886d
--- /dev/null
+++ b/tools/testing/selftests/filesystems/empty_mntns/clone3_empty_mntns_test.c
@@ -0,0 +1,938 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Tests for empty mount namespace creation via clone3() CLONE_EMPTY_MNTNS
+ *
+ * These tests exercise the clone3() code path for creating empty mount
+ * namespaces, which is distinct from the unshare() path tested in
+ * empty_mntns_test.c.  With clone3(), CLONE_EMPTY_MNTNS (0x2000000000ULL)
+ * is a 64-bit flag that implies CLONE_NEWNS.  The implication happens in
+ * kernel_clone() before copy_process(), unlike unshare() where it goes
+ * through UNSHARE_EMPTY_MNTNS -> CLONE_EMPTY_MNTNS conversion in
+ * unshare_nsproxy_namespaces().
+ *
+ * Copyright (c) 2024 Christian Brauner <brauner@kernel.org>
+ */
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <linux/mount.h>
+#include <linux/stat.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "../utils.h"
+#include "../wrappers.h"
+#include "clone3/clone3_selftests.h"
+#include "empty_mntns.h"
+#include "kselftest_harness.h"
+
+static pid_t clone3_empty_mntns(uint64_t extra_flags)
+{
+	struct __clone_args args = {
+		.flags		= CLONE_EMPTY_MNTNS | extra_flags,
+		.exit_signal	= SIGCHLD,
+	};
+
+	return sys_clone3(&args, sizeof(args));
+}
+
+static bool clone3_empty_mntns_supported(void)
+{
+	pid_t pid;
+	int status;
+
+	pid = fork();
+	if (pid < 0)
+		return false;
+
+	if (pid == 0) {
+		if (enter_userns())
+			_exit(1);
+
+		pid = clone3_empty_mntns(0);
+		if (pid < 0)
+			_exit(1);
+
+		if (pid == 0)
+			_exit(0);
+
+		_exit(wait_for_pid(pid) != 0);
+	}
+
+	if (waitpid(pid, &status, 0) != pid)
+		return false;
+
+	if (!WIFEXITED(status))
+		return false;
+
+	return WEXITSTATUS(status) == 0;
+}
+
+FIXTURE(clone3_empty_mntns) {};
+
+FIXTURE_SETUP(clone3_empty_mntns)
+{
+	if (!clone3_empty_mntns_supported())
+		SKIP(return, "CLONE_EMPTY_MNTNS via clone3 not supported");
+}
+
+FIXTURE_TEARDOWN(clone3_empty_mntns) {}
+
+/*
+ * Basic clone3() with CLONE_EMPTY_MNTNS: child gets empty mount namespace
+ * with exactly 1 mount and root == cwd.
+ */
+TEST_F(clone3_empty_mntns, basic)
+{
+	pid_t pid, inner;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		if (enter_userns())
+			_exit(1);
+
+		inner = clone3_empty_mntns(0);
+		if (inner < 0)
+			_exit(2);
+
+		if (inner == 0) {
+			uint64_t root_id, cwd_id;
+
+			if (count_mounts() != 1)
+				_exit(3);
+
+			root_id = get_unique_mnt_id("/");
+			cwd_id = get_unique_mnt_id(".");
+			if (root_id == 0 || cwd_id == 0)
+				_exit(4);
+
+			if (root_id != cwd_id)
+				_exit(5);
+
+			_exit(0);
+		}
+
+		_exit(wait_for_pid(inner));
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * CLONE_EMPTY_MNTNS implies CLONE_NEWNS.  Verify that it works without
+ * explicitly setting CLONE_NEWNS (tests fork.c:2627-2630).
+ */
+TEST_F(clone3_empty_mntns, implies_newns)
+{
+	pid_t pid, inner;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		ssize_t parent_mounts;
+
+		if (enter_userns())
+			_exit(1);
+
+		/* Verify we have mounts in our current namespace. */
+		parent_mounts = count_mounts();
+		if (parent_mounts < 1)
+			_exit(2);
+
+		/* Only CLONE_EMPTY_MNTNS, no explicit CLONE_NEWNS. */
+		inner = clone3_empty_mntns(0);
+		if (inner < 0)
+			_exit(3);
+
+		if (inner == 0) {
+			if (count_mounts() != 1)
+				_exit(4);
+
+			_exit(0);
+		}
+
+		/* Parent still has its mounts. */
+		if (count_mounts() != parent_mounts)
+			_exit(5);
+
+		_exit(wait_for_pid(inner));
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Helper macro: generate a test that clones with CLONE_EMPTY_MNTNS |
+ * @extra_flags and verifies the child has exactly one mount.
+ */
+#define TEST_CLONE3_FLAGS(test_name, extra_flags)			\
+TEST_F(clone3_empty_mntns, test_name)					\
+{									\
+	pid_t pid, inner;						\
+									\
+	pid = fork();							\
+	ASSERT_GE(pid, 0);						\
+									\
+	if (pid == 0) {							\
+		if (enter_userns())					\
+			_exit(1);					\
+									\
+		inner = clone3_empty_mntns(extra_flags);		\
+		if (inner < 0)						\
+			_exit(2);					\
+									\
+		if (inner == 0) {					\
+			if (count_mounts() != 1)			\
+				_exit(3);				\
+			_exit(0);					\
+		}							\
+									\
+		_exit(wait_for_pid(inner));				\
+	}								\
+									\
+	ASSERT_EQ(wait_for_pid(pid), 0);				\
+}
+
+/* Redundant CLONE_NEWNS | CLONE_EMPTY_MNTNS should succeed. */
+TEST_CLONE3_FLAGS(with_explicit_newns, CLONE_NEWNS)
+
+/* CLONE_EMPTY_MNTNS combined with CLONE_NEWUSER. */
+TEST_CLONE3_FLAGS(with_newuser, CLONE_NEWUSER)
+
+/* CLONE_EMPTY_MNTNS combined with other namespace flags. */
+TEST_CLONE3_FLAGS(with_other_ns_flags, CLONE_NEWUTS | CLONE_NEWIPC)
+
+/*
+ * CLONE_EMPTY_MNTNS combined with CLONE_NEWPID.
+ */
+TEST_F(clone3_empty_mntns, with_newpid)
+{
+	pid_t pid, inner;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		if (enter_userns())
+			_exit(1);
+
+		inner = clone3_empty_mntns(CLONE_NEWPID);
+		if (inner < 0)
+			_exit(2);
+
+		if (inner == 0) {
+			if (count_mounts() != 1)
+				_exit(3);
+
+			/* In a new PID namespace, getpid() returns 1. */
+			if (getpid() != 1)
+				_exit(4);
+
+			_exit(0);
+		}
+
+		_exit(wait_for_pid(inner));
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * CLONE_EMPTY_MNTNS | CLONE_FS must fail because the implied CLONE_NEWNS
+ * and CLONE_FS are mutually exclusive (fork.c:1981).
+ */
+TEST_F(clone3_empty_mntns, with_clone_fs_fails)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		struct __clone_args args = {
+			.flags		= CLONE_EMPTY_MNTNS | CLONE_FS,
+			.exit_signal	= SIGCHLD,
+		};
+		pid_t ret;
+
+		if (enter_userns())
+			_exit(1);
+
+		ret = sys_clone3(&args, sizeof(args));
+		if (ret >= 0) {
+			if (ret == 0)
+				_exit(0);
+			wait_for_pid(ret);
+			_exit(2);
+		}
+
+		if (errno != EINVAL)
+			_exit(3);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * CLONE_EMPTY_MNTNS combined with CLONE_PIDFD returns a valid pidfd.
+ */
+TEST_F(clone3_empty_mntns, with_pidfd)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		struct __clone_args args = {
+			.flags		= CLONE_EMPTY_MNTNS | CLONE_PIDFD,
+			.exit_signal	= SIGCHLD,
+		};
+		int pidfd = -1;
+		pid_t inner;
+
+		if (enter_userns())
+			_exit(1);
+
+		args.pidfd = (uintptr_t)&pidfd;
+
+		inner = sys_clone3(&args, sizeof(args));
+		if (inner < 0)
+			_exit(2);
+
+		if (inner == 0) {
+			if (count_mounts() != 1)
+				_exit(3);
+
+			_exit(0);
+		}
+
+		/* Verify we got a valid pidfd. */
+		if (pidfd < 0)
+			_exit(4);
+
+		close(pidfd);
+		_exit(wait_for_pid(inner));
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * clone3 without CAP_SYS_ADMIN must fail with EPERM.
+ */
+TEST_F(clone3_empty_mntns, eperm_without_caps)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		pid_t ret;
+
+		/* Skip if already root. */
+		if (getuid() == 0)
+			_exit(0);
+
+		ret = clone3_empty_mntns(0);
+		if (ret >= 0) {
+			if (ret == 0)
+				_exit(0);
+			wait_for_pid(ret);
+			_exit(1);
+		}
+
+		if (errno != EPERM)
+			_exit(2);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Parent's mount namespace is unaffected after clone3 with CLONE_EMPTY_MNTNS.
+ */
+TEST_F(clone3_empty_mntns, parent_unchanged)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		ssize_t nr_before, nr_after;
+		pid_t inner;
+
+		if (enter_userns())
+			_exit(1);
+
+		nr_before = count_mounts();
+		if (nr_before < 1)
+			_exit(2);
+
+		inner = clone3_empty_mntns(0);
+		if (inner < 0)
+			_exit(3);
+
+		if (inner == 0)
+			_exit(0);
+
+		if (wait_for_pid(inner) != 0)
+			_exit(4);
+
+		nr_after = count_mounts();
+		if (nr_after != nr_before)
+			_exit(5);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Parent with many mounts: child still gets exactly 1 mount.
+ */
+TEST_F(clone3_empty_mntns, many_parent_mounts)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		char tmpdir[] = "/tmp/clone3_mntns_test.XXXXXX";
+		pid_t inner;
+		int i;
+
+		if (enter_userns())
+			_exit(1);
+
+		if (unshare(CLONE_NEWNS))
+			_exit(2);
+
+		if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL))
+			_exit(3);
+
+		if (!mkdtemp(tmpdir))
+			_exit(4);
+
+		if (mount("tmpfs", tmpdir, "tmpfs", 0, "size=1M"))
+			_exit(5);
+
+		for (i = 0; i < 5; i++) {
+			char subdir[256];
+
+			snprintf(subdir, sizeof(subdir), "%s/sub%d", tmpdir, i);
+			if (mkdir(subdir, 0755) && errno != EEXIST)
+				_exit(6);
+			if (mount(subdir, subdir, NULL, MS_BIND, NULL))
+				_exit(7);
+		}
+
+		if (count_mounts() < 5)
+			_exit(8);
+
+		inner = clone3_empty_mntns(0);
+		if (inner < 0)
+			_exit(9);
+
+		if (inner == 0) {
+			if (count_mounts() != 1)
+				_exit(10);
+
+			_exit(0);
+		}
+
+		_exit(wait_for_pid(inner));
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Verify the child's root mount is nullfs with expected statmount properties.
+ */
+TEST_F(clone3_empty_mntns, mount_properties)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		pid_t inner;
+
+		if (enter_userns())
+			_exit(1);
+
+		inner = clone3_empty_mntns(0);
+		if (inner < 0)
+			_exit(2);
+
+		if (inner == 0) {
+			struct statmount *sm;
+			uint64_t root_id;
+
+			root_id = get_unique_mnt_id("/");
+			if (!root_id)
+				_exit(3);
+
+			sm = statmount_alloc(root_id, 0,
+					     STATMOUNT_MNT_BASIC |
+					     STATMOUNT_MNT_POINT |
+					     STATMOUNT_FS_TYPE, 0);
+			if (!sm)
+				_exit(4);
+
+			/* Root mount point is "/". */
+			if (!(sm->mask & STATMOUNT_MNT_POINT))
+				_exit(5);
+			if (strcmp(sm->str + sm->mnt_point, "/") != 0)
+				_exit(6);
+
+			/* Filesystem type is nullfs. */
+			if (!(sm->mask & STATMOUNT_FS_TYPE))
+				_exit(7);
+			if (strcmp(sm->str + sm->fs_type, "nullfs") != 0)
+				_exit(8);
+
+			/* Root mount is its own parent. */
+			if (!(sm->mask & STATMOUNT_MNT_BASIC))
+				_exit(9);
+			if (sm->mnt_parent_id != sm->mnt_id)
+				_exit(10);
+
+			free(sm);
+			_exit(0);
+		}
+
+		_exit(wait_for_pid(inner));
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Listmount returns only the root mount in the child's empty namespace.
+ */
+TEST_F(clone3_empty_mntns, listmount_single_entry)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		pid_t inner;
+
+		if (enter_userns())
+			_exit(1);
+
+		inner = clone3_empty_mntns(0);
+		if (inner < 0)
+			_exit(2);
+
+		if (inner == 0) {
+			uint64_t list[16];
+			ssize_t nr_mounts;
+			uint64_t root_id;
+
+			nr_mounts = listmount(LSMT_ROOT, 0, 0, list, 16, 0);
+			if (nr_mounts != 1)
+				_exit(3);
+
+			root_id = get_unique_mnt_id("/");
+			if (!root_id)
+				_exit(4);
+
+			if (list[0] != root_id)
+				_exit(5);
+
+			_exit(0);
+		}
+
+		_exit(wait_for_pid(inner));
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Child can mount tmpfs over nullfs root (the primary container use case).
+ *
+ * Uses the new mount API (fsopen/fsmount/move_mount) because resolving
+ * "/" returns the process root directly without following overmounts.
+ * The mount fd from fsmount lets us fchdir + chroot into the new tmpfs.
+ */
+TEST_F(clone3_empty_mntns, child_overmount_tmpfs)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		pid_t inner;
+
+		if (enter_userns())
+			_exit(1);
+
+		inner = clone3_empty_mntns(0);
+		if (inner < 0)
+			_exit(2);
+
+		if (inner == 0) {
+			struct statmount *sm;
+			uint64_t root_id;
+			int fd, fsfd, mntfd;
+
+			if (count_mounts() != 1)
+				_exit(3);
+
+			/* Verify root is nullfs. */
+			root_id = get_unique_mnt_id("/");
+			if (!root_id)
+				_exit(4);
+
+			sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE, 0);
+			if (!sm)
+				_exit(5);
+			if (!(sm->mask & STATMOUNT_FS_TYPE))
+				_exit(6);
+			if (strcmp(sm->str + sm->fs_type, "nullfs") != 0)
+				_exit(7);
+			free(sm);
+
+			/* Create tmpfs via the new mount API. */
+			fsfd = sys_fsopen("tmpfs", 0);
+			if (fsfd < 0)
+				_exit(8);
+
+			if (sys_fsconfig(fsfd, FSCONFIG_SET_STRING,
+					 "size", "1M", 0)) {
+				close(fsfd);
+				_exit(9);
+			}
+
+			if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE,
+					 NULL, NULL, 0)) {
+				close(fsfd);
+				_exit(10);
+			}
+
+			mntfd = sys_fsmount(fsfd, 0, 0);
+			close(fsfd);
+			if (mntfd < 0)
+				_exit(11);
+
+			/* Attach tmpfs to "/". */
+			if (sys_move_mount(mntfd, "", AT_FDCWD, "/",
+					   MOVE_MOUNT_F_EMPTY_PATH)) {
+				close(mntfd);
+				_exit(12);
+			}
+
+			if (count_mounts() != 2) {
+				close(mntfd);
+				_exit(13);
+			}
+
+			/* Enter the tmpfs. */
+			if (fchdir(mntfd)) {
+				close(mntfd);
+				_exit(14);
+			}
+
+			if (chroot(".")) {
+				close(mntfd);
+				_exit(15);
+			}
+
+			close(mntfd);
+
+			/* Verify "/" is now tmpfs. */
+			root_id = get_unique_mnt_id("/");
+			if (!root_id)
+				_exit(16);
+
+			sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE, 0);
+			if (!sm)
+				_exit(17);
+			if (!(sm->mask & STATMOUNT_FS_TYPE))
+				_exit(18);
+			if (strcmp(sm->str + sm->fs_type, "tmpfs") != 0)
+				_exit(19);
+			free(sm);
+
+			/* Verify tmpfs is writable. */
+			fd = open("/testfile", O_CREAT | O_RDWR, 0644);
+			if (fd < 0)
+				_exit(20);
+
+			if (write(fd, "test", 4) != 4) {
+				close(fd);
+				_exit(21);
+			}
+			close(fd);
+
+			if (access("/testfile", F_OK))
+				_exit(22);
+
+			_exit(0);
+		}
+
+		_exit(wait_for_pid(inner));
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Multiple clone3 calls with CLONE_EMPTY_MNTNS produce children with
+ * distinct mount namespace root mount IDs.
+ */
+TEST_F(clone3_empty_mntns, repeated)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		int pipe1[2], pipe2[2];
+		uint64_t id1 = 0, id2 = 0;
+		pid_t inner1, inner2;
+
+		if (enter_userns())
+			_exit(1);
+
+		if (pipe(pipe1) || pipe(pipe2))
+			_exit(2);
+
+		inner1 = clone3_empty_mntns(0);
+		if (inner1 < 0)
+			_exit(3);
+
+		if (inner1 == 0) {
+			uint64_t root_id;
+
+			close(pipe1[0]);
+			root_id = get_unique_mnt_id("/");
+			if (write(pipe1[1], &root_id, sizeof(root_id)) != sizeof(root_id))
+				_exit(1);
+			close(pipe1[1]);
+			_exit(0);
+		}
+
+		inner2 = clone3_empty_mntns(0);
+		if (inner2 < 0)
+			_exit(4);
+
+		if (inner2 == 0) {
+			uint64_t root_id;
+
+			close(pipe2[0]);
+			root_id = get_unique_mnt_id("/");
+			if (write(pipe2[1], &root_id, sizeof(root_id)) != sizeof(root_id))
+				_exit(1);
+			close(pipe2[1]);
+			_exit(0);
+		}
+
+		close(pipe1[1]);
+		close(pipe2[1]);
+
+		if (read(pipe1[0], &id1, sizeof(id1)) != sizeof(id1))
+			_exit(5);
+		if (read(pipe2[0], &id2, sizeof(id2)) != sizeof(id2))
+			_exit(6);
+
+		close(pipe1[0]);
+		close(pipe2[0]);
+
+		if (wait_for_pid(inner1) || wait_for_pid(inner2))
+			_exit(7);
+
+		/* Each child must have a distinct root mount ID. */
+		if (id1 == 0 || id2 == 0)
+			_exit(8);
+		if (id1 == id2)
+			_exit(9);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Verify setns() into a child's empty mount namespace works.
+ */
+TEST_F(clone3_empty_mntns, setns_into_child_mntns)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		int pipe_fd[2];
+		pid_t inner;
+		char c;
+
+		if (enter_userns())
+			_exit(1);
+
+		if (pipe(pipe_fd))
+			_exit(2);
+
+		inner = clone3_empty_mntns(0);
+		if (inner < 0)
+			_exit(3);
+
+		if (inner == 0) {
+			/* Signal parent we're ready. */
+			close(pipe_fd[0]);
+			if (write(pipe_fd[1], "r", 1) != 1)
+				_exit(1);
+
+			/*
+			 * Wait for parent to finish.  Reading from our
+			 * write end will block until the parent closes
+			 * its read end, giving us an implicit barrier.
+			 */
+			if (read(pipe_fd[1], &c, 1) < 0)
+				;
+			close(pipe_fd[1]);
+			_exit(0);
+		}
+
+		close(pipe_fd[1]);
+
+		/* Wait for child to be ready. */
+		if (read(pipe_fd[0], &c, 1) != 1)
+			_exit(4);
+
+		/* Open child's mount namespace. */
+		{
+			char path[64];
+			int mntns_fd;
+
+			snprintf(path, sizeof(path), "/proc/%d/ns/mnt", inner);
+			mntns_fd = open(path, O_RDONLY);
+			if (mntns_fd < 0)
+				_exit(5);
+
+			if (setns(mntns_fd, CLONE_NEWNS))
+				_exit(6);
+
+			close(mntns_fd);
+		}
+
+		/* Now we should be in the child's empty mntns. */
+		if (count_mounts() != 1)
+			_exit(7);
+
+		close(pipe_fd[0]);
+		_exit(wait_for_pid(inner));
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Tests below do not require CLONE_EMPTY_MNTNS support.
+ */
+
+/*
+ * Unknown 64-bit flags beyond the known set are rejected.
+ */
+TEST(unknown_flags_rejected)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		struct __clone_args args = {
+			.flags		= 0x800000000ULL,
+			.exit_signal	= SIGCHLD,
+		};
+		pid_t ret;
+
+		ret = sys_clone3(&args, sizeof(args));
+		if (ret >= 0) {
+			if (ret == 0)
+				_exit(0);
+			wait_for_pid(ret);
+			_exit(1);
+		}
+
+		if (errno != EINVAL)
+			_exit(2);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Regular clone3 with CLONE_NEWNS (without CLONE_EMPTY_MNTNS) still
+ * copies the full mount tree.
+ */
+TEST(clone3_newns_full_copy)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		struct __clone_args args = {
+			.flags		= CLONE_NEWNS,
+			.exit_signal	= SIGCHLD,
+		};
+		ssize_t parent_mounts;
+		pid_t inner;
+
+		if (enter_userns())
+			_exit(1);
+
+		parent_mounts = count_mounts();
+		if (parent_mounts < 1)
+			_exit(2);
+
+		inner = sys_clone3(&args, sizeof(args));
+		if (inner < 0)
+			_exit(3);
+
+		if (inner == 0) {
+			/* Full copy should have at least as many mounts. */
+			if (count_mounts() < parent_mounts)
+				_exit(1);
+
+			_exit(0);
+		}
+
+		_exit(wait_for_pid(inner));
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/filesystems/empty_mntns/empty_mntns.h b/tools/testing/selftests/filesystems/empty_mntns/empty_mntns.h
new file mode 100644
index 000000000000..3d9c6b14bbef
--- /dev/null
+++ b/tools/testing/selftests/filesystems/empty_mntns/empty_mntns.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef EMPTY_MNTNS_H
+#define EMPTY_MNTNS_H
+
+#include <errno.h>
+#include <stdlib.h>
+
+#include "../statmount/statmount.h"
+
+#ifndef UNSHARE_EMPTY_MNTNS
+#define UNSHARE_EMPTY_MNTNS	0x00100000
+#endif
+
+#ifndef CLONE_EMPTY_MNTNS
+#define CLONE_EMPTY_MNTNS	(1ULL << 37)
+#endif
+
+static inline ssize_t count_mounts(void)
+{
+	uint64_t list[4096];
+
+	return listmount(LSMT_ROOT, 0, 0, list, sizeof(list) / sizeof(list[0]), 0);
+}
+
+#endif /* EMPTY_MNTNS_H */
diff --git a/tools/testing/selftests/filesystems/empty_mntns/empty_mntns_test.c b/tools/testing/selftests/filesystems/empty_mntns/empty_mntns_test.c
new file mode 100644
index 000000000000..43e296b97d84
--- /dev/null
+++ b/tools/testing/selftests/filesystems/empty_mntns/empty_mntns_test.c
@@ -0,0 +1,725 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Tests for empty mount namespace creation via UNSHARE_EMPTY_MNTNS
+ *
+ * Copyright (c) 2024 Christian Brauner <brauner@kernel.org>
+ */
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <linux/mount.h>
+#include <linux/stat.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "../utils.h"
+#include "../wrappers.h"
+#include "empty_mntns.h"
+#include "kselftest_harness.h"
+
+static bool unshare_empty_mntns_supported(void)
+{
+	pid_t pid;
+	int status;
+
+	pid = fork();
+	if (pid < 0)
+		return false;
+
+	if (pid == 0) {
+		if (enter_userns())
+			_exit(1);
+
+		if (unshare(UNSHARE_EMPTY_MNTNS) && errno == EINVAL)
+			_exit(1);
+		_exit(0);
+	}
+
+	if (waitpid(pid, &status, 0) != pid)
+		return false;
+
+	if (!WIFEXITED(status))
+		return false;
+
+	return WEXITSTATUS(status) == 0;
+}
+
+
+FIXTURE(empty_mntns) {};
+
+FIXTURE_SETUP(empty_mntns)
+{
+	if (!unshare_empty_mntns_supported())
+		SKIP(return, "UNSHARE_EMPTY_MNTNS not supported");
+}
+
+FIXTURE_TEARDOWN(empty_mntns) {}
+
+/* Verify unshare succeeds, produces exactly 1 mount, and root == cwd */
+TEST_F(empty_mntns, basic)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		uint64_t root_id, cwd_id;
+
+		if (enter_userns())
+			_exit(1);
+
+		if (unshare(UNSHARE_EMPTY_MNTNS))
+			_exit(2);
+
+		if (count_mounts() != 1)
+			_exit(3);
+
+		root_id = get_unique_mnt_id("/");
+		cwd_id = get_unique_mnt_id(".");
+		if (root_id == 0 || cwd_id == 0)
+			_exit(4);
+
+		if (root_id != cwd_id)
+			_exit(5);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * UNSHARE_EMPTY_MNTNS combined with CLONE_NEWUSER.
+ *
+ * The user namespace must be created first so /proc is still accessible
+ * for writing uid_map/gid_map.  The empty mount namespace is created
+ * afterwards.
+ */
+TEST_F(empty_mntns, with_clone_newuser)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		uid_t uid = getuid();
+		gid_t gid = getgid();
+		char map[100];
+
+		if (unshare(CLONE_NEWUSER))
+			_exit(1);
+
+		snprintf(map, sizeof(map), "0 %d 1", uid);
+		if (write_file("/proc/self/uid_map", map))
+			_exit(2);
+
+		if (write_file("/proc/self/setgroups", "deny"))
+			_exit(3);
+
+		snprintf(map, sizeof(map), "0 %d 1", gid);
+		if (write_file("/proc/self/gid_map", map))
+			_exit(4);
+
+		if (unshare(UNSHARE_EMPTY_MNTNS))
+			_exit(5);
+
+		if (count_mounts() != 1)
+			_exit(6);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/* UNSHARE_EMPTY_MNTNS combined with other namespace flags */
+TEST_F(empty_mntns, with_other_ns_flags)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		if (enter_userns())
+			_exit(1);
+
+		if (unshare(UNSHARE_EMPTY_MNTNS | CLONE_NEWUTS | CLONE_NEWIPC))
+			_exit(2);
+
+		if (count_mounts() != 1)
+			_exit(3);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/* EPERM without proper capabilities */
+TEST_F(empty_mntns, eperm_without_caps)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		/* Skip if already root */
+		if (getuid() == 0)
+			_exit(0);
+
+		if (unshare(UNSHARE_EMPTY_MNTNS) == 0)
+			_exit(1);
+
+		if (errno != EPERM)
+			_exit(2);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/* Many source mounts still result in exactly 1 mount */
+TEST_F(empty_mntns, many_source_mounts)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		char tmpdir[] = "/tmp/empty_mntns_test.XXXXXX";
+		int i;
+
+		if (enter_userns())
+			_exit(1);
+
+		if (unshare(CLONE_NEWNS))
+			_exit(2);
+
+		if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL))
+			_exit(3);
+
+		if (!mkdtemp(tmpdir))
+			_exit(4);
+
+		if (mount("tmpfs", tmpdir, "tmpfs", 0, "size=1M"))
+			_exit(5);
+
+		for (i = 0; i < 5; i++) {
+			char subdir[256];
+
+			snprintf(subdir, sizeof(subdir), "%s/sub%d", tmpdir, i);
+			if (mkdir(subdir, 0755) && errno != EEXIST)
+				_exit(6);
+			if (mount(subdir, subdir, NULL, MS_BIND, NULL))
+				_exit(7);
+		}
+
+		if (count_mounts() < 5)
+			_exit(8);
+
+		if (unshare(UNSHARE_EMPTY_MNTNS))
+			_exit(9);
+
+		if (count_mounts() != 1)
+			_exit(10);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/* CWD on a different mount gets reset to root */
+TEST_F(empty_mntns, cwd_reset)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		char tmpdir[] = "/tmp/empty_mntns_cwd.XXXXXX";
+		uint64_t root_id, cwd_id;
+		struct statmount *sm;
+
+		if (enter_userns())
+			_exit(1);
+
+		if (unshare(CLONE_NEWNS))
+			_exit(2);
+
+		if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL))
+			_exit(3);
+
+		if (!mkdtemp(tmpdir))
+			_exit(4);
+
+		if (mount("tmpfs", tmpdir, "tmpfs", 0, "size=1M"))
+			_exit(5);
+
+		if (chdir(tmpdir))
+			_exit(6);
+
+		if (unshare(UNSHARE_EMPTY_MNTNS))
+			_exit(7);
+
+		root_id = get_unique_mnt_id("/");
+		cwd_id = get_unique_mnt_id(".");
+		if (root_id == 0 || cwd_id == 0)
+			_exit(8);
+
+		if (root_id != cwd_id)
+			_exit(9);
+
+		sm = statmount_alloc(root_id, 0, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT, 0);
+		if (!sm)
+			_exit(10);
+
+		if (strcmp(sm->str + sm->mnt_point, "/") != 0)
+			_exit(11);
+
+		free(sm);
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/* Verify statmount properties of the root mount */
+TEST_F(empty_mntns, mount_properties)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		struct statmount *sm;
+		uint64_t root_id;
+
+		if (enter_userns())
+			_exit(1);
+
+		if (unshare(UNSHARE_EMPTY_MNTNS))
+			_exit(2);
+
+		root_id = get_unique_mnt_id("/");
+		if (!root_id)
+			_exit(3);
+
+		sm = statmount_alloc(root_id, 0, STATMOUNT_MNT_BASIC | STATMOUNT_MNT_ROOT |
+				     STATMOUNT_MNT_POINT | STATMOUNT_FS_TYPE, 0);
+		if (!sm)
+			_exit(4);
+
+		if (!(sm->mask & STATMOUNT_MNT_POINT))
+			_exit(5);
+
+		if (strcmp(sm->str + sm->mnt_point, "/") != 0)
+			_exit(6);
+
+		if (!(sm->mask & STATMOUNT_MNT_BASIC))
+			_exit(7);
+
+		if (sm->mnt_id != root_id)
+			_exit(8);
+
+		free(sm);
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/* Consecutive UNSHARE_EMPTY_MNTNS calls produce new namespaces */
+TEST_F(empty_mntns, repeated_unshare)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		uint64_t first_root_id, second_root_id;
+
+		if (enter_userns())
+			_exit(1);
+
+		if (unshare(UNSHARE_EMPTY_MNTNS))
+			_exit(2);
+
+		if (count_mounts() != 1)
+			_exit(3);
+
+		first_root_id = get_unique_mnt_id("/");
+
+		if (unshare(UNSHARE_EMPTY_MNTNS))
+			_exit(4);
+
+		if (count_mounts() != 1)
+			_exit(5);
+
+		second_root_id = get_unique_mnt_id("/");
+
+		if (first_root_id == second_root_id)
+			_exit(6);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/* Root mount's parent is itself */
+TEST_F(empty_mntns, root_is_own_parent)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		struct statmount sm;
+		uint64_t root_id;
+
+		if (enter_userns())
+			_exit(1);
+
+		if (unshare(UNSHARE_EMPTY_MNTNS))
+			_exit(2);
+
+		root_id = get_unique_mnt_id("/");
+		if (!root_id)
+			_exit(3);
+
+		if (statmount(root_id, 0, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0) < 0)
+			_exit(4);
+
+		if (!(sm.mask & STATMOUNT_MNT_BASIC))
+			_exit(5);
+
+		if (sm.mnt_parent_id != sm.mnt_id)
+			_exit(6);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/* Listmount returns only the root mount */
+TEST_F(empty_mntns, listmount_single_entry)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		uint64_t list[16];
+		ssize_t nr_mounts;
+		uint64_t root_id;
+
+		if (enter_userns())
+			_exit(1);
+
+		if (unshare(UNSHARE_EMPTY_MNTNS))
+			_exit(2);
+
+		nr_mounts = listmount(LSMT_ROOT, 0, 0, list, 16, 0);
+		if (nr_mounts != 1)
+			_exit(3);
+
+		root_id = get_unique_mnt_id("/");
+		if (!root_id)
+			_exit(4);
+
+		if (list[0] != root_id)
+			_exit(5);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Mount tmpfs over nullfs root to build a writable filesystem from scratch.
+ * This exercises the intended usage pattern: create an empty mount namespace
+ * (which has a nullfs root), then mount a real filesystem over it.
+ *
+ * Because resolving "/" returns the process root directly (via nd_jump_root)
+ * without following overmounts, we use the new mount API (fsopen/fsmount)
+ * to obtain a mount fd, then fchdir + chroot to enter the new filesystem.
+ */
+TEST_F(empty_mntns, overmount_tmpfs)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		struct statmount *sm;
+		uint64_t root_id, cwd_id;
+		int fd, fsfd, mntfd;
+
+		if (enter_userns())
+			_exit(1);
+
+		if (unshare(UNSHARE_EMPTY_MNTNS))
+			_exit(2);
+
+		if (count_mounts() != 1)
+			_exit(3);
+
+		root_id = get_unique_mnt_id("/");
+		if (!root_id)
+			_exit(4);
+
+		/* Verify root is nullfs */
+		sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE, 0);
+		if (!sm)
+			_exit(5);
+
+		if (!(sm->mask & STATMOUNT_FS_TYPE))
+			_exit(6);
+
+		if (strcmp(sm->str + sm->fs_type, "nullfs") != 0)
+			_exit(7);
+
+		free(sm);
+
+		cwd_id = get_unique_mnt_id(".");
+		if (!cwd_id || root_id != cwd_id)
+			_exit(8);
+
+		/*
+		 * nullfs root is immutable.  open(O_CREAT) returns ENOENT
+		 * because empty_dir_lookup() returns -ENOENT before the
+		 * IS_IMMUTABLE permission check in may_o_create() is reached.
+		 */
+		fd = open("/test", O_CREAT | O_RDWR, 0644);
+		if (fd >= 0) {
+			close(fd);
+			_exit(9);
+		}
+		if (errno != ENOENT)
+			_exit(10);
+
+		/*
+		 * Use the new mount API to create tmpfs and get a mount fd.
+		 * We need the fd because after attaching the tmpfs on top of
+		 * "/", path resolution of "/" still returns the process root
+		 * (nullfs) without following the overmount.  The mount fd
+		 * lets us fchdir + chroot into the tmpfs.
+		 */
+		fsfd = sys_fsopen("tmpfs", 0);
+		if (fsfd < 0)
+			_exit(11);
+
+		if (sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "size", "1M", 0)) {
+			close(fsfd);
+			_exit(12);
+		}
+
+		if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)) {
+			close(fsfd);
+			_exit(13);
+		}
+
+		mntfd = sys_fsmount(fsfd, 0, 0);
+		close(fsfd);
+		if (mntfd < 0)
+			_exit(14);
+
+		if (sys_move_mount(mntfd, "", AT_FDCWD, "/",
+				   MOVE_MOUNT_F_EMPTY_PATH)) {
+			close(mntfd);
+			_exit(15);
+		}
+
+		if (count_mounts() != 2) {
+			close(mntfd);
+			_exit(16);
+		}
+
+		/* Enter the tmpfs via the mount fd */
+		if (fchdir(mntfd)) {
+			close(mntfd);
+			_exit(17);
+		}
+
+		if (chroot(".")) {
+			close(mntfd);
+			_exit(18);
+		}
+
+		close(mntfd);
+
+		/* Verify "/" now resolves to tmpfs */
+		root_id = get_unique_mnt_id("/");
+		if (!root_id)
+			_exit(19);
+
+		sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE, 0);
+		if (!sm)
+			_exit(20);
+
+		if (!(sm->mask & STATMOUNT_FS_TYPE))
+			_exit(21);
+
+		if (strcmp(sm->str + sm->fs_type, "tmpfs") != 0)
+			_exit(22);
+
+		free(sm);
+
+		/* Verify tmpfs is writable */
+		fd = open("/testfile", O_CREAT | O_RDWR, 0644);
+		if (fd < 0)
+			_exit(23);
+
+		if (write(fd, "test", 4) != 4) {
+			close(fd);
+			_exit(24);
+		}
+
+		close(fd);
+
+		if (access("/testfile", F_OK))
+			_exit(25);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/*
+ * Tests below do not require UNSHARE_EMPTY_MNTNS support.
+ */
+
+/* Invalid unshare flags return EINVAL */
+TEST(invalid_flags)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		if (enter_userns())
+			_exit(1);
+
+		if (unshare(0x80000000) == 0)
+			_exit(2);
+
+		if (errno != EINVAL)
+			_exit(3);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/* Regular CLONE_NEWNS still copies the full mount tree */
+TEST(clone_newns_full_copy)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		ssize_t nr_mounts_before, nr_mounts_after;
+		char tmpdir[] = "/tmp/empty_mntns_regr.XXXXXX";
+		int i;
+
+		if (enter_userns())
+			_exit(1);
+
+		if (unshare(CLONE_NEWNS))
+			_exit(2);
+
+		if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL))
+			_exit(3);
+
+		if (!mkdtemp(tmpdir))
+			_exit(4);
+
+		if (mount("tmpfs", tmpdir, "tmpfs", 0, "size=1M"))
+			_exit(5);
+
+		for (i = 0; i < 3; i++) {
+			char subdir[256];
+
+			snprintf(subdir, sizeof(subdir), "%s/sub%d", tmpdir, i);
+			if (mkdir(subdir, 0755) && errno != EEXIST)
+				_exit(6);
+			if (mount(subdir, subdir, NULL, MS_BIND, NULL))
+				_exit(7);
+		}
+
+		nr_mounts_before = count_mounts();
+		if (nr_mounts_before < 3)
+			_exit(8);
+
+		if (unshare(CLONE_NEWNS))
+			_exit(9);
+
+		nr_mounts_after = count_mounts();
+		if (nr_mounts_after < nr_mounts_before)
+			_exit(10);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+/* Other namespace unshares are unaffected */
+TEST(other_ns_unaffected)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		char hostname[256];
+
+		if (enter_userns())
+			_exit(1);
+
+		if (unshare(CLONE_NEWUTS))
+			_exit(2);
+
+		if (sethostname("test-empty-mntns", 16))
+			_exit(3);
+
+		if (gethostname(hostname, sizeof(hostname)))
+			_exit(4);
+
+		if (strcmp(hostname, "test-empty-mntns") != 0)
+			_exit(5);
+
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/filesystems/empty_mntns/overmount_chroot_test.c b/tools/testing/selftests/filesystems/empty_mntns/overmount_chroot_test.c
new file mode 100644
index 000000000000..6e21c58258c3
--- /dev/null
+++ b/tools/testing/selftests/filesystems/empty_mntns/overmount_chroot_test.c
@@ -0,0 +1,225 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Test: rootfs overmounted multiple times with chroot into topmost
+ *
+ * This test creates a scenario where:
+ * 1. A new mount namespace is created with a tmpfs root (via pivot_root)
+ * 2. A mountpoint is created and overmounted multiple times
+ * 3. The caller chroots into the topmost mount layer
+ *
+ * The test verifies that:
+ * - Multiple overmounts create separate mount layers
+ * - Each layer's files are isolated
+ * - chroot correctly sets the process's root to the topmost layer
+ * - After chroot, only the topmost layer's files are visible
+ *
+ * Copyright (c) 2024 Christian Brauner <brauner@kernel.org>
+ */
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <linux/mount.h>
+#include <linux/stat.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "../utils.h"
+#include "empty_mntns.h"
+#include "kselftest_harness.h"
+
+#define NR_OVERMOUNTS 5
+
+/*
+ * Setup a proper root filesystem using pivot_root.
+ * This ensures we own the root directory in our user namespace.
+ */
+static int setup_root(void)
+{
+	char tmpdir[] = "/tmp/overmount_test.XXXXXX";
+	char oldroot[256];
+
+	if (!mkdtemp(tmpdir))
+		return -1;
+
+	/* Mount tmpfs at the temporary directory */
+	if (mount("tmpfs", tmpdir, "tmpfs", 0, "size=10M"))
+		return -1;
+
+	/* Create directory for old root */
+	snprintf(oldroot, sizeof(oldroot), "%s/oldroot", tmpdir);
+	if (mkdir(oldroot, 0755))
+		return -1;
+
+	/* pivot_root to use the tmpfs as new root */
+	if (syscall(SYS_pivot_root, tmpdir, oldroot))
+		return -1;
+
+	if (chdir("/"))
+		return -1;
+
+	/* Unmount old root */
+	if (umount2("/oldroot", MNT_DETACH))
+		return -1;
+
+	/* Remove oldroot directory */
+	if (rmdir("/oldroot"))
+		return -1;
+
+	return 0;
+}
+
+/*
+ * Test scenario:
+ * 1. Enter a user namespace to gain CAP_SYS_ADMIN
+ * 2. Create a new mount namespace
+ * 3. Setup a tmpfs root via pivot_root
+ * 4. Create a mountpoint /newroot and overmount it multiple times
+ * 5. Create a marker file in each layer
+ * 6. Chroot into /newroot (the topmost overmount)
+ * 7. Verify we're in the topmost layer (only topmost marker visible)
+ */
+TEST(overmount_chroot)
+{
+	pid_t pid;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		ssize_t nr_mounts;
+		uint64_t mnt_ids[NR_OVERMOUNTS + 1];
+		uint64_t root_id_before, root_id_after;
+		struct statmount *sm;
+		char marker[64];
+		int fd, i;
+
+		/* Step 1: Enter user namespace for privileges */
+		if (enter_userns())
+			_exit(1);
+
+		/* Step 2: Create a new mount namespace */
+		if (unshare(CLONE_NEWNS))
+			_exit(2);
+
+		/* Step 3: Make the mount tree private */
+		if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL))
+			_exit(3);
+
+		/* Step 4: Setup a proper tmpfs root via pivot_root */
+		if (setup_root())
+			_exit(4);
+
+		/* Create the base mount point for overmounting */
+		if (mkdir("/newroot", 0755))
+			_exit(5);
+
+		/* Mount base tmpfs on /newroot */
+		if (mount("tmpfs", "/newroot", "tmpfs", 0, "size=1M"))
+			_exit(6);
+
+		/* Record base mount ID */
+		mnt_ids[0] = get_unique_mnt_id("/newroot");
+		if (!mnt_ids[0])
+			_exit(7);
+
+		/* Create marker in base layer */
+		fd = open("/newroot/layer_0", O_CREAT | O_RDWR, 0644);
+		if (fd < 0)
+			_exit(8);
+		if (write(fd, "layer_0", 7) != 7) {
+			close(fd);
+			_exit(9);
+		}
+		close(fd);
+
+		/* Step 5: Overmount /newroot multiple times with tmpfs */
+		for (i = 0; i < NR_OVERMOUNTS; i++) {
+			if (mount("tmpfs", "/newroot", "tmpfs", 0, "size=1M"))
+				_exit(10);
+
+			/* Record mount ID for this layer */
+			mnt_ids[i + 1] = get_unique_mnt_id("/newroot");
+			if (!mnt_ids[i + 1])
+				_exit(11);
+
+			/* Create a marker file in each layer */
+			snprintf(marker, sizeof(marker), "/newroot/layer_%d", i + 1);
+			fd = open(marker, O_CREAT | O_RDWR, 0644);
+			if (fd < 0)
+				_exit(12);
+
+			if (write(fd, marker, strlen(marker)) != (ssize_t)strlen(marker)) {
+				close(fd);
+				_exit(13);
+			}
+			close(fd);
+		}
+
+		/* Verify mount count increased */
+		nr_mounts = count_mounts();
+		if (nr_mounts < NR_OVERMOUNTS + 2)
+			_exit(14);
+
+		/* Record root mount ID before chroot */
+		root_id_before = get_unique_mnt_id("/newroot");
+
+		/* Verify this is the topmost layer's mount */
+		if (root_id_before != mnt_ids[NR_OVERMOUNTS])
+			_exit(15);
+
+		/* Step 6: Chroot into /newroot (the topmost overmount) */
+		if (chroot("/newroot"))
+			_exit(16);
+
+		/* Change to root directory within the chroot */
+		if (chdir("/"))
+			_exit(17);
+
+		/* Step 7: Verify we're in the topmost layer */
+		root_id_after = get_unique_mnt_id("/");
+
+		/* The mount ID should be the same as the topmost layer */
+		if (root_id_after != mnt_ids[NR_OVERMOUNTS])
+			_exit(18);
+
+		/* Verify the topmost layer's marker file exists */
+		snprintf(marker, sizeof(marker), "/layer_%d", NR_OVERMOUNTS);
+		if (access(marker, F_OK))
+			_exit(19);
+
+		/* Verify we cannot see markers from lower layers (they're hidden) */
+		for (i = 0; i < NR_OVERMOUNTS; i++) {
+			snprintf(marker, sizeof(marker), "/layer_%d", i);
+			if (access(marker, F_OK) == 0)
+				_exit(20);
+		}
+
+		/* Verify the root mount is tmpfs */
+		sm = statmount_alloc(root_id_after, 0,
+				     STATMOUNT_MNT_BASIC | STATMOUNT_MNT_ROOT |
+				     STATMOUNT_MNT_POINT | STATMOUNT_FS_TYPE, 0);
+		if (!sm)
+			_exit(21);
+
+		if (sm->mask & STATMOUNT_FS_TYPE) {
+			if (strcmp(sm->str + sm->fs_type, "tmpfs") != 0) {
+				free(sm);
+				_exit(22);
+			}
+		}
+
+		free(sm);
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/filesystems/fsmount_ns/.gitignore b/tools/testing/selftests/filesystems/fsmount_ns/.gitignore
new file mode 100644
index 000000000000..f1ecf6c6e37b
--- /dev/null
+++ b/tools/testing/selftests/filesystems/fsmount_ns/.gitignore
@@ -0,0 +1 @@
+fsmount_ns_test
diff --git a/tools/testing/selftests/filesystems/fsmount_ns/Makefile b/tools/testing/selftests/filesystems/fsmount_ns/Makefile
new file mode 100644
index 000000000000..d9647efc0739
--- /dev/null
+++ b/tools/testing/selftests/filesystems/fsmount_ns/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0
+TEST_GEN_PROGS := fsmount_ns_test
+
+CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
+LDLIBS := -lcap
+
+include ../../lib.mk
+
+$(OUTPUT)/fsmount_ns_test: fsmount_ns_test.c ../utils.c
+	$(CC) $(CFLAGS) -o $@ $^ $(LDLIBS)
diff --git a/tools/testing/selftests/filesystems/fsmount_ns/fsmount_ns_test.c b/tools/testing/selftests/filesystems/fsmount_ns/fsmount_ns_test.c
new file mode 100644
index 000000000000..b70b3051eed4
--- /dev/null
+++ b/tools/testing/selftests/filesystems/fsmount_ns/fsmount_ns_test.c
@@ -0,0 +1,1135 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2026 Christian Brauner <brauner@kernel.org>
+ *
+ * Test for FSMOUNT_NAMESPACE flag.
+ *
+ * Test that fsmount() with FSMOUNT_NAMESPACE creates a new mount
+ * namespace containing the specified mount.
+ */
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <linux/nsfs.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "../wrappers.h"
+#include "../statmount/statmount.h"
+#include "../utils.h"
+#include "../../kselftest_harness.h"
+
+#ifndef FSMOUNT_NAMESPACE
+#define FSMOUNT_NAMESPACE	0x00000002
+#endif
+
+#ifndef FSMOUNT_CLOEXEC
+#define FSMOUNT_CLOEXEC		0x00000001
+#endif
+
+#ifndef FSCONFIG_CMD_CREATE
+#define FSCONFIG_CMD_CREATE	6
+#endif
+
+static int get_mnt_ns_id(int fd, uint64_t *mnt_ns_id)
+{
+	if (ioctl(fd, NS_GET_MNTNS_ID, mnt_ns_id) < 0)
+		return -errno;
+	return 0;
+}
+
+static int get_mnt_ns_id_from_path(const char *path, uint64_t *mnt_ns_id)
+{
+	int fd, ret;
+
+	fd = open(path, O_RDONLY);
+	if (fd < 0)
+		return -errno;
+
+	ret = get_mnt_ns_id(fd, mnt_ns_id);
+	close(fd);
+	return ret;
+}
+
+static void log_mount(struct __test_metadata *_metadata, struct statmount *sm)
+{
+	const char *fs_type = "";
+	const char *mnt_root = "";
+	const char *mnt_point = "";
+
+	if (sm->mask & STATMOUNT_FS_TYPE)
+		fs_type = sm->str + sm->fs_type;
+	if (sm->mask & STATMOUNT_MNT_ROOT)
+		mnt_root = sm->str + sm->mnt_root;
+	if (sm->mask & STATMOUNT_MNT_POINT)
+		mnt_point = sm->str + sm->mnt_point;
+
+	TH_LOG("  mnt_id: %llu, parent_id: %llu, fs_type: %s, root: %s, point: %s",
+	       (unsigned long long)sm->mnt_id,
+	       (unsigned long long)sm->mnt_parent_id,
+	       fs_type, mnt_root, mnt_point);
+}
+
+static void dump_mounts(struct __test_metadata *_metadata, uint64_t mnt_ns_id)
+{
+	uint64_t list[256];
+	ssize_t nr_mounts;
+
+	nr_mounts = listmount(LSMT_ROOT, mnt_ns_id, 0, list, 256, 0);
+	if (nr_mounts < 0) {
+		TH_LOG("listmount failed: %s", strerror(errno));
+		return;
+	}
+
+	TH_LOG("Mount namespace %llu contains %zd mount(s):",
+	       (unsigned long long)mnt_ns_id, nr_mounts);
+
+	for (ssize_t i = 0; i < nr_mounts; i++) {
+		struct statmount *sm;
+
+		sm = statmount_alloc(list[i], mnt_ns_id,
+				     STATMOUNT_MNT_BASIC |
+				     STATMOUNT_FS_TYPE |
+				     STATMOUNT_MNT_ROOT |
+				     STATMOUNT_MNT_POINT, 0);
+		if (!sm) {
+			TH_LOG("  [%zd] mnt_id %llu: statmount failed: %s",
+			       i, (unsigned long long)list[i], strerror(errno));
+			continue;
+		}
+
+		log_mount(_metadata, sm);
+		free(sm);
+	}
+}
+
+static int create_tmpfs_fd(void)
+{
+	int fs_fd, ret;
+
+	fs_fd = sys_fsopen("tmpfs", FSOPEN_CLOEXEC);
+	if (fs_fd < 0)
+		return -errno;
+
+	ret = sys_fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0);
+	if (ret < 0) {
+		close(fs_fd);
+		return -errno;
+	}
+
+	return fs_fd;
+}
+
+FIXTURE(fsmount_ns)
+{
+	int fd;
+	int fs_fd;
+	uint64_t current_ns_id;
+};
+
+FIXTURE_VARIANT(fsmount_ns)
+{
+	const char *fstype;
+	unsigned int flags;
+	bool expect_success;
+	bool expect_different_ns;
+	int min_mounts;
+};
+
+FIXTURE_VARIANT_ADD(fsmount_ns, basic_tmpfs)
+{
+	.fstype = "tmpfs",
+	.flags = FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC,
+	.expect_success = true,
+	.expect_different_ns = true,
+	.min_mounts = 1,
+};
+
+FIXTURE_VARIANT_ADD(fsmount_ns, cloexec_only)
+{
+	.fstype = "tmpfs",
+	.flags = FSMOUNT_CLOEXEC,
+	.expect_success = true,
+	.expect_different_ns = false,
+	.min_mounts = 1,
+};
+
+FIXTURE_VARIANT_ADD(fsmount_ns, namespace_only)
+{
+	.fstype = "tmpfs",
+	.flags = FSMOUNT_NAMESPACE,
+	.expect_success = true,
+	.expect_different_ns = true,
+	.min_mounts = 1,
+};
+
+FIXTURE_SETUP(fsmount_ns)
+{
+	int ret;
+
+	self->fd = -1;
+	self->fs_fd = -1;
+
+	/* Check if fsopen syscall is supported */
+	ret = sys_fsopen("tmpfs", 0);
+	if (ret == -1 && errno == ENOSYS)
+		SKIP(return, "fsopen() syscall not supported");
+	if (ret >= 0)
+		close(ret);
+
+	/* Check if statmount/listmount are supported */
+	ret = statmount(0, 0, 0, 0, NULL, 0, 0);
+	if (ret == -1 && errno == ENOSYS)
+		SKIP(return, "statmount() syscall not supported");
+
+	/* Get current mount namespace ID for comparison */
+	ret = get_mnt_ns_id_from_path("/proc/self/ns/mnt", &self->current_ns_id);
+	if (ret < 0)
+		SKIP(return, "Failed to get current mount namespace ID");
+}
+
+FIXTURE_TEARDOWN(fsmount_ns)
+{
+	if (self->fd >= 0)
+		close(self->fd);
+	if (self->fs_fd >= 0)
+		close(self->fs_fd);
+}
+
+TEST_F(fsmount_ns, create_namespace)
+{
+	uint64_t new_ns_id;
+	uint64_t list[256];
+	ssize_t nr_mounts;
+	int ret;
+
+	self->fs_fd = create_tmpfs_fd();
+	ASSERT_GE(self->fs_fd, 0);
+
+	self->fd = sys_fsmount(self->fs_fd, variant->flags, 0);
+
+	if (!variant->expect_success) {
+		ASSERT_LT(self->fd, 0);
+		return;
+	}
+
+	if (self->fd < 0 && errno == EINVAL)
+		SKIP(return, "FSMOUNT_NAMESPACE not supported");
+
+	ASSERT_GE(self->fd, 0);
+
+	if (variant->expect_different_ns) {
+		/* Verify we can get the namespace ID from the fd */
+		ret = get_mnt_ns_id(self->fd, &new_ns_id);
+		ASSERT_EQ(ret, 0);
+
+		/* Verify it's a different namespace */
+		ASSERT_NE(new_ns_id, self->current_ns_id);
+
+		/* List mounts in the new namespace */
+		nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0);
+		ASSERT_GE(nr_mounts, 0) {
+			TH_LOG("%m - listmount failed");
+		}
+
+		/* Verify minimum expected mounts */
+		ASSERT_GE(nr_mounts, variant->min_mounts);
+		TH_LOG("Namespace contains %zd mounts", nr_mounts);
+	}
+}
+
+TEST_F(fsmount_ns, setns_into_namespace)
+{
+	uint64_t new_ns_id;
+	pid_t pid;
+	int status;
+	int ret;
+
+	/* Only test with FSMOUNT_NAMESPACE flag */
+	if (!(variant->flags & FSMOUNT_NAMESPACE))
+		SKIP(return, "setns test only for FSMOUNT_NAMESPACE case");
+
+	self->fs_fd = create_tmpfs_fd();
+	ASSERT_GE(self->fs_fd, 0);
+
+	self->fd = sys_fsmount(self->fs_fd, variant->flags, 0);
+	if (self->fd < 0 && errno == EINVAL)
+		SKIP(return, "FSMOUNT_NAMESPACE not supported");
+
+	ASSERT_GE(self->fd, 0);
+
+	/* Get namespace ID and dump all mounts */
+	ret = get_mnt_ns_id(self->fd, &new_ns_id);
+	ASSERT_EQ(ret, 0);
+
+	dump_mounts(_metadata, new_ns_id);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		/* Child: try to enter the namespace */
+		if (setns(self->fd, CLONE_NEWNS) < 0)
+			_exit(1);
+		_exit(0);
+	}
+
+	ASSERT_EQ(waitpid(pid, &status, 0), pid);
+	ASSERT_TRUE(WIFEXITED(status));
+	ASSERT_EQ(WEXITSTATUS(status), 0);
+}
+
+TEST_F(fsmount_ns, verify_mount_properties)
+{
+	struct statmount sm;
+	uint64_t new_ns_id;
+	uint64_t list[256];
+	ssize_t nr_mounts;
+	int ret;
+
+	/* Only test with basic FSMOUNT_NAMESPACE flags */
+	if (variant->flags != (FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC))
+		SKIP(return, "mount properties test only for basic case");
+
+	self->fs_fd = create_tmpfs_fd();
+	ASSERT_GE(self->fs_fd, 0);
+
+	self->fd = sys_fsmount(self->fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 0);
+	if (self->fd < 0 && errno == EINVAL)
+		SKIP(return, "FSMOUNT_NAMESPACE not supported");
+
+	ASSERT_GE(self->fd, 0);
+
+	ret = get_mnt_ns_id(self->fd, &new_ns_id);
+	ASSERT_EQ(ret, 0);
+
+	nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0);
+	ASSERT_GE(nr_mounts, 1);
+
+	/* Get info about the root mount */
+	ret = statmount(list[0], new_ns_id, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0);
+	ASSERT_EQ(ret, 0);
+
+	TH_LOG("Root mount id: %llu, parent: %llu",
+	       (unsigned long long)sm.mnt_id,
+	       (unsigned long long)sm.mnt_parent_id);
+}
+
+TEST_F(fsmount_ns, verify_tmpfs_type)
+{
+	struct statmount *sm;
+	uint64_t new_ns_id;
+	uint64_t list[256];
+	ssize_t nr_mounts;
+	const char *fs_type;
+	int ret;
+
+	/* Only test with basic FSMOUNT_NAMESPACE flags */
+	if (variant->flags != (FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC))
+		SKIP(return, "fs type test only for basic case");
+
+	self->fs_fd = create_tmpfs_fd();
+	ASSERT_GE(self->fs_fd, 0);
+
+	self->fd = sys_fsmount(self->fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 0);
+	if (self->fd < 0 && errno == EINVAL)
+		SKIP(return, "FSMOUNT_NAMESPACE not supported");
+
+	ASSERT_GE(self->fd, 0);
+
+	ret = get_mnt_ns_id(self->fd, &new_ns_id);
+	ASSERT_EQ(ret, 0);
+
+	nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0);
+	ASSERT_GE(nr_mounts, 1);
+
+	sm = statmount_alloc(list[0], new_ns_id, STATMOUNT_FS_TYPE, 0);
+	ASSERT_NE(sm, NULL);
+
+	fs_type = sm->str + sm->fs_type;
+	ASSERT_STREQ(fs_type, "tmpfs");
+
+	free(sm);
+}
+
+FIXTURE(fsmount_ns_caps)
+{
+	bool has_caps;
+};
+
+FIXTURE_SETUP(fsmount_ns_caps)
+{
+	int ret;
+
+	/* Check if fsopen syscall is supported */
+	ret = sys_fsopen("tmpfs", 0);
+	if (ret == -1 && errno == ENOSYS)
+		SKIP(return, "fsopen() syscall not supported");
+	if (ret >= 0)
+		close(ret);
+
+	self->has_caps = (geteuid() == 0);
+}
+
+FIXTURE_TEARDOWN(fsmount_ns_caps)
+{
+}
+
+TEST_F(fsmount_ns_caps, requires_cap_sys_admin)
+{
+	pid_t pid;
+	int status;
+	int fs_fd;
+
+	/*
+	 * Prepare the configured filesystem fd as root before forking.
+	 * fsopen() requires CAP_SYS_ADMIN in the mount namespace's
+	 * user_ns, which won't be available after enter_userns().
+	 */
+	fs_fd = sys_fsopen("tmpfs", FSOPEN_CLOEXEC);
+	ASSERT_GE(fs_fd, 0);
+
+	ASSERT_EQ(sys_fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		int fd;
+
+		/* Child: drop privileges using utils.h helper */
+		if (enter_userns() != 0)
+			_exit(2);
+
+		/* Drop all caps using utils.h helper */
+		if (caps_down() == 0)
+			_exit(3);
+
+		fd = sys_fsmount(fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 0);
+		close(fs_fd);
+
+		if (fd >= 0) {
+			close(fd);
+			/* Should have failed without caps */
+			_exit(1);
+		}
+
+		if (errno == EPERM)
+			_exit(0);
+
+		/* EINVAL means FSMOUNT_NAMESPACE not supported */
+		if (errno == EINVAL)
+			_exit(6);
+
+		/* Unexpected error */
+		_exit(7);
+	}
+
+	close(fs_fd);
+	ASSERT_EQ(waitpid(pid, &status, 0), pid);
+	ASSERT_TRUE(WIFEXITED(status));
+
+	switch (WEXITSTATUS(status)) {
+	case 0:
+		/* Expected: EPERM without caps */
+		break;
+	case 1:
+		ASSERT_FALSE(true) TH_LOG("FSMOUNT_NAMESPACE succeeded without caps");
+		break;
+	case 2:
+		SKIP(return, "enter_userns failed");
+		break;
+	case 3:
+		SKIP(return, "caps_down failed");
+		break;
+	case 6:
+		SKIP(return, "FSMOUNT_NAMESPACE not supported");
+		break;
+	default:
+		ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)",
+					  WEXITSTATUS(status));
+		break;
+	}
+}
+
+FIXTURE(fsmount_ns_userns)
+{
+	int fd;
+	int fs_fd;
+};
+
+FIXTURE_SETUP(fsmount_ns_userns)
+{
+	int ret;
+
+	self->fd = -1;
+	self->fs_fd = -1;
+
+	/* Check if fsopen syscall is supported */
+	ret = sys_fsopen("tmpfs", 0);
+	if (ret == -1 && errno == ENOSYS)
+		SKIP(return, "fsopen() syscall not supported");
+	if (ret >= 0)
+		close(ret);
+
+	/* Check if statmount/listmount are supported */
+	ret = statmount(0, 0, 0, 0, NULL, 0, 0);
+	if (ret == -1 && errno == ENOSYS)
+		SKIP(return, "statmount() syscall not supported");
+}
+
+FIXTURE_TEARDOWN(fsmount_ns_userns)
+{
+	if (self->fd >= 0)
+		close(self->fd);
+	if (self->fs_fd >= 0)
+		close(self->fs_fd);
+}
+
+TEST_F(fsmount_ns_userns, create_in_userns)
+{
+	pid_t pid;
+	int status;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		uint64_t new_ns_id;
+		uint64_t list[256];
+		ssize_t nr_mounts;
+		int fs_fd, fd;
+
+		/* Create new user namespace (also creates mount namespace) */
+		if (setup_userns() != 0)
+			_exit(2);
+
+		/* Now we have CAP_SYS_ADMIN in the user namespace */
+		fs_fd = sys_fsopen("tmpfs", FSOPEN_CLOEXEC);
+		if (fs_fd < 0)
+			_exit(3);
+
+		if (sys_fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) {
+			close(fs_fd);
+			_exit(4);
+		}
+
+		fd = sys_fsmount(fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 0);
+		close(fs_fd);
+
+		if (fd < 0) {
+			if (errno == EINVAL)
+				_exit(6); /* FSMOUNT_NAMESPACE not supported */
+			_exit(1);
+		}
+
+		/* Verify we can get the namespace ID */
+		if (get_mnt_ns_id(fd, &new_ns_id) != 0)
+			_exit(7);
+
+		/* Verify we can list mounts in the new namespace */
+		nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0);
+		if (nr_mounts < 0)
+			_exit(8);
+
+		/* Should have at least 1 mount (the tmpfs) */
+		if (nr_mounts < 1)
+			_exit(9);
+
+		close(fd);
+		_exit(0);
+	}
+
+	ASSERT_EQ(waitpid(pid, &status, 0), pid);
+	ASSERT_TRUE(WIFEXITED(status));
+
+	switch (WEXITSTATUS(status)) {
+	case 0:
+		/* Success */
+		break;
+	case 1:
+		ASSERT_FALSE(true) TH_LOG("fsmount(FSMOUNT_NAMESPACE) failed in userns");
+		break;
+	case 2:
+		SKIP(return, "setup_userns failed");
+		break;
+	case 3:
+		SKIP(return, "fsopen failed in userns");
+		break;
+	case 4:
+		SKIP(return, "fsconfig CMD_CREATE failed in userns");
+		break;
+	case 6:
+		SKIP(return, "FSMOUNT_NAMESPACE not supported");
+		break;
+	case 7:
+		ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID");
+		break;
+	case 8:
+		ASSERT_FALSE(true) TH_LOG("listmount failed in new namespace");
+		break;
+	case 9:
+		ASSERT_FALSE(true) TH_LOG("New namespace has no mounts");
+		break;
+	default:
+		ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)",
+					  WEXITSTATUS(status));
+		break;
+	}
+}
+
+TEST_F(fsmount_ns_userns, setns_in_userns)
+{
+	pid_t pid;
+	int status;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		uint64_t new_ns_id;
+		int fs_fd, fd;
+		pid_t inner_pid;
+		int inner_status;
+
+		/* Create new user namespace */
+		if (setup_userns() != 0)
+			_exit(2);
+
+		fs_fd = sys_fsopen("tmpfs", FSOPEN_CLOEXEC);
+		if (fs_fd < 0)
+			_exit(3);
+
+		if (sys_fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) {
+			close(fs_fd);
+			_exit(4);
+		}
+
+		fd = sys_fsmount(fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 0);
+		close(fs_fd);
+
+		if (fd < 0) {
+			if (errno == EINVAL)
+				_exit(6);
+			_exit(1);
+		}
+
+		if (get_mnt_ns_id(fd, &new_ns_id) != 0)
+			_exit(7);
+
+		/* Fork again to test setns into the new namespace */
+		inner_pid = fork();
+		if (inner_pid < 0)
+			_exit(10);
+
+		if (inner_pid == 0) {
+			/* Inner child: enter the new namespace */
+			if (setns(fd, CLONE_NEWNS) < 0)
+				_exit(1);
+			_exit(0);
+		}
+
+		if (waitpid(inner_pid, &inner_status, 0) != inner_pid)
+			_exit(11);
+
+		if (!WIFEXITED(inner_status) || WEXITSTATUS(inner_status) != 0)
+			_exit(12);
+
+		close(fd);
+		_exit(0);
+	}
+
+	ASSERT_EQ(waitpid(pid, &status, 0), pid);
+	ASSERT_TRUE(WIFEXITED(status));
+
+	switch (WEXITSTATUS(status)) {
+	case 0:
+		/* Success */
+		break;
+	case 1:
+		ASSERT_FALSE(true) TH_LOG("fsmount or setns failed in userns");
+		break;
+	case 2:
+		SKIP(return, "setup_userns failed");
+		break;
+	case 3:
+		SKIP(return, "fsopen failed in userns");
+		break;
+	case 4:
+		SKIP(return, "fsconfig CMD_CREATE failed in userns");
+		break;
+	case 6:
+		SKIP(return, "FSMOUNT_NAMESPACE not supported");
+		break;
+	case 7:
+		ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID");
+		break;
+	case 10:
+		ASSERT_FALSE(true) TH_LOG("Inner fork failed");
+		break;
+	case 11:
+		ASSERT_FALSE(true) TH_LOG("Inner waitpid failed");
+		break;
+	case 12:
+		ASSERT_FALSE(true) TH_LOG("setns into new namespace failed");
+		break;
+	default:
+		ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)",
+					  WEXITSTATUS(status));
+		break;
+	}
+}
+
+TEST_F(fsmount_ns_userns, umount_fails_einval)
+{
+	pid_t pid;
+	int status;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		uint64_t new_ns_id;
+		uint64_t list[256];
+		ssize_t nr_mounts;
+		int fs_fd, fd;
+		ssize_t i;
+
+		/* Create new user namespace */
+		if (setup_userns() != 0)
+			_exit(2);
+
+		fs_fd = sys_fsopen("tmpfs", FSOPEN_CLOEXEC);
+		if (fs_fd < 0)
+			_exit(3);
+
+		if (sys_fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) {
+			close(fs_fd);
+			_exit(4);
+		}
+
+		fd = sys_fsmount(fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 0);
+		close(fs_fd);
+
+		if (fd < 0) {
+			if (errno == EINVAL)
+				_exit(6);
+			_exit(1);
+		}
+
+		if (get_mnt_ns_id(fd, &new_ns_id) != 0)
+			_exit(7);
+
+		/* Get all mounts in the new namespace */
+		nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, LISTMOUNT_REVERSE);
+		if (nr_mounts < 0)
+			_exit(13);
+
+		if (nr_mounts < 1)
+			_exit(14);
+
+		/* Enter the new namespace */
+		if (setns(fd, CLONE_NEWNS) < 0)
+			_exit(8);
+
+		for (i = 0; i < nr_mounts; i++) {
+			struct statmount *sm;
+			const char *mnt_point;
+
+			sm = statmount_alloc(list[i], new_ns_id,
+					     STATMOUNT_MNT_POINT, 0);
+			if (!sm)
+				_exit(15);
+
+			mnt_point = sm->str + sm->mnt_point;
+
+			if (umount2(mnt_point, MNT_DETACH) == 0) {
+				free(sm);
+				_exit(9);
+			}
+
+			if (errno != EINVAL) {
+				/* Wrong error */
+				free(sm);
+				_exit(10);
+			}
+
+			free(sm);
+		}
+
+		close(fd);
+		_exit(0);
+	}
+
+	ASSERT_EQ(waitpid(pid, &status, 0), pid);
+	ASSERT_TRUE(WIFEXITED(status));
+
+	switch (WEXITSTATUS(status)) {
+	case 0:
+		break;
+	case 1:
+		ASSERT_FALSE(true) TH_LOG("fsmount(FSMOUNT_NAMESPACE) failed");
+		break;
+	case 2:
+		SKIP(return, "setup_userns failed");
+		break;
+	case 3:
+		SKIP(return, "fsopen failed in userns");
+		break;
+	case 4:
+		SKIP(return, "fsconfig CMD_CREATE failed in userns");
+		break;
+	case 6:
+		SKIP(return, "FSMOUNT_NAMESPACE not supported");
+		break;
+	case 7:
+		ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID");
+		break;
+	case 8:
+		ASSERT_FALSE(true) TH_LOG("setns into new namespace failed");
+		break;
+	case 9:
+		ASSERT_FALSE(true) TH_LOG("umount succeeded but should have failed with EINVAL");
+		break;
+	case 10:
+		ASSERT_FALSE(true) TH_LOG("umount failed with wrong error (expected EINVAL)");
+		break;
+	case 13:
+		ASSERT_FALSE(true) TH_LOG("listmount failed");
+		break;
+	case 14:
+		ASSERT_FALSE(true) TH_LOG("No mounts in new namespace");
+		break;
+	case 15:
+		ASSERT_FALSE(true) TH_LOG("statmount_alloc failed");
+		break;
+	default:
+		ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)",
+					  WEXITSTATUS(status));
+		break;
+	}
+}
+
+TEST_F(fsmount_ns_userns, umount_succeeds)
+{
+	pid_t pid;
+	int status;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		uint64_t new_ns_id;
+		uint64_t list[256];
+		ssize_t nr_mounts;
+		int fs_fd, fd;
+		ssize_t i;
+
+		if (unshare(CLONE_NEWNS))
+			_exit(1);
+
+		if (sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) != 0)
+			_exit(1);
+
+		fs_fd = sys_fsopen("tmpfs", FSOPEN_CLOEXEC);
+		if (fs_fd < 0)
+			_exit(3);
+
+		if (sys_fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) {
+			close(fs_fd);
+			_exit(4);
+		}
+
+		fd = sys_fsmount(fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 0);
+		close(fs_fd);
+
+		if (fd < 0) {
+			if (errno == EINVAL)
+				_exit(6);
+			_exit(1);
+		}
+
+		if (get_mnt_ns_id(fd, &new_ns_id) != 0)
+			_exit(7);
+
+		/* Get all mounts in the new namespace */
+		nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, LISTMOUNT_REVERSE);
+		if (nr_mounts < 0)
+			_exit(13);
+
+		if (nr_mounts < 1)
+			_exit(14);
+
+		/* Enter the new namespace */
+		if (setns(fd, CLONE_NEWNS) < 0)
+			_exit(8);
+
+		for (i = 0; i < nr_mounts; i++) {
+			struct statmount *sm;
+			const char *mnt_point;
+
+			sm = statmount_alloc(list[i], new_ns_id,
+					     STATMOUNT_MNT_POINT, 0);
+			if (!sm)
+				_exit(15);
+
+			mnt_point = sm->str + sm->mnt_point;
+
+			if (umount2(mnt_point, MNT_DETACH) != 0) {
+				free(sm);
+				_exit(9);
+			}
+
+			free(sm);
+		}
+
+		close(fd);
+		_exit(0);
+	}
+
+	ASSERT_EQ(waitpid(pid, &status, 0), pid);
+	ASSERT_TRUE(WIFEXITED(status));
+
+	switch (WEXITSTATUS(status)) {
+	case 0:
+		break;
+	case 1:
+		ASSERT_FALSE(true) TH_LOG("fsmount(FSMOUNT_NAMESPACE) failed or unshare failed");
+		break;
+	case 3:
+		SKIP(return, "fsopen failed");
+		break;
+	case 4:
+		SKIP(return, "fsconfig CMD_CREATE failed");
+		break;
+	case 6:
+		SKIP(return, "FSMOUNT_NAMESPACE not supported");
+		break;
+	case 7:
+		ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID");
+		break;
+	case 8:
+		ASSERT_FALSE(true) TH_LOG("setns into new namespace failed");
+		break;
+	case 9:
+		ASSERT_FALSE(true) TH_LOG("umount failed but should have succeeded");
+		break;
+	case 13:
+		ASSERT_FALSE(true) TH_LOG("listmount failed");
+		break;
+	case 14:
+		ASSERT_FALSE(true) TH_LOG("No mounts in new namespace");
+		break;
+	case 15:
+		ASSERT_FALSE(true) TH_LOG("statmount_alloc failed");
+		break;
+	default:
+		ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)",
+					  WEXITSTATUS(status));
+		break;
+	}
+}
+
+FIXTURE(fsmount_ns_mount_attrs)
+{
+	int fd;
+	int fs_fd;
+};
+
+FIXTURE_SETUP(fsmount_ns_mount_attrs)
+{
+	int ret;
+
+	self->fd = -1;
+	self->fs_fd = -1;
+
+	/* Check if fsopen syscall is supported */
+	ret = sys_fsopen("tmpfs", 0);
+	if (ret == -1 && errno == ENOSYS)
+		SKIP(return, "fsopen() syscall not supported");
+	if (ret >= 0)
+		close(ret);
+
+	/* Check if statmount/listmount are supported */
+	ret = statmount(0, 0, 0, 0, NULL, 0, 0);
+	if (ret == -1 && errno == ENOSYS)
+		SKIP(return, "statmount() syscall not supported");
+}
+
+FIXTURE_TEARDOWN(fsmount_ns_mount_attrs)
+{
+	if (self->fd >= 0)
+		close(self->fd);
+	if (self->fs_fd >= 0)
+		close(self->fs_fd);
+}
+
+TEST_F(fsmount_ns_mount_attrs, readonly)
+{
+	struct statmount sm;
+	uint64_t new_ns_id;
+	uint64_t list[256];
+	ssize_t nr_mounts;
+	int ret;
+
+	self->fs_fd = create_tmpfs_fd();
+	ASSERT_GE(self->fs_fd, 0);
+
+	self->fd = sys_fsmount(self->fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC,
+			       MOUNT_ATTR_RDONLY);
+	if (self->fd < 0 && errno == EINVAL)
+		SKIP(return, "FSMOUNT_NAMESPACE not supported");
+
+	ASSERT_GE(self->fd, 0);
+
+	ret = get_mnt_ns_id(self->fd, &new_ns_id);
+	ASSERT_EQ(ret, 0);
+
+	nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0);
+	ASSERT_GE(nr_mounts, 1);
+
+	ret = statmount(list[0], new_ns_id, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0);
+	ASSERT_EQ(ret, 0);
+
+	/* Verify the mount is read-only */
+	ASSERT_TRUE(sm.mnt_attr & MOUNT_ATTR_RDONLY);
+}
+
+TEST_F(fsmount_ns_mount_attrs, noexec)
+{
+	struct statmount sm;
+	uint64_t new_ns_id;
+	uint64_t list[256];
+	ssize_t nr_mounts;
+	int ret;
+
+	self->fs_fd = create_tmpfs_fd();
+	ASSERT_GE(self->fs_fd, 0);
+
+	self->fd = sys_fsmount(self->fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC,
+			       MOUNT_ATTR_NOEXEC);
+	if (self->fd < 0 && errno == EINVAL)
+		SKIP(return, "FSMOUNT_NAMESPACE not supported");
+
+	ASSERT_GE(self->fd, 0);
+
+	ret = get_mnt_ns_id(self->fd, &new_ns_id);
+	ASSERT_EQ(ret, 0);
+
+	nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0);
+	ASSERT_GE(nr_mounts, 1);
+
+	ret = statmount(list[0], new_ns_id, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0);
+	ASSERT_EQ(ret, 0);
+
+	/* Verify the mount is noexec */
+	ASSERT_TRUE(sm.mnt_attr & MOUNT_ATTR_NOEXEC);
+}
+
+TEST_F(fsmount_ns_mount_attrs, nosuid)
+{
+	struct statmount sm;
+	uint64_t new_ns_id;
+	uint64_t list[256];
+	ssize_t nr_mounts;
+	int ret;
+
+	self->fs_fd = create_tmpfs_fd();
+	ASSERT_GE(self->fs_fd, 0);
+
+	self->fd = sys_fsmount(self->fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC,
+			       MOUNT_ATTR_NOSUID);
+	if (self->fd < 0 && errno == EINVAL)
+		SKIP(return, "FSMOUNT_NAMESPACE not supported");
+
+	ASSERT_GE(self->fd, 0);
+
+	ret = get_mnt_ns_id(self->fd, &new_ns_id);
+	ASSERT_EQ(ret, 0);
+
+	nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0);
+	ASSERT_GE(nr_mounts, 1);
+
+	ret = statmount(list[0], new_ns_id, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0);
+	ASSERT_EQ(ret, 0);
+
+	/* Verify the mount is nosuid */
+	ASSERT_TRUE(sm.mnt_attr & MOUNT_ATTR_NOSUID);
+}
+
+TEST_F(fsmount_ns_mount_attrs, noatime)
+{
+	struct statmount sm;
+	uint64_t new_ns_id;
+	uint64_t list[256];
+	ssize_t nr_mounts;
+	int ret;
+
+	self->fs_fd = create_tmpfs_fd();
+	ASSERT_GE(self->fs_fd, 0);
+
+	self->fd = sys_fsmount(self->fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC,
+			       MOUNT_ATTR_NOATIME);
+	if (self->fd < 0 && errno == EINVAL)
+		SKIP(return, "FSMOUNT_NAMESPACE not supported");
+
+	ASSERT_GE(self->fd, 0);
+
+	ret = get_mnt_ns_id(self->fd, &new_ns_id);
+	ASSERT_EQ(ret, 0);
+
+	nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0);
+	ASSERT_GE(nr_mounts, 1);
+
+	ret = statmount(list[0], new_ns_id, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0);
+	ASSERT_EQ(ret, 0);
+
+	/* Verify the mount is noatime */
+	ASSERT_TRUE(sm.mnt_attr & MOUNT_ATTR_NOATIME);
+}
+
+TEST_F(fsmount_ns_mount_attrs, combined)
+{
+	struct statmount sm;
+	uint64_t new_ns_id;
+	uint64_t list[256];
+	ssize_t nr_mounts;
+	int ret;
+
+	self->fs_fd = create_tmpfs_fd();
+	ASSERT_GE(self->fs_fd, 0);
+
+	self->fd = sys_fsmount(self->fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC,
+			       MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOEXEC |
+			       MOUNT_ATTR_NOSUID | MOUNT_ATTR_NOATIME);
+	if (self->fd < 0 && errno == EINVAL)
+		SKIP(return, "FSMOUNT_NAMESPACE not supported");
+
+	ASSERT_GE(self->fd, 0);
+
+	ret = get_mnt_ns_id(self->fd, &new_ns_id);
+	ASSERT_EQ(ret, 0);
+
+	nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0);
+	ASSERT_GE(nr_mounts, 1);
+
+	ret = statmount(list[0], new_ns_id, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0);
+	ASSERT_EQ(ret, 0);
+
+	/* Verify all attributes are set */
+	ASSERT_TRUE(sm.mnt_attr & MOUNT_ATTR_RDONLY);
+	ASSERT_TRUE(sm.mnt_attr & MOUNT_ATTR_NOEXEC);
+	ASSERT_TRUE(sm.mnt_attr & MOUNT_ATTR_NOSUID);
+	ASSERT_TRUE(sm.mnt_attr & MOUNT_ATTR_NOATIME);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/filesystems/move_mount/.gitignore b/tools/testing/selftests/filesystems/move_mount/.gitignore
new file mode 100644
index 000000000000..c7557db30671
--- /dev/null
+++ b/tools/testing/selftests/filesystems/move_mount/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+move_mount_test
diff --git a/tools/testing/selftests/filesystems/move_mount/Makefile b/tools/testing/selftests/filesystems/move_mount/Makefile
new file mode 100644
index 000000000000..5c5b199b464b
--- /dev/null
+++ b/tools/testing/selftests/filesystems/move_mount/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+CFLAGS += -Wall -O2 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
+LDLIBS += -lcap
+
+TEST_GEN_PROGS := move_mount_test
+
+include ../../lib.mk
+
+$(OUTPUT)/move_mount_test: ../utils.c
diff --git a/tools/testing/selftests/filesystems/move_mount/move_mount_test.c b/tools/testing/selftests/filesystems/move_mount/move_mount_test.c
new file mode 100644
index 000000000000..f08f94b1f0ec
--- /dev/null
+++ b/tools/testing/selftests/filesystems/move_mount/move_mount_test.c
@@ -0,0 +1,492 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+// Copyright (c) 2026 Christian Brauner <brauner@kernel.org>
+
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+
+#include "../wrappers.h"
+#include "../utils.h"
+#include "../statmount/statmount.h"
+#include "../../kselftest_harness.h"
+
+#include <linux/stat.h>
+
+#ifndef MOVE_MOUNT_BENEATH
+#define MOVE_MOUNT_BENEATH 0x00000200
+#endif
+
+static uint64_t get_unique_mnt_id_fd(int fd)
+{
+	struct statx sx;
+	int ret;
+
+	ret = statx(fd, "", AT_EMPTY_PATH, STATX_MNT_ID_UNIQUE, &sx);
+	if (ret)
+		return 0;
+
+	if (!(sx.stx_mask & STATX_MNT_ID_UNIQUE))
+		return 0;
+
+	return sx.stx_mnt_id;
+}
+
+/*
+ * Create a locked overmount stack at /mnt_dir for testing MNT_LOCKED
+ * transfer on non-rootfs mounts.
+ *
+ * Mounts tmpfs A at /mnt_dir, overmounts with tmpfs B, then enters a
+ * new user+mount namespace where both become locked. Returns the exit
+ * code to use on failure, or 0 on success.
+ */
+static int setup_locked_overmount(void)
+{
+	/* Isolate so mounts don't leak. */
+	if (unshare(CLONE_NEWNS))
+		return 1;
+	if (mount("", "/", NULL, MS_REC | MS_PRIVATE, NULL))
+		return 2;
+
+	/*
+	 * Create mounts while still in the initial user namespace so
+	 * they become locked after the subsequent user namespace
+	 * unshare.
+	 */
+	rmdir("/mnt_dir");
+	if (mkdir("/mnt_dir", 0755))
+		return 3;
+
+	/* Mount tmpfs A */
+	if (mount("tmpfs", "/mnt_dir", "tmpfs", 0, NULL))
+		return 4;
+
+	/* Overmount with tmpfs B */
+	if (mount("tmpfs", "/mnt_dir", "tmpfs", 0, NULL))
+		return 5;
+
+	/*
+	 * Create user+mount namespace. Mounts A and B become locked
+	 * because they might be covering something that is not supposed
+	 * to be revealed.
+	 */
+	if (setup_userns())
+		return 6;
+
+	/* Sanity check: B must be locked */
+	if (!umount2("/mnt_dir", MNT_DETACH) || errno != EINVAL)
+		return 7;
+
+	return 0;
+}
+
+/*
+ * Create a detached tmpfs mount and return its fd, or -1 on failure.
+ */
+static int create_detached_tmpfs(void)
+{
+	int fs_fd, mnt_fd;
+
+	fs_fd = sys_fsopen("tmpfs", FSOPEN_CLOEXEC);
+	if (fs_fd < 0)
+		return -1;
+
+	if (sys_fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)) {
+		close(fs_fd);
+		return -1;
+	}
+
+	mnt_fd = sys_fsmount(fs_fd, FSMOUNT_CLOEXEC, 0);
+	close(fs_fd);
+	return mnt_fd;
+}
+
+FIXTURE(move_mount) {
+	uint64_t orig_root_id;
+};
+
+FIXTURE_SETUP(move_mount)
+{
+	ASSERT_EQ(unshare(CLONE_NEWNS), 0);
+
+	ASSERT_EQ(mount("", "/", NULL, MS_REC | MS_PRIVATE, NULL), 0);
+
+	self->orig_root_id = get_unique_mnt_id("/");
+	ASSERT_NE(self->orig_root_id, 0);
+}
+
+FIXTURE_TEARDOWN(move_mount)
+{
+}
+
+/*
+ * Test successful MOVE_MOUNT_BENEATH on the rootfs.
+ * Mount a clone beneath /, fchdir to the clone, chroot to switch root,
+ * then detach the old root.
+ */
+TEST_F(move_mount, beneath_rootfs_success)
+{
+	int fd_tree, ret;
+	uint64_t clone_id, root_id;
+
+	fd_tree = sys_open_tree(AT_FDCWD, "/",
+				OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
+	ASSERT_GE(fd_tree, 0);
+
+	clone_id = get_unique_mnt_id_fd(fd_tree);
+	ASSERT_NE(clone_id, 0);
+	ASSERT_NE(clone_id, self->orig_root_id);
+
+	ASSERT_EQ(fchdir(fd_tree), 0);
+
+	ret = sys_move_mount(fd_tree, "", AT_FDCWD, "/",
+			     MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_BENEATH);
+	ASSERT_EQ(ret, 0);
+
+	close(fd_tree);
+
+	/* Switch root to the clone */
+	ASSERT_EQ(chroot("."), 0);
+
+	/* Verify "/" is now the clone */
+	root_id = get_unique_mnt_id("/");
+	ASSERT_NE(root_id, 0);
+	ASSERT_EQ(root_id, clone_id);
+
+	/* Detach old root */
+	ASSERT_EQ(umount2(".", MNT_DETACH), 0);
+}
+
+/*
+ * Test that after MOVE_MOUNT_BENEATH on the rootfs the old root is
+ * stacked on top of the clone. Verify via statmount that the old
+ * root's parent is the clone.
+ */
+TEST_F(move_mount, beneath_rootfs_old_root_stacked)
+{
+	int fd_tree, ret;
+	uint64_t clone_id;
+	struct statmount sm;
+
+	fd_tree = sys_open_tree(AT_FDCWD, "/",
+				OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
+	ASSERT_GE(fd_tree, 0);
+
+	clone_id = get_unique_mnt_id_fd(fd_tree);
+	ASSERT_NE(clone_id, 0);
+	ASSERT_NE(clone_id, self->orig_root_id);
+
+	ASSERT_EQ(fchdir(fd_tree), 0);
+
+	ret = sys_move_mount(fd_tree, "", AT_FDCWD, "/",
+			     MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_BENEATH);
+	ASSERT_EQ(ret, 0);
+
+	close(fd_tree);
+
+	ASSERT_EQ(chroot("."), 0);
+
+	/* Old root's parent should now be the clone */
+	ASSERT_EQ(statmount(self->orig_root_id, 0, 0,
+			     STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0), 0);
+	ASSERT_EQ(sm.mnt_parent_id, clone_id);
+
+	ASSERT_EQ(umount2(".", MNT_DETACH), 0);
+}
+
+/*
+ * Test that MOVE_MOUNT_BENEATH on rootfs fails when chroot'd into a
+ * subdirectory of the same mount. The caller's fs->root.dentry doesn't
+ * match mnt->mnt_root so the kernel rejects it.
+ */
+TEST_F(move_mount, beneath_rootfs_in_chroot_fail)
+{
+	int fd_tree, ret;
+	uint64_t chroot_id, clone_id;
+
+	rmdir("/chroot_dir");
+	ASSERT_EQ(mkdir("/chroot_dir", 0755), 0);
+
+	chroot_id = get_unique_mnt_id("/chroot_dir");
+	ASSERT_NE(chroot_id, 0);
+	ASSERT_EQ(self->orig_root_id, chroot_id);
+
+	ASSERT_EQ(chdir("/chroot_dir"), 0);
+	ASSERT_EQ(chroot("."), 0);
+
+	fd_tree = sys_open_tree(AT_FDCWD, "/",
+				OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
+	ASSERT_GE(fd_tree, 0);
+
+	clone_id = get_unique_mnt_id_fd(fd_tree);
+	ASSERT_NE(clone_id, 0);
+	ASSERT_NE(clone_id, chroot_id);
+
+	ASSERT_EQ(fchdir(fd_tree), 0);
+
+	/*
+	 * Should fail: fs->root.dentry (/chroot_dir) doesn't match
+	 * the mount's mnt_root (/).
+	 */
+	ret = sys_move_mount(fd_tree, "", AT_FDCWD, "/",
+			     MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_BENEATH);
+	ASSERT_EQ(ret, -1);
+	ASSERT_EQ(errno, EINVAL);
+
+	close(fd_tree);
+}
+
+/*
+ * Test that MOVE_MOUNT_BENEATH on rootfs succeeds when chroot'd into a
+ * separate tmpfs mount. The caller's root dentry matches the mount's
+ * mnt_root since it's a dedicated mount.
+ */
+TEST_F(move_mount, beneath_rootfs_in_chroot_success)
+{
+	int fd_tree, ret;
+	uint64_t chroot_id, clone_id, root_id;
+	struct statmount sm;
+
+	rmdir("/chroot_dir");
+	ASSERT_EQ(mkdir("/chroot_dir", 0755), 0);
+	ASSERT_EQ(mount("tmpfs", "/chroot_dir", "tmpfs", 0, NULL), 0);
+
+	chroot_id = get_unique_mnt_id("/chroot_dir");
+	ASSERT_NE(chroot_id, 0);
+
+	ASSERT_EQ(chdir("/chroot_dir"), 0);
+	ASSERT_EQ(chroot("."), 0);
+
+	ASSERT_EQ(get_unique_mnt_id("/"), chroot_id);
+
+	fd_tree = sys_open_tree(AT_FDCWD, "/",
+				OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
+	ASSERT_GE(fd_tree, 0);
+
+	clone_id = get_unique_mnt_id_fd(fd_tree);
+	ASSERT_NE(clone_id, 0);
+	ASSERT_NE(clone_id, chroot_id);
+
+	ASSERT_EQ(fchdir(fd_tree), 0);
+
+	ret = sys_move_mount(fd_tree, "", AT_FDCWD, "/",
+			     MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_BENEATH);
+	ASSERT_EQ(ret, 0);
+
+	close(fd_tree);
+
+	ASSERT_EQ(chroot("."), 0);
+
+	root_id = get_unique_mnt_id("/");
+	ASSERT_NE(root_id, 0);
+	ASSERT_EQ(root_id, clone_id);
+
+	ASSERT_EQ(statmount(chroot_id, 0, 0,
+			     STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0), 0);
+	ASSERT_EQ(sm.mnt_parent_id, clone_id);
+
+	ASSERT_EQ(umount2(".", MNT_DETACH), 0);
+}
+
+/*
+ * Test MNT_LOCKED transfer when mounting beneath rootfs in a user+mount
+ * namespace. After mount-beneath the new root gets MNT_LOCKED and the
+ * old root has MNT_LOCKED cleared so it can be unmounted.
+ */
+TEST_F(move_mount, beneath_rootfs_locked_transfer)
+{
+	int fd_tree, ret;
+	uint64_t clone_id, root_id;
+
+	ASSERT_EQ(setup_userns(), 0);
+
+	ASSERT_EQ(mount("", "/", NULL, MS_REC | MS_PRIVATE, NULL), 0);
+
+	fd_tree = sys_open_tree(AT_FDCWD, "/",
+				OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC |
+				AT_RECURSIVE);
+	ASSERT_GE(fd_tree, 0);
+
+	clone_id = get_unique_mnt_id_fd(fd_tree);
+	ASSERT_NE(clone_id, 0);
+
+	ASSERT_EQ(fchdir(fd_tree), 0);
+
+	ret = sys_move_mount(fd_tree, "", AT_FDCWD, "/",
+			     MOVE_MOUNT_F_EMPTY_PATH |
+			     MOVE_MOUNT_BENEATH);
+	ASSERT_EQ(ret, 0);
+
+	close(fd_tree);
+
+	ASSERT_EQ(chroot("."), 0);
+
+	root_id = get_unique_mnt_id("/");
+	ASSERT_EQ(root_id, clone_id);
+
+	/*
+	 * The old root should be unmountable (MNT_LOCKED was
+	 * transferred to the clone). If MNT_LOCKED wasn't
+	 * cleared, this would fail with EINVAL.
+	 */
+	ASSERT_EQ(umount2(".", MNT_DETACH), 0);
+
+	/* Verify "/" is still the clone after detaching old root */
+	root_id = get_unique_mnt_id("/");
+	ASSERT_EQ(root_id, clone_id);
+}
+
+/*
+ * Test containment invariant: after mount-beneath rootfs in a user+mount
+ * namespace, the new root must be MNT_LOCKED. The lock transfer from the
+ * old root preserves containment -- the process cannot unmount the new root
+ * to escape the namespace.
+ */
+TEST_F(move_mount, beneath_rootfs_locked_containment)
+{
+	int fd_tree, ret;
+	uint64_t clone_id, root_id;
+
+	ASSERT_EQ(setup_userns(), 0);
+
+	ASSERT_EQ(mount("", "/", NULL, MS_REC | MS_PRIVATE, NULL), 0);
+
+	/* Sanity: rootfs must be locked in the new userns */
+	ASSERT_EQ(umount2("/", MNT_DETACH), -1);
+	ASSERT_EQ(errno, EINVAL);
+
+	fd_tree = sys_open_tree(AT_FDCWD, "/",
+				OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC |
+				AT_RECURSIVE);
+	ASSERT_GE(fd_tree, 0);
+
+	clone_id = get_unique_mnt_id_fd(fd_tree);
+	ASSERT_NE(clone_id, 0);
+
+	ASSERT_EQ(fchdir(fd_tree), 0);
+
+	ret = sys_move_mount(fd_tree, "", AT_FDCWD, "/",
+			     MOVE_MOUNT_F_EMPTY_PATH |
+			     MOVE_MOUNT_BENEATH);
+	ASSERT_EQ(ret, 0);
+
+	close(fd_tree);
+
+	ASSERT_EQ(chroot("."), 0);
+
+	root_id = get_unique_mnt_id("/");
+	ASSERT_EQ(root_id, clone_id);
+
+	/* Detach old root (MNT_LOCKED was cleared from it) */
+	ASSERT_EQ(umount2(".", MNT_DETACH), 0);
+
+	/* Verify "/" is still the clone after detaching old root */
+	root_id = get_unique_mnt_id("/");
+	ASSERT_EQ(root_id, clone_id);
+
+	/*
+	 * The new root must be locked (MNT_LOCKED was transferred
+	 * from the old root). Attempting to unmount it must fail
+	 * with EINVAL, preserving the containment invariant.
+	 */
+	ASSERT_EQ(umount2("/", MNT_DETACH), -1);
+	ASSERT_EQ(errno, EINVAL);
+}
+
+/*
+ * Test MNT_LOCKED transfer when mounting beneath a non-rootfs locked mount.
+ * Mounts created before unshare(CLONE_NEWUSER | CLONE_NEWNS) become locked
+ * in the new namespace. Mount-beneath transfers the lock from the displaced
+ * mount to the new mount, so the displaced mount can be unmounted.
+ */
+TEST_F(move_mount, beneath_non_rootfs_locked_transfer)
+{
+	int mnt_fd, ret;
+	uint64_t mnt_new_id, mnt_visible_id;
+
+	ASSERT_EQ(setup_locked_overmount(), 0);
+
+	mnt_fd = create_detached_tmpfs();
+	ASSERT_GE(mnt_fd, 0);
+
+	mnt_new_id = get_unique_mnt_id_fd(mnt_fd);
+	ASSERT_NE(mnt_new_id, 0);
+
+	/* Move mount beneath B (which is locked) */
+	ret = sys_move_mount(mnt_fd, "", AT_FDCWD, "/mnt_dir",
+			     MOVE_MOUNT_F_EMPTY_PATH |
+			     MOVE_MOUNT_BENEATH);
+	ASSERT_EQ(ret, 0);
+
+	close(mnt_fd);
+
+	/*
+	 * B should now be unmountable (MNT_LOCKED was transferred
+	 * to the new mount beneath it). If MNT_LOCKED wasn't
+	 * cleared from B, this would fail with EINVAL.
+	 */
+	ASSERT_EQ(umount2("/mnt_dir", MNT_DETACH), 0);
+
+	/* Verify the new mount is now visible */
+	mnt_visible_id = get_unique_mnt_id("/mnt_dir");
+	ASSERT_EQ(mnt_visible_id, mnt_new_id);
+}
+
+/*
+ * Test MNT_LOCKED containment when mounting beneath a non-rootfs mount
+ * that was locked during unshare(CLONE_NEWUSER | CLONE_NEWNS).
+ * Mounts created before unshare become locked in the new namespace.
+ * Mount-beneath transfers the lock, preserving containment: the new
+ * mount cannot be unmounted, but the displaced mount can.
+ */
+TEST_F(move_mount, beneath_non_rootfs_locked_containment)
+{
+	int mnt_fd, ret;
+	uint64_t mnt_new_id, mnt_visible_id;
+
+	ASSERT_EQ(setup_locked_overmount(), 0);
+
+	mnt_fd = create_detached_tmpfs();
+	ASSERT_GE(mnt_fd, 0);
+
+	mnt_new_id = get_unique_mnt_id_fd(mnt_fd);
+	ASSERT_NE(mnt_new_id, 0);
+
+	/*
+	 * Move new tmpfs beneath B at /mnt_dir.
+	 * Stack becomes: A -> new -> B
+	 * Lock transfers from B to new.
+	 */
+	ret = sys_move_mount(mnt_fd, "", AT_FDCWD, "/mnt_dir",
+			     MOVE_MOUNT_F_EMPTY_PATH |
+			     MOVE_MOUNT_BENEATH);
+	ASSERT_EQ(ret, 0);
+
+	close(mnt_fd);
+
+	/*
+	 * B lost MNT_LOCKED -- unmounting it must succeed.
+	 * This reveals the new mount at /mnt_dir.
+	 */
+	ASSERT_EQ(umount2("/mnt_dir", MNT_DETACH), 0);
+
+	/* Verify the new mount is now visible */
+	mnt_visible_id = get_unique_mnt_id("/mnt_dir");
+	ASSERT_EQ(mnt_visible_id, mnt_new_id);
+
+	/*
+	 * The new mount gained MNT_LOCKED -- unmounting it must
+	 * fail with EINVAL, preserving the containment invariant.
+	 */
+	ASSERT_EQ(umount2("/mnt_dir", MNT_DETACH), -1);
+	ASSERT_EQ(errno, EINVAL);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/filesystems/open_tree_ns/Makefile b/tools/testing/selftests/filesystems/open_tree_ns/Makefile
index 73c03c4a7ef6..4976ed1d7d4a 100644
--- a/tools/testing/selftests/filesystems/open_tree_ns/Makefile
+++ b/tools/testing/selftests/filesystems/open_tree_ns/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 TEST_GEN_PROGS := open_tree_ns_test
 
-CFLAGS := -Wall -Werror -g $(KHDR_INCLUDES)
+CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
 LDLIBS := -lcap
 
 include ../../lib.mk
diff --git a/tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c b/tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c
index 9711556280ae..82f3c8c02c9a 100644
--- a/tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c
+++ b/tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c
@@ -1,5 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
+ * Copyright (c) 2026 Christian Brauner <brauner@kernel.org>
+ *
  * Test for OPEN_TREE_NAMESPACE flag.
  *
  * Test that open_tree() with OPEN_TREE_NAMESPACE creates a new mount
@@ -50,31 +52,6 @@ static int get_mnt_ns_id_from_path(const char *path, uint64_t *mnt_ns_id)
 	return ret;
 }
 
-#define STATMOUNT_BUFSIZE (1 << 15)
-
-static struct statmount *statmount_alloc(uint64_t mnt_id, uint64_t mnt_ns_id, uint64_t mask)
-{
-	struct statmount *buf;
-	size_t bufsize = STATMOUNT_BUFSIZE;
-	int ret;
-
-	for (;;) {
-		buf = malloc(bufsize);
-		if (!buf)
-			return NULL;
-
-		ret = statmount(mnt_id, mnt_ns_id, mask, buf, bufsize, 0);
-		if (ret == 0)
-			return buf;
-
-		free(buf);
-		if (errno != EOVERFLOW)
-			return NULL;
-
-		bufsize <<= 1;
-	}
-}
-
 static void log_mount(struct __test_metadata *_metadata, struct statmount *sm)
 {
 	const char *fs_type = "";
@@ -115,7 +92,7 @@ static void dump_mounts(struct __test_metadata *_metadata, uint64_t mnt_ns_id)
 				     STATMOUNT_MNT_BASIC |
 				     STATMOUNT_FS_TYPE |
 				     STATMOUNT_MNT_ROOT |
-				     STATMOUNT_MNT_POINT);
+				     STATMOUNT_MNT_POINT, 0);
 		if (!sm) {
 			TH_LOG("  [%zd] mnt_id %llu: statmount failed: %s",
 			       i, (unsigned long long)list[i], strerror(errno));
@@ -221,7 +198,7 @@ FIXTURE_SETUP(open_tree_ns)
 		SKIP(return, "open_tree() syscall not supported");
 
 	/* Check if statmount/listmount are supported */
-	ret = statmount(0, 0, 0, NULL, 0, 0);
+	ret = statmount(0, 0, 0, 0, NULL, 0, 0);
 	if (ret == -1 && errno == ENOSYS)
 		SKIP(return, "statmount() syscall not supported");
 
@@ -340,7 +317,7 @@ TEST_F(open_tree_ns, verify_mount_properties)
 	ASSERT_GE(nr_mounts, 1);
 
 	/* Get info about the root mount (the bind mount, rootfs is hidden) */
-	ret = statmount(list[0], new_ns_id, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0);
+	ret = statmount(list[0], new_ns_id, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0);
 	ASSERT_EQ(ret, 0);
 
 	ASSERT_NE(sm.mnt_id, sm.mnt_parent_id);
@@ -452,7 +429,7 @@ FIXTURE_SETUP(open_tree_ns_userns)
 		SKIP(return, "open_tree() syscall not supported");
 
 	/* Check if statmount/listmount are supported */
-	ret = statmount(0, 0, 0, NULL, 0, 0);
+	ret = statmount(0, 0, 0, 0, NULL, 0, 0);
 	if (ret == -1 && errno == ENOSYS)
 		SKIP(return, "statmount() syscall not supported");
 }
@@ -746,7 +723,7 @@ TEST_F(open_tree_ns_userns, umount_fails_einval)
 			const char *mnt_point;
 
 			sm = statmount_alloc(list[i], new_ns_id,
-					     STATMOUNT_MNT_POINT);
+					     STATMOUNT_MNT_POINT, 0);
 			if (!sm)
 				_exit(11);
 
@@ -863,7 +840,7 @@ TEST_F(open_tree_ns_userns, umount_succeeds)
 			const char *mnt_point;
 
 			sm = statmount_alloc(list[i], new_ns_id,
-					     STATMOUNT_MNT_POINT);
+					     STATMOUNT_MNT_POINT, 0);
 			if (!sm)
 				_exit(11);
 
@@ -904,7 +881,7 @@ TEST_F(open_tree_ns_userns, umount_succeeds)
 		ASSERT_FALSE(true) TH_LOG("setns into new namespace failed");
 		break;
 	case 7:
-		ASSERT_FALSE(true) TH_LOG("umount succeeded but should have failed with EINVAL");
+		ASSERT_FALSE(true) TH_LOG("umount failed but should have succeeded");
 		break;
 	case 9:
 		ASSERT_FALSE(true) TH_LOG("listmount failed");
@@ -1003,7 +980,7 @@ TEST_F(open_tree_ns_unbindable, recursive_skips_on_unbindable)
 		struct statmount *sm;
 		const char *mnt_point;
 
-		sm = statmount_alloc(list[i], new_ns_id, STATMOUNT_MNT_POINT);
+		sm = statmount_alloc(list[i], new_ns_id, STATMOUNT_MNT_POINT, 0);
 		ASSERT_NE(sm, NULL) {
 			TH_LOG("statmount_alloc failed for mnt_id %llu",
 			       (unsigned long long)list[i]);
diff --git a/tools/testing/selftests/filesystems/statmount/statmount.h b/tools/testing/selftests/filesystems/statmount/statmount.h
index e1cba4bfd8d9..675f7cc00076 100644
--- a/tools/testing/selftests/filesystems/statmount/statmount.h
+++ b/tools/testing/selftests/filesystems/statmount/statmount.h
@@ -3,10 +3,14 @@
 #ifndef __STATMOUNT_H
 #define __STATMOUNT_H
 
+#include <errno.h>
 #include <stdint.h>
+#include <stdlib.h>
 #include <linux/mount.h>
 #include <asm/unistd.h>
 
+#define STATMOUNT_BUFSIZE (1 << 15)
+
 #ifndef __NR_statmount
 	#if defined __alpha__
 		#define __NR_statmount 567
@@ -84,4 +88,51 @@ static inline ssize_t listmount(uint64_t mnt_id, uint64_t mnt_ns_id,
 	return syscall(__NR_listmount, &req, list, num, flags);
 }
 
+static inline struct statmount *statmount_alloc(uint64_t mnt_id, uint64_t mnt_ns_id,
+						 uint64_t mask, unsigned int flags)
+{
+	struct statmount *buf;
+	size_t bufsize = STATMOUNT_BUFSIZE;
+	int ret;
+
+	for (;;) {
+		buf = malloc(bufsize);
+		if (!buf)
+			return NULL;
+
+		ret = statmount(mnt_id, mnt_ns_id, 0, mask, buf, bufsize, flags);
+		if (ret == 0)
+			return buf;
+
+		free(buf);
+		if (errno != EOVERFLOW)
+			return NULL;
+
+		bufsize <<= 1;
+	}
+}
+
+static inline struct statmount *statmount_alloc_by_fd(int fd, uint64_t mask)
+{
+	struct statmount *buf;
+	size_t bufsize = STATMOUNT_BUFSIZE;
+	int ret;
+
+	for (;;) {
+		buf = malloc(bufsize);
+		if (!buf)
+			return NULL;
+
+		ret = statmount(0, 0, fd, mask, buf, bufsize, STATMOUNT_BY_FD);
+		if (ret == 0)
+			return buf;
+
+		free(buf);
+		if (errno != EOVERFLOW)
+			return NULL;
+
+		bufsize <<= 1;
+	}
+}
+
 #endif /* __STATMOUNT_H */
diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test.c b/tools/testing/selftests/filesystems/statmount/statmount_test.c
index a04bcaace126..8dc018d47a93 100644
--- a/tools/testing/selftests/filesystems/statmount/statmount_test.c
+++ b/tools/testing/selftests/filesystems/statmount/statmount_test.c
@@ -33,45 +33,6 @@ static const char *const known_fs[] = {
 	"sysv", "tmpfs", "tracefs", "ubifs", "udf", "ufs", "v7", "vboxsf",
 	"vfat", "virtiofs", "vxfs", "xenfs", "xfs", "zonefs", NULL };
 
-static struct statmount *statmount_alloc(uint64_t mnt_id, int fd, uint64_t mask, unsigned int flags)
-{
-	size_t bufsize = 1 << 15;
-	struct statmount *buf = NULL, *tmp = NULL;
-	int tofree = 0;
-	int ret;
-
-	if (flags & STATMOUNT_BY_FD && fd < 0)
-		return NULL;
-
-	tmp = alloca(bufsize);
-
-	for (;;) {
-		if (flags & STATMOUNT_BY_FD)
-			ret = statmount(0, 0, (uint32_t) fd, mask, tmp, bufsize, flags);
-		else
-			ret = statmount(mnt_id, 0, 0, mask, tmp, bufsize, flags);
-
-		if (ret != -1)
-			break;
-		if (tofree)
-			free(tmp);
-		if (errno != EOVERFLOW)
-			return NULL;
-		bufsize <<= 1;
-		tofree = 1;
-		tmp = malloc(bufsize);
-		if (!tmp)
-			return NULL;
-	}
-	buf = malloc(tmp->size);
-	if (buf)
-		memcpy(buf, tmp, tmp->size);
-	if (tofree)
-		free(tmp);
-
-	return buf;
-}
-
 static void write_file(const char *path, const char *val)
 {
 	int fd = open(path, O_WRONLY);
@@ -715,7 +676,7 @@ static void test_statmount_by_fd(void)
 		goto err_fd;
 	}
 
-	sm = statmount_alloc(0, fd, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT, STATMOUNT_BY_FD);
+	sm = statmount_alloc_by_fd(fd, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT);
 	if (!sm) {
 		ksft_test_result_fail("statmount by fd failed: %s\n", strerror(errno));
 		goto err_chroot;
@@ -750,7 +711,7 @@ static void test_statmount_by_fd(void)
 	}
 
 	free(sm);
-	sm = statmount_alloc(0, fd, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT, STATMOUNT_BY_FD);
+	sm = statmount_alloc_by_fd(fd, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT);
 	if (!sm) {
 		ksft_test_result_fail("statmount by fd failed: %s\n", strerror(errno));
 		goto err_fd;
@@ -844,7 +805,7 @@ static void test_statmount_by_fd_unmounted(void)
 		goto err_fd;
 	}
 
-	sm = statmount_alloc(0, fd, STATMOUNT_MNT_POINT | STATMOUNT_MNT_ROOT, STATMOUNT_BY_FD);
+	sm = statmount_alloc_by_fd(fd, STATMOUNT_MNT_POINT | STATMOUNT_MNT_ROOT);
 	if (!sm) {
 		ksft_test_result_fail("statmount by fd unmounted: %s\n",
 				      strerror(errno));
diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c
index 063d9de46431..e500905e4c07 100644
--- a/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c
+++ b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c
@@ -34,31 +34,6 @@ static void handle_result(int ret, const char *testname)
 		ksft_test_result_skip("%s\n", testname);
 }
 
-static inline int wait_for_pid(pid_t pid)
-{
-	int status, ret;
-
-again:
-	ret = waitpid(pid, &status, 0);
-	if (ret == -1) {
-		if (errno == EINTR)
-			goto again;
-
-		ksft_print_msg("waitpid returned -1, errno=%d\n", errno);
-		return -1;
-	}
-
-	if (!WIFEXITED(status)) {
-		ksft_print_msg(
-		       "waitpid !WIFEXITED, WIFSIGNALED=%d, WTERMSIG=%d\n",
-		       WIFSIGNALED(status), WTERMSIG(status));
-		return -1;
-	}
-
-	ret = WEXITSTATUS(status);
-	return ret;
-}
-
 static int get_mnt_ns_id(const char *mnt_ns, uint64_t *mnt_ns_id)
 {
 	int fd = open(mnt_ns, O_RDONLY);
diff --git a/tools/testing/selftests/filesystems/utils.c b/tools/testing/selftests/filesystems/utils.c
index d6f26f849053..d73d7d8171db 100644
--- a/tools/testing/selftests/filesystems/utils.c
+++ b/tools/testing/selftests/filesystems/utils.c
@@ -158,7 +158,7 @@ static int get_userns_fd_cb(void *data)
 	_exit(0);
 }
 
-static int wait_for_pid(pid_t pid)
+int wait_for_pid(pid_t pid)
 {
 	int status, ret;
 
@@ -450,7 +450,7 @@ out_close:
 	return fret;
 }
 
-static int write_file(const char *path, const char *val)
+int write_file(const char *path, const char *val)
 {
 	int fd = open(path, O_WRONLY);
 	size_t len = strlen(val);
diff --git a/tools/testing/selftests/filesystems/utils.h b/tools/testing/selftests/filesystems/utils.h
index 0bccfed666a9..d03085cef5cb 100644
--- a/tools/testing/selftests/filesystems/utils.h
+++ b/tools/testing/selftests/filesystems/utils.h
@@ -44,6 +44,8 @@ static inline bool switch_userns(int fd, uid_t uid, gid_t gid, bool drop_caps)
 	return true;
 }
 
+extern int wait_for_pid(pid_t pid);
+extern int write_file(const char *path, const char *val);
 extern uint64_t get_unique_mnt_id(const char *path);
 
 #endif /* __IDMAP_UTILS_H */
diff --git a/tools/testing/selftests/namespaces/listns_efault_test.c b/tools/testing/selftests/namespaces/listns_efault_test.c
index c7ed4023d7a8..b570746e917c 100644
--- a/tools/testing/selftests/namespaces/listns_efault_test.c
+++ b/tools/testing/selftests/namespaces/listns_efault_test.c
@@ -19,7 +19,6 @@
 #include <sys/wait.h>
 #include <unistd.h>
 #include "../kselftest_harness.h"
-#include "../filesystems/utils.h"
 #include "../pidfd/pidfd.h"
 #include "wrappers.h"