Merge patch series "ns: fixes for namespace iteration and active reference counting"

Christian Brauner <brauner@kernel.org> says: * Make sure to initialize the active reference count for the initial network namespace and prevent __ns_common_init() from returning too early. * Make sure that passive reference counts are dropped outside of rcu read locks as some namespaces such as the mount namespace do in fact sleep when putting the last reference. * The setns() system call supports: (1) namespace file descriptors (nsfd) (2) process file descriptors (pidfd) When using nsfds the namespaces will remain active because they are pinned by the vfs. However, when pidfds are used things are more complicated. When the target task exits and passes through exit_nsproxy_namespaces() or is reaped and thus also passes through exit_cred_namespaces() after the setns()'ing task has called prepare_nsset() but before the active reference count of the set of namespaces it wants to setns() to might have been dropped already: P1 P2 pid_p1 = clone(CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWNS) pidfd = pidfd_open(pid_p1) setns(pidfd, CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWNS) prepare_nsset() exit(0) // ns->__ns_active_ref == 1 // parent_ns->__ns_active_ref == 1 -> exit_nsproxy_namespaces() -> exit_cred_namespaces() // ns_active_ref_put() will also put // the reference on the owner of the // namespace. If the only reason the // owning namespace was alive was // because it was a parent of @ns // it's active reference count now goes // to zero... -------------------------------- // | // ns->__ns_active_ref == 0 | // parent_ns->__ns_active_ref == 0 | | commit_nsset() -----------------> // If setns() // now manages to install the namespaces // it will call ns_active_ref_get() // on them thus bumping the active reference // count from zero again but without also // taking the required reference on the owner. // Thus we get: // // ns->__ns_active_ref == 1 // parent_ns->__ns_active_ref == 0 When later someone does ns_active_ref_put() on @ns it will underflow parent_ns->__ns_active_ref leading to a splat from our asserts thinking there are still active references when in fact the counter just underflowed. So resurrect the ownership chain if necessary as well. If the caller succeeded to grab passive references to the set of namespaces the setns() should simply succeed even if the target task exists or gets reaped in the meantime. The race is rare and can only be triggered when using pidfs to setns() to namespaces. Also note that active reference on initial namespaces are nops. Since we now always handle parent references directly we can drop ns_ref_active_get_owner() when adding a namespace to a namespace tree. This is now all handled uniformly in the places where the new namespaces actually become active. * patches from https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-0-ae8a4ad5a3b3@kernel.org: selftests/namespaces: test for efault selftests/namespaces: add active reference count regression test ns: add asserts for active refcount underflow ns: handle setns(pidfd, ...) cleanly ns: return EFAULT on put_user() error ns: make sure reference are dropped outside of rcu lock ns: don't increment or decrement initial namespaces ns: don't skip active reference count initialization Link: https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-0-ae8a4ad5a3b3@kernel.org Signed-off-by: Christian Brauner <brauner@kernel.org>
author: Christian Brauner <brauner@kernel.org> 2025-11-10 12:21:00 +0300
committer: Christian Brauner <brauner@kernel.org> 2025-11-10 17:54:02 +0300
commit: ae901e5e2e9b079761d26a366e0c80530d8aad22 (patch)
tree: 65f2dcf3975dd9d7ba1d412fa3f116b5fce9e534
parent: 8ebfb9896c97ab609222460e705f425cb3f0aad0 (diff)
parent: 07d7ad46dad48a81ffc796fb7875b1ec141c8b48 (diff)
download: linux-ae901e5e2e9b079761d26a366e0c80530d8aad22.tar.xz
8 files changed, 724 insertions, 74 deletions
diff --git a/fs/nsfs.c b/fs/nsfs.c
index ba6c8975c82e..a80f8d2a4122 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -430,7 +430,7 @@ static int nsfs_init_inode(struct inode *inode, void *data)
 	 * ioctl on such a socket will resurrect the relevant namespace
 	 * subtree.
 	 */
-	__ns_ref_active_resurrect(ns);
+	__ns_ref_active_get(ns);
 	return 0;
 }
 
diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index bd4492ef6ffc..66ea09b48377 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -141,6 +141,12 @@ static __always_inline bool is_initial_namespace(struct ns_common *ns)
 				 IPC_NS_INIT_INO - MNT_NS_INIT_INO + 1));
 }
 
+static __always_inline bool is_ns_init_id(const struct ns_common *ns)
+{
+	VFS_WARN_ON_ONCE(ns->ns_id == 0);
+	return ns->ns_id <= NS_LAST_INIT_ID;
+}
+
 #define to_ns_common(__ns)                                    \
 	_Generic((__ns),                                      \
 		struct cgroup_namespace *:       &(__ns)->ns, \
@@ -281,54 +287,25 @@ static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns
 #define ns_ref_active_read(__ns) \
 	((__ns) ? __ns_ref_active_read(to_ns_common(__ns)) : 0)
 
-void __ns_ref_active_get_owner(struct ns_common *ns);
-
-static __always_inline void __ns_ref_active_get(struct ns_common *ns)
-{
-	WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active));
-	VFS_WARN_ON_ONCE(is_initial_namespace(ns) && __ns_ref_active_read(ns) <= 0);
-}
-#define ns_ref_active_get(__ns) \
-	do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0)
-
-static __always_inline bool __ns_ref_active_get_not_zero(struct ns_common *ns)
-{
-	if (atomic_inc_not_zero(&ns->__ns_ref_active)) {
-		VFS_WARN_ON_ONCE(!__ns_ref_read(ns));
-		return true;
-	}
-	return false;
-}
-
-#define ns_ref_active_get_owner(__ns) \
-	do { if (__ns) __ns_ref_active_get_owner(to_ns_common(__ns)); } while (0)
-
-void __ns_ref_active_put_owner(struct ns_common *ns);
+void __ns_ref_active_put(struct ns_common *ns);
 
-static __always_inline void __ns_ref_active_put(struct ns_common *ns)
-{
-	if (atomic_dec_and_test(&ns->__ns_ref_active)) {
-		VFS_WARN_ON_ONCE(is_initial_namespace(ns));
-		VFS_WARN_ON_ONCE(!__ns_ref_read(ns));
-		__ns_ref_active_put_owner(ns);
-	}
-}
 #define ns_ref_active_put(__ns) \
 	do { if (__ns) __ns_ref_active_put(to_ns_common(__ns)); } while (0)
 
 static __always_inline struct ns_common *__must_check ns_get_unless_inactive(struct ns_common *ns)
 {
-	VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) && !__ns_ref_read(ns));
-	if (!__ns_ref_active_read(ns))
+	if (!__ns_ref_active_read(ns)) {
+		VFS_WARN_ON_ONCE(is_ns_init_id(ns));
 		return NULL;
+	}
 	if (!__ns_ref_get(ns))
 		return NULL;
 	return ns;
 }
 
-void __ns_ref_active_resurrect(struct ns_common *ns);
+void __ns_ref_active_get(struct ns_common *ns);
 
-#define ns_ref_active_resurrect(__ns) \
-	do { if (__ns) __ns_ref_active_resurrect(to_ns_common(__ns)); } while (0)
+#define ns_ref_active_get(__ns) \
+	do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0)
 
 #endif
diff --git a/kernel/nscommon.c b/kernel/nscommon.c
index 6fe1c747fa46..c910b979e433 100644
--- a/kernel/nscommon.c
+++ b/kernel/nscommon.c
@@ -54,7 +54,7 @@ static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops)
 
 int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum)
 {
-	int ret;
+	int ret = 0;
 
 	refcount_set(&ns->__ns_ref, 1);
 	ns->stashed = NULL;
@@ -74,11 +74,10 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope
 	ns_debug(ns, ops);
 #endif
 
-	if (inum) {
+	if (inum)
 		ns->inum = inum;
-		return 0;
-	}
-	ret = proc_alloc_inum(&ns->inum);
+	else
+		ret = proc_alloc_inum(&ns->inum);
 	if (ret)
 		return ret;
 	/*
@@ -115,13 +114,6 @@ struct ns_common *__must_check ns_owner(struct ns_common *ns)
 	return to_ns_common(owner);
 }
 
-void __ns_ref_active_get_owner(struct ns_common *ns)
-{
-	ns = ns_owner(ns);
-	if (ns)
-		WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active));
-}
-
 /*
  * The active reference count works by having each namespace that gets
  * created take a single active reference on its owning user namespace.
@@ -172,14 +164,29 @@ void __ns_ref_active_get_owner(struct ns_common *ns)
  * The iteration stops once we reach a namespace that still has active
  * references.
  */
-void __ns_ref_active_put_owner(struct ns_common *ns)
+void __ns_ref_active_put(struct ns_common *ns)
 {
+	/* Initial namespaces are always active. */
+	if (is_ns_init_id(ns))
+		return;
+
+	if (!atomic_dec_and_test(&ns->__ns_ref_active)) {
+		VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0);
+		return;
+	}
+
+	VFS_WARN_ON_ONCE(is_ns_init_id(ns));
+	VFS_WARN_ON_ONCE(!__ns_ref_read(ns));
+
 	for (;;) {
 		ns = ns_owner(ns);
 		if (!ns)
 			return;
-		if (!atomic_dec_and_test(&ns->__ns_ref_active))
+		VFS_WARN_ON_ONCE(is_ns_init_id(ns));
+		if (!atomic_dec_and_test(&ns->__ns_ref_active)) {
+			VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0);
 			return;
+		}
 	}
 }
 
@@ -275,10 +282,18 @@ void __ns_ref_active_put_owner(struct ns_common *ns)
  * it also needs to take another reference on its owning user namespace
  * and so on.
  */
-void __ns_ref_active_resurrect(struct ns_common *ns)
+void __ns_ref_active_get(struct ns_common *ns)
 {
+	int prev;
+
+	/* Initial namespaces are always active. */
+	if (is_ns_init_id(ns))
+		return;
+
 	/* If we didn't resurrect the namespace we're done. */
-	if (atomic_fetch_add(1, &ns->__ns_ref_active))
+	prev = atomic_fetch_add(1, &ns->__ns_ref_active);
+	VFS_WARN_ON_ONCE(prev < 0);
+	if (likely(prev))
 		return;
 
 	/*
@@ -290,7 +305,10 @@ void __ns_ref_active_resurrect(struct ns_common *ns)
 		if (!ns)
 			return;
 
-		if (atomic_fetch_add(1, &ns->__ns_ref_active))
+		VFS_WARN_ON_ONCE(is_ns_init_id(ns));
+		prev = atomic_fetch_add(1, &ns->__ns_ref_active);
+		VFS_WARN_ON_ONCE(prev < 0);
+		if (likely(prev))
 			return;
 	}
 }
diff --git a/kernel/nstree.c b/kernel/nstree.c
index 4a8838683b6b..97404fb90749 100644
--- a/kernel/nstree.c
+++ b/kernel/nstree.c
@@ -173,14 +173,6 @@ void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree)
 	write_sequnlock(&ns_tree_lock);
 
 	VFS_WARN_ON_ONCE(node);
-
-	/*
-	 * Take an active reference on the owner namespace. This ensures
-	 * that the owner remains visible while any of its child namespaces
-	 * are active. For init namespaces this is a no-op as ns_owner()
-	 * returns NULL for namespaces owned by init_user_ns.
-	 */
-	__ns_ref_active_get_owner(ns);
 }
 
 void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree)
@@ -505,13 +497,13 @@ static inline bool __must_check may_list_ns(const struct klistns *kls,
 	return false;
 }
 
-static void __ns_put(struct ns_common *ns)
+static inline void ns_put(struct ns_common *ns)
 {
-	if (ns->ops)
+	if (ns && ns->ops)
 		ns->ops->put(ns);
 }
 
-DEFINE_FREE(ns_put, struct ns_common *, if (!IS_ERR_OR_NULL(_T)) __ns_put(_T))
+DEFINE_FREE(ns_put, struct ns_common *, if (!IS_ERR_OR_NULL(_T)) ns_put(_T))
 
 static inline struct ns_common *__must_check legitimize_ns(const struct klistns *kls,
 							   struct ns_common *candidate)
@@ -535,7 +527,7 @@ static ssize_t do_listns_userns(struct klistns *kls)
 {
 	u64 __user *ns_ids = kls->uns_ids;
 	size_t nr_ns_ids = kls->nr_ns_ids;
-	struct ns_common *ns = NULL, *first_ns = NULL;
+	struct ns_common *ns = NULL, *first_ns = NULL, *prev = NULL;
 	const struct list_head *head;
 	ssize_t ret;
 
@@ -568,9 +560,10 @@ static ssize_t do_listns_userns(struct klistns *kls)
 
 	if (!first_ns)
 		first_ns = list_entry_rcu(head->next, typeof(*ns), ns_owner_entry);
+
 	for (ns = first_ns; &ns->ns_owner_entry != head && nr_ns_ids;
 	     ns = list_entry_rcu(ns->ns_owner_entry.next, typeof(*ns), ns_owner_entry)) {
-		struct ns_common *valid __free(ns_put);
+		struct ns_common *valid;
 
 		valid = legitimize_ns(kls, ns);
 		if (!valid)
@@ -578,8 +571,14 @@ static ssize_t do_listns_userns(struct klistns *kls)
 
 		rcu_read_unlock();
 
-		if (put_user(valid->ns_id, ns_ids + ret))
-			return -EINVAL;
+		ns_put(prev);
+		prev = valid;
+
+		if (put_user(valid->ns_id, ns_ids + ret)) {
+			ns_put(prev);
+			return -EFAULT;
+		}
+
 		nr_ns_ids--;
 		ret++;
 
@@ -587,6 +586,7 @@ static ssize_t do_listns_userns(struct klistns *kls)
 	}
 
 	rcu_read_unlock();
+	ns_put(prev);
 	return ret;
 }
 
@@ -668,7 +668,7 @@ static ssize_t do_listns(struct klistns *kls)
 {
 	u64 __user *ns_ids = kls->uns_ids;
 	size_t nr_ns_ids = kls->nr_ns_ids;
-	struct ns_common *ns, *first_ns = NULL;
+	struct ns_common *ns, *first_ns = NULL, *prev = NULL;
 	struct ns_tree *ns_tree = NULL;
 	const struct list_head *head;
 	u32 ns_type;
@@ -705,7 +705,7 @@ static ssize_t do_listns(struct klistns *kls)
 
 	for (ns = first_ns; !ns_common_is_head(ns, head, ns_tree) && nr_ns_ids;
 	     ns = next_ns_common(ns, ns_tree)) {
-		struct ns_common *valid __free(ns_put);
+		struct ns_common *valid;
 
 		valid = legitimize_ns(kls, ns);
 		if (!valid)
@@ -713,8 +713,13 @@ static ssize_t do_listns(struct klistns *kls)
 
 		rcu_read_unlock();
 
-		if (put_user(valid->ns_id, ns_ids + ret))
-			return -EINVAL;
+		ns_put(prev);
+		prev = valid;
+
+		if (put_user(valid->ns_id, ns_ids + ret)) {
+			ns_put(prev);
+			return -EFAULT;
+		}
 
 		nr_ns_ids--;
 		ret++;
@@ -723,6 +728,7 @@ static ssize_t do_listns(struct klistns *kls)
 	}
 
 	rcu_read_unlock();
+	ns_put(prev);
 	return ret;
 }
 
diff --git a/tools/testing/selftests/namespaces/.gitignore b/tools/testing/selftests/namespaces/.gitignore
index f4d2209ca4e4..0989e80da457 100644
--- a/tools/testing/selftests/namespaces/.gitignore
+++ b/tools/testing/selftests/namespaces/.gitignore
@@ -4,7 +4,9 @@ init_ino_test
 ns_active_ref_test
 listns_test
 listns_permissions_test
+listns_efault_test
 siocgskns_test
 cred_change_test
 stress_test
 listns_pagination_bug
+regression_pidfd_setns_test
diff --git a/tools/testing/selftests/namespaces/Makefile b/tools/testing/selftests/namespaces/Makefile
index 01569e0abbdb..fbb821652c17 100644
--- a/tools/testing/selftests/namespaces/Makefile
+++ b/tools/testing/selftests/namespaces/Makefile
@@ -8,18 +8,22 @@ TEST_GEN_PROGS := nsid_test \
 		  ns_active_ref_test \
 		  listns_test \
 		  listns_permissions_test \
+		  listns_efault_test \
 		  siocgskns_test \
 		  cred_change_test \
 		  stress_test \
-		  listns_pagination_bug
+		  listns_pagination_bug \
+		  regression_pidfd_setns_test
 
 include ../lib.mk
 
 $(OUTPUT)/ns_active_ref_test: ../filesystems/utils.c
 $(OUTPUT)/listns_test: ../filesystems/utils.c
 $(OUTPUT)/listns_permissions_test: ../filesystems/utils.c
+$(OUTPUT)/listns_efault_test: ../filesystems/utils.c
 $(OUTPUT)/siocgskns_test: ../filesystems/utils.c
 $(OUTPUT)/cred_change_test: ../filesystems/utils.c
 $(OUTPUT)/stress_test: ../filesystems/utils.c
 $(OUTPUT)/listns_pagination_bug: ../filesystems/utils.c
+$(OUTPUT)/regression_pidfd_setns_test: ../filesystems/utils.c
 
diff --git a/tools/testing/selftests/namespaces/listns_efault_test.c b/tools/testing/selftests/namespaces/listns_efault_test.c
new file mode 100644
index 000000000000..c7ed4023d7a8
--- /dev/null
+++ b/tools/testing/selftests/namespaces/listns_efault_test.c
@@ -0,0 +1,530 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <linux/nsfs.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "../pidfd/pidfd.h"
+#include "wrappers.h"
+
+/*
+ * Test listns() error handling with invalid buffer addresses.
+ *
+ * When the buffer pointer is invalid (e.g., crossing page boundaries
+ * into unmapped memory), listns() returns EINVAL.
+ *
+ * This test also creates mount namespaces that get destroyed during
+ * iteration, testing that namespace cleanup happens outside the RCU
+ * read lock.
+ */
+TEST(listns_partial_fault_with_ns_cleanup)
+{
+	void *map;
+	__u64 *ns_ids;
+	ssize_t ret;
+	long page_size;
+	pid_t pid, iter_pid;
+	int pidfds[5];
+	int sv[5][2];
+	int iter_pidfd;
+	int i, status;
+	char c;
+
+	page_size = sysconf(_SC_PAGESIZE);
+	ASSERT_GT(page_size, 0);
+
+	/*
+	 * Map two pages:
+	 * - First page: readable and writable
+	 * - Second page: will be unmapped to trigger EFAULT
+	 */
+	map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
+		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	ASSERT_NE(map, MAP_FAILED);
+
+	/* Unmap the second page */
+	ret = munmap((char *)map + page_size, page_size);
+	ASSERT_EQ(ret, 0);
+
+	/*
+	 * Position the buffer pointer so there's room for exactly one u64
+	 * before the page boundary. The second u64 would fall into the
+	 * unmapped page.
+	 */
+	ns_ids = ((__u64 *)((char *)map + page_size)) - 1;
+
+	/*
+	 * Create a separate process to run listns() in a loop concurrently
+	 * with namespace creation and destruction.
+	 */
+	iter_pid = create_child(&iter_pidfd, 0);
+	ASSERT_NE(iter_pid, -1);
+
+	if (iter_pid == 0) {
+		struct ns_id_req req = {
+			.size = sizeof(req),
+			.spare = 0,
+			.ns_id = 0,
+			.ns_type = 0,  /* All types */
+			.spare2 = 0,
+			.user_ns_id = 0,  /* Global listing */
+		};
+		int iter_ret;
+
+		/*
+		 * Loop calling listns() until killed.
+		 * The kernel should:
+		 * 1. Successfully write the first namespace ID (within valid page)
+		 * 2. Fail with EFAULT when trying to write the second ID (unmapped page)
+		 * 3. Handle concurrent namespace destruction without deadlock
+		 */
+		while (1) {
+			iter_ret = sys_listns(&req, ns_ids, 2, 0);
+
+			if (iter_ret == -1 && errno == ENOSYS)
+				_exit(PIDFD_SKIP);
+		}
+	}
+
+	/* Small delay to let iterator start looping */
+	usleep(50000);
+
+	/*
+	 * Create several child processes, each in its own mount namespace.
+	 * These will be destroyed while the iterator is running listns().
+	 */
+	for (i = 0; i < 5; i++) {
+		/* Create socketpair for synchronization */
+		ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
+
+		pid = create_child(&pidfds[i], CLONE_NEWNS);
+		ASSERT_NE(pid, -1);
+
+		if (pid == 0) {
+			close(sv[i][0]); /* Close parent end */
+
+			if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
+				_exit(1);
+
+			/* Child: create a couple of tmpfs mounts */
+			if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
+				_exit(1);
+			if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
+				_exit(1);
+
+			if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
+				_exit(1);
+			if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
+				_exit(1);
+
+			/* Signal parent that setup is complete */
+			if (write_nointr(sv[i][1], "R", 1) != 1)
+				_exit(1);
+
+			/* Wait for parent to signal us to exit */
+			if (read_nointr(sv[i][1], &c, 1) != 1)
+				_exit(1);
+
+			close(sv[i][1]);
+			_exit(0);
+		}
+
+		close(sv[i][1]); /* Close child end */
+	}
+
+	/* Wait for all children to finish setup */
+	for (i = 0; i < 5; i++) {
+		ret = read_nointr(sv[i][0], &c, 1);
+		ASSERT_EQ(ret, 1);
+		ASSERT_EQ(c, 'R');
+	}
+
+	/*
+	 * Signal children to exit. This will destroy their mount namespaces
+	 * while listns() is iterating the namespace tree.
+	 * This tests that cleanup happens outside the RCU read lock.
+	 */
+	for (i = 0; i < 5; i++)
+		write_nointr(sv[i][0], "X", 1);
+
+	/* Wait for all mount namespace children to exit and cleanup */
+	for (i = 0; i < 5; i++) {
+		waitpid(-1, NULL, 0);
+		close(sv[i][0]);
+		close(pidfds[i]);
+	}
+
+	/* Kill iterator and wait for it */
+	sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
+	ret = waitpid(iter_pid, &status, 0);
+	ASSERT_EQ(ret, iter_pid);
+	close(iter_pidfd);
+
+	/* Should have been killed */
+	ASSERT_TRUE(WIFSIGNALED(status));
+	ASSERT_EQ(WTERMSIG(status), SIGKILL);
+
+	/* Clean up */
+	munmap(map, page_size);
+}
+
+/*
+ * Test listns() error handling when the entire buffer is invalid.
+ * This is a sanity check that basic invalid pointer detection works.
+ */
+TEST(listns_complete_fault)
+{
+	struct ns_id_req req = {
+		.size = sizeof(req),
+		.spare = 0,
+		.ns_id = 0,
+		.ns_type = 0,
+		.spare2 = 0,
+		.user_ns_id = 0,
+	};
+	__u64 *ns_ids;
+	ssize_t ret;
+
+	/* Use a clearly invalid pointer */
+	ns_ids = (__u64 *)0xdeadbeef;
+
+	ret = sys_listns(&req, ns_ids, 10, 0);
+
+	if (ret == -1 && errno == ENOSYS)
+		SKIP(return, "listns() not supported");
+
+	/* Should fail with EFAULT */
+	ASSERT_EQ(ret, -1);
+	ASSERT_EQ(errno, EFAULT);
+}
+
+/*
+ * Test listns() error handling when the buffer is NULL.
+ */
+TEST(listns_null_buffer)
+{
+	struct ns_id_req req = {
+		.size = sizeof(req),
+		.spare = 0,
+		.ns_id = 0,
+		.ns_type = 0,
+		.spare2 = 0,
+		.user_ns_id = 0,
+	};
+	ssize_t ret;
+
+	/* NULL buffer with non-zero count should fail */
+	ret = sys_listns(&req, NULL, 10, 0);
+
+	if (ret == -1 && errno == ENOSYS)
+		SKIP(return, "listns() not supported");
+
+	/* Should fail with EFAULT */
+	ASSERT_EQ(ret, -1);
+	ASSERT_EQ(errno, EFAULT);
+}
+
+/*
+ * Test listns() with a buffer that becomes invalid mid-iteration
+ * (after several successful writes), combined with mount namespace
+ * destruction to test RCU cleanup logic.
+ */
+TEST(listns_late_fault_with_ns_cleanup)
+{
+	void *map;
+	__u64 *ns_ids;
+	ssize_t ret;
+	long page_size;
+	pid_t pid, iter_pid;
+	int pidfds[10];
+	int sv[10][2];
+	int iter_pidfd;
+	int i, status;
+	char c;
+
+	page_size = sysconf(_SC_PAGESIZE);
+	ASSERT_GT(page_size, 0);
+
+	/* Map two pages */
+	map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
+		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	ASSERT_NE(map, MAP_FAILED);
+
+	/* Unmap the second page */
+	ret = munmap((char *)map + page_size, page_size);
+	ASSERT_EQ(ret, 0);
+
+	/*
+	 * Position buffer so we can write several u64s successfully
+	 * before hitting the page boundary.
+	 */
+	ns_ids = ((__u64 *)((char *)map + page_size)) - 5;
+
+	/*
+	 * Create a separate process to run listns() concurrently.
+	 */
+	iter_pid = create_child(&iter_pidfd, 0);
+	ASSERT_NE(iter_pid, -1);
+
+	if (iter_pid == 0) {
+		struct ns_id_req req = {
+			.size = sizeof(req),
+			.spare = 0,
+			.ns_id = 0,
+			.ns_type = 0,
+			.spare2 = 0,
+			.user_ns_id = 0,
+		};
+		int iter_ret;
+
+		/*
+		 * Loop calling listns() until killed.
+		 * Request 10 namespace IDs while namespaces are being destroyed.
+		 * This tests:
+		 * 1. EFAULT handling when buffer becomes invalid
+		 * 2. Namespace cleanup outside RCU read lock during iteration
+		 */
+		while (1) {
+			iter_ret = sys_listns(&req, ns_ids, 10, 0);
+
+			if (iter_ret == -1 && errno == ENOSYS)
+				_exit(PIDFD_SKIP);
+		}
+	}
+
+	/* Small delay to let iterator start looping */
+	usleep(50000);
+
+	/*
+	 * Create more children with mount namespaces to increase the
+	 * likelihood that namespace cleanup happens during iteration.
+	 */
+	for (i = 0; i < 10; i++) {
+		/* Create socketpair for synchronization */
+		ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
+
+		pid = create_child(&pidfds[i], CLONE_NEWNS);
+		ASSERT_NE(pid, -1);
+
+		if (pid == 0) {
+			close(sv[i][0]); /* Close parent end */
+
+			if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
+				_exit(1);
+
+			/* Child: create tmpfs mounts */
+			if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
+				_exit(1);
+			if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
+				_exit(1);
+
+			if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
+				_exit(1);
+			if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
+				_exit(1);
+
+			/* Signal parent that setup is complete */
+			if (write_nointr(sv[i][1], "R", 1) != 1)
+				_exit(1);
+
+			/* Wait for parent to signal us to exit */
+			if (read_nointr(sv[i][1], &c, 1) != 1)
+				_exit(1);
+
+			close(sv[i][1]);
+			_exit(0);
+		}
+
+		close(sv[i][1]); /* Close child end */
+	}
+
+	/* Wait for all children to finish setup */
+	for (i = 0; i < 10; i++) {
+		ret = read_nointr(sv[i][0], &c, 1);
+		ASSERT_EQ(ret, 1);
+		ASSERT_EQ(c, 'R');
+	}
+
+	/* Kill half the children */
+	for (i = 0; i < 5; i++)
+		write_nointr(sv[i][0], "X", 1);
+
+	/* Small delay to let some exit */
+	usleep(10000);
+
+	/* Kill remaining children */
+	for (i = 5; i < 10; i++)
+		write_nointr(sv[i][0], "X", 1);
+
+	/* Wait for all children and cleanup */
+	for (i = 0; i < 10; i++) {
+		waitpid(-1, NULL, 0);
+		close(sv[i][0]);
+		close(pidfds[i]);
+	}
+
+	/* Kill iterator and wait for it */
+	sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
+	ret = waitpid(iter_pid, &status, 0);
+	ASSERT_EQ(ret, iter_pid);
+	close(iter_pidfd);
+
+	/* Should have been killed */
+	ASSERT_TRUE(WIFSIGNALED(status));
+	ASSERT_EQ(WTERMSIG(status), SIGKILL);
+
+	/* Clean up */
+	munmap(map, page_size);
+}
+
+/*
+ * Test specifically focused on mount namespace cleanup during EFAULT.
+ * Filter for mount namespaces only.
+ */
+TEST(listns_mnt_ns_cleanup_on_fault)
+{
+	void *map;
+	__u64 *ns_ids;
+	ssize_t ret;
+	long page_size;
+	pid_t pid, iter_pid;
+	int pidfds[8];
+	int sv[8][2];
+	int iter_pidfd;
+	int i, status;
+	char c;
+
+	page_size = sysconf(_SC_PAGESIZE);
+	ASSERT_GT(page_size, 0);
+
+	/* Set up partial fault buffer */
+	map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
+		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	ASSERT_NE(map, MAP_FAILED);
+
+	ret = munmap((char *)map + page_size, page_size);
+	ASSERT_EQ(ret, 0);
+
+	/* Position for 3 successful writes, then fault */
+	ns_ids = ((__u64 *)((char *)map + page_size)) - 3;
+
+	/*
+	 * Create a separate process to run listns() concurrently.
+	 */
+	iter_pid = create_child(&iter_pidfd, 0);
+	ASSERT_NE(iter_pid, -1);
+
+	if (iter_pid == 0) {
+		struct ns_id_req req = {
+			.size = sizeof(req),
+			.spare = 0,
+			.ns_id = 0,
+			.ns_type = CLONE_NEWNS,  /* Only mount namespaces */
+			.spare2 = 0,
+			.user_ns_id = 0,
+		};
+		int iter_ret;
+
+		/*
+		 * Loop calling listns() until killed.
+		 * Call listns() to race with namespace destruction.
+		 */
+		while (1) {
+			iter_ret = sys_listns(&req, ns_ids, 10, 0);
+
+			if (iter_ret == -1 && errno == ENOSYS)
+				_exit(PIDFD_SKIP);
+		}
+	}
+
+	/* Small delay to let iterator start looping */
+	usleep(50000);
+
+	/* Create children with mount namespaces */
+	for (i = 0; i < 8; i++) {
+		/* Create socketpair for synchronization */
+		ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
+
+		pid = create_child(&pidfds[i], CLONE_NEWNS);
+		ASSERT_NE(pid, -1);
+
+		if (pid == 0) {
+			close(sv[i][0]); /* Close parent end */
+
+			if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
+				_exit(1);
+
+			/* Do some mount operations to make cleanup more interesting */
+			if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
+				_exit(1);
+			if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
+				_exit(1);
+
+			if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
+				_exit(1);
+			if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
+				_exit(1);
+
+			/* Signal parent that setup is complete */
+			if (write_nointr(sv[i][1], "R", 1) != 1)
+				_exit(1);
+
+			/* Wait for parent to signal us to exit */
+			if (read_nointr(sv[i][1], &c, 1) != 1)
+				_exit(1);
+
+			close(sv[i][1]);
+			_exit(0);
+		}
+
+		close(sv[i][1]); /* Close child end */
+	}
+
+	/* Wait for all children to finish setup */
+	for (i = 0; i < 8; i++) {
+		ret = read_nointr(sv[i][0], &c, 1);
+		ASSERT_EQ(ret, 1);
+		ASSERT_EQ(c, 'R');
+	}
+
+	/* Kill children to trigger namespace destruction during iteration */
+	for (i = 0; i < 8; i++)
+		write_nointr(sv[i][0], "X", 1);
+
+	/* Wait for children and cleanup */
+	for (i = 0; i < 8; i++) {
+		waitpid(-1, NULL, 0);
+		close(sv[i][0]);
+		close(pidfds[i]);
+	}
+
+	/* Kill iterator and wait for it */
+	sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
+	ret = waitpid(iter_pid, &status, 0);
+	ASSERT_EQ(ret, iter_pid);
+	close(iter_pidfd);
+
+	/* Should have been killed */
+	ASSERT_TRUE(WIFSIGNALED(status));
+	ASSERT_EQ(WTERMSIG(status), SIGKILL);
+
+	munmap(map, page_size);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c b/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c
new file mode 100644
index 000000000000..753fd29dffd8
--- /dev/null
+++ b/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#include "../pidfd/pidfd.h"
+#include "../kselftest_harness.h"
+
+/*
+ * Regression tests for the setns(pidfd) active reference counting bug.
+ *
+ * These tests are based on the reproducers that triggered the race condition
+ * fixed by commit 1c465d0518dc ("ns: handle setns(pidfd, ...) cleanly").
+ *
+ * The bug: When using setns() with a pidfd, if the target task exits between
+ * prepare_nsset() and commit_nsset(), the namespaces would become inactive.
+ * Then ns_ref_active_get() would increment from 0 without properly resurrecting
+ * the owner chain, causing active reference count underflows.
+ */
+
+/*
+ * Simple pidfd setns test using create_child()+unshare().
+ *
+ * Without the fix, this would trigger active refcount warnings when the
+ * parent exits after doing setns(pidfd) on a child that has already exited.
+ */
+TEST(simple_pidfd_setns)
+{
+	pid_t child_pid;
+	int pidfd = -1;
+	int ret;
+	int sv[2];
+	char c;
+
+	/* Ignore SIGCHLD for autoreap */
+	ASSERT_NE(signal(SIGCHLD, SIG_IGN), SIG_ERR);
+
+	ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0);
+
+	/* Create a child process without namespaces initially */
+	child_pid = create_child(&pidfd, 0);
+	ASSERT_GE(child_pid, 0);
+
+	if (child_pid == 0) {
+		close(sv[0]);
+
+		if (unshare(CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET | CLONE_NEWUSER) < 0) {
+			close(sv[1]);
+			_exit(1);
+		}
+
+		/* Signal parent that namespaces are ready */
+		if (write_nointr(sv[1], "1", 1) < 0) {
+			close(sv[1]);
+			_exit(1);
+		}
+
+		close(sv[1]);
+		_exit(0);
+	}
+	ASSERT_GE(pidfd, 0);
+	EXPECT_EQ(close(sv[1]), 0);
+
+	ret = read_nointr(sv[0], &c, 1);
+	ASSERT_EQ(ret, 1);
+	EXPECT_EQ(close(sv[0]), 0);
+
+	/* Set to child's namespaces via pidfd */
+	ret = setns(pidfd, CLONE_NEWUTS | CLONE_NEWIPC);
+	TH_LOG("setns() returned %d", ret);
+	close(pidfd);
+}
+
+/*
+ * Simple pidfd setns test using create_child().
+ *
+ * This variation uses create_child() with namespace flags directly.
+ * Namespaces are created immediately at clone time.
+ */
+TEST(simple_pidfd_setns_clone)
+{
+	pid_t child_pid;
+	int pidfd = -1;
+	int ret;
+
+	/* Ignore SIGCHLD for autoreap */
+	ASSERT_NE(signal(SIGCHLD, SIG_IGN), SIG_ERR);
+
+	/* Create a child process with new namespaces using create_child() */
+	child_pid = create_child(&pidfd, CLONE_NEWUSER | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET);
+	ASSERT_GE(child_pid, 0);
+
+	if (child_pid == 0) {
+		/* Child: sleep for a while so parent can setns to us */
+		sleep(2);
+		_exit(0);
+	}
+
+	/* Parent: pidfd was already created by create_child() */
+	ASSERT_GE(pidfd, 0);
+
+	/* Set to child's namespaces via pidfd */
+	ret = setns(pidfd, CLONE_NEWUTS | CLONE_NEWIPC);
+	close(pidfd);
+	TH_LOG("setns() returned %d", ret);
+}
+
+TEST_HARNESS_MAIN
author	Christian Brauner <brauner@kernel.org>	2025-11-10 12:21:00 +0300
committer	Christian Brauner <brauner@kernel.org>	2025-11-10 17:54:02 +0300
commit	ae901e5e2e9b079761d26a366e0c80530d8aad22 (patch)
tree	65f2dcf3975dd9d7ba1d412fa3f116b5fce9e534
parent	8ebfb9896c97ab609222460e705f425cb3f0aad0 (diff)
parent	07d7ad46dad48a81ffc796fb7875b1ec141c8b48 (diff)
download	linux-ae901e5e2e9b079761d26a366e0c80530d8aad22.tar.xz