From 912289ee831d92b377309dc832c0c3ce5906deae Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sun, 26 Apr 2026 00:08:42 +0200
Subject: proc: allow to mark /proc files permanent outside of fs/proc/

Add proc_make_permanent() function to mark PDE as permanent to speed up
open/read/close (one alloc/free and lock/unlock less).

Enable it for built-in code and for compiled-in modules.
This function becomes nop magically in modular code.

		Note, note, note!

If built-in code creates and deletes PDEs dynamically (not in init
hook), then proc_make_permanent() must not be used.

It is intended for simple code:

	static int __init xxx_module_init(void)
	{
		g_pde = proc_create_single();
		proc_make_permanent(g_pde);
		return 0;
	}
	static void __exit xxx_module_exit(void)
	{
		remove_proc_entry(g_pde);
	}

If module is built-in then exit hook never executed and PDE is
permanent so it is OK to mark it as such.

If module is module then rmmod will yank PDE, but proc_make_permanent()
is nop and core /proc code will do everything right.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Link: https://patch.msgid.link/20260425220844.1763933-2-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/proc_fs.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 19d1c5e5f335..d2860c18dca9 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -248,4 +248,16 @@ static inline struct pid_namespace *proc_pid_ns(struct super_block *sb)
 
 bool proc_ns_file(const struct file *file);
 
+#if defined CONFIG_PROC_FS && !defined MODULE
+void impl_proc_make_permanent(struct proc_dir_entry *pde);
+#endif
+
+static inline void proc_make_permanent(struct proc_dir_entry *pde)
+{
+	/* Don't give matches to modules. */
+#if defined CONFIG_PROC_FS && !defined MODULE
+	impl_proc_make_permanent(pde);
+#endif
+}
+
 #endif /* _LINUX_PROC_FS_H */
-- 
cgit v1.2.3


From dc651e25a6d289bc08cfde032427fa17b3d1b22e Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Sun, 26 Apr 2026 00:08:43 +0200
Subject: fs: RCU-ify filesystems list

The drivers list was protected by an rwlock; every mount, every open
of /proc/filesystems and the legacy sysfs(2) syscall walked a
hand-rolled singly-linked list under it. /proc/filesystems is
especially hot because libselinux causes programs as mundane as
mkdir, ls and sed to open and read it on every invocation.

Convert the list to an RCU-protected hlist and switch the writer side
to a plain spinlock. Writers keep their existing non-sleeping
section while readers walk under rcu_read_lock() with no lock traffic:

  - register_filesystem()/unregister_filesystem() take
    file_systems_lock, publish via hlist_{add_tail,del_init}_rcu()
    and invalidate the cached /proc/filesystems string.
    unregister_filesystem() keeps its synchronize_rcu() after
    dropping the lock so in-flight readers are drained before the
    module (and its embedded file_system_type) can go away.

  - __get_fs_type(), list_bdev_fs_names() and the
    fs_index()/fs_name()/fs_maxindex() helpers walk the list under
    rcu_read_lock(). fs_name() continues to drop the read-side
    lock after try_module_get() and accesses ->name outside the RCU
    section; the module reference pins the embedded file_system_type
    across the boundary.

struct file_system_type::next becomes struct hlist_node list; no
in-tree caller references the old ->next field outside
fs/filesystems.c.

Link: https://patch.msgid.link/20260425220844.1763933-3-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/filesystems.c   | 179 ++++++++++++++++++++++-------------------------------
 fs/ocfs2/super.c   |   1 -
 include/linux/fs.h |   2 +-
 3 files changed, 75 insertions(+), 107 deletions(-)

(limited to 'include')

diff --git a/fs/filesystems.c b/fs/filesystems.c
index 0c7d2b7ac26c..7976366d4197 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -17,22 +17,19 @@
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 #include <linux/fs_parser.h>
+#include <linux/rculist.h>
 
 /*
- * Handling of filesystem drivers list.
- * Rules:
- *	Inclusion to/removals from/scanning of list are protected by spinlock.
- *	During the unload module must call unregister_filesystem().
- *	We can access the fields of list element if:
- *		1) spinlock is held or
- *		2) we hold the reference to the module.
- *	The latter can be guaranteed by call of try_module_get(); if it
- *	returned 0 we must skip the element, otherwise we got the reference.
- *	Once the reference is obtained we can drop the spinlock.
+ * Read-mostly filesystem drivers list.
+ *
+ * Readers walk under rcu_read_lock(); writers take file_systems_lock
+ * and publish via _rcu hlist primitives.  unregister_filesystem()
+ * synchronize_rcu()s after unlock so the embedded file_system_type
+ * can't go away under a reader.  To keep using a filesystem after
+ * the RCU section ends, take a module reference via try_module_get().
  */
-
-static struct file_system_type *file_systems;
-static DEFINE_RWLOCK(file_systems_lock);
+static HLIST_HEAD(file_systems);
+static DEFINE_SPINLOCK(file_systems_lock);
 
 /* WARNING: This can be used only if we _already_ own a reference */
 struct file_system_type *get_filesystem(struct file_system_type *fs)
@@ -46,14 +43,15 @@ void put_filesystem(struct file_system_type *fs)
 	module_put(fs->owner);
 }
 
-static struct file_system_type **find_filesystem(const char *name, unsigned len)
+static struct file_system_type *find_filesystem(const char *name, unsigned len)
 {
-	struct file_system_type **p;
-	for (p = &file_systems; *p; p = &(*p)->next)
-		if (strncmp((*p)->name, name, len) == 0 &&
-		    !(*p)->name[len])
-			break;
-	return p;
+	struct file_system_type *fs;
+
+	hlist_for_each_entry_rcu(fs, &file_systems, list,
+				 lockdep_is_held(&file_systems_lock))
+		if (strncmp(fs->name, name, len) == 0 && !fs->name[len])
+			return fs;
+	return NULL;
 }
 
 /**
@@ -64,33 +62,26 @@ static struct file_system_type **find_filesystem(const char *name, unsigned len)
  *	is aware of for mount and other syscalls. Returns 0 on success,
  *	or a negative errno code on an error.
  *
- *	The &struct file_system_type that is passed is linked into the kernel 
+ *	The &struct file_system_type that is passed is linked into the kernel
  *	structures and must not be freed until the file system has been
  *	unregistered.
  */
- 
-int register_filesystem(struct file_system_type * fs)
+int register_filesystem(struct file_system_type *fs)
 {
-	int res = 0;
-	struct file_system_type ** p;
-
 	if (fs->parameters &&
 	    !fs_validate_description(fs->name, fs->parameters))
 		return -EINVAL;
 
 	BUG_ON(strchr(fs->name, '.'));
-	if (fs->next)
+	if (!hlist_unhashed_lockless(&fs->list))
 		return -EBUSY;
-	write_lock(&file_systems_lock);
-	p = find_filesystem(fs->name, strlen(fs->name));
-	if (*p)
-		res = -EBUSY;
-	else
-		*p = fs;
-	write_unlock(&file_systems_lock);
-	return res;
-}
 
+	guard(spinlock)(&file_systems_lock);
+	if (find_filesystem(fs->name, strlen(fs->name)))
+		return -EBUSY;
+	hlist_add_tail_rcu(&fs->list, &file_systems);
+	return 0;
+}
 EXPORT_SYMBOL(register_filesystem);
 
 /**
@@ -100,94 +91,78 @@ EXPORT_SYMBOL(register_filesystem);
  *	Remove a file system that was previously successfully registered
  *	with the kernel. An error is returned if the file system is not found.
  *	Zero is returned on a success.
- *	
+ *
  *	Once this function has returned the &struct file_system_type structure
  *	may be freed or reused.
  */
- 
-int unregister_filesystem(struct file_system_type * fs)
+int unregister_filesystem(struct file_system_type *fs)
 {
-	struct file_system_type ** tmp;
-
-	write_lock(&file_systems_lock);
-	tmp = &file_systems;
-	while (*tmp) {
-		if (fs == *tmp) {
-			*tmp = fs->next;
-			fs->next = NULL;
-			write_unlock(&file_systems_lock);
-			synchronize_rcu();
-			return 0;
-		}
-		tmp = &(*tmp)->next;
+	scoped_guard(spinlock, &file_systems_lock) {
+		if (hlist_unhashed(&fs->list))
+			return -EINVAL;
+		hlist_del_init_rcu(&fs->list);
 	}
-	write_unlock(&file_systems_lock);
-
-	return -EINVAL;
+	synchronize_rcu();
+	return 0;
 }
-
 EXPORT_SYMBOL(unregister_filesystem);
 
 #ifdef CONFIG_SYSFS_SYSCALL
-static int fs_index(const char __user * __name)
+static int fs_index(const char __user *__name)
 {
-	struct file_system_type * tmp;
+	struct file_system_type *p;
 	char *name __free(kfree) = strndup_user(__name, PATH_MAX);
-	int err, index;
+	int index = 0;
 
 	if (IS_ERR(name))
 		return PTR_ERR(name);
 
-	err = -EINVAL;
-	read_lock(&file_systems_lock);
-	for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) {
-		if (strcmp(tmp->name, name) == 0) {
-			err = index;
-			break;
-		}
+	guard(rcu)();
+	hlist_for_each_entry_rcu(p, &file_systems, list) {
+		if (strcmp(p->name, name) == 0)
+			return index;
+		index++;
 	}
-	read_unlock(&file_systems_lock);
-	return err;
+	return -EINVAL;
 }
 
-static int fs_name(unsigned int index, char __user * buf)
+static int fs_name(unsigned int index, char __user *buf)
 {
-	struct file_system_type * tmp;
-	int len, res = -EINVAL;
-
-	read_lock(&file_systems_lock);
-	for (tmp = file_systems; tmp; tmp = tmp->next, index--) {
-		if (index == 0) {
-			if (try_module_get(tmp->owner))
-				res = 0;
+	struct file_system_type *p, *found = NULL;
+	int len, res;
+
+	scoped_guard(rcu) {
+		hlist_for_each_entry_rcu(p, &file_systems, list) {
+			if (index--)
+				continue;
+			if (try_module_get(p->owner))
+				found = p;
 			break;
 		}
 	}
-	read_unlock(&file_systems_lock);
-	if (res)
-		return res;
+	if (!found)
+		return -EINVAL;
 
 	/* OK, we got the reference, so we can safely block */
-	len = strlen(tmp->name) + 1;
-	res = copy_to_user(buf, tmp->name, len) ? -EFAULT : 0;
-	put_filesystem(tmp);
+	len = strlen(found->name) + 1;
+	res = copy_to_user(buf, found->name, len) ? -EFAULT : 0;
+	put_filesystem(found);
 	return res;
 }
 
 static int fs_maxindex(void)
 {
-	struct file_system_type * tmp;
-	int index;
+	struct file_system_type *p;
+	int index = 0;
 
-	read_lock(&file_systems_lock);
-	for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++)
-		;
-	read_unlock(&file_systems_lock);
+	guard(rcu)();
+	hlist_for_each_entry_rcu(p, &file_systems, list)
+		index++;
 	return index;
 }
 
 /*
- * Whee.. Weird sysv syscall. 
+ * Whee.. Weird sysv syscall.
  */
 SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2)
 {
@@ -216,8 +191,8 @@ int __init list_bdev_fs_names(char *buf, size_t size)
 	size_t len;
 	int count = 0;
 
-	read_lock(&file_systems_lock);
-	for (p = file_systems; p; p = p->next) {
+	guard(rcu)();
+	hlist_for_each_entry_rcu(p, &file_systems, list) {
 		if (!(p->fs_flags & FS_REQUIRES_DEV))
 			continue;
 		len = strlen(p->name) + 1;
@@ -230,24 +205,20 @@ int __init list_bdev_fs_names(char *buf, size_t size)
 		size -= len;
 		count++;
 	}
-	read_unlock(&file_systems_lock);
 	return count;
 }
 
 #ifdef CONFIG_PROC_FS
 static int filesystems_proc_show(struct seq_file *m, void *v)
 {
-	struct file_system_type * tmp;
+	struct file_system_type *p;
 
-	read_lock(&file_systems_lock);
-	tmp = file_systems;
-	while (tmp) {
+	guard(rcu)();
+	hlist_for_each_entry_rcu(p, &file_systems, list) {
 		seq_printf(m, "%s\t%s\n",
-			(tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
-			tmp->name);
-		tmp = tmp->next;
+			   (p->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
+			   p->name);
 	}
-	read_unlock(&file_systems_lock);
 	return 0;
 }
 
@@ -263,11 +234,10 @@ static struct file_system_type *__get_fs_type(const char *name, int len)
 {
 	struct file_system_type *fs;
 
-	read_lock(&file_systems_lock);
-	fs = *(find_filesystem(name, len));
+	guard(rcu)();
+	fs = find_filesystem(name, len);
 	if (fs && !try_module_get(fs->owner))
 		fs = NULL;
-	read_unlock(&file_systems_lock);
 	return fs;
 }
 
@@ -291,5 +261,4 @@ struct file_system_type *get_fs_type(const char *name)
 	}
 	return fs;
 }
-
 EXPORT_SYMBOL(get_fs_type);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index b875f01c9756..4870e680c4e5 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1224,7 +1224,6 @@ static struct file_system_type ocfs2_fs_type = {
 	.name           = "ocfs2",
 	.kill_sb        = kill_block_super,
 	.fs_flags       = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
-	.next           = NULL,
 	.init_fs_context = ocfs2_init_fs_context,
 	.parameters	= ocfs2_param_spec,
 };
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 11559c513dfb..c37bb3c7de8b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2286,7 +2286,7 @@ struct file_system_type {
 	const struct fs_parameter_spec *parameters;
 	void (*kill_sb) (struct super_block *);
 	struct module *owner;
-	struct file_system_type * next;
+	struct hlist_node list;
 	struct hlist_head fs_supers;
 
 	struct lock_class_key s_lock_key;
-- 
cgit v1.2.3


From dc4edae7f41dceb236553b61cda0383895293c90 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 27 Apr 2026 10:26:03 +0200
Subject: fs: move SB_I_USERNS_VISIBLE to FS_USERNS_MOUNT_RESTRICTED

Whether a filesystem's mounts need to undergo a visibility check in user
namespaces is a static property of the filesystem type, not a runtime
property of each superblock instance. Both proc and sysfs always set
SB_I_USERNS_VISIBLE on their superblocks unconditionally (sysfs does so
on first creation, and subsequent mounts reuse the same superblock).

Move this flag from sb->s_iflags (SB_I_USERNS_VISIBLE) to
file_system_type->fs_flags (FS_USERNS_MOUNT_RESTRICTED) so the intent
is expressed at the filesystem type level where it belongs.

All check sites are updated to test sb->s_type->fs_flags instead of
sb->s_iflags. The SB_I_NOEXEC and SB_I_NODEV flags remain on the
superblock as they are runtime properties set during fill_super.

Link: https://patch.msgid.link/72887c5b6204dc3adf5a53104f0be6bd8bc4f6cd.1777278334.git.legion@kernel.org
Reviewed-by: Aleksa Sarai <aleksa@amutable.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c                 | 6 +++---
 fs/proc/root.c                 | 4 ++--
 fs/sysfs/mount.c               | 4 +---
 include/linux/fs.h             | 1 +
 include/linux/fs/super_types.h | 1 -
 kernel/acct.c                  | 2 +-
 6 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/fs/namespace.c b/fs/namespace.c
index 047efa737ef9..e6e58117c778 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1099,7 +1099,7 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
 	rb_link_node(&mnt->mnt_node, parent, link);
 	rb_insert_color(&mnt->mnt_node, &ns->mounts);
 
-	if ((mnt->mnt.mnt_sb->s_iflags & SB_I_USERNS_VISIBLE) &&
+	if ((mnt->mnt.mnt_sb->s_type->fs_flags & FS_USERNS_MOUNT_RESTRICTED) &&
 	    mnt->mnt.mnt_root == mnt->mnt.mnt_sb->s_root)
 		hlist_add_head(&mnt->mnt_ns_visible, &ns->mnt_visible_mounts);
 
@@ -6408,10 +6408,10 @@ static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags
 		return false;
 
 	/* Can this filesystem be too revealing? */
-	s_iflags = sb->s_iflags;
-	if (!(s_iflags & SB_I_USERNS_VISIBLE))
+	if (!(sb->s_type->fs_flags & FS_USERNS_MOUNT_RESTRICTED))
 		return false;
 
+	s_iflags = sb->s_iflags;
 	if ((s_iflags & required_iflags) != required_iflags) {
 		WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n",
 			  required_iflags);
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 0f9100559471..b65053f9f046 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -257,7 +257,7 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc)
 	proc_apply_options(fs_info, fc, current_user_ns());
 
 	/* User space would break if executables or devices appear on proc */
-	s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
+	s->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;
 	s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC;
 	s->s_blocksize = 1024;
 	s->s_blocksize_bits = 10;
@@ -359,7 +359,7 @@ static struct file_system_type proc_fs_type = {
 	.init_fs_context	= proc_init_fs_context,
 	.parameters		= proc_fs_parameters,
 	.kill_sb		= proc_kill_sb,
-	.fs_flags		= FS_USERNS_MOUNT | FS_DISALLOW_NOTIFY_PERM,
+	.fs_flags		= FS_USERNS_MOUNT | FS_USERNS_MOUNT_RESTRICTED | FS_DISALLOW_NOTIFY_PERM,
 };
 
 void __init proc_root_init(void)
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index b199e8ff79b1..b45ea5d511e7 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -32,8 +32,6 @@ static int sysfs_get_tree(struct fs_context *fc)
 	if (ret)
 		return ret;
 
-	if (kfc->new_sb_created)
-		fc->root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE;
 	return 0;
 }
 
@@ -93,7 +91,7 @@ static struct file_system_type sysfs_fs_type = {
 	.name			= "sysfs",
 	.init_fs_context	= sysfs_init_fs_context,
 	.kill_sb		= sysfs_kill_sb,
-	.fs_flags		= FS_USERNS_MOUNT,
+	.fs_flags		= FS_USERNS_MOUNT | FS_USERNS_MOUNT_RESTRICTED,
 };
 
 int __init sysfs_init(void)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c37bb3c7de8b..e7ff9f8b1485 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2281,6 +2281,7 @@ struct file_system_type {
 #define FS_MGTIME		64	/* FS uses multigrain timestamps */
 #define FS_LBS			128	/* FS supports LBS */
 #define FS_POWER_FREEZE		256	/* Always freeze on suspend/hibernate */
+#define FS_USERNS_MOUNT_RESTRICTED 512	/* Restrict mount in userns if not already visible */
 #define FS_RENAME_DOES_D_MOVE	32768	/* FS will handle d_move() during rename() internally. */
 	int (*init_fs_context)(struct fs_context *);
 	const struct fs_parameter_spec *parameters;
diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h
index 383050e7fdf5..182efbeb9520 100644
--- a/include/linux/fs/super_types.h
+++ b/include/linux/fs/super_types.h
@@ -326,7 +326,6 @@ struct super_block {
 #define SB_I_STABLE_WRITES 0x00000008	/* don't modify blks until WB is done */
 
 /* sb->s_iflags to limit user namespace mounts */
-#define SB_I_USERNS_VISIBLE		0x00000010 /* fstype already mounted */
 #define SB_I_IMA_UNVERIFIABLE_SIGNATURE	0x00000020
 #define SB_I_UNTRUSTED_MOUNTER		0x00000040
 #define SB_I_EVM_HMAC_UNSUPPORTED	0x00000080
diff --git a/kernel/acct.c b/kernel/acct.c
index cbbf79d718cf..c440d43479ca 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -249,7 +249,7 @@ static int acct_on(const char __user *name)
 		return -EINVAL;
 
 	/* Exclude procfs and sysfs. */
-	if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE)
+	if (file_inode(file)->i_sb->s_type->fs_flags & FS_USERNS_MOUNT_RESTRICTED)
 		return -EINVAL;
 
 	if (!(file->f_mode & FMODE_CAN_WRITE))
-- 
cgit v1.2.3


From a2a5eb6323a7b1987fd8048d94b9ffc7f87e3064 Mon Sep 17 00:00:00 2001
From: Alexey Gladkov <legion@kernel.org>
Date: Mon, 27 Apr 2026 10:26:05 +0200
Subject: proc: subset=pid: Show /proc/self/net only for CAP_NET_ADMIN

Cache the mounters credentials and allow access to the net directories
contingent of the permissions of the mounter of proc.

Do not show /proc/self/net when proc is mounted with subset=pid option
and the mounter does not have CAP_NET_ADMIN. To avoid inadvertently
allowing access to /proc/<pid>/net, updating mounter credentials is not
supported.

Signed-off-by: Alexey Gladkov <legion@kernel.org>
Link: https://patch.msgid.link/d2466fe9085367f1e24693c437ecb8cff2789660.1777278334.git.legion@kernel.org
Reviewed-by: Aleksa Sarai <aleksa@amutable.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/proc/proc_net.c      | 8 ++++++++
 fs/proc/root.c          | 2 ++
 include/linux/proc_fs.h | 1 +
 3 files changed, 11 insertions(+)

(limited to 'include')

diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 184cddeb8215..00cc385bce21 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -23,6 +23,7 @@
 #include <linux/uidgid.h>
 #include <net/net_namespace.h>
 #include <linux/seq_file.h>
+#include <linux/security.h>
 
 #include "internal.h"
 
@@ -270,6 +271,7 @@ static struct net *get_proc_task_net(struct inode *dir)
 	struct task_struct *task;
 	struct nsproxy *ns;
 	struct net *net = NULL;
+	struct proc_fs_info *fs_info = proc_sb_info(dir->i_sb);
 
 	rcu_read_lock();
 	task = pid_task(proc_pid(dir), PIDTYPE_PID);
@@ -282,6 +284,12 @@ static struct net *get_proc_task_net(struct inode *dir)
 	}
 	rcu_read_unlock();
 
+	if (net && (fs_info->pidonly == PROC_PIDONLY_ON) &&
+	    security_capable(fs_info->mounter_cred, net->user_ns, CAP_NET_ADMIN, CAP_OPT_NONE) < 0) {
+		put_net(net);
+		net = NULL;
+	}
+
 	return net;
 }
 
diff --git a/fs/proc/root.c b/fs/proc/root.c
index b65053f9f046..89e5678129e4 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -254,6 +254,7 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc)
 		return -ENOMEM;
 
 	fs_info->pid_ns = get_pid_ns(ctx->pid_ns);
+	fs_info->mounter_cred = get_cred(fc->cred);
 	proc_apply_options(fs_info, fc, current_user_ns());
 
 	/* User space would break if executables or devices appear on proc */
@@ -350,6 +351,7 @@ static void proc_kill_sb(struct super_block *sb)
 	kill_anon_super(sb);
 	if (fs_info) {
 		put_pid_ns(fs_info->pid_ns);
+		put_cred(fs_info->mounter_cred);
 		kfree_rcu(fs_info, rcu);
 	}
 }
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index d2860c18dca9..47d7deaeed8f 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -67,6 +67,7 @@ enum proc_pidonly {
 struct proc_fs_info {
 	struct pid_namespace *pid_ns;
 	kgid_t pid_gid;
+	const struct cred *mounter_cred;
 	enum proc_hidepid hide_pid;
 	enum proc_pidonly pidonly;
 	struct rcu_head rcu;
-- 
cgit v1.2.3


From 05dab768fc2dc7eb9b827201bb39bb5be54bce49 Mon Sep 17 00:00:00 2001
From: Alexey Gladkov <legion@kernel.org>
Date: Mon, 27 Apr 2026 10:26:07 +0200
Subject: proc: handle subset=pid separately in userns visibility checks

When procfs is mounted with subset=pid, only the dynamic process-related
part of the filesystem remains visible. That part cannot be hidden by
overmounts, so checking whether an existing procfs mount is fully
visible does not make sense for this mode.

At the same time, a subset=pid procfs mount must not be used as evidence
that a later procfs mount would not reveal additional information. It
provides a restricted view of procfs, not the full filesystem view.

Mark subset=pid procfs instances as restricted variants. Ignore
restricted variants when looking for an already-visible mount, and allow
new restricted variants without consulting mnt_already_visible().

Signed-off-by: Alexey Gladkov <legion@kernel.org>
Link: https://patch.msgid.link/4d5e760c3d534dd2e05578d119cc408450053a98.1777278334.git.legion@kernel.org
Reviewed-by: Aleksa Sarai <aleksa@amutable.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c                 | 17 ++++++++++++++++-
 fs/proc/root.c                 |  3 +++
 include/linux/fs/super_types.h |  1 +
 3 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/namespace.c b/fs/namespace.c
index e6e58117c778..9a66a806a9b8 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -6353,10 +6353,18 @@ static bool mnt_already_visible(struct mnt_namespace *ns,
 
 	guard(namespace_shared)();
 	hlist_for_each_entry(mnt, &ns->mnt_visible_mounts, mnt_ns_visible) {
+		const struct super_block *sb_visible = mnt->mnt.mnt_sb;
 		struct mount *child;
 		int mnt_flags;
 
-		if (mnt->mnt.mnt_sb->s_type != sb->s_type)
+		if (sb_visible->s_type != sb->s_type)
+			continue;
+
+		/*
+		 * Restricted variants are not compatible with anything, even
+		 * other restricted variants.
+		 */
+		if (sb_visible->s_iflags & SB_I_RESTRICTED_VARIANT)
 			continue;
 
 		/* A local view of the mount flags */
@@ -6418,6 +6426,13 @@ static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags
 		return true;
 	}
 
+	/*
+	 * Restricted variants don't need an already visible mount because they
+	 * don't expose the full filesystem view.
+	 */
+	if (s_iflags & SB_I_RESTRICTED_VARIANT)
+		return false;
+
 	return !mnt_already_visible(ns, sb, new_mnt_flags);
 }
 
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 1bf75a4ee146..99adddfeb4a4 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -275,6 +275,9 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc)
 	s->s_time_gran = 1;
 	s->s_fs_info = fs_info;
 
+	if (fs_info->pidonly == PROC_PIDONLY_ON)
+		s->s_iflags |= SB_I_RESTRICTED_VARIANT;
+
 	/*
 	 * procfs isn't actually a stacking filesystem; however, there is
 	 * too much magic going on inside it to permit stacking things on
diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h
index 182efbeb9520..a6cdc8f6de4e 100644
--- a/include/linux/fs/super_types.h
+++ b/include/linux/fs/super_types.h
@@ -326,6 +326,7 @@ struct super_block {
 #define SB_I_STABLE_WRITES 0x00000008	/* don't modify blks until WB is done */
 
 /* sb->s_iflags to limit user namespace mounts */
+#define SB_I_RESTRICTED_VARIANT		0x00000010
 #define SB_I_IMA_UNVERIFIABLE_SIGNATURE	0x00000020
 #define SB_I_UNTRUSTED_MOUNTER		0x00000040
 #define SB_I_EVM_HMAC_UNSUPPORTED	0x00000080
-- 
cgit v1.2.3