From dc4edae7f41dceb236553b61cda0383895293c90 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 27 Apr 2026 10:26:03 +0200 Subject: fs: move SB_I_USERNS_VISIBLE to FS_USERNS_MOUNT_RESTRICTED Whether a filesystem's mounts need to undergo a visibility check in user namespaces is a static property of the filesystem type, not a runtime property of each superblock instance. Both proc and sysfs always set SB_I_USERNS_VISIBLE on their superblocks unconditionally (sysfs does so on first creation, and subsequent mounts reuse the same superblock). Move this flag from sb->s_iflags (SB_I_USERNS_VISIBLE) to file_system_type->fs_flags (FS_USERNS_MOUNT_RESTRICTED) so the intent is expressed at the filesystem type level where it belongs. All check sites are updated to test sb->s_type->fs_flags instead of sb->s_iflags. The SB_I_NOEXEC and SB_I_NODEV flags remain on the superblock as they are runtime properties set during fill_super. Link: https://patch.msgid.link/72887c5b6204dc3adf5a53104f0be6bd8bc4f6cd.1777278334.git.legion@kernel.org Reviewed-by: Aleksa Sarai Signed-off-by: Christian Brauner --- include/linux/fs.h | 1 + include/linux/fs/super_types.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index c37bb3c7de8b..e7ff9f8b1485 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2281,6 +2281,7 @@ struct file_system_type { #define FS_MGTIME 64 /* FS uses multigrain timestamps */ #define FS_LBS 128 /* FS supports LBS */ #define FS_POWER_FREEZE 256 /* Always freeze on suspend/hibernate */ +#define FS_USERNS_MOUNT_RESTRICTED 512 /* Restrict mount in userns if not already visible */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ int (*init_fs_context)(struct fs_context *); const struct fs_parameter_spec *parameters; diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h index 383050e7fdf5..182efbeb9520 100644 --- a/include/linux/fs/super_types.h +++ b/include/linux/fs/super_types.h @@ -326,7 +326,6 @@ struct super_block { #define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */ /* sb->s_iflags to limit user namespace mounts */ -#define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ #define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020 #define SB_I_UNTRUSTED_MOUNTER 0x00000040 #define SB_I_EVM_HMAC_UNSUPPORTED 0x00000080 -- cgit v1.2.3 From a2a5eb6323a7b1987fd8048d94b9ffc7f87e3064 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Mon, 27 Apr 2026 10:26:05 +0200 Subject: proc: subset=pid: Show /proc/self/net only for CAP_NET_ADMIN Cache the mounters credentials and allow access to the net directories contingent of the permissions of the mounter of proc. Do not show /proc/self/net when proc is mounted with subset=pid option and the mounter does not have CAP_NET_ADMIN. To avoid inadvertently allowing access to /proc//net, updating mounter credentials is not supported. Signed-off-by: Alexey Gladkov Link: https://patch.msgid.link/d2466fe9085367f1e24693c437ecb8cff2789660.1777278334.git.legion@kernel.org Reviewed-by: Aleksa Sarai Signed-off-by: Christian Brauner --- fs/proc/proc_net.c | 8 ++++++++ fs/proc/root.c | 2 ++ include/linux/proc_fs.h | 1 + 3 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 184cddeb8215..00cc385bce21 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "internal.h" @@ -270,6 +271,7 @@ static struct net *get_proc_task_net(struct inode *dir) struct task_struct *task; struct nsproxy *ns; struct net *net = NULL; + struct proc_fs_info *fs_info = proc_sb_info(dir->i_sb); rcu_read_lock(); task = pid_task(proc_pid(dir), PIDTYPE_PID); @@ -282,6 +284,12 @@ static struct net *get_proc_task_net(struct inode *dir) } rcu_read_unlock(); + if (net && (fs_info->pidonly == PROC_PIDONLY_ON) && + security_capable(fs_info->mounter_cred, net->user_ns, CAP_NET_ADMIN, CAP_OPT_NONE) < 0) { + put_net(net); + net = NULL; + } + return net; } diff --git a/fs/proc/root.c b/fs/proc/root.c index b65053f9f046..89e5678129e4 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -254,6 +254,7 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc) return -ENOMEM; fs_info->pid_ns = get_pid_ns(ctx->pid_ns); + fs_info->mounter_cred = get_cred(fc->cred); proc_apply_options(fs_info, fc, current_user_ns()); /* User space would break if executables or devices appear on proc */ @@ -350,6 +351,7 @@ static void proc_kill_sb(struct super_block *sb) kill_anon_super(sb); if (fs_info) { put_pid_ns(fs_info->pid_ns); + put_cred(fs_info->mounter_cred); kfree_rcu(fs_info, rcu); } } diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index d2860c18dca9..47d7deaeed8f 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -67,6 +67,7 @@ enum proc_pidonly { struct proc_fs_info { struct pid_namespace *pid_ns; kgid_t pid_gid; + const struct cred *mounter_cred; enum proc_hidepid hide_pid; enum proc_pidonly pidonly; struct rcu_head rcu; -- cgit v1.2.3 From 05dab768fc2dc7eb9b827201bb39bb5be54bce49 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Mon, 27 Apr 2026 10:26:07 +0200 Subject: proc: handle subset=pid separately in userns visibility checks When procfs is mounted with subset=pid, only the dynamic process-related part of the filesystem remains visible. That part cannot be hidden by overmounts, so checking whether an existing procfs mount is fully visible does not make sense for this mode. At the same time, a subset=pid procfs mount must not be used as evidence that a later procfs mount would not reveal additional information. It provides a restricted view of procfs, not the full filesystem view. Mark subset=pid procfs instances as restricted variants. Ignore restricted variants when looking for an already-visible mount, and allow new restricted variants without consulting mnt_already_visible(). Signed-off-by: Alexey Gladkov Link: https://patch.msgid.link/4d5e760c3d534dd2e05578d119cc408450053a98.1777278334.git.legion@kernel.org Reviewed-by: Aleksa Sarai Signed-off-by: Christian Brauner --- fs/namespace.c | 17 ++++++++++++++++- fs/proc/root.c | 3 +++ include/linux/fs/super_types.h | 1 + 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/namespace.c b/fs/namespace.c index e6e58117c778..9a66a806a9b8 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -6353,10 +6353,18 @@ static bool mnt_already_visible(struct mnt_namespace *ns, guard(namespace_shared)(); hlist_for_each_entry(mnt, &ns->mnt_visible_mounts, mnt_ns_visible) { + const struct super_block *sb_visible = mnt->mnt.mnt_sb; struct mount *child; int mnt_flags; - if (mnt->mnt.mnt_sb->s_type != sb->s_type) + if (sb_visible->s_type != sb->s_type) + continue; + + /* + * Restricted variants are not compatible with anything, even + * other restricted variants. + */ + if (sb_visible->s_iflags & SB_I_RESTRICTED_VARIANT) continue; /* A local view of the mount flags */ @@ -6418,6 +6426,13 @@ static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags return true; } + /* + * Restricted variants don't need an already visible mount because they + * don't expose the full filesystem view. + */ + if (s_iflags & SB_I_RESTRICTED_VARIANT) + return false; + return !mnt_already_visible(ns, sb, new_mnt_flags); } diff --git a/fs/proc/root.c b/fs/proc/root.c index 1bf75a4ee146..99adddfeb4a4 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -275,6 +275,9 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc) s->s_time_gran = 1; s->s_fs_info = fs_info; + if (fs_info->pidonly == PROC_PIDONLY_ON) + s->s_iflags |= SB_I_RESTRICTED_VARIANT; + /* * procfs isn't actually a stacking filesystem; however, there is * too much magic going on inside it to permit stacking things on diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h index 182efbeb9520..a6cdc8f6de4e 100644 --- a/include/linux/fs/super_types.h +++ b/include/linux/fs/super_types.h @@ -326,6 +326,7 @@ struct super_block { #define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */ /* sb->s_iflags to limit user namespace mounts */ +#define SB_I_RESTRICTED_VARIANT 0x00000010 #define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020 #define SB_I_UNTRUSTED_MOUNTER 0x00000040 #define SB_I_EVM_HMAC_UNSUPPORTED 0x00000080 -- cgit v1.2.3