From 672697c37c9dfe05aabfd44ccb51c2877deb524e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 27 Apr 2026 10:26:02 +0200 Subject: namespace: record fully visible mounts in list Instead of wading through all the mounts in the mount namespace rbtree to find fully visible procfs and sysfs mounts, be honest about them being special cruft and record them in a separate per-mount namespace list. Link: https://patch.msgid.link/684859a8e0ac929cb89c1fbe16ce15b30c70eb1f.1777278334.git.legion@kernel.org Reviewed-by: Aleksa Sarai Signed-off-by: Christian Brauner --- fs/mount.h | 4 ++++ fs/namespace.c | 19 +++++++++++-------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/fs/mount.h b/fs/mount.h index e0816c11a198..5df134d56d47 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -25,6 +25,7 @@ struct mnt_namespace { __u32 n_fsnotify_mask; struct fsnotify_mark_connector __rcu *n_fsnotify_marks; #endif + struct hlist_head mnt_visible_mounts; /* SB_I_USERNS_VISIBLE mounts */ unsigned int nr_mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; refcount_t passive; /* number references not pinning @mounts */ @@ -90,6 +91,7 @@ struct mount { int mnt_expiry_mark; /* true if marked for expiry */ struct hlist_head mnt_pins; struct hlist_head mnt_stuck_children; + struct hlist_node mnt_ns_visible; /* link in ns->mnt_visible_mounts */ struct mount *overmount; /* mounted on ->mnt_root */ } __randomize_layout; @@ -207,6 +209,8 @@ static inline void move_from_ns(struct mount *mnt) ns->mnt_first_node = rb_next(&mnt->mnt_node); rb_erase(&mnt->mnt_node, &ns->mounts); RB_CLEAR_NODE(&mnt->mnt_node); + if (!hlist_unhashed(&mnt->mnt_ns_visible)) + hlist_del_init(&mnt->mnt_ns_visible); } bool has_locked_children(struct mount *mnt, struct dentry *dentry); diff --git a/fs/namespace.c b/fs/namespace.c index fe919abd2f01..047efa737ef9 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -321,6 +321,7 @@ static struct mount *alloc_vfsmnt(const char *name) INIT_HLIST_NODE(&mnt->mnt_slave); INIT_HLIST_NODE(&mnt->mnt_mp_list); INIT_HLIST_HEAD(&mnt->mnt_stuck_children); + INIT_HLIST_NODE(&mnt->mnt_ns_visible); RB_CLEAR_NODE(&mnt->mnt_node); mnt->mnt.mnt_idmap = &nop_mnt_idmap; } @@ -1098,6 +1099,10 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt) rb_link_node(&mnt->mnt_node, parent, link); rb_insert_color(&mnt->mnt_node, &ns->mounts); + if ((mnt->mnt.mnt_sb->s_iflags & SB_I_USERNS_VISIBLE) && + mnt->mnt.mnt_root == mnt->mnt.mnt_sb->s_root) + hlist_add_head(&mnt->mnt_ns_visible, &ns->mnt_visible_mounts); + mnt_notify_add(mnt); } @@ -6340,22 +6345,20 @@ static bool mnt_already_visible(struct mnt_namespace *ns, int *new_mnt_flags) { int new_flags = *new_mnt_flags; - struct mount *mnt, *n; + struct mount *mnt; + + /* Don't acquire namespace semaphore without a good reason. */ + if (hlist_empty(&ns->mnt_visible_mounts)) + return false; guard(namespace_shared)(); - rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) { + hlist_for_each_entry(mnt, &ns->mnt_visible_mounts, mnt_ns_visible) { struct mount *child; int mnt_flags; if (mnt->mnt.mnt_sb->s_type != sb->s_type) continue; - /* This mount is not fully visible if it's root directory - * is not the root directory of the filesystem. - */ - if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root) - continue; - /* A local view of the mount flags */ mnt_flags = mnt->mnt.mnt_flags; -- cgit v1.2.3 From 912289ee831d92b377309dc832c0c3ce5906deae Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 26 Apr 2026 00:08:42 +0200 Subject: proc: allow to mark /proc files permanent outside of fs/proc/ Add proc_make_permanent() function to mark PDE as permanent to speed up open/read/close (one alloc/free and lock/unlock less). Enable it for built-in code and for compiled-in modules. This function becomes nop magically in modular code. Note, note, note! If built-in code creates and deletes PDEs dynamically (not in init hook), then proc_make_permanent() must not be used. It is intended for simple code: static int __init xxx_module_init(void) { g_pde = proc_create_single(); proc_make_permanent(g_pde); return 0; } static void __exit xxx_module_exit(void) { remove_proc_entry(g_pde); } If module is built-in then exit hook never executed and PDE is permanent so it is OK to mark it as such. If module is module then rmmod will yank PDE, but proc_make_permanent() is nop and core /proc code will do everything right. Signed-off-by: Alexey Dobriyan Link: https://patch.msgid.link/20260425220844.1763933-2-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/proc/generic.c | 10 ++++++++++ fs/proc/internal.h | 3 +++ include/linux/proc_fs.h | 12 ++++++++++++ 3 files changed, 25 insertions(+) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 8bb81e58c9d8..c6ae076e1fa0 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -841,3 +841,13 @@ ssize_t proc_simple_write(struct file *f, const char __user *ubuf, size_t size, kfree(buf); return ret == 0 ? size : ret; } + +/* + * Not exported to modules: + * modules' /proc files aren't permanent because modules aren't permanent. + */ +void impl_proc_make_permanent(struct proc_dir_entry *pde) +{ + if (pde) + pde_make_permanent(pde); +} diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 64dc44832808..1edbabbdbc5d 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -79,8 +79,11 @@ static inline bool pde_is_permanent(const struct proc_dir_entry *pde) return pde->flags & PROC_ENTRY_PERMANENT; } +/* This is for builtin code, not even for modules which are compiled in. */ static inline void pde_make_permanent(struct proc_dir_entry *pde) { + /* Ensure magic flag does something. */ + static_assert(PROC_ENTRY_PERMANENT != 0); pde->flags |= PROC_ENTRY_PERMANENT; } diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 19d1c5e5f335..d2860c18dca9 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -248,4 +248,16 @@ static inline struct pid_namespace *proc_pid_ns(struct super_block *sb) bool proc_ns_file(const struct file *file); +#if defined CONFIG_PROC_FS && !defined MODULE +void impl_proc_make_permanent(struct proc_dir_entry *pde); +#endif + +static inline void proc_make_permanent(struct proc_dir_entry *pde) +{ + /* Don't give matches to modules. */ +#if defined CONFIG_PROC_FS && !defined MODULE + impl_proc_make_permanent(pde); +#endif +} + #endif /* _LINUX_PROC_FS_H */ -- cgit v1.2.3 From dc4edae7f41dceb236553b61cda0383895293c90 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 27 Apr 2026 10:26:03 +0200 Subject: fs: move SB_I_USERNS_VISIBLE to FS_USERNS_MOUNT_RESTRICTED Whether a filesystem's mounts need to undergo a visibility check in user namespaces is a static property of the filesystem type, not a runtime property of each superblock instance. Both proc and sysfs always set SB_I_USERNS_VISIBLE on their superblocks unconditionally (sysfs does so on first creation, and subsequent mounts reuse the same superblock). Move this flag from sb->s_iflags (SB_I_USERNS_VISIBLE) to file_system_type->fs_flags (FS_USERNS_MOUNT_RESTRICTED) so the intent is expressed at the filesystem type level where it belongs. All check sites are updated to test sb->s_type->fs_flags instead of sb->s_iflags. The SB_I_NOEXEC and SB_I_NODEV flags remain on the superblock as they are runtime properties set during fill_super. Link: https://patch.msgid.link/72887c5b6204dc3adf5a53104f0be6bd8bc4f6cd.1777278334.git.legion@kernel.org Reviewed-by: Aleksa Sarai Signed-off-by: Christian Brauner --- fs/namespace.c | 6 +++--- fs/proc/root.c | 4 ++-- fs/sysfs/mount.c | 4 +--- include/linux/fs.h | 1 + include/linux/fs/super_types.h | 1 - kernel/acct.c | 2 +- 6 files changed, 8 insertions(+), 10 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 047efa737ef9..e6e58117c778 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1099,7 +1099,7 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt) rb_link_node(&mnt->mnt_node, parent, link); rb_insert_color(&mnt->mnt_node, &ns->mounts); - if ((mnt->mnt.mnt_sb->s_iflags & SB_I_USERNS_VISIBLE) && + if ((mnt->mnt.mnt_sb->s_type->fs_flags & FS_USERNS_MOUNT_RESTRICTED) && mnt->mnt.mnt_root == mnt->mnt.mnt_sb->s_root) hlist_add_head(&mnt->mnt_ns_visible, &ns->mnt_visible_mounts); @@ -6408,10 +6408,10 @@ static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags return false; /* Can this filesystem be too revealing? */ - s_iflags = sb->s_iflags; - if (!(s_iflags & SB_I_USERNS_VISIBLE)) + if (!(sb->s_type->fs_flags & FS_USERNS_MOUNT_RESTRICTED)) return false; + s_iflags = sb->s_iflags; if ((s_iflags & required_iflags) != required_iflags) { WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n", required_iflags); diff --git a/fs/proc/root.c b/fs/proc/root.c index 0f9100559471..b65053f9f046 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -257,7 +257,7 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc) proc_apply_options(fs_info, fc, current_user_ns()); /* User space would break if executables or devices appear on proc */ - s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV; + s->s_iflags |= SB_I_NOEXEC | SB_I_NODEV; s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC; s->s_blocksize = 1024; s->s_blocksize_bits = 10; @@ -359,7 +359,7 @@ static struct file_system_type proc_fs_type = { .init_fs_context = proc_init_fs_context, .parameters = proc_fs_parameters, .kill_sb = proc_kill_sb, - .fs_flags = FS_USERNS_MOUNT | FS_DISALLOW_NOTIFY_PERM, + .fs_flags = FS_USERNS_MOUNT | FS_USERNS_MOUNT_RESTRICTED | FS_DISALLOW_NOTIFY_PERM, }; void __init proc_root_init(void) diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index b199e8ff79b1..b45ea5d511e7 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -32,8 +32,6 @@ static int sysfs_get_tree(struct fs_context *fc) if (ret) return ret; - if (kfc->new_sb_created) - fc->root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE; return 0; } @@ -93,7 +91,7 @@ static struct file_system_type sysfs_fs_type = { .name = "sysfs", .init_fs_context = sysfs_init_fs_context, .kill_sb = sysfs_kill_sb, - .fs_flags = FS_USERNS_MOUNT, + .fs_flags = FS_USERNS_MOUNT | FS_USERNS_MOUNT_RESTRICTED, }; int __init sysfs_init(void) diff --git a/include/linux/fs.h b/include/linux/fs.h index c37bb3c7de8b..e7ff9f8b1485 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2281,6 +2281,7 @@ struct file_system_type { #define FS_MGTIME 64 /* FS uses multigrain timestamps */ #define FS_LBS 128 /* FS supports LBS */ #define FS_POWER_FREEZE 256 /* Always freeze on suspend/hibernate */ +#define FS_USERNS_MOUNT_RESTRICTED 512 /* Restrict mount in userns if not already visible */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ int (*init_fs_context)(struct fs_context *); const struct fs_parameter_spec *parameters; diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h index 383050e7fdf5..182efbeb9520 100644 --- a/include/linux/fs/super_types.h +++ b/include/linux/fs/super_types.h @@ -326,7 +326,6 @@ struct super_block { #define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */ /* sb->s_iflags to limit user namespace mounts */ -#define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ #define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020 #define SB_I_UNTRUSTED_MOUNTER 0x00000040 #define SB_I_EVM_HMAC_UNSUPPORTED 0x00000080 diff --git a/kernel/acct.c b/kernel/acct.c index cbbf79d718cf..c440d43479ca 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -249,7 +249,7 @@ static int acct_on(const char __user *name) return -EINVAL; /* Exclude procfs and sysfs. */ - if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE) + if (file_inode(file)->i_sb->s_type->fs_flags & FS_USERNS_MOUNT_RESTRICTED) return -EINVAL; if (!(file->f_mode & FMODE_CAN_WRITE)) -- cgit v1.2.3 From dc651e25a6d289bc08cfde032427fa17b3d1b22e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 26 Apr 2026 00:08:43 +0200 Subject: fs: RCU-ify filesystems list The drivers list was protected by an rwlock; every mount, every open of /proc/filesystems and the legacy sysfs(2) syscall walked a hand-rolled singly-linked list under it. /proc/filesystems is especially hot because libselinux causes programs as mundane as mkdir, ls and sed to open and read it on every invocation. Convert the list to an RCU-protected hlist and switch the writer side to a plain spinlock. Writers keep their existing non-sleeping section while readers walk under rcu_read_lock() with no lock traffic: - register_filesystem()/unregister_filesystem() take file_systems_lock, publish via hlist_{add_tail,del_init}_rcu() and invalidate the cached /proc/filesystems string. unregister_filesystem() keeps its synchronize_rcu() after dropping the lock so in-flight readers are drained before the module (and its embedded file_system_type) can go away. - __get_fs_type(), list_bdev_fs_names() and the fs_index()/fs_name()/fs_maxindex() helpers walk the list under rcu_read_lock(). fs_name() continues to drop the read-side lock after try_module_get() and accesses ->name outside the RCU section; the module reference pins the embedded file_system_type across the boundary. struct file_system_type::next becomes struct hlist_node list; no in-tree caller references the old ->next field outside fs/filesystems.c. Link: https://patch.msgid.link/20260425220844.1763933-3-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/filesystems.c | 179 ++++++++++++++++++++++------------------------------- fs/ocfs2/super.c | 1 - include/linux/fs.h | 2 +- 3 files changed, 75 insertions(+), 107 deletions(-) diff --git a/fs/filesystems.c b/fs/filesystems.c index 0c7d2b7ac26c..7976366d4197 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -17,22 +17,19 @@ #include #include #include +#include /* - * Handling of filesystem drivers list. - * Rules: - * Inclusion to/removals from/scanning of list are protected by spinlock. - * During the unload module must call unregister_filesystem(). - * We can access the fields of list element if: - * 1) spinlock is held or - * 2) we hold the reference to the module. - * The latter can be guaranteed by call of try_module_get(); if it - * returned 0 we must skip the element, otherwise we got the reference. - * Once the reference is obtained we can drop the spinlock. + * Read-mostly filesystem drivers list. + * + * Readers walk under rcu_read_lock(); writers take file_systems_lock + * and publish via _rcu hlist primitives. unregister_filesystem() + * synchronize_rcu()s after unlock so the embedded file_system_type + * can't go away under a reader. To keep using a filesystem after + * the RCU section ends, take a module reference via try_module_get(). */ - -static struct file_system_type *file_systems; -static DEFINE_RWLOCK(file_systems_lock); +static HLIST_HEAD(file_systems); +static DEFINE_SPINLOCK(file_systems_lock); /* WARNING: This can be used only if we _already_ own a reference */ struct file_system_type *get_filesystem(struct file_system_type *fs) @@ -46,14 +43,15 @@ void put_filesystem(struct file_system_type *fs) module_put(fs->owner); } -static struct file_system_type **find_filesystem(const char *name, unsigned len) +static struct file_system_type *find_filesystem(const char *name, unsigned len) { - struct file_system_type **p; - for (p = &file_systems; *p; p = &(*p)->next) - if (strncmp((*p)->name, name, len) == 0 && - !(*p)->name[len]) - break; - return p; + struct file_system_type *fs; + + hlist_for_each_entry_rcu(fs, &file_systems, list, + lockdep_is_held(&file_systems_lock)) + if (strncmp(fs->name, name, len) == 0 && !fs->name[len]) + return fs; + return NULL; } /** @@ -64,33 +62,26 @@ static struct file_system_type **find_filesystem(const char *name, unsigned len) * is aware of for mount and other syscalls. Returns 0 on success, * or a negative errno code on an error. * - * The &struct file_system_type that is passed is linked into the kernel + * The &struct file_system_type that is passed is linked into the kernel * structures and must not be freed until the file system has been * unregistered. */ - -int register_filesystem(struct file_system_type * fs) +int register_filesystem(struct file_system_type *fs) { - int res = 0; - struct file_system_type ** p; - if (fs->parameters && !fs_validate_description(fs->name, fs->parameters)) return -EINVAL; BUG_ON(strchr(fs->name, '.')); - if (fs->next) + if (!hlist_unhashed_lockless(&fs->list)) return -EBUSY; - write_lock(&file_systems_lock); - p = find_filesystem(fs->name, strlen(fs->name)); - if (*p) - res = -EBUSY; - else - *p = fs; - write_unlock(&file_systems_lock); - return res; -} + guard(spinlock)(&file_systems_lock); + if (find_filesystem(fs->name, strlen(fs->name))) + return -EBUSY; + hlist_add_tail_rcu(&fs->list, &file_systems); + return 0; +} EXPORT_SYMBOL(register_filesystem); /** @@ -100,94 +91,78 @@ EXPORT_SYMBOL(register_filesystem); * Remove a file system that was previously successfully registered * with the kernel. An error is returned if the file system is not found. * Zero is returned on a success. - * + * * Once this function has returned the &struct file_system_type structure * may be freed or reused. */ - -int unregister_filesystem(struct file_system_type * fs) +int unregister_filesystem(struct file_system_type *fs) { - struct file_system_type ** tmp; - - write_lock(&file_systems_lock); - tmp = &file_systems; - while (*tmp) { - if (fs == *tmp) { - *tmp = fs->next; - fs->next = NULL; - write_unlock(&file_systems_lock); - synchronize_rcu(); - return 0; - } - tmp = &(*tmp)->next; + scoped_guard(spinlock, &file_systems_lock) { + if (hlist_unhashed(&fs->list)) + return -EINVAL; + hlist_del_init_rcu(&fs->list); } - write_unlock(&file_systems_lock); - - return -EINVAL; + synchronize_rcu(); + return 0; } - EXPORT_SYMBOL(unregister_filesystem); #ifdef CONFIG_SYSFS_SYSCALL -static int fs_index(const char __user * __name) +static int fs_index(const char __user *__name) { - struct file_system_type * tmp; + struct file_system_type *p; char *name __free(kfree) = strndup_user(__name, PATH_MAX); - int err, index; + int index = 0; if (IS_ERR(name)) return PTR_ERR(name); - err = -EINVAL; - read_lock(&file_systems_lock); - for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) { - if (strcmp(tmp->name, name) == 0) { - err = index; - break; - } + guard(rcu)(); + hlist_for_each_entry_rcu(p, &file_systems, list) { + if (strcmp(p->name, name) == 0) + return index; + index++; } - read_unlock(&file_systems_lock); - return err; + return -EINVAL; } -static int fs_name(unsigned int index, char __user * buf) +static int fs_name(unsigned int index, char __user *buf) { - struct file_system_type * tmp; - int len, res = -EINVAL; - - read_lock(&file_systems_lock); - for (tmp = file_systems; tmp; tmp = tmp->next, index--) { - if (index == 0) { - if (try_module_get(tmp->owner)) - res = 0; + struct file_system_type *p, *found = NULL; + int len, res; + + scoped_guard(rcu) { + hlist_for_each_entry_rcu(p, &file_systems, list) { + if (index--) + continue; + if (try_module_get(p->owner)) + found = p; break; } } - read_unlock(&file_systems_lock); - if (res) - return res; + if (!found) + return -EINVAL; /* OK, we got the reference, so we can safely block */ - len = strlen(tmp->name) + 1; - res = copy_to_user(buf, tmp->name, len) ? -EFAULT : 0; - put_filesystem(tmp); + len = strlen(found->name) + 1; + res = copy_to_user(buf, found->name, len) ? -EFAULT : 0; + put_filesystem(found); return res; } static int fs_maxindex(void) { - struct file_system_type * tmp; - int index; + struct file_system_type *p; + int index = 0; - read_lock(&file_systems_lock); - for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++) - ; - read_unlock(&file_systems_lock); + guard(rcu)(); + hlist_for_each_entry_rcu(p, &file_systems, list) + index++; return index; } /* - * Whee.. Weird sysv syscall. + * Whee.. Weird sysv syscall. */ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2) { @@ -216,8 +191,8 @@ int __init list_bdev_fs_names(char *buf, size_t size) size_t len; int count = 0; - read_lock(&file_systems_lock); - for (p = file_systems; p; p = p->next) { + guard(rcu)(); + hlist_for_each_entry_rcu(p, &file_systems, list) { if (!(p->fs_flags & FS_REQUIRES_DEV)) continue; len = strlen(p->name) + 1; @@ -230,24 +205,20 @@ int __init list_bdev_fs_names(char *buf, size_t size) size -= len; count++; } - read_unlock(&file_systems_lock); return count; } #ifdef CONFIG_PROC_FS static int filesystems_proc_show(struct seq_file *m, void *v) { - struct file_system_type * tmp; + struct file_system_type *p; - read_lock(&file_systems_lock); - tmp = file_systems; - while (tmp) { + guard(rcu)(); + hlist_for_each_entry_rcu(p, &file_systems, list) { seq_printf(m, "%s\t%s\n", - (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", - tmp->name); - tmp = tmp->next; + (p->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", + p->name); } - read_unlock(&file_systems_lock); return 0; } @@ -263,11 +234,10 @@ static struct file_system_type *__get_fs_type(const char *name, int len) { struct file_system_type *fs; - read_lock(&file_systems_lock); - fs = *(find_filesystem(name, len)); + guard(rcu)(); + fs = find_filesystem(name, len); if (fs && !try_module_get(fs->owner)) fs = NULL; - read_unlock(&file_systems_lock); return fs; } @@ -291,5 +261,4 @@ struct file_system_type *get_fs_type(const char *name) } return fs; } - EXPORT_SYMBOL(get_fs_type); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index b875f01c9756..4870e680c4e5 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1224,7 +1224,6 @@ static struct file_system_type ocfs2_fs_type = { .name = "ocfs2", .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, - .next = NULL, .init_fs_context = ocfs2_init_fs_context, .parameters = ocfs2_param_spec, }; diff --git a/include/linux/fs.h b/include/linux/fs.h index 11559c513dfb..c37bb3c7de8b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2286,7 +2286,7 @@ struct file_system_type { const struct fs_parameter_spec *parameters; void (*kill_sb) (struct super_block *); struct module *owner; - struct file_system_type * next; + struct hlist_node list; struct hlist_head fs_supers; struct lock_class_key s_lock_key; -- cgit v1.2.3 From 78d797520f6a74ed402cb98c6bf74d96b4937965 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 27 Apr 2026 10:26:04 +0200 Subject: sysfs: remove trivial sysfs_get_tree() wrapper Now that FS_USERNS_MOUNT_RESTRICTED is a file_system_type flag, sysfs_get_tree() is a trivial wrapper around kernfs_get_tree() with no additional logic. Point sysfs_fs_context_ops.get_tree directly at kernfs_get_tree() and remove the wrapper. Link: https://patch.msgid.link/e8ac71fc96ad864c8b58fc0a8e5305550c01db25.1777278334.git.legion@kernel.org Reviewed-by: Aleksa Sarai Signed-off-by: Christian Brauner --- fs/sysfs/mount.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index b45ea5d511e7..88c10823fcaf 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -23,18 +23,6 @@ static struct kernfs_root *sysfs_root; struct kernfs_node *sysfs_root_kn; -static int sysfs_get_tree(struct fs_context *fc) -{ - struct kernfs_fs_context *kfc = fc->fs_private; - int ret; - - ret = kernfs_get_tree(fc); - if (ret) - return ret; - - return 0; -} - static void sysfs_fs_context_free(struct fs_context *fc) { struct kernfs_fs_context *kfc = fc->fs_private; @@ -47,7 +35,7 @@ static void sysfs_fs_context_free(struct fs_context *fc) static const struct fs_context_operations sysfs_fs_context_ops = { .free = sysfs_fs_context_free, - .get_tree = sysfs_get_tree, + .get_tree = kernfs_get_tree, }; static int sysfs_init_fs_context(struct fs_context *fc) -- cgit v1.2.3 From 36b3306779ea58f994a614b3663a196707a66ce2 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sun, 26 Apr 2026 00:08:44 +0200 Subject: fs: cache the string generated by reading /proc/filesystems It is being read surprisingly often (e.g., by mkdir, ls and even sed!). This is lock-protected pointer chasing over a linked list to pay for sprintf for every fs (32 on my boxen). Instead cache the result. While here make the file as permanent to avoid spurious ref trips in procfs. Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20260425220844.1763933-4-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/filesystems.c | 155 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 153 insertions(+), 2 deletions(-) diff --git a/fs/filesystems.c b/fs/filesystems.c index 7976366d4197..771fc31a69b8 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -31,6 +31,36 @@ static HLIST_HEAD(file_systems); static DEFINE_SPINLOCK(file_systems_lock); +#ifdef CONFIG_PROC_FS +/* + * Cache a stringified version of the filesystem list. + * + * The fs list gets queried a lot by userspace because of libselinux, including + * rather surprising programs (would you guess *sed* is on the list?). In order + * to reduce the overhead we cache the resulting string, which normally hangs + * around below 512 bytes in size. + * + * As the list almost never changes, its creation is not particularly optimized + * to keep things simple. + * + * We sort it out on read in order to not introduce a failure point for fs + * registration (in principle we may be unable to alloc memory for the list). + */ +struct file_systems_string { + struct rcu_head rcu; + unsigned long gen; + size_t len; + char string[]; +}; + +static unsigned long file_systems_gen; +static struct file_systems_string __rcu *file_systems_string; + +static void invalidate_filesystems_string(void); +#else +static inline void invalidate_filesystems_string(void) { } +#endif + /* WARNING: This can be used only if we _already_ own a reference */ struct file_system_type *get_filesystem(struct file_system_type *fs) { @@ -80,6 +110,7 @@ int register_filesystem(struct file_system_type *fs) if (find_filesystem(fs->name, strlen(fs->name))) return -EBUSY; hlist_add_tail_rcu(&fs->list, &file_systems); + invalidate_filesystems_string(); return 0; } EXPORT_SYMBOL(register_filesystem); @@ -101,6 +132,7 @@ int unregister_filesystem(struct file_system_type *fs) if (hlist_unhashed(&fs->list)) return -EINVAL; hlist_del_init_rcu(&fs->list); + invalidate_filesystems_string(); } synchronize_rcu(); return 0; @@ -209,7 +241,102 @@ int __init list_bdev_fs_names(char *buf, size_t size) } #ifdef CONFIG_PROC_FS -static int filesystems_proc_show(struct seq_file *m, void *v) +static void invalidate_filesystems_string(void) +{ + struct file_systems_string *old; + + lockdep_assert_held_write(&file_systems_lock); + file_systems_gen++; + old = rcu_replace_pointer(file_systems_string, NULL, + lockdep_is_held(&file_systems_lock)); + if (old) + kfree_rcu(old, rcu); +} + +static __cold noinline int regen_filesystems_string(void) +{ + struct file_system_type *p; + struct file_systems_string *old, *new; + size_t newlen, usedlen; + unsigned long gen; + +retry: + newlen = 0; + + /* pre-calc space for each fs */ + spin_lock(&file_systems_lock); + gen = file_systems_gen; + hlist_for_each_entry_rcu(p, &file_systems, list) { + if (!(p->fs_flags & FS_REQUIRES_DEV)) + newlen += strlen("nodev"); + newlen += strlen("\t") + strlen(p->name) + strlen("\n"); + } + spin_unlock(&file_systems_lock); + + new = kmalloc(offsetof(struct file_systems_string, string) + newlen + 1, + GFP_KERNEL); + if (!new) + return -ENOMEM; + + new->gen = gen; + new->len = newlen; + new->string[newlen] = '\0'; + + spin_lock(&file_systems_lock); + old = file_systems_string; + + /* + * Did someone beat us to it? + */ + if (old && old->gen == file_systems_gen) { + kfree(new); + return 0; + } + + /* + * Did the list change in the meantime? + */ + if (gen != file_systems_gen) { + kfree(new); + goto retry; + } + + /* + * Populate the string. + * + * We know we have just enough space because we calculated the right + * size the previous time we had the lock and confirmed the list has + * not changed after reacquiring it. + */ + usedlen = 0; + hlist_for_each_entry_rcu(p, &file_systems, list) { + usedlen += sprintf(&new->string[usedlen], "%s\t%s\n", + (p->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", + p->name); + } + + if (WARN_ON_ONCE(new->len != strlen(new->string))) { + /* + * Should never happen of course, keep this in case someone changes string + * generation above and messes it up. + */ + spin_unlock(&file_systems_lock); + if (old) + kfree_rcu(old, rcu); + return -EINVAL; + } + + /* + * Paired with consume fence in READ_ONCE() in filesystems_proc_show() + */ + smp_store_release(&file_systems_string, new); + spin_unlock(&file_systems_lock); + if (old) + kfree_rcu(old, rcu); + return 0; +} + +static __cold noinline int filesystems_proc_show_fallback(struct seq_file *m, void *v) { struct file_system_type *p; @@ -222,9 +349,33 @@ static int filesystems_proc_show(struct seq_file *m, void *v) return 0; } +static int filesystems_proc_show(struct seq_file *m, void *v) +{ + struct file_systems_string *fss; + + for (;;) { + scoped_guard(rcu) { + fss = rcu_dereference(file_systems_string); + if (likely(fss)) { + seq_write(m, fss->string, fss->len); + return 0; + } + } + + int err = regen_filesystems_string(); + if (unlikely(err)) + return filesystems_proc_show_fallback(m, v); + } +} + static int __init proc_filesystems_init(void) { - proc_create_single("filesystems", 0, NULL, filesystems_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("filesystems", 0, NULL, filesystems_proc_show); + if (!pde) + return -ENOMEM; + proc_make_permanent(pde); return 0; } module_init(proc_filesystems_init); -- cgit v1.2.3 From a2a5eb6323a7b1987fd8048d94b9ffc7f87e3064 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Mon, 27 Apr 2026 10:26:05 +0200 Subject: proc: subset=pid: Show /proc/self/net only for CAP_NET_ADMIN Cache the mounters credentials and allow access to the net directories contingent of the permissions of the mounter of proc. Do not show /proc/self/net when proc is mounted with subset=pid option and the mounter does not have CAP_NET_ADMIN. To avoid inadvertently allowing access to /proc//net, updating mounter credentials is not supported. Signed-off-by: Alexey Gladkov Link: https://patch.msgid.link/d2466fe9085367f1e24693c437ecb8cff2789660.1777278334.git.legion@kernel.org Reviewed-by: Aleksa Sarai Signed-off-by: Christian Brauner --- fs/proc/proc_net.c | 8 ++++++++ fs/proc/root.c | 2 ++ include/linux/proc_fs.h | 1 + 3 files changed, 11 insertions(+) diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 184cddeb8215..00cc385bce21 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "internal.h" @@ -270,6 +271,7 @@ static struct net *get_proc_task_net(struct inode *dir) struct task_struct *task; struct nsproxy *ns; struct net *net = NULL; + struct proc_fs_info *fs_info = proc_sb_info(dir->i_sb); rcu_read_lock(); task = pid_task(proc_pid(dir), PIDTYPE_PID); @@ -282,6 +284,12 @@ static struct net *get_proc_task_net(struct inode *dir) } rcu_read_unlock(); + if (net && (fs_info->pidonly == PROC_PIDONLY_ON) && + security_capable(fs_info->mounter_cred, net->user_ns, CAP_NET_ADMIN, CAP_OPT_NONE) < 0) { + put_net(net); + net = NULL; + } + return net; } diff --git a/fs/proc/root.c b/fs/proc/root.c index b65053f9f046..89e5678129e4 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -254,6 +254,7 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc) return -ENOMEM; fs_info->pid_ns = get_pid_ns(ctx->pid_ns); + fs_info->mounter_cred = get_cred(fc->cred); proc_apply_options(fs_info, fc, current_user_ns()); /* User space would break if executables or devices appear on proc */ @@ -350,6 +351,7 @@ static void proc_kill_sb(struct super_block *sb) kill_anon_super(sb); if (fs_info) { put_pid_ns(fs_info->pid_ns); + put_cred(fs_info->mounter_cred); kfree_rcu(fs_info, rcu); } } diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index d2860c18dca9..47d7deaeed8f 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -67,6 +67,7 @@ enum proc_pidonly { struct proc_fs_info { struct pid_namespace *pid_ns; kgid_t pid_gid; + const struct cred *mounter_cred; enum proc_hidepid hide_pid; enum proc_pidonly pidonly; struct rcu_head rcu; -- cgit v1.2.3 From 1991a8f6932124d880e847885da20a98948b6fed Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Mon, 27 Apr 2026 10:26:06 +0200 Subject: proc: prevent reconfiguring subset=pid Changing subset=pid on an existing procfs instance is not safe. If a full procfs mount has entries hidden by overmounts, switching it to subset=pid would hide the top-level procfs entries from lookup and readdir while leaving the existing overmounts reachable. Reject attempts to change the subset=pid state during reconfigure before applying any other procfs mount options, so a failed reconfigure cannot partially update the instance. Signed-off-by: Alexey Gladkov Link: https://patch.msgid.link/13295f40f642af5d6e7038d681d43132ad80f7b2.1777278334.git.legion@kernel.org Reviewed-by: Aleksa Sarai Signed-off-by: Christian Brauner --- fs/proc/root.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/fs/proc/root.c b/fs/proc/root.c index 89e5678129e4..1bf75a4ee146 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -223,12 +223,17 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) return 0; } -static void proc_apply_options(struct proc_fs_info *fs_info, +static int proc_apply_options(struct proc_fs_info *fs_info, struct fs_context *fc, struct user_namespace *user_ns) { struct proc_fs_context *ctx = fc->fs_private; + if ((ctx->mask & (1 << Opt_subset)) && + fc->purpose == FS_CONTEXT_FOR_RECONFIGURE && + ctx->pidonly != fs_info->pidonly) + return invalf(fc, "proc: subset=pid cannot be changed\n"); + if (ctx->mask & (1 << Opt_gid)) fs_info->pid_gid = make_kgid(user_ns, ctx->gid); if (ctx->mask & (1 << Opt_hidepid)) @@ -240,6 +245,7 @@ static void proc_apply_options(struct proc_fs_info *fs_info, put_pid_ns(fs_info->pid_ns); fs_info->pid_ns = get_pid_ns(ctx->pid_ns); } + return 0; } static int proc_fill_super(struct super_block *s, struct fs_context *fc) @@ -255,7 +261,9 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc) fs_info->pid_ns = get_pid_ns(ctx->pid_ns); fs_info->mounter_cred = get_cred(fc->cred); - proc_apply_options(fs_info, fc, current_user_ns()); + ret = proc_apply_options(fs_info, fc, current_user_ns()); + if (ret) + return ret; /* User space would break if executables or devices appear on proc */ s->s_iflags |= SB_I_NOEXEC | SB_I_NODEV; @@ -304,8 +312,7 @@ static int proc_reconfigure(struct fs_context *fc) sync_filesystem(sb); - proc_apply_options(fs_info, fc, current_user_ns()); - return 0; + return proc_apply_options(fs_info, fc, current_user_ns()); } static int proc_get_tree(struct fs_context *fc) -- cgit v1.2.3 From 05dab768fc2dc7eb9b827201bb39bb5be54bce49 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Mon, 27 Apr 2026 10:26:07 +0200 Subject: proc: handle subset=pid separately in userns visibility checks When procfs is mounted with subset=pid, only the dynamic process-related part of the filesystem remains visible. That part cannot be hidden by overmounts, so checking whether an existing procfs mount is fully visible does not make sense for this mode. At the same time, a subset=pid procfs mount must not be used as evidence that a later procfs mount would not reveal additional information. It provides a restricted view of procfs, not the full filesystem view. Mark subset=pid procfs instances as restricted variants. Ignore restricted variants when looking for an already-visible mount, and allow new restricted variants without consulting mnt_already_visible(). Signed-off-by: Alexey Gladkov Link: https://patch.msgid.link/4d5e760c3d534dd2e05578d119cc408450053a98.1777278334.git.legion@kernel.org Reviewed-by: Aleksa Sarai Signed-off-by: Christian Brauner --- fs/namespace.c | 17 ++++++++++++++++- fs/proc/root.c | 3 +++ include/linux/fs/super_types.h | 1 + 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index e6e58117c778..9a66a806a9b8 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -6353,10 +6353,18 @@ static bool mnt_already_visible(struct mnt_namespace *ns, guard(namespace_shared)(); hlist_for_each_entry(mnt, &ns->mnt_visible_mounts, mnt_ns_visible) { + const struct super_block *sb_visible = mnt->mnt.mnt_sb; struct mount *child; int mnt_flags; - if (mnt->mnt.mnt_sb->s_type != sb->s_type) + if (sb_visible->s_type != sb->s_type) + continue; + + /* + * Restricted variants are not compatible with anything, even + * other restricted variants. + */ + if (sb_visible->s_iflags & SB_I_RESTRICTED_VARIANT) continue; /* A local view of the mount flags */ @@ -6418,6 +6426,13 @@ static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags return true; } + /* + * Restricted variants don't need an already visible mount because they + * don't expose the full filesystem view. + */ + if (s_iflags & SB_I_RESTRICTED_VARIANT) + return false; + return !mnt_already_visible(ns, sb, new_mnt_flags); } diff --git a/fs/proc/root.c b/fs/proc/root.c index 1bf75a4ee146..99adddfeb4a4 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -275,6 +275,9 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc) s->s_time_gran = 1; s->s_fs_info = fs_info; + if (fs_info->pidonly == PROC_PIDONLY_ON) + s->s_iflags |= SB_I_RESTRICTED_VARIANT; + /* * procfs isn't actually a stacking filesystem; however, there is * too much magic going on inside it to permit stacking things on diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h index 182efbeb9520..a6cdc8f6de4e 100644 --- a/include/linux/fs/super_types.h +++ b/include/linux/fs/super_types.h @@ -326,6 +326,7 @@ struct super_block { #define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */ /* sb->s_iflags to limit user namespace mounts */ +#define SB_I_RESTRICTED_VARIANT 0x00000010 #define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020 #define SB_I_UNTRUSTED_MOUNTER 0x00000040 #define SB_I_EVM_HMAC_UNSUPPORTED 0x00000080 -- cgit v1.2.3 From c5dffafb426f927db1630140552dc11d6f76e1a6 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Mon, 27 Apr 2026 10:26:08 +0200 Subject: docs: proc: add documentation about mount restrictions procfs has a number of mounting restrictions that are not documented anywhere. Signed-off-by: Alexey Gladkov Link: https://patch.msgid.link/e7cb804df3c1759ee17cf9df1dc4c211d63d7a5f.1777278334.git.legion@kernel.org Reviewed-by: Aleksa Sarai Signed-off-by: Christian Brauner --- Documentation/filesystems/proc.rst | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index db6167befb7b..5006644c1d19 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -52,6 +52,7 @@ fixes/update part 1.1 Stefani Seibold June 9 2009 4 Configuring procfs 4.1 Mount options + 4.2 Mount restrictions 5 Filesystem behavior @@ -2425,7 +2426,9 @@ prohibited by hidepid=. If you use some daemon like identd which needs to learn information about processes information, just add identd to this group. subset=pid hides all top level files and directories in the procfs that -are not related to tasks. +are not related to tasks. This option cannot be changed on an existing +procfs instance because overmounts that existed before the change could +otherwise remain reachable after the top level procfs entries are hidden. pidns= specifies a pid namespace (either as a string path to something like `/proc/$pid/ns/pid`, or a file descriptor when using `FSCONFIG_SET_FD`) that @@ -2434,6 +2437,20 @@ will use the calling process's active pid namespace. Note that the pid namespace of an existing procfs instance cannot be modified (attempting to do so will give an `-EBUSY` error). +4.2 Mount restrictions +-------------------------- + +If user namespaces are in use, the kernel additionally checks the instances of +procfs available to the mounter and will not allow procfs to be mounted if: + + 1. This mount is not fully visible unless the new procfs is going to be + mounted with subset=pid option. + + a. Its root directory is not the root directory of the filesystem. + b. If any file or non-empty procfs directory is hidden by another mount. + + 2. A new mount overrides the readonly option or any option from atime family. + Chapter 5: Filesystem behavior ============================== -- cgit v1.2.3 From 6650527444dadc63d84aa939d14ecba4fadb2f69 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 18 May 2026 18:35:15 +0200 Subject: proc: protect ptrace_may_access() with exec_update_lock (part 1) Fix the easy cases where procfs currently calls ptrace_may_access() without exec_update_lock protection, where the fix is to simply add the extra lock or use mm_access(): - do_task_stat(): grab exec_update_lock - proc_pid_wchan(): grab exec_update_lock - proc_map_files_lookup(): use mm_access() instead of get_task_mm() - proc_map_files_readdir(): use mm_access() instead of get_task_mm() - proc_ns_get_link(): grab exec_update_lock - proc_ns_readlink(): grab exec_update_lock Fixes: f83ce3e6b02d ("proc: avoid information leaks to non-privileged processes") Cc: stable@vger.kernel.org Signed-off-by: Jann Horn Link: https://patch.msgid.link/20260518-procfs-lockfix-part1-v1-1-5c3d20e0ac33@google.com Signed-off-by: Christian Brauner (Amutable) --- fs/proc/array.c | 6 ++++++ fs/proc/base.c | 41 ++++++++++++++++++++++------------------- fs/proc/namespaces.c | 12 ++++++++++++ 3 files changed, 40 insertions(+), 19 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index 90fb0c6b5f99..479ea8cb4ef4 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -482,6 +482,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, unsigned long flags; int exit_code = task->exit_code; struct signal_struct *sig = task->signal; + int ret; + + ret = down_read_killable(&task->signal->exec_update_lock); + if (ret) + return ret; state = *get_task_state(task); vsize = eip = esp = 0; @@ -657,6 +662,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, seq_puts(m, " 0"); seq_putc(m, '\n'); + up_read(&task->signal->exec_update_lock); if (mm) mmput(mm); return 0; diff --git a/fs/proc/base.c b/fs/proc/base.c index d9acfa89c894..cc99d953a0a3 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -423,18 +423,24 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, { unsigned long wchan; char symname[KSYM_NAME_LEN]; + int err; + err = down_read_killable(&task->signal->exec_update_lock); + if (err) + return err; if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto print0; wchan = get_wchan(task); if (wchan && !lookup_symbol_name(wchan, symname)) { seq_puts(m, symname); + up_read(&task->signal->exec_update_lock); return 0; } print0: seq_putc(m, '0'); + up_read(&task->signal->exec_update_lock); return 0; } #endif /* CONFIG_KALLSYMS */ @@ -2360,17 +2366,15 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, if (!task) goto out; - result = ERR_PTR(-EACCES); - if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) - goto out_put_task; - result = ERR_PTR(-ENOENT); if (dname_to_vma_addr(dentry, &vm_start, &vm_end)) goto out_put_task; - mm = get_task_mm(task); - if (!mm) + mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); + if (IS_ERR(mm)) { + result = ERR_CAST(mm); goto out_put_task; + } result = ERR_PTR(-EINTR); if (mmap_read_lock_killable(mm)) @@ -2420,23 +2424,22 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) if (!task) goto out; - ret = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) - goto out_put_task; - ret = 0; if (!dir_emit_dots(file, ctx)) goto out_put_task; - mm = get_task_mm(task); - if (!mm) + mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); + if (IS_ERR(mm)) { + ret = PTR_ERR(mm); + /* if the task has no mm, the directory should just be empty */ + if (ret == -ESRCH) + ret = 0; goto out_put_task; + } ret = mmap_read_lock_killable(mm); - if (ret) { - mmput(mm); - goto out_put_task; - } + if (ret) + goto out_put_mm; nr_files = 0; @@ -2462,8 +2465,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) if (!p) { ret = -ENOMEM; mmap_read_unlock(mm); - mmput(mm); - goto out_put_task; + goto out_put_mm; } p->start = vma->vm_start; @@ -2471,7 +2473,6 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) p->mode = vma->vm_file->f_mode; } mmap_read_unlock(mm); - mmput(mm); for (i = 0; i < nr_files; i++) { char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */ @@ -2488,6 +2489,8 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) ctx->pos++; } +out_put_mm: + mmput(mm); out_put_task: put_task_struct(task); out: diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 39f4169f669f..2f46f1396744 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -55,6 +55,10 @@ static const char *proc_ns_get_link(struct dentry *dentry, if (!task) return ERR_PTR(-EACCES); + error = down_read_killable(&task->signal->exec_update_lock); + if (error) + goto out_put_task; + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto out; @@ -64,6 +68,8 @@ static const char *proc_ns_get_link(struct dentry *dentry, error = nd_jump_link(&ns_path); out: + up_read(&task->signal->exec_update_lock); +out_put_task: put_task_struct(task); return ERR_PTR(error); } @@ -80,11 +86,17 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl if (!task) return res; + res = down_read_killable(&task->signal->exec_update_lock); + if (res) + goto out_put_task; + if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { res = ns_get_name(name, sizeof(name), task, ns_ops); if (res >= 0) res = readlink_copy(buffer, buflen, name, strlen(name)); } + up_read(&task->signal->exec_update_lock); +out_put_task: put_task_struct(task); return res; } -- cgit v1.2.3 From 6255da28d4bb5349fe18e84cb043ccd394eba75d Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 18 May 2026 18:35:16 +0200 Subject: proc: protect ptrace_may_access() with exec_update_lock (FD links) proc_pid_get_link() and proc_pid_readlink() currently look up the task from the pid once, then do the ptrace access check on that task, then look up the task from the pid a second time to do the actual access. That's racy in several ways. To fix it, pass the task to the ->proc_get_link() handler, and instead of proc_fd_access_allowed(), introduce a new helper call_proc_get_link() that looks up and locks the task, does the access check, and calls ->proc_get_link(). Fixes: 778c1144771f ("[PATCH] proc: Use sane permission checks on the /proc//fd/ symlinks") Cc: stable@vger.kernel.org Signed-off-by: Jann Horn Link: https://patch.msgid.link/20260518-procfs-lockfix-part1-v1-2-5c3d20e0ac33@google.com Signed-off-by: Christian Brauner (Amutable) --- fs/proc/base.c | 119 +++++++++++++++++++++-------------------------------- fs/proc/fd.c | 27 +++++------- fs/proc/internal.h | 2 +- 3 files changed, 59 insertions(+), 89 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index cc99d953a0a3..5042f14d24f3 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -218,33 +218,24 @@ static int get_task_root(struct task_struct *task, struct path *root) return result; } -static int proc_cwd_link(struct dentry *dentry, struct path *path) +static int proc_cwd_link(struct dentry *dentry, struct path *path, + struct task_struct *task) { - struct task_struct *task = get_proc_task(d_inode(dentry)); int result = -ENOENT; - if (task) { - task_lock(task); - if (task->fs) { - get_fs_pwd(task->fs, path); - result = 0; - } - task_unlock(task); - put_task_struct(task); + task_lock(task); + if (task->fs) { + get_fs_pwd(task->fs, path); + result = 0; } + task_unlock(task); return result; } -static int proc_root_link(struct dentry *dentry, struct path *path) +static int proc_root_link(struct dentry *dentry, struct path *path, + struct task_struct *task) { - struct task_struct *task = get_proc_task(d_inode(dentry)); - int result = -ENOENT; - - if (task) { - result = get_task_root(task, path); - put_task_struct(task); - } - return result; + return get_task_root(task, path); } /* @@ -710,23 +701,6 @@ static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns, /* Here the fs part begins */ /************************************************************************/ -/* permission checks */ -static bool proc_fd_access_allowed(struct inode *inode) -{ - struct task_struct *task; - bool allowed = false; - /* Allow access to a task's file descriptors if it is us or we - * may use ptrace attach to the process and find out that - * information. - */ - task = get_proc_task(inode); - if (task) { - allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); - put_task_struct(task); - } - return allowed; -} - int proc_nochmod_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { @@ -1783,16 +1757,12 @@ static const struct file_operations proc_pid_set_comm_operations = { .release = single_release, }; -static int proc_exe_link(struct dentry *dentry, struct path *exe_path) +static int proc_exe_link(struct dentry *dentry, struct path *exe_path, + struct task_struct *task) { - struct task_struct *task; struct file *exe_file; - task = get_proc_task(d_inode(dentry)); - if (!task) - return -ENOENT; exe_file = get_task_exe_file(task); - put_task_struct(task); if (exe_file) { *exe_path = exe_file->f_path; path_get(&exe_file->f_path); @@ -1802,26 +1772,42 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path) return -ENOENT; } +static int call_proc_get_link(struct dentry *dentry, struct inode *inode, struct path *path_out) +{ + struct task_struct *task; + int ret; + + task = get_proc_task(inode); + if (!task) + return -ENOENT; + ret = down_read_killable(&task->signal->exec_update_lock); + if (ret) + goto out_put_task; + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { + ret = -EACCES; + goto out; + } + ret = PROC_I(inode)->op.proc_get_link(dentry, path_out, task); + +out: + up_read(&task->signal->exec_update_lock); +out_put_task: + put_task_struct(task); + return ret; +} + static const char *proc_pid_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct path path; - int error = -EACCES; + int error; if (!dentry) return ERR_PTR(-ECHILD); - - /* Are we allowed to snoop on the tasks file descriptors? */ - if (!proc_fd_access_allowed(inode)) - goto out; - - error = PROC_I(inode)->op.proc_get_link(dentry, &path); - if (error) - goto out; - - error = nd_jump_link(&path); -out: + error = call_proc_get_link(dentry, inode, &path); + if (!error) + error = nd_jump_link(&path); return ERR_PTR(error); } @@ -1855,17 +1841,11 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b struct inode *inode = d_inode(dentry); struct path path; - /* Are we allowed to snoop on the tasks file descriptors? */ - if (!proc_fd_access_allowed(inode)) - goto out; - - error = PROC_I(inode)->op.proc_get_link(dentry, &path); - if (error) - goto out; - - error = do_proc_readlink(&path, buffer, buflen); - path_put(&path); -out: + error = call_proc_get_link(dentry, inode, &path); + if (!error) { + error = do_proc_readlink(&path, buffer, buflen); + path_put(&path); + } return error; } @@ -2256,21 +2236,16 @@ static const struct dentry_operations tid_map_files_dentry_operations = { .d_delete = pid_delete_dentry, }; -static int map_files_get_link(struct dentry *dentry, struct path *path) +static int map_files_get_link(struct dentry *dentry, struct path *path, + struct task_struct *task) { unsigned long vm_start, vm_end; struct vm_area_struct *vma; - struct task_struct *task; struct mm_struct *mm; int rc; rc = -ENOENT; - task = get_proc_task(d_inode(dentry)); - if (!task) - goto out; - mm = get_task_mm(task); - put_task_struct(task); if (!mm) goto out; diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 05c7513e77c7..0f9a1556f2a3 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -171,24 +171,19 @@ static const struct dentry_operations tid_fd_dentry_operations = { .d_delete = pid_delete_dentry, }; -static int proc_fd_link(struct dentry *dentry, struct path *path) +static int proc_fd_link(struct dentry *dentry, struct path *path, + struct task_struct *task) { - struct task_struct *task; int ret = -ENOENT; - - task = get_proc_task(d_inode(dentry)); - if (task) { - unsigned int fd = proc_fd(d_inode(dentry)); - struct file *fd_file; - - fd_file = fget_task(task, fd); - if (fd_file) { - *path = fd_file->f_path; - path_get(&fd_file->f_path); - ret = 0; - fput(fd_file); - } - put_task_struct(task); + unsigned int fd = proc_fd(d_inode(dentry)); + struct file *fd_file; + + fd_file = fget_task(task, fd); + if (fd_file) { + *path = fd_file->f_path; + path_get(&fd_file->f_path); + ret = 0; + fput(fd_file); } return ret; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 1edbabbdbc5d..b232e1098117 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -110,7 +110,7 @@ extern struct kmem_cache *proc_dir_entry_cache; void pde_free(struct proc_dir_entry *pde); union proc_op { - int (*proc_get_link)(struct dentry *, struct path *); + int (*proc_get_link)(struct dentry *, struct path *, struct task_struct *); int (*proc_show)(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task); -- cgit v1.2.3 From cf30ceccfaec3d2549ff60f7c915625f12dd3a93 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 10 Jun 2026 16:39:54 +0200 Subject: fs: fix ups and tidy ups to /proc/filesystems caching - add missing unlocks in some corner cases - whitespace touch ups - s/smp_store_release/rcu_assign_pointer/ [nop, the macro expands to the same thing] - mark file_systems_string as __read_mostly Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20260610143954.34185-1-mjguzik@gmail.com Signed-off-by: Christian Brauner (Amutable) --- fs/filesystems.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/fs/filesystems.c b/fs/filesystems.c index 771fc31a69b8..673a03b5f32b 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -54,7 +54,7 @@ struct file_systems_string { }; static unsigned long file_systems_gen; -static struct file_systems_string __rcu *file_systems_string; +static struct file_systems_string __read_mostly __rcu *file_systems_string; static void invalidate_filesystems_string(void); #else @@ -269,7 +269,7 @@ retry: hlist_for_each_entry_rcu(p, &file_systems, list) { if (!(p->fs_flags & FS_REQUIRES_DEV)) newlen += strlen("nodev"); - newlen += strlen("\t") + strlen(p->name) + strlen("\n"); + newlen += strlen("\t") + strlen(p->name) + strlen("\n"); } spin_unlock(&file_systems_lock); @@ -289,6 +289,7 @@ retry: * Did someone beat us to it? */ if (old && old->gen == file_systems_gen) { + spin_unlock(&file_systems_lock); kfree(new); return 0; } @@ -297,6 +298,7 @@ retry: * Did the list change in the meantime? */ if (gen != file_systems_gen) { + spin_unlock(&file_systems_lock); kfree(new); goto retry; } @@ -321,15 +323,11 @@ retry: * generation above and messes it up. */ spin_unlock(&file_systems_lock); - if (old) - kfree_rcu(old, rcu); + kfree(new); return -EINVAL; } - /* - * Paired with consume fence in READ_ONCE() in filesystems_proc_show() - */ - smp_store_release(&file_systems_string, new); + rcu_assign_pointer(file_systems_string, new); spin_unlock(&file_systems_lock); if (old) kfree_rcu(old, rcu); -- cgit v1.2.3