diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-06-15 01:37:58 +0300 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-06-15 01:37:58 +0300 |
| commit | 79169a1624253363fed3e9a447b77e50bb226206 (patch) | |
| tree | e7c5853529da0bd435d04d2445e4d2bed9050021 | |
| parent | 7e0e7bd60d4a812b694c477716597fcb038b00cb (diff) | |
| parent | cf30ceccfaec3d2549ff60f7c915625f12dd3a93 (diff) | |
| download | linux-79169a1624253363fed3e9a447b77e50bb226206.tar.xz | |
Merge tag 'vfs-7.2-rc1.procfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull procfs updates from Christian Brauner:
- Revamp fs/filesystems.c
The file was a mess with a hand-rolled linked list in desperate need
of a cleanup. The filesystems list is now RCU-ified, /proc files can
be marked permanent from outside fs/proc/, and the string emitted
when reading /proc/filesystems is pre-generated and cached instead of
pointer-chasing and printfing entry by entry on every read.
The file is read frequently because libselinux reads it and is linked
into numerous frequently used programs (even ones you would not
suspect, like sed!). Scalability also improves since reference
maintenance on open/close is bypassed.
open+read+close cycle single-threaded (ops/s):
before: 442732
after: 1063462 (+140%)
open+read+close cycle with 20 processes (ops/s):
before: 606177
after: 3300576 (+444%)
A follow-up patch adds missing unlocks in some corner cases and
tidies things up.
- Relax the mount visibility check for subset=pid mounts
When procfs is mounted with subset=pid, all static files become
unavailable and only the dynamic pid information is accessible. In
that case there is no point in imposing the full mount visibility
restrictions on the mounter - everything that can be hidden in procfs
is already inaccessible. These restrictions prevented procfs from
being mounted inside rootless containers since almost all container
implementations overmount parts of procfs to hide certain
directories.
As part of this /proc/self/net is only shown in subset=pid mounts for
CAP_NET_ADMIN, reconfiguring subset=pid is rejected, the
SB_I_USERNS_VISIBLE superblock flag is replaced with an
FS_USERNS_MOUNT_RESTRICTED filesystem flag, fully visible mounts are
recorded in a list, and the mount restrictions are finally
documented.
- Protect ptrace_may_access() with exec_update_lock in procfs
Most uses of ptrace_may_access() in procfs should hold
exec_update_lock to avoid TOCTOU issues with concurrent privileged
execve() (like setuid binary execution).
This fixes the easy cases - the owner and visibility checks and the
FD link permission checks - with the gnarlier ones to follow later.
* tag 'vfs-7.2-rc1.procfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
fs: fix ups and tidy ups to /proc/filesystems caching
proc: protect ptrace_may_access() with exec_update_lock (FD links)
proc: protect ptrace_may_access() with exec_update_lock (part 1)
docs: proc: add documentation about mount restrictions
proc: handle subset=pid separately in userns visibility checks
proc: prevent reconfiguring subset=pid
proc: subset=pid: Show /proc/self/net only for CAP_NET_ADMIN
fs: cache the string generated by reading /proc/filesystems
sysfs: remove trivial sysfs_get_tree() wrapper
fs: RCU-ify filesystems list
fs: move SB_I_USERNS_VISIBLE to FS_USERNS_MOUNT_RESTRICTED
proc: allow to mark /proc files permanent outside of fs/proc/
namespace: record fully visible mounts in list
| -rw-r--r-- | Documentation/filesystems/proc.rst | 19 | ||||
| -rw-r--r-- | fs/filesystems.c | 330 | ||||
| -rw-r--r-- | fs/mount.h | 4 | ||||
| -rw-r--r-- | fs/namespace.c | 34 | ||||
| -rw-r--r-- | fs/ocfs2/super.c | 1 | ||||
| -rw-r--r-- | fs/proc/array.c | 6 | ||||
| -rw-r--r-- | fs/proc/base.c | 160 | ||||
| -rw-r--r-- | fs/proc/fd.c | 27 | ||||
| -rw-r--r-- | fs/proc/generic.c | 10 | ||||
| -rw-r--r-- | fs/proc/internal.h | 5 | ||||
| -rw-r--r-- | fs/proc/namespaces.c | 12 | ||||
| -rw-r--r-- | fs/proc/proc_net.c | 8 | ||||
| -rw-r--r-- | fs/proc/root.c | 24 | ||||
| -rw-r--r-- | fs/sysfs/mount.c | 18 | ||||
| -rw-r--r-- | include/linux/fs.h | 3 | ||||
| -rw-r--r-- | include/linux/fs/super_types.h | 2 | ||||
| -rw-r--r-- | include/linux/proc_fs.h | 13 | ||||
| -rw-r--r-- | kernel/acct.c | 2 |
18 files changed, 429 insertions, 249 deletions
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index db6167befb7b..5006644c1d19 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -52,6 +52,7 @@ fixes/update part 1.1 Stefani Seibold <stefani@seibold.net> June 9 2009 4 Configuring procfs 4.1 Mount options + 4.2 Mount restrictions 5 Filesystem behavior @@ -2425,7 +2426,9 @@ prohibited by hidepid=. If you use some daemon like identd which needs to learn information about processes information, just add identd to this group. subset=pid hides all top level files and directories in the procfs that -are not related to tasks. +are not related to tasks. This option cannot be changed on an existing +procfs instance because overmounts that existed before the change could +otherwise remain reachable after the top level procfs entries are hidden. pidns= specifies a pid namespace (either as a string path to something like `/proc/$pid/ns/pid`, or a file descriptor when using `FSCONFIG_SET_FD`) that @@ -2434,6 +2437,20 @@ will use the calling process's active pid namespace. Note that the pid namespace of an existing procfs instance cannot be modified (attempting to do so will give an `-EBUSY` error). +4.2 Mount restrictions +-------------------------- + +If user namespaces are in use, the kernel additionally checks the instances of +procfs available to the mounter and will not allow procfs to be mounted if: + + 1. This mount is not fully visible unless the new procfs is going to be + mounted with subset=pid option. + + a. Its root directory is not the root directory of the filesystem. + b. If any file or non-empty procfs directory is hidden by another mount. + + 2. A new mount overrides the readonly option or any option from atime family. + Chapter 5: Filesystem behavior ============================== diff --git a/fs/filesystems.c b/fs/filesystems.c index 0c7d2b7ac26c..673a03b5f32b 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -17,22 +17,49 @@ #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/fs_parser.h> +#include <linux/rculist.h> /* - * Handling of filesystem drivers list. - * Rules: - * Inclusion to/removals from/scanning of list are protected by spinlock. - * During the unload module must call unregister_filesystem(). - * We can access the fields of list element if: - * 1) spinlock is held or - * 2) we hold the reference to the module. - * The latter can be guaranteed by call of try_module_get(); if it - * returned 0 we must skip the element, otherwise we got the reference. - * Once the reference is obtained we can drop the spinlock. + * Read-mostly filesystem drivers list. + * + * Readers walk under rcu_read_lock(); writers take file_systems_lock + * and publish via _rcu hlist primitives. unregister_filesystem() + * synchronize_rcu()s after unlock so the embedded file_system_type + * can't go away under a reader. To keep using a filesystem after + * the RCU section ends, take a module reference via try_module_get(). + */ +static HLIST_HEAD(file_systems); +static DEFINE_SPINLOCK(file_systems_lock); + +#ifdef CONFIG_PROC_FS +/* + * Cache a stringified version of the filesystem list. + * + * The fs list gets queried a lot by userspace because of libselinux, including + * rather surprising programs (would you guess *sed* is on the list?). In order + * to reduce the overhead we cache the resulting string, which normally hangs + * around below 512 bytes in size. + * + * As the list almost never changes, its creation is not particularly optimized + * to keep things simple. + * + * We sort it out on read in order to not introduce a failure point for fs + * registration (in principle we may be unable to alloc memory for the list). */ +struct file_systems_string { + struct rcu_head rcu; + unsigned long gen; + size_t len; + char string[]; +}; + +static unsigned long file_systems_gen; +static struct file_systems_string __read_mostly __rcu *file_systems_string; -static struct file_system_type *file_systems; -static DEFINE_RWLOCK(file_systems_lock); +static void invalidate_filesystems_string(void); +#else +static inline void invalidate_filesystems_string(void) { } +#endif /* WARNING: This can be used only if we _already_ own a reference */ struct file_system_type *get_filesystem(struct file_system_type *fs) @@ -46,14 +73,15 @@ void put_filesystem(struct file_system_type *fs) module_put(fs->owner); } -static struct file_system_type **find_filesystem(const char *name, unsigned len) +static struct file_system_type *find_filesystem(const char *name, unsigned len) { - struct file_system_type **p; - for (p = &file_systems; *p; p = &(*p)->next) - if (strncmp((*p)->name, name, len) == 0 && - !(*p)->name[len]) - break; - return p; + struct file_system_type *fs; + + hlist_for_each_entry_rcu(fs, &file_systems, list, + lockdep_is_held(&file_systems_lock)) + if (strncmp(fs->name, name, len) == 0 && !fs->name[len]) + return fs; + return NULL; } /** @@ -64,33 +92,27 @@ static struct file_system_type **find_filesystem(const char *name, unsigned len) * is aware of for mount and other syscalls. Returns 0 on success, * or a negative errno code on an error. * - * The &struct file_system_type that is passed is linked into the kernel + * The &struct file_system_type that is passed is linked into the kernel * structures and must not be freed until the file system has been * unregistered. */ - -int register_filesystem(struct file_system_type * fs) +int register_filesystem(struct file_system_type *fs) { - int res = 0; - struct file_system_type ** p; - if (fs->parameters && !fs_validate_description(fs->name, fs->parameters)) return -EINVAL; BUG_ON(strchr(fs->name, '.')); - if (fs->next) + if (!hlist_unhashed_lockless(&fs->list)) return -EBUSY; - write_lock(&file_systems_lock); - p = find_filesystem(fs->name, strlen(fs->name)); - if (*p) - res = -EBUSY; - else - *p = fs; - write_unlock(&file_systems_lock); - return res; -} + guard(spinlock)(&file_systems_lock); + if (find_filesystem(fs->name, strlen(fs->name))) + return -EBUSY; + hlist_add_tail_rcu(&fs->list, &file_systems); + invalidate_filesystems_string(); + return 0; +} EXPORT_SYMBOL(register_filesystem); /** @@ -100,94 +122,79 @@ EXPORT_SYMBOL(register_filesystem); * Remove a file system that was previously successfully registered * with the kernel. An error is returned if the file system is not found. * Zero is returned on a success. - * + * * Once this function has returned the &struct file_system_type structure * may be freed or reused. */ - -int unregister_filesystem(struct file_system_type * fs) +int unregister_filesystem(struct file_system_type *fs) { - struct file_system_type ** tmp; - - write_lock(&file_systems_lock); - tmp = &file_systems; - while (*tmp) { - if (fs == *tmp) { - *tmp = fs->next; - fs->next = NULL; - write_unlock(&file_systems_lock); - synchronize_rcu(); - return 0; - } - tmp = &(*tmp)->next; + scoped_guard(spinlock, &file_systems_lock) { + if (hlist_unhashed(&fs->list)) + return -EINVAL; + hlist_del_init_rcu(&fs->list); + invalidate_filesystems_string(); } - write_unlock(&file_systems_lock); - - return -EINVAL; + synchronize_rcu(); + return 0; } - EXPORT_SYMBOL(unregister_filesystem); #ifdef CONFIG_SYSFS_SYSCALL -static int fs_index(const char __user * __name) +static int fs_index(const char __user *__name) { - struct file_system_type * tmp; + struct file_system_type *p; char *name __free(kfree) = strndup_user(__name, PATH_MAX); - int err, index; + int index = 0; if (IS_ERR(name)) return PTR_ERR(name); - err = -EINVAL; - read_lock(&file_systems_lock); - for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) { - if (strcmp(tmp->name, name) == 0) { - err = index; - break; - } + guard(rcu)(); + hlist_for_each_entry_rcu(p, &file_systems, list) { + if (strcmp(p->name, name) == 0) + return index; + index++; } - read_unlock(&file_systems_lock); - return err; + return -EINVAL; } -static int fs_name(unsigned int index, char __user * buf) +static int fs_name(unsigned int index, char __user *buf) { - struct file_system_type * tmp; - int len, res = -EINVAL; - - read_lock(&file_systems_lock); - for (tmp = file_systems; tmp; tmp = tmp->next, index--) { - if (index == 0) { - if (try_module_get(tmp->owner)) - res = 0; + struct file_system_type *p, *found = NULL; + int len, res; + + scoped_guard(rcu) { + hlist_for_each_entry_rcu(p, &file_systems, list) { + if (index--) + continue; + if (try_module_get(p->owner)) + found = p; break; } } - read_unlock(&file_systems_lock); - if (res) - return res; + if (!found) + return -EINVAL; /* OK, we got the reference, so we can safely block */ - len = strlen(tmp->name) + 1; - res = copy_to_user(buf, tmp->name, len) ? -EFAULT : 0; - put_filesystem(tmp); + len = strlen(found->name) + 1; + res = copy_to_user(buf, found->name, len) ? -EFAULT : 0; + put_filesystem(found); return res; } static int fs_maxindex(void) { - struct file_system_type * tmp; - int index; + struct file_system_type *p; + int index = 0; - read_lock(&file_systems_lock); - for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++) - ; - read_unlock(&file_systems_lock); + guard(rcu)(); + hlist_for_each_entry_rcu(p, &file_systems, list) + index++; return index; } /* - * Whee.. Weird sysv syscall. + * Whee.. Weird sysv syscall. */ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2) { @@ -216,8 +223,8 @@ int __init list_bdev_fs_names(char *buf, size_t size) size_t len; int count = 0; - read_lock(&file_systems_lock); - for (p = file_systems; p; p = p->next) { + guard(rcu)(); + hlist_for_each_entry_rcu(p, &file_systems, list) { if (!(p->fs_flags & FS_REQUIRES_DEV)) continue; len = strlen(p->name) + 1; @@ -230,30 +237,143 @@ int __init list_bdev_fs_names(char *buf, size_t size) size -= len; count++; } - read_unlock(&file_systems_lock); return count; } #ifdef CONFIG_PROC_FS -static int filesystems_proc_show(struct seq_file *m, void *v) +static void invalidate_filesystems_string(void) +{ + struct file_systems_string *old; + + lockdep_assert_held_write(&file_systems_lock); + file_systems_gen++; + old = rcu_replace_pointer(file_systems_string, NULL, + lockdep_is_held(&file_systems_lock)); + if (old) + kfree_rcu(old, rcu); +} + +static __cold noinline int regen_filesystems_string(void) +{ + struct file_system_type *p; + struct file_systems_string *old, *new; + size_t newlen, usedlen; + unsigned long gen; + +retry: + newlen = 0; + + /* pre-calc space for each fs */ + spin_lock(&file_systems_lock); + gen = file_systems_gen; + hlist_for_each_entry_rcu(p, &file_systems, list) { + if (!(p->fs_flags & FS_REQUIRES_DEV)) + newlen += strlen("nodev"); + newlen += strlen("\t") + strlen(p->name) + strlen("\n"); + } + spin_unlock(&file_systems_lock); + + new = kmalloc(offsetof(struct file_systems_string, string) + newlen + 1, + GFP_KERNEL); + if (!new) + return -ENOMEM; + + new->gen = gen; + new->len = newlen; + new->string[newlen] = '\0'; + + spin_lock(&file_systems_lock); + old = file_systems_string; + + /* + * Did someone beat us to it? + */ + if (old && old->gen == file_systems_gen) { + spin_unlock(&file_systems_lock); + kfree(new); + return 0; + } + + /* + * Did the list change in the meantime? + */ + if (gen != file_systems_gen) { + spin_unlock(&file_systems_lock); + kfree(new); + goto retry; + } + + /* + * Populate the string. + * + * We know we have just enough space because we calculated the right + * size the previous time we had the lock and confirmed the list has + * not changed after reacquiring it. + */ + usedlen = 0; + hlist_for_each_entry_rcu(p, &file_systems, list) { + usedlen += sprintf(&new->string[usedlen], "%s\t%s\n", + (p->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", + p->name); + } + + if (WARN_ON_ONCE(new->len != strlen(new->string))) { + /* + * Should never happen of course, keep this in case someone changes string + * generation above and messes it up. + */ + spin_unlock(&file_systems_lock); + kfree(new); + return -EINVAL; + } + + rcu_assign_pointer(file_systems_string, new); + spin_unlock(&file_systems_lock); + if (old) + kfree_rcu(old, rcu); + return 0; +} + +static __cold noinline int filesystems_proc_show_fallback(struct seq_file *m, void *v) { - struct file_system_type * tmp; + struct file_system_type *p; - read_lock(&file_systems_lock); - tmp = file_systems; - while (tmp) { + guard(rcu)(); + hlist_for_each_entry_rcu(p, &file_systems, list) { seq_printf(m, "%s\t%s\n", - (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", - tmp->name); - tmp = tmp->next; + (p->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", + p->name); } - read_unlock(&file_systems_lock); return 0; } +static int filesystems_proc_show(struct seq_file *m, void *v) +{ + struct file_systems_string *fss; + + for (;;) { + scoped_guard(rcu) { + fss = rcu_dereference(file_systems_string); + if (likely(fss)) { + seq_write(m, fss->string, fss->len); + return 0; + } + } + + int err = regen_filesystems_string(); + if (unlikely(err)) + return filesystems_proc_show_fallback(m, v); + } +} + static int __init proc_filesystems_init(void) { - proc_create_single("filesystems", 0, NULL, filesystems_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("filesystems", 0, NULL, filesystems_proc_show); + if (!pde) + return -ENOMEM; + proc_make_permanent(pde); return 0; } module_init(proc_filesystems_init); @@ -263,11 +383,10 @@ static struct file_system_type *__get_fs_type(const char *name, int len) { struct file_system_type *fs; - read_lock(&file_systems_lock); - fs = *(find_filesystem(name, len)); + guard(rcu)(); + fs = find_filesystem(name, len); if (fs && !try_module_get(fs->owner)) fs = NULL; - read_unlock(&file_systems_lock); return fs; } @@ -291,5 +410,4 @@ struct file_system_type *get_fs_type(const char *name) } return fs; } - EXPORT_SYMBOL(get_fs_type); diff --git a/fs/mount.h b/fs/mount.h index 5c120f8361bd..94fcc306d21e 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -25,6 +25,7 @@ struct mnt_namespace { __u32 n_fsnotify_mask; struct fsnotify_mark_connector __rcu *n_fsnotify_marks; #endif + struct hlist_head mnt_visible_mounts; /* SB_I_USERNS_VISIBLE mounts */ unsigned int nr_mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; refcount_t passive; /* number references not pinning @mounts */ @@ -98,6 +99,7 @@ struct mount { int mnt_expiry_mark; /* true if marked for expiry */ struct hlist_head mnt_pins; struct hlist_head mnt_stuck_children; + struct hlist_node mnt_ns_visible; /* link in ns->mnt_visible_mounts */ struct mount *overmount; /* mounted on ->mnt_root */ } __randomize_layout; @@ -215,6 +217,8 @@ static inline void move_from_ns(struct mount *mnt) ns->mnt_first_node = rb_next(&mnt->mnt_node); rb_erase(&mnt->mnt_node, &ns->mounts); RB_CLEAR_NODE(&mnt->mnt_node); + if (!hlist_unhashed(&mnt->mnt_ns_visible)) + hlist_del_init(&mnt->mnt_ns_visible); } bool has_locked_children(struct mount *mnt, struct dentry *dentry); diff --git a/fs/namespace.c b/fs/namespace.c index d4cf40198e92..3d5cd5bf3b05 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -321,6 +321,7 @@ static struct mount *alloc_vfsmnt(const char *name) INIT_HLIST_NODE(&mnt->mnt_slave); INIT_HLIST_NODE(&mnt->mnt_mp_list); INIT_HLIST_HEAD(&mnt->mnt_stuck_children); + INIT_HLIST_NODE(&mnt->mnt_ns_visible); RB_CLEAR_NODE(&mnt->mnt_node); mnt->mnt.mnt_idmap = &nop_mnt_idmap; } @@ -1098,6 +1099,10 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt) rb_link_node(&mnt->mnt_node, parent, link); rb_insert_color(&mnt->mnt_node, &ns->mounts); + if ((mnt->mnt.mnt_sb->s_type->fs_flags & FS_USERNS_MOUNT_RESTRICTED) && + mnt->mnt.mnt_root == mnt->mnt.mnt_sb->s_root) + hlist_add_head(&mnt->mnt_ns_visible, &ns->mnt_visible_mounts); + mnt_notify_add(mnt); } @@ -6346,20 +6351,26 @@ static bool mnt_already_visible(struct mnt_namespace *ns, int *new_mnt_flags) { int new_flags = *new_mnt_flags; - struct mount *mnt, *n; + struct mount *mnt; + + /* Don't acquire namespace semaphore without a good reason. */ + if (hlist_empty(&ns->mnt_visible_mounts)) + return false; guard(namespace_shared)(); - rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) { + hlist_for_each_entry(mnt, &ns->mnt_visible_mounts, mnt_ns_visible) { + const struct super_block *sb_visible = mnt->mnt.mnt_sb; struct mount *child; int mnt_flags; - if (mnt->mnt.mnt_sb->s_type != sb->s_type) + if (sb_visible->s_type != sb->s_type) continue; - /* This mount is not fully visible if it's root directory - * is not the root directory of the filesystem. + /* + * Restricted variants are not compatible with anything, even + * other restricted variants. */ - if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root) + if (sb_visible->s_iflags & SB_I_RESTRICTED_VARIANT) continue; /* A local view of the mount flags */ @@ -6411,16 +6422,23 @@ static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags return false; /* Can this filesystem be too revealing? */ - s_iflags = sb->s_iflags; - if (!(s_iflags & SB_I_USERNS_VISIBLE)) + if (!(sb->s_type->fs_flags & FS_USERNS_MOUNT_RESTRICTED)) return false; + s_iflags = sb->s_iflags; if ((s_iflags & required_iflags) != required_iflags) { WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n", required_iflags); return true; } + /* + * Restricted variants don't need an already visible mount because they + * don't expose the full filesystem view. + */ + if (s_iflags & SB_I_RESTRICTED_VARIANT) + return false; + return !mnt_already_visible(ns, sb, new_mnt_flags); } diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index b875f01c9756..4870e680c4e5 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1224,7 +1224,6 @@ static struct file_system_type ocfs2_fs_type = { .name = "ocfs2", .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, - .next = NULL, .init_fs_context = ocfs2_init_fs_context, .parameters = ocfs2_param_spec, }; diff --git a/fs/proc/array.c b/fs/proc/array.c index 90fb0c6b5f99..479ea8cb4ef4 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -482,6 +482,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, unsigned long flags; int exit_code = task->exit_code; struct signal_struct *sig = task->signal; + int ret; + + ret = down_read_killable(&task->signal->exec_update_lock); + if (ret) + return ret; state = *get_task_state(task); vsize = eip = esp = 0; @@ -657,6 +662,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, seq_puts(m, " 0"); seq_putc(m, '\n'); + up_read(&task->signal->exec_update_lock); if (mm) mmput(mm); return 0; diff --git a/fs/proc/base.c b/fs/proc/base.c index 126b98419adb..b67e8c3605fb 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -219,33 +219,24 @@ static int get_task_root(struct task_struct *task, struct path *root) return result; } -static int proc_cwd_link(struct dentry *dentry, struct path *path) +static int proc_cwd_link(struct dentry *dentry, struct path *path, + struct task_struct *task) { - struct task_struct *task = get_proc_task(d_inode(dentry)); int result = -ENOENT; - if (task) { - task_lock(task); - if (task->fs) { - get_fs_pwd(task->fs, path); - result = 0; - } - task_unlock(task); - put_task_struct(task); + task_lock(task); + if (task->fs) { + get_fs_pwd(task->fs, path); + result = 0; } + task_unlock(task); return result; } -static int proc_root_link(struct dentry *dentry, struct path *path) +static int proc_root_link(struct dentry *dentry, struct path *path, + struct task_struct *task) { - struct task_struct *task = get_proc_task(d_inode(dentry)); - int result = -ENOENT; - - if (task) { - result = get_task_root(task, path); - put_task_struct(task); - } - return result; + return get_task_root(task, path); } /* @@ -424,18 +415,24 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, { unsigned long wchan; char symname[KSYM_NAME_LEN]; + int err; + err = down_read_killable(&task->signal->exec_update_lock); + if (err) + return err; if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto print0; wchan = get_wchan(task); if (wchan && !lookup_symbol_name(wchan, symname)) { seq_puts(m, symname); + up_read(&task->signal->exec_update_lock); return 0; } print0: seq_putc(m, '0'); + up_read(&task->signal->exec_update_lock); return 0; } #endif /* CONFIG_KALLSYMS */ @@ -705,23 +702,6 @@ static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns, /* Here the fs part begins */ /************************************************************************/ -/* permission checks */ -static bool proc_fd_access_allowed(struct inode *inode) -{ - struct task_struct *task; - bool allowed = false; - /* Allow access to a task's file descriptors if it is us or we - * may use ptrace attach to the process and find out that - * information. - */ - task = get_proc_task(inode); - if (task) { - allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); - put_task_struct(task); - } - return allowed; -} - int proc_nochmod_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { @@ -1778,16 +1758,12 @@ static const struct file_operations proc_pid_set_comm_operations = { .release = single_release, }; -static int proc_exe_link(struct dentry *dentry, struct path *exe_path) +static int proc_exe_link(struct dentry *dentry, struct path *exe_path, + struct task_struct *task) { - struct task_struct *task; struct file *exe_file; - task = get_proc_task(d_inode(dentry)); - if (!task) - return -ENOENT; exe_file = get_task_exe_file(task); - put_task_struct(task); if (exe_file) { *exe_path = exe_file->f_path; path_get(&exe_file->f_path); @@ -1797,26 +1773,42 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path) return -ENOENT; } +static int call_proc_get_link(struct dentry *dentry, struct inode *inode, struct path *path_out) +{ + struct task_struct *task; + int ret; + + task = get_proc_task(inode); + if (!task) + return -ENOENT; + ret = down_read_killable(&task->signal->exec_update_lock); + if (ret) + goto out_put_task; + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { + ret = -EACCES; + goto out; + } + ret = PROC_I(inode)->op.proc_get_link(dentry, path_out, task); + +out: + up_read(&task->signal->exec_update_lock); +out_put_task: + put_task_struct(task); + return ret; +} + static const char *proc_pid_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct path path; - int error = -EACCES; + int error; if (!dentry) return ERR_PTR(-ECHILD); - - /* Are we allowed to snoop on the tasks file descriptors? */ - if (!proc_fd_access_allowed(inode)) - goto out; - - error = PROC_I(inode)->op.proc_get_link(dentry, &path); - if (error) - goto out; - - error = nd_jump_link(&path); -out: + error = call_proc_get_link(dentry, inode, &path); + if (!error) + error = nd_jump_link(&path); return ERR_PTR(error); } @@ -1850,17 +1842,11 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b struct inode *inode = d_inode(dentry); struct path path; - /* Are we allowed to snoop on the tasks file descriptors? */ - if (!proc_fd_access_allowed(inode)) - goto out; - - error = PROC_I(inode)->op.proc_get_link(dentry, &path); - if (error) - goto out; - - error = do_proc_readlink(&path, buffer, buflen); - path_put(&path); -out: + error = call_proc_get_link(dentry, inode, &path); + if (!error) { + error = do_proc_readlink(&path, buffer, buflen); + path_put(&path); + } return error; } @@ -2243,21 +2229,16 @@ static const struct dentry_operations tid_map_files_dentry_operations = { .d_delete = pid_delete_dentry, }; -static int map_files_get_link(struct dentry *dentry, struct path *path) +static int map_files_get_link(struct dentry *dentry, struct path *path, + struct task_struct *task) { unsigned long vm_start, vm_end; struct vm_area_struct *vma; - struct task_struct *task; struct mm_struct *mm; int rc; rc = -ENOENT; - task = get_proc_task(d_inode(dentry)); - if (!task) - goto out; - mm = get_task_mm(task); - put_task_struct(task); if (!mm) goto out; @@ -2353,17 +2334,15 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, if (!task) goto out; - result = ERR_PTR(-EACCES); - if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) - goto out_put_task; - result = ERR_PTR(-ENOENT); if (dname_to_vma_addr(dentry, &vm_start, &vm_end)) goto out_put_task; - mm = get_task_mm(task); - if (!mm) + mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); + if (IS_ERR(mm)) { + result = ERR_CAST(mm); goto out_put_task; + } result = ERR_PTR(-EINTR); if (mmap_read_lock_killable(mm)) @@ -2413,23 +2392,22 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) if (!task) goto out; - ret = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) - goto out_put_task; - ret = 0; if (!dir_emit_dots(file, ctx)) goto out_put_task; - mm = get_task_mm(task); - if (!mm) + mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); + if (IS_ERR(mm)) { + ret = PTR_ERR(mm); + /* if the task has no mm, the directory should just be empty */ + if (ret == -ESRCH) + ret = 0; goto out_put_task; + } ret = mmap_read_lock_killable(mm); - if (ret) { - mmput(mm); - goto out_put_task; - } + if (ret) + goto out_put_mm; nr_files = 0; @@ -2455,8 +2433,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) if (!p) { ret = -ENOMEM; mmap_read_unlock(mm); - mmput(mm); - goto out_put_task; + goto out_put_mm; } p->start = vma->vm_start; @@ -2464,7 +2441,6 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) p->mode = vma->vm_file->f_mode; } mmap_read_unlock(mm); - mmput(mm); for (i = 0; i < nr_files; i++) { char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */ @@ -2481,6 +2457,8 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) ctx->pos++; } +out_put_mm: + mmput(mm); out_put_task: put_task_struct(task); out: diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 05c7513e77c7..0f9a1556f2a3 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -171,24 +171,19 @@ static const struct dentry_operations tid_fd_dentry_operations = { .d_delete = pid_delete_dentry, }; -static int proc_fd_link(struct dentry *dentry, struct path *path) +static int proc_fd_link(struct dentry *dentry, struct path *path, + struct task_struct *task) { - struct task_struct *task; int ret = -ENOENT; - - task = get_proc_task(d_inode(dentry)); - if (task) { - unsigned int fd = proc_fd(d_inode(dentry)); - struct file *fd_file; - - fd_file = fget_task(task, fd); - if (fd_file) { - *path = fd_file->f_path; - path_get(&fd_file->f_path); - ret = 0; - fput(fd_file); - } - put_task_struct(task); + unsigned int fd = proc_fd(d_inode(dentry)); + struct file *fd_file; + + fd_file = fget_task(task, fd); + if (fd_file) { + *path = fd_file->f_path; + path_get(&fd_file->f_path); + ret = 0; + fput(fd_file); } return ret; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 8bb81e58c9d8..c6ae076e1fa0 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -841,3 +841,13 @@ ssize_t proc_simple_write(struct file *f, const char __user *ubuf, size_t size, kfree(buf); return ret == 0 ? size : ret; } + +/* + * Not exported to modules: + * modules' /proc files aren't permanent because modules aren't permanent. + */ +void impl_proc_make_permanent(struct proc_dir_entry *pde) +{ + if (pde) + pde_make_permanent(pde); +} diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 64dc44832808..b232e1098117 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -79,8 +79,11 @@ static inline bool pde_is_permanent(const struct proc_dir_entry *pde) return pde->flags & PROC_ENTRY_PERMANENT; } +/* This is for builtin code, not even for modules which are compiled in. */ static inline void pde_make_permanent(struct proc_dir_entry *pde) { + /* Ensure magic flag does something. */ + static_assert(PROC_ENTRY_PERMANENT != 0); pde->flags |= PROC_ENTRY_PERMANENT; } @@ -107,7 +110,7 @@ extern struct kmem_cache *proc_dir_entry_cache; void pde_free(struct proc_dir_entry *pde); union proc_op { - int (*proc_get_link)(struct dentry *, struct path *); + int (*proc_get_link)(struct dentry *, struct path *, struct task_struct *); int (*proc_show)(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task); diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 39f4169f669f..2f46f1396744 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -55,6 +55,10 @@ static const char *proc_ns_get_link(struct dentry *dentry, if (!task) return ERR_PTR(-EACCES); + error = down_read_killable(&task->signal->exec_update_lock); + if (error) + goto out_put_task; + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto out; @@ -64,6 +68,8 @@ static const char *proc_ns_get_link(struct dentry *dentry, error = nd_jump_link(&ns_path); out: + up_read(&task->signal->exec_update_lock); +out_put_task: put_task_struct(task); return ERR_PTR(error); } @@ -80,11 +86,17 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl if (!task) return res; + res = down_read_killable(&task->signal->exec_update_lock); + if (res) + goto out_put_task; + if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { res = ns_get_name(name, sizeof(name), task, ns_ops); if (res >= 0) res = readlink_copy(buffer, buflen, name, strlen(name)); } + up_read(&task->signal->exec_update_lock); +out_put_task: put_task_struct(task); return res; } diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 184cddeb8215..00cc385bce21 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -23,6 +23,7 @@ #include <linux/uidgid.h> #include <net/net_namespace.h> #include <linux/seq_file.h> +#include <linux/security.h> #include "internal.h" @@ -270,6 +271,7 @@ static struct net *get_proc_task_net(struct inode *dir) struct task_struct *task; struct nsproxy *ns; struct net *net = NULL; + struct proc_fs_info *fs_info = proc_sb_info(dir->i_sb); rcu_read_lock(); task = pid_task(proc_pid(dir), PIDTYPE_PID); @@ -282,6 +284,12 @@ static struct net *get_proc_task_net(struct inode *dir) } rcu_read_unlock(); + if (net && (fs_info->pidonly == PROC_PIDONLY_ON) && + security_capable(fs_info->mounter_cred, net->user_ns, CAP_NET_ADMIN, CAP_OPT_NONE) < 0) { + put_net(net); + net = NULL; + } + return net; } diff --git a/fs/proc/root.c b/fs/proc/root.c index 0f9100559471..99adddfeb4a4 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -223,12 +223,17 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) return 0; } -static void proc_apply_options(struct proc_fs_info *fs_info, +static int proc_apply_options(struct proc_fs_info *fs_info, struct fs_context *fc, struct user_namespace *user_ns) { struct proc_fs_context *ctx = fc->fs_private; + if ((ctx->mask & (1 << Opt_subset)) && + fc->purpose == FS_CONTEXT_FOR_RECONFIGURE && + ctx->pidonly != fs_info->pidonly) + return invalf(fc, "proc: subset=pid cannot be changed\n"); + if (ctx->mask & (1 << Opt_gid)) fs_info->pid_gid = make_kgid(user_ns, ctx->gid); if (ctx->mask & (1 << Opt_hidepid)) @@ -240,6 +245,7 @@ static void proc_apply_options(struct proc_fs_info *fs_info, put_pid_ns(fs_info->pid_ns); fs_info->pid_ns = get_pid_ns(ctx->pid_ns); } + return 0; } static int proc_fill_super(struct super_block *s, struct fs_context *fc) @@ -254,10 +260,13 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc) return -ENOMEM; fs_info->pid_ns = get_pid_ns(ctx->pid_ns); - proc_apply_options(fs_info, fc, current_user_ns()); + fs_info->mounter_cred = get_cred(fc->cred); + ret = proc_apply_options(fs_info, fc, current_user_ns()); + if (ret) + return ret; /* User space would break if executables or devices appear on proc */ - s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV; + s->s_iflags |= SB_I_NOEXEC | SB_I_NODEV; s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC; s->s_blocksize = 1024; s->s_blocksize_bits = 10; @@ -266,6 +275,9 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc) s->s_time_gran = 1; s->s_fs_info = fs_info; + if (fs_info->pidonly == PROC_PIDONLY_ON) + s->s_iflags |= SB_I_RESTRICTED_VARIANT; + /* * procfs isn't actually a stacking filesystem; however, there is * too much magic going on inside it to permit stacking things on @@ -303,8 +315,7 @@ static int proc_reconfigure(struct fs_context *fc) sync_filesystem(sb); - proc_apply_options(fs_info, fc, current_user_ns()); - return 0; + return proc_apply_options(fs_info, fc, current_user_ns()); } static int proc_get_tree(struct fs_context *fc) @@ -350,6 +361,7 @@ static void proc_kill_sb(struct super_block *sb) kill_anon_super(sb); if (fs_info) { put_pid_ns(fs_info->pid_ns); + put_cred(fs_info->mounter_cred); kfree_rcu(fs_info, rcu); } } @@ -359,7 +371,7 @@ static struct file_system_type proc_fs_type = { .init_fs_context = proc_init_fs_context, .parameters = proc_fs_parameters, .kill_sb = proc_kill_sb, - .fs_flags = FS_USERNS_MOUNT | FS_DISALLOW_NOTIFY_PERM, + .fs_flags = FS_USERNS_MOUNT | FS_USERNS_MOUNT_RESTRICTED | FS_DISALLOW_NOTIFY_PERM, }; void __init proc_root_init(void) diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index b199e8ff79b1..88c10823fcaf 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -23,20 +23,6 @@ static struct kernfs_root *sysfs_root; struct kernfs_node *sysfs_root_kn; -static int sysfs_get_tree(struct fs_context *fc) -{ - struct kernfs_fs_context *kfc = fc->fs_private; - int ret; - - ret = kernfs_get_tree(fc); - if (ret) - return ret; - - if (kfc->new_sb_created) - fc->root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE; - return 0; -} - static void sysfs_fs_context_free(struct fs_context *fc) { struct kernfs_fs_context *kfc = fc->fs_private; @@ -49,7 +35,7 @@ static void sysfs_fs_context_free(struct fs_context *fc) static const struct fs_context_operations sysfs_fs_context_ops = { .free = sysfs_fs_context_free, - .get_tree = sysfs_get_tree, + .get_tree = kernfs_get_tree, }; static int sysfs_init_fs_context(struct fs_context *fc) @@ -93,7 +79,7 @@ static struct file_system_type sysfs_fs_type = { .name = "sysfs", .init_fs_context = sysfs_init_fs_context, .kill_sb = sysfs_kill_sb, - .fs_flags = FS_USERNS_MOUNT, + .fs_flags = FS_USERNS_MOUNT | FS_USERNS_MOUNT_RESTRICTED, }; int __init sysfs_init(void) diff --git a/include/linux/fs.h b/include/linux/fs.h index 9674c3d1cb3f..6da44573ce45 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2294,13 +2294,14 @@ struct file_system_type { #define FS_MGTIME 64 /* FS uses multigrain timestamps */ #define FS_LBS 128 /* FS supports LBS */ #define FS_POWER_FREEZE 256 /* Always freeze on suspend/hibernate */ +#define FS_USERNS_MOUNT_RESTRICTED 512 /* Restrict mount in userns if not already visible */ #define FS_USERNS_DELEGATABLE 1024 /* Can be mounted inside userns from outside */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ int (*init_fs_context)(struct fs_context *); const struct fs_parameter_spec *parameters; void (*kill_sb) (struct super_block *); struct module *owner; - struct file_system_type * next; + struct hlist_node list; struct hlist_head fs_supers; struct lock_class_key s_lock_key; diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h index 1ab4e2265129..aa86e4944dbf 100644 --- a/include/linux/fs/super_types.h +++ b/include/linux/fs/super_types.h @@ -334,7 +334,7 @@ struct super_block { #define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */ /* sb->s_iflags to limit user namespace mounts */ -#define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ +#define SB_I_RESTRICTED_VARIANT 0x00000010 #define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020 #define SB_I_UNTRUSTED_MOUNTER 0x00000040 #define SB_I_EVM_HMAC_UNSUPPORTED 0x00000080 diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 19d1c5e5f335..47d7deaeed8f 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -67,6 +67,7 @@ enum proc_pidonly { struct proc_fs_info { struct pid_namespace *pid_ns; kgid_t pid_gid; + const struct cred *mounter_cred; enum proc_hidepid hide_pid; enum proc_pidonly pidonly; struct rcu_head rcu; @@ -248,4 +249,16 @@ static inline struct pid_namespace *proc_pid_ns(struct super_block *sb) bool proc_ns_file(const struct file *file); +#if defined CONFIG_PROC_FS && !defined MODULE +void impl_proc_make_permanent(struct proc_dir_entry *pde); +#endif + +static inline void proc_make_permanent(struct proc_dir_entry *pde) +{ + /* Don't give matches to modules. */ +#if defined CONFIG_PROC_FS && !defined MODULE + impl_proc_make_permanent(pde); +#endif +} + #endif /* _LINUX_PROC_FS_H */ diff --git a/kernel/acct.c b/kernel/acct.c index cbbf79d718cf..c440d43479ca 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -249,7 +249,7 @@ static int acct_on(const char __user *name) return -EINVAL; /* Exclude procfs and sysfs. */ - if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE) + if (file_inode(file)->i_sb->s_type->fs_flags & FS_USERNS_MOUNT_RESTRICTED) return -EINVAL; if (!(file->f_mode & FMODE_CAN_WRITE)) |
