diff options
| -rw-r--r-- | Documentation/filesystems/proc.rst | 19 | ||||
| -rw-r--r-- | fs/filesystems.c | 330 | ||||
| -rw-r--r-- | fs/mount.h | 4 | ||||
| -rw-r--r-- | fs/namespace.c | 34 | ||||
| -rw-r--r-- | fs/ocfs2/super.c | 1 | ||||
| -rw-r--r-- | fs/proc/array.c | 6 | ||||
| -rw-r--r-- | fs/proc/base.c | 160 | ||||
| -rw-r--r-- | fs/proc/fd.c | 27 | ||||
| -rw-r--r-- | fs/proc/generic.c | 10 | ||||
| -rw-r--r-- | fs/proc/internal.h | 5 | ||||
| -rw-r--r-- | fs/proc/namespaces.c | 12 | ||||
| -rw-r--r-- | fs/proc/proc_net.c | 8 | ||||
| -rw-r--r-- | fs/proc/root.c | 24 | ||||
| -rw-r--r-- | fs/sysfs/mount.c | 18 | ||||
| -rw-r--r-- | include/linux/fs.h | 3 | ||||
| -rw-r--r-- | include/linux/fs/super_types.h | 2 | ||||
| -rw-r--r-- | include/linux/proc_fs.h | 13 | ||||
| -rw-r--r-- | kernel/acct.c | 2 |
18 files changed, 429 insertions, 249 deletions
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index db6167befb7b..5006644c1d19 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -52,6 +52,7 @@ fixes/update part 1.1 Stefani Seibold <stefani@seibold.net> June 9 2009 4 Configuring procfs 4.1 Mount options + 4.2 Mount restrictions 5 Filesystem behavior @@ -2425,7 +2426,9 @@ prohibited by hidepid=. If you use some daemon like identd which needs to learn information about processes information, just add identd to this group. subset=pid hides all top level files and directories in the procfs that -are not related to tasks. +are not related to tasks. This option cannot be changed on an existing +procfs instance because overmounts that existed before the change could +otherwise remain reachable after the top level procfs entries are hidden. pidns= specifies a pid namespace (either as a string path to something like `/proc/$pid/ns/pid`, or a file descriptor when using `FSCONFIG_SET_FD`) that @@ -2434,6 +2437,20 @@ will use the calling process's active pid namespace. Note that the pid namespace of an existing procfs instance cannot be modified (attempting to do so will give an `-EBUSY` error). +4.2 Mount restrictions +-------------------------- + +If user namespaces are in use, the kernel additionally checks the instances of +procfs available to the mounter and will not allow procfs to be mounted if: + + 1. This mount is not fully visible unless the new procfs is going to be + mounted with subset=pid option. + + a. Its root directory is not the root directory of the filesystem. + b. If any file or non-empty procfs directory is hidden by another mount. + + 2. A new mount overrides the readonly option or any option from atime family. + Chapter 5: Filesystem behavior ============================== diff --git a/fs/filesystems.c b/fs/filesystems.c index 0c7d2b7ac26c..673a03b5f32b 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -17,22 +17,49 @@ #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/fs_parser.h> +#include <linux/rculist.h> /* - * Handling of filesystem drivers list. - * Rules: - * Inclusion to/removals from/scanning of list are protected by spinlock. - * During the unload module must call unregister_filesystem(). - * We can access the fields of list element if: - * 1) spinlock is held or - * 2) we hold the reference to the module. - * The latter can be guaranteed by call of try_module_get(); if it - * returned 0 we must skip the element, otherwise we got the reference. - * Once the reference is obtained we can drop the spinlock. + * Read-mostly filesystem drivers list. + * + * Readers walk under rcu_read_lock(); writers take file_systems_lock + * and publish via _rcu hlist primitives. unregister_filesystem() + * synchronize_rcu()s after unlock so the embedded file_system_type + * can't go away under a reader. To keep using a filesystem after + * the RCU section ends, take a module reference via try_module_get(). + */ +static HLIST_HEAD(file_systems); +static DEFINE_SPINLOCK(file_systems_lock); + +#ifdef CONFIG_PROC_FS +/* + * Cache a stringified version of the filesystem list. + * + * The fs list gets queried a lot by userspace because of libselinux, including + * rather surprising programs (would you guess *sed* is on the list?). In order + * to reduce the overhead we cache the resulting string, which normally hangs + * around below 512 bytes in size. + * + * As the list almost never changes, its creation is not particularly optimized + * to keep things simple. + * + * We sort it out on read in order to not introduce a failure point for fs + * registration (in principle we may be unable to alloc memory for the list). */ +struct file_systems_string { + struct rcu_head rcu; + unsigned long gen; + size_t len; + char string[]; +}; + +static unsigned long file_systems_gen; +static struct file_systems_string __read_mostly __rcu *file_systems_string; -static struct file_system_type *file_systems; -static DEFINE_RWLOCK(file_systems_lock); +static void invalidate_filesystems_string(void); +#else +static inline void invalidate_filesystems_string(void) { } +#endif /* WARNING: This can be used only if we _already_ own a reference */ struct file_system_type *get_filesystem(struct file_system_type *fs) @@ -46,14 +73,15 @@ void put_filesystem(struct file_system_type *fs) module_put(fs->owner); } -static struct file_system_type **find_filesystem(const char *name, unsigned len) +static struct file_system_type *find_filesystem(const char *name, unsigned len) { - struct file_system_type **p; - for (p = &file_systems; *p; p = &(*p)->next) - if (strncmp((*p)->name, name, len) == 0 && - !(*p)->name[len]) - break; - return p; + struct file_system_type *fs; + + hlist_for_each_entry_rcu(fs, &file_systems, list, + lockdep_is_held(&file_systems_lock)) + if (strncmp(fs->name, name, len) == 0 && !fs->name[len]) + return fs; + return NULL; } /** @@ -64,33 +92,27 @@ static struct file_system_type **find_filesystem(const char *name, unsigned len) * is aware of for mount and other syscalls. Returns 0 on success, * or a negative errno code on an error. * - * The &struct file_system_type that is passed is linked into the kernel + * The &struct file_system_type that is passed is linked into the kernel * structures and must not be freed until the file system has been * unregistered. */ - -int register_filesystem(struct file_system_type * fs) +int register_filesystem(struct file_system_type *fs) { - int res = 0; - struct file_system_type ** p; - if (fs->parameters && !fs_validate_description(fs->name, fs->parameters)) return -EINVAL; BUG_ON(strchr(fs->name, '.')); - if (fs->next) + if (!hlist_unhashed_lockless(&fs->list)) return -EBUSY; - write_lock(&file_systems_lock); - p = find_filesystem(fs->name, strlen(fs->name)); - if (*p) - res = -EBUSY; - else - *p = fs; - write_unlock(&file_systems_lock); - return res; -} + guard(spinlock)(&file_systems_lock); + if (find_filesystem(fs->name, strlen(fs->name))) + return -EBUSY; + hlist_add_tail_rcu(&fs->list, &file_systems); + invalidate_filesystems_string(); + return 0; +} EXPORT_SYMBOL(register_filesystem); /** @@ -100,94 +122,79 @@ EXPORT_SYMBOL(register_filesystem); * Remove a file system that was previously successfully registered * with the kernel. An error is returned if the file system is not found. * Zero is returned on a success. - * + * * Once this function has returned the &struct file_system_type structure * may be freed or reused. */ - -int unregister_filesystem(struct file_system_type * fs) +int unregister_filesystem(struct file_system_type *fs) { - struct file_system_type ** tmp; - - write_lock(&file_systems_lock); - tmp = &file_systems; - while (*tmp) { - if (fs == *tmp) { - *tmp = fs->next; - fs->next = NULL; - write_unlock(&file_systems_lock); - synchronize_rcu(); - return 0; - } - tmp = &(*tmp)->next; + scoped_guard(spinlock, &file_systems_lock) { + if (hlist_unhashed(&fs->list)) + return -EINVAL; + hlist_del_init_rcu(&fs->list); + invalidate_filesystems_string(); } - write_unlock(&file_systems_lock); - - return -EINVAL; + synchronize_rcu(); + return 0; } - EXPORT_SYMBOL(unregister_filesystem); #ifdef CONFIG_SYSFS_SYSCALL -static int fs_index(const char __user * __name) +static int fs_index(const char __user *__name) { - struct file_system_type * tmp; + struct file_system_type *p; char *name __free(kfree) = strndup_user(__name, PATH_MAX); - int err, index; + int index = 0; if (IS_ERR(name)) return PTR_ERR(name); - err = -EINVAL; - read_lock(&file_systems_lock); - for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) { - if (strcmp(tmp->name, name) == 0) { - err = index; - break; - } + guard(rcu)(); + hlist_for_each_entry_rcu(p, &file_systems, list) { + if (strcmp(p->name, name) == 0) + return index; + index++; } - read_unlock(&file_systems_lock); - return err; + return -EINVAL; } -static int fs_name(unsigned int index, char __user * buf) +static int fs_name(unsigned int index, char __user *buf) { - struct file_system_type * tmp; - int len, res = -EINVAL; - - read_lock(&file_systems_lock); - for (tmp = file_systems; tmp; tmp = tmp->next, index--) { - if (index == 0) { - if (try_module_get(tmp->owner)) - res = 0; + struct file_system_type *p, *found = NULL; + int len, res; + + scoped_guard(rcu) { + hlist_for_each_entry_rcu(p, &file_systems, list) { + if (index--) + continue; + if (try_module_get(p->owner)) + found = p; break; } } - read_unlock(&file_systems_lock); - if (res) - return res; + if (!found) + return -EINVAL; /* OK, we got the reference, so we can safely block */ - len = strlen(tmp->name) + 1; - res = copy_to_user(buf, tmp->name, len) ? -EFAULT : 0; - put_filesystem(tmp); + len = strlen(found->name) + 1; + res = copy_to_user(buf, found->name, len) ? -EFAULT : 0; + put_filesystem(found); return res; } static int fs_maxindex(void) { - struct file_system_type * tmp; - int index; + struct file_system_type *p; + int index = 0; - read_lock(&file_systems_lock); - for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++) - ; - read_unlock(&file_systems_lock); + guard(rcu)(); + hlist_for_each_entry_rcu(p, &file_systems, list) + index++; return index; } /* - * Whee.. Weird sysv syscall. + * Whee.. Weird sysv syscall. */ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2) { @@ -216,8 +223,8 @@ int __init list_bdev_fs_names(char *buf, size_t size) size_t len; int count = 0; - read_lock(&file_systems_lock); - for (p = file_systems; p; p = p->next) { + guard(rcu)(); + hlist_for_each_entry_rcu(p, &file_systems, list) { if (!(p->fs_flags & FS_REQUIRES_DEV)) continue; len = strlen(p->name) + 1; @@ -230,30 +237,143 @@ int __init list_bdev_fs_names(char *buf, size_t size) size -= len; count++; } - read_unlock(&file_systems_lock); return count; } #ifdef CONFIG_PROC_FS -static int filesystems_proc_show(struct seq_file *m, void *v) +static void invalidate_filesystems_string(void) +{ + struct file_systems_string *old; + + lockdep_assert_held_write(&file_systems_lock); + file_systems_gen++; + old = rcu_replace_pointer(file_systems_string, NULL, + lockdep_is_held(&file_systems_lock)); + if (old) + kfree_rcu(old, rcu); +} + +static __cold noinline int regen_filesystems_string(void) +{ + struct file_system_type *p; + struct file_systems_string *old, *new; + size_t newlen, usedlen; + unsigned long gen; + +retry: + newlen = 0; + + /* pre-calc space for each fs */ + spin_lock(&file_systems_lock); + gen = file_systems_gen; + hlist_for_each_entry_rcu(p, &file_systems, list) { + if (!(p->fs_flags & FS_REQUIRES_DEV)) + newlen += strlen("nodev"); + newlen += strlen("\t") + strlen(p->name) + strlen("\n"); + } + spin_unlock(&file_systems_lock); + + new = kmalloc(offsetof(struct file_systems_string, string) + newlen + 1, + GFP_KERNEL); + if (!new) + return -ENOMEM; + + new->gen = gen; + new->len = newlen; + new->string[newlen] = '\0'; + + spin_lock(&file_systems_lock); + old = file_systems_string; + + /* + * Did someone beat us to it? + */ + if (old && old->gen == file_systems_gen) { + spin_unlock(&file_systems_lock); + kfree(new); + return 0; + } + + /* + * Did the list change in the meantime? + */ + if (gen != file_systems_gen) { + spin_unlock(&file_systems_lock); + kfree(new); + goto retry; + } + + /* + * Populate the string. + * + * We know we have just enough space because we calculated the right + * size the previous time we had the lock and confirmed the list has + * not changed after reacquiring it. + */ + usedlen = 0; + hlist_for_each_entry_rcu(p, &file_systems, list) { + usedlen += sprintf(&new->string[usedlen], "%s\t%s\n", + (p->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", + p->name); + } + + if (WARN_ON_ONCE(new->len != strlen(new->string))) { + /* + * Should never happen of course, keep this in case someone changes string + * generation above and messes it up. + */ + spin_unlock(&file_systems_lock); + kfree(new); + return -EINVAL; + } + + rcu_assign_pointer(file_systems_string, new); + spin_unlock(&file_systems_lock); + if (old) + kfree_rcu(old, rcu); + return 0; +} + +static __cold noinline int filesystems_proc_show_fallback(struct seq_file *m, void *v) { - struct file_system_type * tmp; + struct file_system_type *p; - read_lock(&file_systems_lock); - tmp = file_systems; - while (tmp) { + guard(rcu)(); + hlist_for_each_entry_rcu(p, &file_systems, list) { seq_printf(m, "%s\t%s\n", - (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", - tmp->name); - tmp = tmp->next; + (p->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", + p->name); } - read_unlock(&file_systems_lock); return 0; } +static int filesystems_proc_show(struct seq_file *m, void *v) +{ + struct file_systems_string *fss; + + for (;;) { + scoped_guard(rcu) { + fss = rcu_dereference(file_systems_string); + if (likely(fss)) { + seq_write(m, fss->string, fss->len); + return 0; + } + } + + int err = regen_filesystems_string(); + if (unlikely(err)) + return filesystems_proc_show_fallback(m, v); + } +} + static int __init proc_filesystems_init(void) { - proc_create_single("filesystems", 0, NULL, filesystems_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("filesystems", 0, NULL, filesystems_proc_show); + if (!pde) + return -ENOMEM; + proc_make_permanent(pde); return 0; } module_init(proc_filesystems_init); @@ -263,11 +383,10 @@ static struct file_system_type *__get_fs_type(const char *name, int len) { struct file_system_type *fs; - read_lock(&file_systems_lock); - fs = *(find_filesystem(name, len)); + guard(rcu)(); + fs = find_filesystem(name, len); if (fs && !try_module_get(fs->owner)) fs = NULL; - read_unlock(&file_systems_lock); return fs; } @@ -291,5 +410,4 @@ struct file_system_type *get_fs_type(const char *name) } return fs; } - EXPORT_SYMBOL(get_fs_type); diff --git a/fs/mount.h b/fs/mount.h index 5c120f8361bd..94fcc306d21e 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -25,6 +25,7 @@ struct mnt_namespace { __u32 n_fsnotify_mask; struct fsnotify_mark_connector __rcu *n_fsnotify_marks; #endif + struct hlist_head mnt_visible_mounts; /* SB_I_USERNS_VISIBLE mounts */ unsigned int nr_mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; refcount_t passive; /* number references not pinning @mounts */ @@ -98,6 +99,7 @@ struct mount { int mnt_expiry_mark; /* true if marked for expiry */ struct hlist_head mnt_pins; struct hlist_head mnt_stuck_children; + struct hlist_node mnt_ns_visible; /* link in ns->mnt_visible_mounts */ struct mount *overmount; /* mounted on ->mnt_root */ } __randomize_layout; @@ -215,6 +217,8 @@ static inline void move_from_ns(struct mount *mnt) ns->mnt_first_node = rb_next(&mnt->mnt_node); rb_erase(&mnt->mnt_node, &ns->mounts); RB_CLEAR_NODE(&mnt->mnt_node); + if (!hlist_unhashed(&mnt->mnt_ns_visible)) + hlist_del_init(&mnt->mnt_ns_visible); } bool has_locked_children(struct mount *mnt, struct dentry *dentry); diff --git a/fs/namespace.c b/fs/namespace.c index d4cf40198e92..3d5cd5bf3b05 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -321,6 +321,7 @@ static struct mount *alloc_vfsmnt(const char *name) INIT_HLIST_NODE(&mnt->mnt_slave); INIT_HLIST_NODE(&mnt->mnt_mp_list); INIT_HLIST_HEAD(&mnt->mnt_stuck_children); + INIT_HLIST_NODE(&mnt->mnt_ns_visible); RB_CLEAR_NODE(&mnt->mnt_node); mnt->mnt.mnt_idmap = &nop_mnt_idmap; } @@ -1098,6 +1099,10 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt) rb_link_node(&mnt->mnt_node, parent, link); rb_insert_color(&mnt->mnt_node, &ns->mounts); + if ((mnt->mnt.mnt_sb->s_type->fs_flags & FS_USERNS_MOUNT_RESTRICTED) && + mnt->mnt.mnt_root == mnt->mnt.mnt_sb->s_root) + hlist_add_head(&mnt->mnt_ns_visible, &ns->mnt_visible_mounts); + mnt_notify_add(mnt); } @@ -6346,20 +6351,26 @@ static bool mnt_already_visible(struct mnt_namespace *ns, int *new_mnt_flags) { int new_flags = *new_mnt_flags; - struct mount *mnt, *n; + struct mount *mnt; + + /* Don't acquire namespace semaphore without a good reason. */ + if (hlist_empty(&ns->mnt_visible_mounts)) + return false; guard(namespace_shared)(); - rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) { + hlist_for_each_entry(mnt, &ns->mnt_visible_mounts, mnt_ns_visible) { + const struct super_block *sb_visible = mnt->mnt.mnt_sb; struct mount *child; int mnt_flags; - if (mnt->mnt.mnt_sb->s_type != sb->s_type) + if (sb_visible->s_type != sb->s_type) continue; - /* This mount is not fully visible if it's root directory - * is not the root directory of the filesystem. + /* + * Restricted variants are not compatible with anything, even + * other restricted variants. */ - if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root) + if (sb_visible->s_iflags & SB_I_RESTRICTED_VARIANT) continue; /* A local view of the mount flags */ @@ -6411,16 +6422,23 @@ static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags return false; /* Can this filesystem be too revealing? */ - s_iflags = sb->s_iflags; - if (!(s_iflags & SB_I_USERNS_VISIBLE)) + if (!(sb->s_type->fs_flags & FS_USERNS_MOUNT_RESTRICTED)) return false; + s_iflags = sb->s_iflags; if ((s_iflags & required_iflags) != required_iflags) { WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n", required_iflags); return true; } + /* + * Restricted variants don't need an already visible mount because they + * don't expose the full filesystem view. + */ + if (s_iflags & SB_I_RESTRICTED_VARIANT) + return false; + return !mnt_already_visible(ns, sb, new_mnt_flags); } diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index b875f01c9756..4870e680c4e5 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1224,7 +1224,6 @@ static struct file_system_type ocfs2_fs_type = { .name = "ocfs2", .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, - .next = NULL, .init_fs_context = ocfs2_init_fs_context, .parameters = ocfs2_param_spec, }; diff --git a/fs/proc/array.c b/fs/proc/array.c index 90fb0c6b5f99..479ea8cb4ef4 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -482,6 +482,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, unsigned long flags; int exit_code = task->exit_code; struct signal_struct *sig = task->signal; + int ret; + + ret = down_read_killable(&task->signal->exec_update_lock); + if (ret) + return ret; state = *get_task_state(task); vsize = eip = esp = 0; @@ -657,6 +662,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, seq_puts(m, " 0"); seq_putc(m, '\n'); + up_read(&task->signal->exec_update_lock); if (mm) mmput(mm); return 0; diff --git a/fs/proc/base.c b/fs/proc/base.c index 126b98419adb..b67e8c3605fb 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -219,33 +219,24 @@ static int get_task_root(struct task_struct *task, struct path *root) return result; } -static int proc_cwd_link(struct dentry *dentry, struct path *path) +static int proc_cwd_link(struct dentry *dentry, struct path *path, + struct task_struct *task) { - struct task_struct *task = get_proc_task(d_inode(dentry)); int result = -ENOENT; - if (task) { - task_lock(task); - if (task->fs) { - get_fs_pwd(task->fs, path); - result = 0; - } - task_unlock(task); - put_task_struct(task); + task_lock(task); + if (task->fs) { + get_fs_pwd(task->fs, path); + result = 0; } + task_unlock(task); return result; } -static int proc_root_link(struct dentry *dentry, struct path *path) +static int proc_root_link(struct dentry *dentry, struct path *path, + struct task_struct *task) { - struct task_struct *task = get_proc_task(d_inode(dentry)); - int result = -ENOENT; - - if (task) { - result = get_task_root(task, path); - put_task_struct(task); - } - return result; + return get_task_root(task, path); } /* @@ -424,18 +415,24 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, { unsigned long wchan; char symname[KSYM_NAME_LEN]; + int err; + err = down_read_killable(&task->signal->exec_update_lock); + if (err) + return err; if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto print0; wchan = get_wchan(task); if (wchan && !lookup_symbol_name(wchan, symname)) { seq_puts(m, symname); + up_read(&task->signal->exec_update_lock); return 0; } print0: seq_putc(m, '0'); + up_read(&task->signal->exec_update_lock); return 0; } #endif /* CONFIG_KALLSYMS */ @@ -705,23 +702,6 @@ static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns, /* Here the fs part begins */ /************************************************************************/ -/* permission checks */ -static bool proc_fd_access_allowed(struct inode *inode) -{ - struct task_struct *task; - bool allowed = false; - /* Allow access to a task's file descriptors if it is us or we - * may use ptrace attach to the process and find out that - * information. - */ - task = get_proc_task(inode); - if (task) { - allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); - put_task_struct(task); - } - return allowed; -} - int proc_nochmod_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { @@ -1778,16 +1758,12 @@ static const struct file_operations proc_pid_set_comm_operations = { .release = single_release, }; -static int proc_exe_link(struct dentry *dentry, struct path *exe_path) +static int proc_exe_link(struct dentry *dentry, struct path *exe_path, + struct task_struct *task) { - struct task_struct *task; struct file *exe_file; - task = get_proc_task(d_inode(dentry)); - if (!task) - return -ENOENT; exe_file = get_task_exe_file(task); - put_task_struct(task); if (exe_file) { *exe_path = exe_file->f_path; path_get(&exe_file->f_path); @@ -1797,26 +1773,42 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path) return -ENOENT; } +static int call_proc_get_link(struct dentry *dentry, struct inode *inode, struct path *path_out) +{ + struct task_struct *task; + int ret; + + task = get_proc_task(inode); + if (!task) + return -ENOENT; + ret = down_read_killable(&task->signal->exec_update_lock); + if (ret) + goto out_put_task; + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { + ret = -EACCES; + goto out; + } + ret = PROC_I(inode)->op.proc_get_link(dentry, path_out, task); + +out: + up_read(&task->signal->exec_update_lock); +out_put_task: + put_task_struct(task); + return ret; +} + static const char *proc_pid_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct path path; - int error = -EACCES; + int error; if (!dentry) return ERR_PTR(-ECHILD); - - /* Are we allowed to snoop on the tasks file descriptors? */ - if (!proc_fd_access_allowed(inode)) - goto out; - - error = PROC_I(inode)->op.proc_get_link(dentry, &path); - if (error) - goto out; - - error = nd_jump_link(&path); -out: + error = call_proc_get_link(dentry, inode, &path); + if (!error) + error = nd_jump_link(&path); return ERR_PTR(error); } @@ -1850,17 +1842,11 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b struct inode *inode = d_inode(dentry); struct path path; - /* Are we allowed to snoop on the tasks file descriptors? */ - if (!proc_fd_access_allowed(inode)) - goto out; - - error = PROC_I(inode)->op.proc_get_link(dentry, &path); - if (error) - goto out; - - error = do_proc_readlink(&path, buffer, buflen); - path_put(&path); -out: + error = call_proc_get_link(dentry, inode, &path); + if (!error) { + error = do_proc_readlink(&path, buffer, buflen); + path_put(&path); + } return error; } @@ -2243,21 +2229,16 @@ static const struct dentry_operations tid_map_files_dentry_operations = { .d_delete = pid_delete_dentry, }; -static int map_files_get_link(struct dentry *dentry, struct path *path) +static int map_files_get_link(struct dentry *dentry, struct path *path, + struct task_struct *task) { unsigned long vm_start, vm_end; struct vm_area_struct *vma; - struct task_struct *task; struct mm_struct *mm; int rc; rc = -ENOENT; - task = get_proc_task(d_inode(dentry)); - if (!task) - goto out; - mm = get_task_mm(task); - put_task_struct(task); if (!mm) goto out; @@ -2353,17 +2334,15 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, if (!task) goto out; - result = ERR_PTR(-EACCES); - if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) - goto out_put_task; - result = ERR_PTR(-ENOENT); if (dname_to_vma_addr(dentry, &vm_start, &vm_end)) goto out_put_task; - mm = get_task_mm(task); - if (!mm) + mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); + if (IS_ERR(mm)) { + result = ERR_CAST(mm); goto out_put_task; + } result = ERR_PTR(-EINTR); if (mmap_read_lock_killable(mm)) @@ -2413,23 +2392,22 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) if (!task) goto out; - ret = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) - goto out_put_task; - ret = 0; if (!dir_emit_dots(file, ctx)) goto out_put_task; - mm = get_task_mm(task); - if (!mm) + mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); + if (IS_ERR(mm)) { + ret = PTR_ERR(mm); + /* if the task has no mm, the directory should just be empty */ + if (ret == -ESRCH) + ret = 0; goto out_put_task; + } ret = mmap_read_lock_killable(mm); - if (ret) { - mmput(mm); - goto out_put_task; - } + if (ret) + goto out_put_mm; nr_files = 0; @@ -2455,8 +2433,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) if (!p) { ret = -ENOMEM; mmap_read_unlock(mm); - mmput(mm); - goto out_put_task; + goto out_put_mm; } p->start = vma->vm_start; @@ -2464,7 +2441,6 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) p->mode = vma->vm_file->f_mode; } mmap_read_unlock(mm); - mmput(mm); for (i = 0; i < nr_files; i++) { char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */ @@ -2481,6 +2457,8 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) ctx->pos++; } +out_put_mm: + mmput(mm); out_put_task: put_task_struct(task); out: diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 05c7513e77c7..0f9a1556f2a3 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -171,24 +171,19 @@ static const struct dentry_operations tid_fd_dentry_operations = { .d_delete = pid_delete_dentry, }; -static int proc_fd_link(struct dentry *dentry, struct path *path) +static int proc_fd_link(struct dentry *dentry, struct path *path, + struct task_struct *task) { - struct task_struct *task; int ret = -ENOENT; - - task = get_proc_task(d_inode(dentry)); - if (task) { - unsigned int fd = proc_fd(d_inode(dentry)); - struct file *fd_file; - - fd_file = fget_task(task, fd); - if (fd_file) { - *path = fd_file->f_path; - path_get(&fd_file->f_path); - ret = 0; - fput(fd_file); - } - put_task_struct(task); + unsigned int fd = proc_fd(d_inode(dentry)); + struct file *fd_file; + + fd_file = fget_task(task, fd); + if (fd_file) { + *path = fd_file->f_path; + path_get(&fd_file->f_path); + ret = 0; + fput(fd_file); } return ret; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 8bb81e58c9d8..c6ae076e1fa0 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -841,3 +841,13 @@ ssize_t proc_simple_write(struct file *f, const char __user *ubuf, size_t size, kfree(buf); return ret == 0 ? size : ret; } + +/* + * Not exported to modules: + * modules' /proc files aren't permanent because modules aren't permanent. + */ +void impl_proc_make_permanent(struct proc_dir_entry *pde) +{ + if (pde) + pde_make_permanent(pde); +} diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 64dc44832808..b232e1098117 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -79,8 +79,11 @@ static inline bool pde_is_permanent(const struct proc_dir_entry *pde) return pde->flags & PROC_ENTRY_PERMANENT; } +/* This is for builtin code, not even for modules which are compiled in. */ static inline void pde_make_permanent(struct proc_dir_entry *pde) { + /* Ensure magic flag does something. */ + static_assert(PROC_ENTRY_PERMANENT != 0); pde->flags |= PROC_ENTRY_PERMANENT; } @@ -107,7 +110,7 @@ extern struct kmem_cache *proc_dir_entry_cache; void pde_free(struct proc_dir_entry *pde); union proc_op { - int (*proc_get_link)(struct dentry *, struct path *); + int (*proc_get_link)(struct dentry *, struct path *, struct task_struct *); int (*proc_show)(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task); diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 39f4169f669f..2f46f1396744 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -55,6 +55,10 @@ static const char *proc_ns_get_link(struct dentry *dentry, if (!task) return ERR_PTR(-EACCES); + error = down_read_killable(&task->signal->exec_update_lock); + if (error) + goto out_put_task; + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto out; @@ -64,6 +68,8 @@ static const char *proc_ns_get_link(struct dentry *dentry, error = nd_jump_link(&ns_path); out: + up_read(&task->signal->exec_update_lock); +out_put_task: put_task_struct(task); return ERR_PTR(error); } @@ -80,11 +86,17 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl if (!task) return res; + res = down_read_killable(&task->signal->exec_update_lock); + if (res) + goto out_put_task; + if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { res = ns_get_name(name, sizeof(name), task, ns_ops); if (res >= 0) res = readlink_copy(buffer, buflen, name, strlen(name)); } + up_read(&task->signal->exec_update_lock); +out_put_task: put_task_struct(task); return res; } diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 184cddeb8215..00cc385bce21 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -23,6 +23,7 @@ #include <linux/uidgid.h> #include <net/net_namespace.h> #include <linux/seq_file.h> +#include <linux/security.h> #include "internal.h" @@ -270,6 +271,7 @@ static struct net *get_proc_task_net(struct inode *dir) struct task_struct *task; struct nsproxy *ns; struct net *net = NULL; + struct proc_fs_info *fs_info = proc_sb_info(dir->i_sb); rcu_read_lock(); task = pid_task(proc_pid(dir), PIDTYPE_PID); @@ -282,6 +284,12 @@ static struct net *get_proc_task_net(struct inode *dir) } rcu_read_unlock(); + if (net && (fs_info->pidonly == PROC_PIDONLY_ON) && + security_capable(fs_info->mounter_cred, net->user_ns, CAP_NET_ADMIN, CAP_OPT_NONE) < 0) { + put_net(net); + net = NULL; + } + return net; } diff --git a/fs/proc/root.c b/fs/proc/root.c index 0f9100559471..99adddfeb4a4 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -223,12 +223,17 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) return 0; } -static void proc_apply_options(struct proc_fs_info *fs_info, +static int proc_apply_options(struct proc_fs_info *fs_info, struct fs_context *fc, struct user_namespace *user_ns) { struct proc_fs_context *ctx = fc->fs_private; + if ((ctx->mask & (1 << Opt_subset)) && + fc->purpose == FS_CONTEXT_FOR_RECONFIGURE && + ctx->pidonly != fs_info->pidonly) + return invalf(fc, "proc: subset=pid cannot be changed\n"); + if (ctx->mask & (1 << Opt_gid)) fs_info->pid_gid = make_kgid(user_ns, ctx->gid); if (ctx->mask & (1 << Opt_hidepid)) @@ -240,6 +245,7 @@ static void proc_apply_options(struct proc_fs_info *fs_info, put_pid_ns(fs_info->pid_ns); fs_info->pid_ns = get_pid_ns(ctx->pid_ns); } + return 0; } static int proc_fill_super(struct super_block *s, struct fs_context *fc) @@ -254,10 +260,13 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc) return -ENOMEM; fs_info->pid_ns = get_pid_ns(ctx->pid_ns); - proc_apply_options(fs_info, fc, current_user_ns()); + fs_info->mounter_cred = get_cred(fc->cred); + ret = proc_apply_options(fs_info, fc, current_user_ns()); + if (ret) + return ret; /* User space would break if executables or devices appear on proc */ - s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV; + s->s_iflags |= SB_I_NOEXEC | SB_I_NODEV; s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC; s->s_blocksize = 1024; s->s_blocksize_bits = 10; @@ -266,6 +275,9 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc) s->s_time_gran = 1; s->s_fs_info = fs_info; + if (fs_info->pidonly == PROC_PIDONLY_ON) + s->s_iflags |= SB_I_RESTRICTED_VARIANT; + /* * procfs isn't actually a stacking filesystem; however, there is * too much magic going on inside it to permit stacking things on @@ -303,8 +315,7 @@ static int proc_reconfigure(struct fs_context *fc) sync_filesystem(sb); - proc_apply_options(fs_info, fc, current_user_ns()); - return 0; + return proc_apply_options(fs_info, fc, current_user_ns()); } static int proc_get_tree(struct fs_context *fc) @@ -350,6 +361,7 @@ static void proc_kill_sb(struct super_block *sb) kill_anon_super(sb); if (fs_info) { put_pid_ns(fs_info->pid_ns); + put_cred(fs_info->mounter_cred); kfree_rcu(fs_info, rcu); } } @@ -359,7 +371,7 @@ static struct file_system_type proc_fs_type = { .init_fs_context = proc_init_fs_context, .parameters = proc_fs_parameters, .kill_sb = proc_kill_sb, - .fs_flags = FS_USERNS_MOUNT | FS_DISALLOW_NOTIFY_PERM, + .fs_flags = FS_USERNS_MOUNT | FS_USERNS_MOUNT_RESTRICTED | FS_DISALLOW_NOTIFY_PERM, }; void __init proc_root_init(void) diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index b199e8ff79b1..88c10823fcaf 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -23,20 +23,6 @@ static struct kernfs_root *sysfs_root; struct kernfs_node *sysfs_root_kn; -static int sysfs_get_tree(struct fs_context *fc) -{ - struct kernfs_fs_context *kfc = fc->fs_private; - int ret; - - ret = kernfs_get_tree(fc); - if (ret) - return ret; - - if (kfc->new_sb_created) - fc->root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE; - return 0; -} - static void sysfs_fs_context_free(struct fs_context *fc) { struct kernfs_fs_context *kfc = fc->fs_private; @@ -49,7 +35,7 @@ static void sysfs_fs_context_free(struct fs_context *fc) static const struct fs_context_operations sysfs_fs_context_ops = { .free = sysfs_fs_context_free, - .get_tree = sysfs_get_tree, + .get_tree = kernfs_get_tree, }; static int sysfs_init_fs_context(struct fs_context *fc) @@ -93,7 +79,7 @@ static struct file_system_type sysfs_fs_type = { .name = "sysfs", .init_fs_context = sysfs_init_fs_context, .kill_sb = sysfs_kill_sb, - .fs_flags = FS_USERNS_MOUNT, + .fs_flags = FS_USERNS_MOUNT | FS_USERNS_MOUNT_RESTRICTED, }; int __init sysfs_init(void) diff --git a/include/linux/fs.h b/include/linux/fs.h index 9674c3d1cb3f..6da44573ce45 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2294,13 +2294,14 @@ struct file_system_type { #define FS_MGTIME 64 /* FS uses multigrain timestamps */ #define FS_LBS 128 /* FS supports LBS */ #define FS_POWER_FREEZE 256 /* Always freeze on suspend/hibernate */ +#define FS_USERNS_MOUNT_RESTRICTED 512 /* Restrict mount in userns if not already visible */ #define FS_USERNS_DELEGATABLE 1024 /* Can be mounted inside userns from outside */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ int (*init_fs_context)(struct fs_context *); const struct fs_parameter_spec *parameters; void (*kill_sb) (struct super_block *); struct module *owner; - struct file_system_type * next; + struct hlist_node list; struct hlist_head fs_supers; struct lock_class_key s_lock_key; diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h index 1ab4e2265129..aa86e4944dbf 100644 --- a/include/linux/fs/super_types.h +++ b/include/linux/fs/super_types.h @@ -334,7 +334,7 @@ struct super_block { #define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */ /* sb->s_iflags to limit user namespace mounts */ -#define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ +#define SB_I_RESTRICTED_VARIANT 0x00000010 #define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020 #define SB_I_UNTRUSTED_MOUNTER 0x00000040 #define SB_I_EVM_HMAC_UNSUPPORTED 0x00000080 diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 19d1c5e5f335..47d7deaeed8f 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -67,6 +67,7 @@ enum proc_pidonly { struct proc_fs_info { struct pid_namespace *pid_ns; kgid_t pid_gid; + const struct cred *mounter_cred; enum proc_hidepid hide_pid; enum proc_pidonly pidonly; struct rcu_head rcu; @@ -248,4 +249,16 @@ static inline struct pid_namespace *proc_pid_ns(struct super_block *sb) bool proc_ns_file(const struct file *file); +#if defined CONFIG_PROC_FS && !defined MODULE +void impl_proc_make_permanent(struct proc_dir_entry *pde); +#endif + +static inline void proc_make_permanent(struct proc_dir_entry *pde) +{ + /* Don't give matches to modules. */ +#if defined CONFIG_PROC_FS && !defined MODULE + impl_proc_make_permanent(pde); +#endif +} + #endif /* _LINUX_PROC_FS_H */ diff --git a/kernel/acct.c b/kernel/acct.c index cbbf79d718cf..c440d43479ca 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -249,7 +249,7 @@ static int acct_on(const char __user *name) return -EINVAL; /* Exclude procfs and sysfs. */ - if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE) + if (file_inode(file)->i_sb->s_type->fs_flags & FS_USERNS_MOUNT_RESTRICTED) return -EINVAL; if (!(file->f_mode & FMODE_CAN_WRITE)) |
