summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/proc.rst19
-rw-r--r--fs/filesystems.c330
-rw-r--r--fs/mount.h4
-rw-r--r--fs/namespace.c34
-rw-r--r--fs/ocfs2/super.c1
-rw-r--r--fs/proc/array.c6
-rw-r--r--fs/proc/base.c160
-rw-r--r--fs/proc/fd.c27
-rw-r--r--fs/proc/generic.c10
-rw-r--r--fs/proc/internal.h5
-rw-r--r--fs/proc/namespaces.c12
-rw-r--r--fs/proc/proc_net.c8
-rw-r--r--fs/proc/root.c24
-rw-r--r--fs/sysfs/mount.c18
-rw-r--r--include/linux/fs.h3
-rw-r--r--include/linux/fs/super_types.h2
-rw-r--r--include/linux/proc_fs.h13
-rw-r--r--kernel/acct.c2
18 files changed, 429 insertions, 249 deletions
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index db6167befb7b..5006644c1d19 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -52,6 +52,7 @@ fixes/update part 1.1 Stefani Seibold <stefani@seibold.net> June 9 2009
4 Configuring procfs
4.1 Mount options
+ 4.2 Mount restrictions
5 Filesystem behavior
@@ -2425,7 +2426,9 @@ prohibited by hidepid=. If you use some daemon like identd which needs to learn
information about processes information, just add identd to this group.
subset=pid hides all top level files and directories in the procfs that
-are not related to tasks.
+are not related to tasks. This option cannot be changed on an existing
+procfs instance because overmounts that existed before the change could
+otherwise remain reachable after the top level procfs entries are hidden.
pidns= specifies a pid namespace (either as a string path to something like
`/proc/$pid/ns/pid`, or a file descriptor when using `FSCONFIG_SET_FD`) that
@@ -2434,6 +2437,20 @@ will use the calling process's active pid namespace. Note that the pid
namespace of an existing procfs instance cannot be modified (attempting to do
so will give an `-EBUSY` error).
+4.2 Mount restrictions
+--------------------------
+
+If user namespaces are in use, the kernel additionally checks the instances of
+procfs available to the mounter and will not allow procfs to be mounted if:
+
+ 1. This mount is not fully visible unless the new procfs is going to be
+ mounted with subset=pid option.
+
+ a. Its root directory is not the root directory of the filesystem.
+ b. If any file or non-empty procfs directory is hidden by another mount.
+
+ 2. A new mount overrides the readonly option or any option from atime family.
+
Chapter 5: Filesystem behavior
==============================
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 0c7d2b7ac26c..673a03b5f32b 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -17,22 +17,49 @@
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/fs_parser.h>
+#include <linux/rculist.h>
/*
- * Handling of filesystem drivers list.
- * Rules:
- * Inclusion to/removals from/scanning of list are protected by spinlock.
- * During the unload module must call unregister_filesystem().
- * We can access the fields of list element if:
- * 1) spinlock is held or
- * 2) we hold the reference to the module.
- * The latter can be guaranteed by call of try_module_get(); if it
- * returned 0 we must skip the element, otherwise we got the reference.
- * Once the reference is obtained we can drop the spinlock.
+ * Read-mostly filesystem drivers list.
+ *
+ * Readers walk under rcu_read_lock(); writers take file_systems_lock
+ * and publish via _rcu hlist primitives. unregister_filesystem()
+ * synchronize_rcu()s after unlock so the embedded file_system_type
+ * can't go away under a reader. To keep using a filesystem after
+ * the RCU section ends, take a module reference via try_module_get().
+ */
+static HLIST_HEAD(file_systems);
+static DEFINE_SPINLOCK(file_systems_lock);
+
+#ifdef CONFIG_PROC_FS
+/*
+ * Cache a stringified version of the filesystem list.
+ *
+ * The fs list gets queried a lot by userspace because of libselinux, including
+ * rather surprising programs (would you guess *sed* is on the list?). In order
+ * to reduce the overhead we cache the resulting string, which normally hangs
+ * around below 512 bytes in size.
+ *
+ * As the list almost never changes, its creation is not particularly optimized
+ * to keep things simple.
+ *
+ * We sort it out on read in order to not introduce a failure point for fs
+ * registration (in principle we may be unable to alloc memory for the list).
*/
+struct file_systems_string {
+ struct rcu_head rcu;
+ unsigned long gen;
+ size_t len;
+ char string[];
+};
+
+static unsigned long file_systems_gen;
+static struct file_systems_string __read_mostly __rcu *file_systems_string;
-static struct file_system_type *file_systems;
-static DEFINE_RWLOCK(file_systems_lock);
+static void invalidate_filesystems_string(void);
+#else
+static inline void invalidate_filesystems_string(void) { }
+#endif
/* WARNING: This can be used only if we _already_ own a reference */
struct file_system_type *get_filesystem(struct file_system_type *fs)
@@ -46,14 +73,15 @@ void put_filesystem(struct file_system_type *fs)
module_put(fs->owner);
}
-static struct file_system_type **find_filesystem(const char *name, unsigned len)
+static struct file_system_type *find_filesystem(const char *name, unsigned len)
{
- struct file_system_type **p;
- for (p = &file_systems; *p; p = &(*p)->next)
- if (strncmp((*p)->name, name, len) == 0 &&
- !(*p)->name[len])
- break;
- return p;
+ struct file_system_type *fs;
+
+ hlist_for_each_entry_rcu(fs, &file_systems, list,
+ lockdep_is_held(&file_systems_lock))
+ if (strncmp(fs->name, name, len) == 0 && !fs->name[len])
+ return fs;
+ return NULL;
}
/**
@@ -64,33 +92,27 @@ static struct file_system_type **find_filesystem(const char *name, unsigned len)
* is aware of for mount and other syscalls. Returns 0 on success,
* or a negative errno code on an error.
*
- * The &struct file_system_type that is passed is linked into the kernel
+ * The &struct file_system_type that is passed is linked into the kernel
* structures and must not be freed until the file system has been
* unregistered.
*/
-
-int register_filesystem(struct file_system_type * fs)
+int register_filesystem(struct file_system_type *fs)
{
- int res = 0;
- struct file_system_type ** p;
-
if (fs->parameters &&
!fs_validate_description(fs->name, fs->parameters))
return -EINVAL;
BUG_ON(strchr(fs->name, '.'));
- if (fs->next)
+ if (!hlist_unhashed_lockless(&fs->list))
return -EBUSY;
- write_lock(&file_systems_lock);
- p = find_filesystem(fs->name, strlen(fs->name));
- if (*p)
- res = -EBUSY;
- else
- *p = fs;
- write_unlock(&file_systems_lock);
- return res;
-}
+ guard(spinlock)(&file_systems_lock);
+ if (find_filesystem(fs->name, strlen(fs->name)))
+ return -EBUSY;
+ hlist_add_tail_rcu(&fs->list, &file_systems);
+ invalidate_filesystems_string();
+ return 0;
+}
EXPORT_SYMBOL(register_filesystem);
/**
@@ -100,94 +122,79 @@ EXPORT_SYMBOL(register_filesystem);
* Remove a file system that was previously successfully registered
* with the kernel. An error is returned if the file system is not found.
* Zero is returned on a success.
- *
+ *
* Once this function has returned the &struct file_system_type structure
* may be freed or reused.
*/
-
-int unregister_filesystem(struct file_system_type * fs)
+int unregister_filesystem(struct file_system_type *fs)
{
- struct file_system_type ** tmp;
-
- write_lock(&file_systems_lock);
- tmp = &file_systems;
- while (*tmp) {
- if (fs == *tmp) {
- *tmp = fs->next;
- fs->next = NULL;
- write_unlock(&file_systems_lock);
- synchronize_rcu();
- return 0;
- }
- tmp = &(*tmp)->next;
+ scoped_guard(spinlock, &file_systems_lock) {
+ if (hlist_unhashed(&fs->list))
+ return -EINVAL;
+ hlist_del_init_rcu(&fs->list);
+ invalidate_filesystems_string();
}
- write_unlock(&file_systems_lock);
-
- return -EINVAL;
+ synchronize_rcu();
+ return 0;
}
-
EXPORT_SYMBOL(unregister_filesystem);
#ifdef CONFIG_SYSFS_SYSCALL
-static int fs_index(const char __user * __name)
+static int fs_index(const char __user *__name)
{
- struct file_system_type * tmp;
+ struct file_system_type *p;
char *name __free(kfree) = strndup_user(__name, PATH_MAX);
- int err, index;
+ int index = 0;
if (IS_ERR(name))
return PTR_ERR(name);
- err = -EINVAL;
- read_lock(&file_systems_lock);
- for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) {
- if (strcmp(tmp->name, name) == 0) {
- err = index;
- break;
- }
+ guard(rcu)();
+ hlist_for_each_entry_rcu(p, &file_systems, list) {
+ if (strcmp(p->name, name) == 0)
+ return index;
+ index++;
}
- read_unlock(&file_systems_lock);
- return err;
+ return -EINVAL;
}
-static int fs_name(unsigned int index, char __user * buf)
+static int fs_name(unsigned int index, char __user *buf)
{
- struct file_system_type * tmp;
- int len, res = -EINVAL;
-
- read_lock(&file_systems_lock);
- for (tmp = file_systems; tmp; tmp = tmp->next, index--) {
- if (index == 0) {
- if (try_module_get(tmp->owner))
- res = 0;
+ struct file_system_type *p, *found = NULL;
+ int len, res;
+
+ scoped_guard(rcu) {
+ hlist_for_each_entry_rcu(p, &file_systems, list) {
+ if (index--)
+ continue;
+ if (try_module_get(p->owner))
+ found = p;
break;
}
}
- read_unlock(&file_systems_lock);
- if (res)
- return res;
+ if (!found)
+ return -EINVAL;
/* OK, we got the reference, so we can safely block */
- len = strlen(tmp->name) + 1;
- res = copy_to_user(buf, tmp->name, len) ? -EFAULT : 0;
- put_filesystem(tmp);
+ len = strlen(found->name) + 1;
+ res = copy_to_user(buf, found->name, len) ? -EFAULT : 0;
+ put_filesystem(found);
return res;
}
static int fs_maxindex(void)
{
- struct file_system_type * tmp;
- int index;
+ struct file_system_type *p;
+ int index = 0;
- read_lock(&file_systems_lock);
- for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++)
- ;
- read_unlock(&file_systems_lock);
+ guard(rcu)();
+ hlist_for_each_entry_rcu(p, &file_systems, list)
+ index++;
return index;
}
/*
- * Whee.. Weird sysv syscall.
+ * Whee.. Weird sysv syscall.
*/
SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2)
{
@@ -216,8 +223,8 @@ int __init list_bdev_fs_names(char *buf, size_t size)
size_t len;
int count = 0;
- read_lock(&file_systems_lock);
- for (p = file_systems; p; p = p->next) {
+ guard(rcu)();
+ hlist_for_each_entry_rcu(p, &file_systems, list) {
if (!(p->fs_flags & FS_REQUIRES_DEV))
continue;
len = strlen(p->name) + 1;
@@ -230,30 +237,143 @@ int __init list_bdev_fs_names(char *buf, size_t size)
size -= len;
count++;
}
- read_unlock(&file_systems_lock);
return count;
}
#ifdef CONFIG_PROC_FS
-static int filesystems_proc_show(struct seq_file *m, void *v)
+static void invalidate_filesystems_string(void)
+{
+ struct file_systems_string *old;
+
+ lockdep_assert_held_write(&file_systems_lock);
+ file_systems_gen++;
+ old = rcu_replace_pointer(file_systems_string, NULL,
+ lockdep_is_held(&file_systems_lock));
+ if (old)
+ kfree_rcu(old, rcu);
+}
+
+static __cold noinline int regen_filesystems_string(void)
+{
+ struct file_system_type *p;
+ struct file_systems_string *old, *new;
+ size_t newlen, usedlen;
+ unsigned long gen;
+
+retry:
+ newlen = 0;
+
+ /* pre-calc space for each fs */
+ spin_lock(&file_systems_lock);
+ gen = file_systems_gen;
+ hlist_for_each_entry_rcu(p, &file_systems, list) {
+ if (!(p->fs_flags & FS_REQUIRES_DEV))
+ newlen += strlen("nodev");
+ newlen += strlen("\t") + strlen(p->name) + strlen("\n");
+ }
+ spin_unlock(&file_systems_lock);
+
+ new = kmalloc(offsetof(struct file_systems_string, string) + newlen + 1,
+ GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ new->gen = gen;
+ new->len = newlen;
+ new->string[newlen] = '\0';
+
+ spin_lock(&file_systems_lock);
+ old = file_systems_string;
+
+ /*
+ * Did someone beat us to it?
+ */
+ if (old && old->gen == file_systems_gen) {
+ spin_unlock(&file_systems_lock);
+ kfree(new);
+ return 0;
+ }
+
+ /*
+ * Did the list change in the meantime?
+ */
+ if (gen != file_systems_gen) {
+ spin_unlock(&file_systems_lock);
+ kfree(new);
+ goto retry;
+ }
+
+ /*
+ * Populate the string.
+ *
+ * We know we have just enough space because we calculated the right
+ * size the previous time we had the lock and confirmed the list has
+ * not changed after reacquiring it.
+ */
+ usedlen = 0;
+ hlist_for_each_entry_rcu(p, &file_systems, list) {
+ usedlen += sprintf(&new->string[usedlen], "%s\t%s\n",
+ (p->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
+ p->name);
+ }
+
+ if (WARN_ON_ONCE(new->len != strlen(new->string))) {
+ /*
+ * Should never happen of course, keep this in case someone changes string
+ * generation above and messes it up.
+ */
+ spin_unlock(&file_systems_lock);
+ kfree(new);
+ return -EINVAL;
+ }
+
+ rcu_assign_pointer(file_systems_string, new);
+ spin_unlock(&file_systems_lock);
+ if (old)
+ kfree_rcu(old, rcu);
+ return 0;
+}
+
+static __cold noinline int filesystems_proc_show_fallback(struct seq_file *m, void *v)
{
- struct file_system_type * tmp;
+ struct file_system_type *p;
- read_lock(&file_systems_lock);
- tmp = file_systems;
- while (tmp) {
+ guard(rcu)();
+ hlist_for_each_entry_rcu(p, &file_systems, list) {
seq_printf(m, "%s\t%s\n",
- (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
- tmp->name);
- tmp = tmp->next;
+ (p->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
+ p->name);
}
- read_unlock(&file_systems_lock);
return 0;
}
+static int filesystems_proc_show(struct seq_file *m, void *v)
+{
+ struct file_systems_string *fss;
+
+ for (;;) {
+ scoped_guard(rcu) {
+ fss = rcu_dereference(file_systems_string);
+ if (likely(fss)) {
+ seq_write(m, fss->string, fss->len);
+ return 0;
+ }
+ }
+
+ int err = regen_filesystems_string();
+ if (unlikely(err))
+ return filesystems_proc_show_fallback(m, v);
+ }
+}
+
static int __init proc_filesystems_init(void)
{
- proc_create_single("filesystems", 0, NULL, filesystems_proc_show);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("filesystems", 0, NULL, filesystems_proc_show);
+ if (!pde)
+ return -ENOMEM;
+ proc_make_permanent(pde);
return 0;
}
module_init(proc_filesystems_init);
@@ -263,11 +383,10 @@ static struct file_system_type *__get_fs_type(const char *name, int len)
{
struct file_system_type *fs;
- read_lock(&file_systems_lock);
- fs = *(find_filesystem(name, len));
+ guard(rcu)();
+ fs = find_filesystem(name, len);
if (fs && !try_module_get(fs->owner))
fs = NULL;
- read_unlock(&file_systems_lock);
return fs;
}
@@ -291,5 +410,4 @@ struct file_system_type *get_fs_type(const char *name)
}
return fs;
}
-
EXPORT_SYMBOL(get_fs_type);
diff --git a/fs/mount.h b/fs/mount.h
index 5c120f8361bd..94fcc306d21e 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -25,6 +25,7 @@ struct mnt_namespace {
__u32 n_fsnotify_mask;
struct fsnotify_mark_connector __rcu *n_fsnotify_marks;
#endif
+ struct hlist_head mnt_visible_mounts; /* SB_I_USERNS_VISIBLE mounts */
unsigned int nr_mounts; /* # of mounts in the namespace */
unsigned int pending_mounts;
refcount_t passive; /* number references not pinning @mounts */
@@ -98,6 +99,7 @@ struct mount {
int mnt_expiry_mark; /* true if marked for expiry */
struct hlist_head mnt_pins;
struct hlist_head mnt_stuck_children;
+ struct hlist_node mnt_ns_visible; /* link in ns->mnt_visible_mounts */
struct mount *overmount; /* mounted on ->mnt_root */
} __randomize_layout;
@@ -215,6 +217,8 @@ static inline void move_from_ns(struct mount *mnt)
ns->mnt_first_node = rb_next(&mnt->mnt_node);
rb_erase(&mnt->mnt_node, &ns->mounts);
RB_CLEAR_NODE(&mnt->mnt_node);
+ if (!hlist_unhashed(&mnt->mnt_ns_visible))
+ hlist_del_init(&mnt->mnt_ns_visible);
}
bool has_locked_children(struct mount *mnt, struct dentry *dentry);
diff --git a/fs/namespace.c b/fs/namespace.c
index d4cf40198e92..3d5cd5bf3b05 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -321,6 +321,7 @@ static struct mount *alloc_vfsmnt(const char *name)
INIT_HLIST_NODE(&mnt->mnt_slave);
INIT_HLIST_NODE(&mnt->mnt_mp_list);
INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
+ INIT_HLIST_NODE(&mnt->mnt_ns_visible);
RB_CLEAR_NODE(&mnt->mnt_node);
mnt->mnt.mnt_idmap = &nop_mnt_idmap;
}
@@ -1098,6 +1099,10 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
rb_link_node(&mnt->mnt_node, parent, link);
rb_insert_color(&mnt->mnt_node, &ns->mounts);
+ if ((mnt->mnt.mnt_sb->s_type->fs_flags & FS_USERNS_MOUNT_RESTRICTED) &&
+ mnt->mnt.mnt_root == mnt->mnt.mnt_sb->s_root)
+ hlist_add_head(&mnt->mnt_ns_visible, &ns->mnt_visible_mounts);
+
mnt_notify_add(mnt);
}
@@ -6346,20 +6351,26 @@ static bool mnt_already_visible(struct mnt_namespace *ns,
int *new_mnt_flags)
{
int new_flags = *new_mnt_flags;
- struct mount *mnt, *n;
+ struct mount *mnt;
+
+ /* Don't acquire namespace semaphore without a good reason. */
+ if (hlist_empty(&ns->mnt_visible_mounts))
+ return false;
guard(namespace_shared)();
- rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
+ hlist_for_each_entry(mnt, &ns->mnt_visible_mounts, mnt_ns_visible) {
+ const struct super_block *sb_visible = mnt->mnt.mnt_sb;
struct mount *child;
int mnt_flags;
- if (mnt->mnt.mnt_sb->s_type != sb->s_type)
+ if (sb_visible->s_type != sb->s_type)
continue;
- /* This mount is not fully visible if it's root directory
- * is not the root directory of the filesystem.
+ /*
+ * Restricted variants are not compatible with anything, even
+ * other restricted variants.
*/
- if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
+ if (sb_visible->s_iflags & SB_I_RESTRICTED_VARIANT)
continue;
/* A local view of the mount flags */
@@ -6411,16 +6422,23 @@ static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags
return false;
/* Can this filesystem be too revealing? */
- s_iflags = sb->s_iflags;
- if (!(s_iflags & SB_I_USERNS_VISIBLE))
+ if (!(sb->s_type->fs_flags & FS_USERNS_MOUNT_RESTRICTED))
return false;
+ s_iflags = sb->s_iflags;
if ((s_iflags & required_iflags) != required_iflags) {
WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n",
required_iflags);
return true;
}
+ /*
+ * Restricted variants don't need an already visible mount because they
+ * don't expose the full filesystem view.
+ */
+ if (s_iflags & SB_I_RESTRICTED_VARIANT)
+ return false;
+
return !mnt_already_visible(ns, sb, new_mnt_flags);
}
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index b875f01c9756..4870e680c4e5 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1224,7 +1224,6 @@ static struct file_system_type ocfs2_fs_type = {
.name = "ocfs2",
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
- .next = NULL,
.init_fs_context = ocfs2_init_fs_context,
.parameters = ocfs2_param_spec,
};
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 90fb0c6b5f99..479ea8cb4ef4 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -482,6 +482,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
unsigned long flags;
int exit_code = task->exit_code;
struct signal_struct *sig = task->signal;
+ int ret;
+
+ ret = down_read_killable(&task->signal->exec_update_lock);
+ if (ret)
+ return ret;
state = *get_task_state(task);
vsize = eip = esp = 0;
@@ -657,6 +662,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
seq_puts(m, " 0");
seq_putc(m, '\n');
+ up_read(&task->signal->exec_update_lock);
if (mm)
mmput(mm);
return 0;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 126b98419adb..b67e8c3605fb 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -219,33 +219,24 @@ static int get_task_root(struct task_struct *task, struct path *root)
return result;
}
-static int proc_cwd_link(struct dentry *dentry, struct path *path)
+static int proc_cwd_link(struct dentry *dentry, struct path *path,
+ struct task_struct *task)
{
- struct task_struct *task = get_proc_task(d_inode(dentry));
int result = -ENOENT;
- if (task) {
- task_lock(task);
- if (task->fs) {
- get_fs_pwd(task->fs, path);
- result = 0;
- }
- task_unlock(task);
- put_task_struct(task);
+ task_lock(task);
+ if (task->fs) {
+ get_fs_pwd(task->fs, path);
+ result = 0;
}
+ task_unlock(task);
return result;
}
-static int proc_root_link(struct dentry *dentry, struct path *path)
+static int proc_root_link(struct dentry *dentry, struct path *path,
+ struct task_struct *task)
{
- struct task_struct *task = get_proc_task(d_inode(dentry));
- int result = -ENOENT;
-
- if (task) {
- result = get_task_root(task, path);
- put_task_struct(task);
- }
- return result;
+ return get_task_root(task, path);
}
/*
@@ -424,18 +415,24 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
{
unsigned long wchan;
char symname[KSYM_NAME_LEN];
+ int err;
+ err = down_read_killable(&task->signal->exec_update_lock);
+ if (err)
+ return err;
if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
goto print0;
wchan = get_wchan(task);
if (wchan && !lookup_symbol_name(wchan, symname)) {
seq_puts(m, symname);
+ up_read(&task->signal->exec_update_lock);
return 0;
}
print0:
seq_putc(m, '0');
+ up_read(&task->signal->exec_update_lock);
return 0;
}
#endif /* CONFIG_KALLSYMS */
@@ -705,23 +702,6 @@ static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns,
/* Here the fs part begins */
/************************************************************************/
-/* permission checks */
-static bool proc_fd_access_allowed(struct inode *inode)
-{
- struct task_struct *task;
- bool allowed = false;
- /* Allow access to a task's file descriptors if it is us or we
- * may use ptrace attach to the process and find out that
- * information.
- */
- task = get_proc_task(inode);
- if (task) {
- allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
- put_task_struct(task);
- }
- return allowed;
-}
-
int proc_nochmod_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct iattr *attr)
{
@@ -1778,16 +1758,12 @@ static const struct file_operations proc_pid_set_comm_operations = {
.release = single_release,
};
-static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
+static int proc_exe_link(struct dentry *dentry, struct path *exe_path,
+ struct task_struct *task)
{
- struct task_struct *task;
struct file *exe_file;
- task = get_proc_task(d_inode(dentry));
- if (!task)
- return -ENOENT;
exe_file = get_task_exe_file(task);
- put_task_struct(task);
if (exe_file) {
*exe_path = exe_file->f_path;
path_get(&exe_file->f_path);
@@ -1797,26 +1773,42 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
return -ENOENT;
}
+static int call_proc_get_link(struct dentry *dentry, struct inode *inode, struct path *path_out)
+{
+ struct task_struct *task;
+ int ret;
+
+ task = get_proc_task(inode);
+ if (!task)
+ return -ENOENT;
+ ret = down_read_killable(&task->signal->exec_update_lock);
+ if (ret)
+ goto out_put_task;
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
+ ret = -EACCES;
+ goto out;
+ }
+ ret = PROC_I(inode)->op.proc_get_link(dentry, path_out, task);
+
+out:
+ up_read(&task->signal->exec_update_lock);
+out_put_task:
+ put_task_struct(task);
+ return ret;
+}
+
static const char *proc_pid_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
{
struct path path;
- int error = -EACCES;
+ int error;
if (!dentry)
return ERR_PTR(-ECHILD);
-
- /* Are we allowed to snoop on the tasks file descriptors? */
- if (!proc_fd_access_allowed(inode))
- goto out;
-
- error = PROC_I(inode)->op.proc_get_link(dentry, &path);
- if (error)
- goto out;
-
- error = nd_jump_link(&path);
-out:
+ error = call_proc_get_link(dentry, inode, &path);
+ if (!error)
+ error = nd_jump_link(&path);
return ERR_PTR(error);
}
@@ -1850,17 +1842,11 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b
struct inode *inode = d_inode(dentry);
struct path path;
- /* Are we allowed to snoop on the tasks file descriptors? */
- if (!proc_fd_access_allowed(inode))
- goto out;
-
- error = PROC_I(inode)->op.proc_get_link(dentry, &path);
- if (error)
- goto out;
-
- error = do_proc_readlink(&path, buffer, buflen);
- path_put(&path);
-out:
+ error = call_proc_get_link(dentry, inode, &path);
+ if (!error) {
+ error = do_proc_readlink(&path, buffer, buflen);
+ path_put(&path);
+ }
return error;
}
@@ -2243,21 +2229,16 @@ static const struct dentry_operations tid_map_files_dentry_operations = {
.d_delete = pid_delete_dentry,
};
-static int map_files_get_link(struct dentry *dentry, struct path *path)
+static int map_files_get_link(struct dentry *dentry, struct path *path,
+ struct task_struct *task)
{
unsigned long vm_start, vm_end;
struct vm_area_struct *vma;
- struct task_struct *task;
struct mm_struct *mm;
int rc;
rc = -ENOENT;
- task = get_proc_task(d_inode(dentry));
- if (!task)
- goto out;
-
mm = get_task_mm(task);
- put_task_struct(task);
if (!mm)
goto out;
@@ -2353,17 +2334,15 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
if (!task)
goto out;
- result = ERR_PTR(-EACCES);
- if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
- goto out_put_task;
-
result = ERR_PTR(-ENOENT);
if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
goto out_put_task;
- mm = get_task_mm(task);
- if (!mm)
+ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
+ if (IS_ERR(mm)) {
+ result = ERR_CAST(mm);
goto out_put_task;
+ }
result = ERR_PTR(-EINTR);
if (mmap_read_lock_killable(mm))
@@ -2413,23 +2392,22 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
if (!task)
goto out;
- ret = -EACCES;
- if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
- goto out_put_task;
-
ret = 0;
if (!dir_emit_dots(file, ctx))
goto out_put_task;
- mm = get_task_mm(task);
- if (!mm)
+ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
+ if (IS_ERR(mm)) {
+ ret = PTR_ERR(mm);
+ /* if the task has no mm, the directory should just be empty */
+ if (ret == -ESRCH)
+ ret = 0;
goto out_put_task;
+ }
ret = mmap_read_lock_killable(mm);
- if (ret) {
- mmput(mm);
- goto out_put_task;
- }
+ if (ret)
+ goto out_put_mm;
nr_files = 0;
@@ -2455,8 +2433,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
if (!p) {
ret = -ENOMEM;
mmap_read_unlock(mm);
- mmput(mm);
- goto out_put_task;
+ goto out_put_mm;
}
p->start = vma->vm_start;
@@ -2464,7 +2441,6 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
p->mode = vma->vm_file->f_mode;
}
mmap_read_unlock(mm);
- mmput(mm);
for (i = 0; i < nr_files; i++) {
char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */
@@ -2481,6 +2457,8 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
ctx->pos++;
}
+out_put_mm:
+ mmput(mm);
out_put_task:
put_task_struct(task);
out:
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 05c7513e77c7..0f9a1556f2a3 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -171,24 +171,19 @@ static const struct dentry_operations tid_fd_dentry_operations = {
.d_delete = pid_delete_dentry,
};
-static int proc_fd_link(struct dentry *dentry, struct path *path)
+static int proc_fd_link(struct dentry *dentry, struct path *path,
+ struct task_struct *task)
{
- struct task_struct *task;
int ret = -ENOENT;
-
- task = get_proc_task(d_inode(dentry));
- if (task) {
- unsigned int fd = proc_fd(d_inode(dentry));
- struct file *fd_file;
-
- fd_file = fget_task(task, fd);
- if (fd_file) {
- *path = fd_file->f_path;
- path_get(&fd_file->f_path);
- ret = 0;
- fput(fd_file);
- }
- put_task_struct(task);
+ unsigned int fd = proc_fd(d_inode(dentry));
+ struct file *fd_file;
+
+ fd_file = fget_task(task, fd);
+ if (fd_file) {
+ *path = fd_file->f_path;
+ path_get(&fd_file->f_path);
+ ret = 0;
+ fput(fd_file);
}
return ret;
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 8bb81e58c9d8..c6ae076e1fa0 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -841,3 +841,13 @@ ssize_t proc_simple_write(struct file *f, const char __user *ubuf, size_t size,
kfree(buf);
return ret == 0 ? size : ret;
}
+
+/*
+ * Not exported to modules:
+ * modules' /proc files aren't permanent because modules aren't permanent.
+ */
+void impl_proc_make_permanent(struct proc_dir_entry *pde)
+{
+ if (pde)
+ pde_make_permanent(pde);
+}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 64dc44832808..b232e1098117 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -79,8 +79,11 @@ static inline bool pde_is_permanent(const struct proc_dir_entry *pde)
return pde->flags & PROC_ENTRY_PERMANENT;
}
+/* This is for builtin code, not even for modules which are compiled in. */
static inline void pde_make_permanent(struct proc_dir_entry *pde)
{
+ /* Ensure magic flag does something. */
+ static_assert(PROC_ENTRY_PERMANENT != 0);
pde->flags |= PROC_ENTRY_PERMANENT;
}
@@ -107,7 +110,7 @@ extern struct kmem_cache *proc_dir_entry_cache;
void pde_free(struct proc_dir_entry *pde);
union proc_op {
- int (*proc_get_link)(struct dentry *, struct path *);
+ int (*proc_get_link)(struct dentry *, struct path *, struct task_struct *);
int (*proc_show)(struct seq_file *m,
struct pid_namespace *ns, struct pid *pid,
struct task_struct *task);
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 39f4169f669f..2f46f1396744 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -55,6 +55,10 @@ static const char *proc_ns_get_link(struct dentry *dentry,
if (!task)
return ERR_PTR(-EACCES);
+ error = down_read_killable(&task->signal->exec_update_lock);
+ if (error)
+ goto out_put_task;
+
if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
goto out;
@@ -64,6 +68,8 @@ static const char *proc_ns_get_link(struct dentry *dentry,
error = nd_jump_link(&ns_path);
out:
+ up_read(&task->signal->exec_update_lock);
+out_put_task:
put_task_struct(task);
return ERR_PTR(error);
}
@@ -80,11 +86,17 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
if (!task)
return res;
+ res = down_read_killable(&task->signal->exec_update_lock);
+ if (res)
+ goto out_put_task;
+
if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
res = ns_get_name(name, sizeof(name), task, ns_ops);
if (res >= 0)
res = readlink_copy(buffer, buflen, name, strlen(name));
}
+ up_read(&task->signal->exec_update_lock);
+out_put_task:
put_task_struct(task);
return res;
}
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 184cddeb8215..00cc385bce21 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -23,6 +23,7 @@
#include <linux/uidgid.h>
#include <net/net_namespace.h>
#include <linux/seq_file.h>
+#include <linux/security.h>
#include "internal.h"
@@ -270,6 +271,7 @@ static struct net *get_proc_task_net(struct inode *dir)
struct task_struct *task;
struct nsproxy *ns;
struct net *net = NULL;
+ struct proc_fs_info *fs_info = proc_sb_info(dir->i_sb);
rcu_read_lock();
task = pid_task(proc_pid(dir), PIDTYPE_PID);
@@ -282,6 +284,12 @@ static struct net *get_proc_task_net(struct inode *dir)
}
rcu_read_unlock();
+ if (net && (fs_info->pidonly == PROC_PIDONLY_ON) &&
+ security_capable(fs_info->mounter_cred, net->user_ns, CAP_NET_ADMIN, CAP_OPT_NONE) < 0) {
+ put_net(net);
+ net = NULL;
+ }
+
return net;
}
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 0f9100559471..99adddfeb4a4 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -223,12 +223,17 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
return 0;
}
-static void proc_apply_options(struct proc_fs_info *fs_info,
+static int proc_apply_options(struct proc_fs_info *fs_info,
struct fs_context *fc,
struct user_namespace *user_ns)
{
struct proc_fs_context *ctx = fc->fs_private;
+ if ((ctx->mask & (1 << Opt_subset)) &&
+ fc->purpose == FS_CONTEXT_FOR_RECONFIGURE &&
+ ctx->pidonly != fs_info->pidonly)
+ return invalf(fc, "proc: subset=pid cannot be changed\n");
+
if (ctx->mask & (1 << Opt_gid))
fs_info->pid_gid = make_kgid(user_ns, ctx->gid);
if (ctx->mask & (1 << Opt_hidepid))
@@ -240,6 +245,7 @@ static void proc_apply_options(struct proc_fs_info *fs_info,
put_pid_ns(fs_info->pid_ns);
fs_info->pid_ns = get_pid_ns(ctx->pid_ns);
}
+ return 0;
}
static int proc_fill_super(struct super_block *s, struct fs_context *fc)
@@ -254,10 +260,13 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc)
return -ENOMEM;
fs_info->pid_ns = get_pid_ns(ctx->pid_ns);
- proc_apply_options(fs_info, fc, current_user_ns());
+ fs_info->mounter_cred = get_cred(fc->cred);
+ ret = proc_apply_options(fs_info, fc, current_user_ns());
+ if (ret)
+ return ret;
/* User space would break if executables or devices appear on proc */
- s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
+ s->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;
s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC;
s->s_blocksize = 1024;
s->s_blocksize_bits = 10;
@@ -266,6 +275,9 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc)
s->s_time_gran = 1;
s->s_fs_info = fs_info;
+ if (fs_info->pidonly == PROC_PIDONLY_ON)
+ s->s_iflags |= SB_I_RESTRICTED_VARIANT;
+
/*
* procfs isn't actually a stacking filesystem; however, there is
* too much magic going on inside it to permit stacking things on
@@ -303,8 +315,7 @@ static int proc_reconfigure(struct fs_context *fc)
sync_filesystem(sb);
- proc_apply_options(fs_info, fc, current_user_ns());
- return 0;
+ return proc_apply_options(fs_info, fc, current_user_ns());
}
static int proc_get_tree(struct fs_context *fc)
@@ -350,6 +361,7 @@ static void proc_kill_sb(struct super_block *sb)
kill_anon_super(sb);
if (fs_info) {
put_pid_ns(fs_info->pid_ns);
+ put_cred(fs_info->mounter_cred);
kfree_rcu(fs_info, rcu);
}
}
@@ -359,7 +371,7 @@ static struct file_system_type proc_fs_type = {
.init_fs_context = proc_init_fs_context,
.parameters = proc_fs_parameters,
.kill_sb = proc_kill_sb,
- .fs_flags = FS_USERNS_MOUNT | FS_DISALLOW_NOTIFY_PERM,
+ .fs_flags = FS_USERNS_MOUNT | FS_USERNS_MOUNT_RESTRICTED | FS_DISALLOW_NOTIFY_PERM,
};
void __init proc_root_init(void)
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index b199e8ff79b1..88c10823fcaf 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -23,20 +23,6 @@
static struct kernfs_root *sysfs_root;
struct kernfs_node *sysfs_root_kn;
-static int sysfs_get_tree(struct fs_context *fc)
-{
- struct kernfs_fs_context *kfc = fc->fs_private;
- int ret;
-
- ret = kernfs_get_tree(fc);
- if (ret)
- return ret;
-
- if (kfc->new_sb_created)
- fc->root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE;
- return 0;
-}
-
static void sysfs_fs_context_free(struct fs_context *fc)
{
struct kernfs_fs_context *kfc = fc->fs_private;
@@ -49,7 +35,7 @@ static void sysfs_fs_context_free(struct fs_context *fc)
static const struct fs_context_operations sysfs_fs_context_ops = {
.free = sysfs_fs_context_free,
- .get_tree = sysfs_get_tree,
+ .get_tree = kernfs_get_tree,
};
static int sysfs_init_fs_context(struct fs_context *fc)
@@ -93,7 +79,7 @@ static struct file_system_type sysfs_fs_type = {
.name = "sysfs",
.init_fs_context = sysfs_init_fs_context,
.kill_sb = sysfs_kill_sb,
- .fs_flags = FS_USERNS_MOUNT,
+ .fs_flags = FS_USERNS_MOUNT | FS_USERNS_MOUNT_RESTRICTED,
};
int __init sysfs_init(void)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9674c3d1cb3f..6da44573ce45 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2294,13 +2294,14 @@ struct file_system_type {
#define FS_MGTIME 64 /* FS uses multigrain timestamps */
#define FS_LBS 128 /* FS supports LBS */
#define FS_POWER_FREEZE 256 /* Always freeze on suspend/hibernate */
+#define FS_USERNS_MOUNT_RESTRICTED 512 /* Restrict mount in userns if not already visible */
#define FS_USERNS_DELEGATABLE 1024 /* Can be mounted inside userns from outside */
#define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */
int (*init_fs_context)(struct fs_context *);
const struct fs_parameter_spec *parameters;
void (*kill_sb) (struct super_block *);
struct module *owner;
- struct file_system_type * next;
+ struct hlist_node list;
struct hlist_head fs_supers;
struct lock_class_key s_lock_key;
diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h
index 1ab4e2265129..aa86e4944dbf 100644
--- a/include/linux/fs/super_types.h
+++ b/include/linux/fs/super_types.h
@@ -334,7 +334,7 @@ struct super_block {
#define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */
/* sb->s_iflags to limit user namespace mounts */
-#define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */
+#define SB_I_RESTRICTED_VARIANT 0x00000010
#define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020
#define SB_I_UNTRUSTED_MOUNTER 0x00000040
#define SB_I_EVM_HMAC_UNSUPPORTED 0x00000080
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 19d1c5e5f335..47d7deaeed8f 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -67,6 +67,7 @@ enum proc_pidonly {
struct proc_fs_info {
struct pid_namespace *pid_ns;
kgid_t pid_gid;
+ const struct cred *mounter_cred;
enum proc_hidepid hide_pid;
enum proc_pidonly pidonly;
struct rcu_head rcu;
@@ -248,4 +249,16 @@ static inline struct pid_namespace *proc_pid_ns(struct super_block *sb)
bool proc_ns_file(const struct file *file);
+#if defined CONFIG_PROC_FS && !defined MODULE
+void impl_proc_make_permanent(struct proc_dir_entry *pde);
+#endif
+
+static inline void proc_make_permanent(struct proc_dir_entry *pde)
+{
+ /* Don't give matches to modules. */
+#if defined CONFIG_PROC_FS && !defined MODULE
+ impl_proc_make_permanent(pde);
+#endif
+}
+
#endif /* _LINUX_PROC_FS_H */
diff --git a/kernel/acct.c b/kernel/acct.c
index cbbf79d718cf..c440d43479ca 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -249,7 +249,7 @@ static int acct_on(const char __user *name)
return -EINVAL;
/* Exclude procfs and sysfs. */
- if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE)
+ if (file_inode(file)->i_sb->s_type->fs_flags & FS_USERNS_MOUNT_RESTRICTED)
return -EINVAL;
if (!(file->f_mode & FMODE_CAN_WRITE))