diff options
Diffstat (limited to 'fs/proc')
-rw-r--r-- | fs/proc/Kconfig | 6 | ||||
-rw-r--r-- | fs/proc/array.c | 52 | ||||
-rw-r--r-- | fs/proc/base.c | 264 | ||||
-rw-r--r-- | fs/proc/bootconfig.c | 15 | ||||
-rw-r--r-- | fs/proc/cpuinfo.c | 1 | ||||
-rw-r--r-- | fs/proc/devices.c | 1 | ||||
-rw-r--r-- | fs/proc/generic.c | 40 | ||||
-rw-r--r-- | fs/proc/inode.c | 293 | ||||
-rw-r--r-- | fs/proc/internal.h | 10 | ||||
-rw-r--r-- | fs/proc/kcore.c | 3 | ||||
-rw-r--r-- | fs/proc/kmsg.c | 1 | ||||
-rw-r--r-- | fs/proc/meminfo.c | 16 | ||||
-rw-r--r-- | fs/proc/nommu.c | 1 | ||||
-rw-r--r-- | fs/proc/proc_net.c | 19 | ||||
-rw-r--r-- | fs/proc/proc_sysctl.c | 241 | ||||
-rw-r--r-- | fs/proc/root.c | 164 | ||||
-rw-r--r-- | fs/proc/self.c | 10 | ||||
-rw-r--r-- | fs/proc/stat.c | 1 | ||||
-rw-r--r-- | fs/proc/task_mmu.c | 153 | ||||
-rw-r--r-- | fs/proc/task_nommu.c | 18 | ||||
-rw-r--r-- | fs/proc/thread_self.c | 10 | ||||
-rw-r--r-- | fs/proc/vmcore.c | 6 |
22 files changed, 859 insertions, 466 deletions
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 27ef84d99f59..c930001056f9 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -23,7 +23,7 @@ config PROC_FS /proc" or the equivalent line in /etc/fstab does the job. The /proc file system is explained in the file - <file:Documentation/filesystems/proc.txt> and on the proc(5) manpage + <file:Documentation/filesystems/proc.rst> and on the proc(5) manpage ("man 5 proc"). This option will enlarge your kernel by about 67 KB. Several @@ -66,7 +66,7 @@ config PROC_SYSCTL depends on PROC_FS select SYSCTL default y - ---help--- + help The sysctl interface provides a means of dynamically changing certain kernel parameters and variables on the fly without requiring a recompile of the kernel or reboot of the system. The primary @@ -95,7 +95,7 @@ config PROC_CHILDREN default n help Provides a fast way to retrieve first level children pids of a task. See - <file:Documentation/filesystems/proc.txt> for more information. + <file:Documentation/filesystems/proc.rst> for more information. Say Y if you are running any user-space software which takes benefit from this interface. For example, rkt is such a piece of software. diff --git a/fs/proc/array.c b/fs/proc/array.c index 5efaf3708ec6..65ec2029fa80 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -92,7 +92,6 @@ #include <linux/user_namespace.h> #include <linux/fs_struct.h> -#include <asm/pgtable.h> #include <asm/processor.h> #include "internal.h" @@ -248,8 +247,8 @@ void render_sigset_t(struct seq_file *m, const char *header, seq_putc(m, '\n'); } -static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign, - sigset_t *catch) +static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *sigign, + sigset_t *sigcatch) { struct k_sigaction *k; int i; @@ -257,9 +256,9 @@ static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign, k = p->sighand->action; for (i = 1; i <= _NSIG; ++i, ++k) { if (k->sa.sa_handler == SIG_IGN) - sigaddset(ign, i); + sigaddset(sigign, i); else if (k->sa.sa_handler != SIG_DFL) - sigaddset(catch, i); + sigaddset(sigcatch, i); } } @@ -342,6 +341,8 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p) seq_put_decimal_ull(m, "NoNewPrivs:\t", task_no_new_privs(p)); #ifdef CONFIG_SECCOMP seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode); + seq_put_decimal_ull(m, "\nSeccomp_filters:\t", + atomic_read(&p->seccomp.filter_count)); #endif seq_puts(m, "\nSpeculation_Store_Bypass:\t"); switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) { @@ -635,28 +636,35 @@ int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns, int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - unsigned long size = 0, resident = 0, shared = 0, text = 0, data = 0; struct mm_struct *mm = get_task_mm(task); if (mm) { + unsigned long size; + unsigned long resident = 0; + unsigned long shared = 0; + unsigned long text = 0; + unsigned long data = 0; + size = task_statm(mm, &shared, &text, &data, &resident); mmput(mm); - } - /* - * For quick read, open code by putting numbers directly - * expected format is - * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n", - * size, resident, shared, text, data); - */ - seq_put_decimal_ull(m, "", size); - seq_put_decimal_ull(m, " ", resident); - seq_put_decimal_ull(m, " ", shared); - seq_put_decimal_ull(m, " ", text); - seq_put_decimal_ull(m, " ", 0); - seq_put_decimal_ull(m, " ", data); - seq_put_decimal_ull(m, " ", 0); - seq_putc(m, '\n'); + /* + * For quick read, open code by putting numbers directly + * expected format is + * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n", + * size, resident, shared, text, data); + */ + seq_put_decimal_ull(m, "", size); + seq_put_decimal_ull(m, " ", resident); + seq_put_decimal_ull(m, " ", shared); + seq_put_decimal_ull(m, " ", text); + seq_put_decimal_ull(m, " ", 0); + seq_put_decimal_ull(m, " ", data); + seq_put_decimal_ull(m, " ", 0); + seq_putc(m, '\n'); + } else { + seq_write(m, "0 0 0 0 0 0 0\n", 14); + } return 0; } @@ -721,7 +729,7 @@ static int children_seq_show(struct seq_file *seq, void *v) { struct inode *inode = file_inode(seq->file); - seq_printf(seq, "%d ", pid_nr_ns(v, proc_pid_ns(inode))); + seq_printf(seq, "%d ", pid_nr_ns(v, proc_pid_ns(inode->i_sb))); return 0; } diff --git a/fs/proc/base.c b/fs/proc/base.c index c7c64272b0fa..617db4e0faa0 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -405,11 +405,11 @@ print0: static int lock_trace(struct task_struct *task) { - int err = mutex_lock_killable(&task->signal->cred_guard_mutex); + int err = mutex_lock_killable(&task->signal->exec_update_mutex); if (err) return err; if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) { - mutex_unlock(&task->signal->cred_guard_mutex); + mutex_unlock(&task->signal->exec_update_mutex); return -EPERM; } return 0; @@ -417,7 +417,7 @@ static int lock_trace(struct task_struct *task) static void unlock_trace(struct task_struct *task) { - mutex_unlock(&task->signal->cred_guard_mutex); + mutex_unlock(&task->signal->exec_update_mutex); } #ifdef CONFIG_STACKTRACE @@ -551,8 +551,17 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, { unsigned long totalpages = totalram_pages() + total_swap_pages; unsigned long points = 0; + long badness; + + badness = oom_badness(task, totalpages); + /* + * Special case OOM_SCORE_ADJ_MIN for all others scale the + * badness value into [0, 2000] range which we have been + * exporting for a long time so userspace might depend on it. + */ + if (badness != LONG_MIN) + points = (1000 + badness * 1000 / (long)totalpages) * 2 / 3; - points = oom_badness(task, totalpages) * 1000 / totalpages; seq_printf(m, "%lu\n", points); return 0; @@ -697,13 +706,21 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr) * May current process learn task's sched/cmdline info (for hide_pid_min=1) * or euid/egid (for hide_pid_min=2)? */ -static bool has_pid_permissions(struct pid_namespace *pid, +static bool has_pid_permissions(struct proc_fs_info *fs_info, struct task_struct *task, - int hide_pid_min) + enum proc_hidepid hide_pid_min) { - if (pid->hide_pid < hide_pid_min) + /* + * If 'hidpid' mount option is set force a ptrace check, + * we indicate that we are using a filesystem syscall + * by passing PTRACE_MODE_READ_FSCREDS + */ + if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE) + return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); + + if (fs_info->hide_pid < hide_pid_min) return true; - if (in_group_p(pid->pid_gid)) + if (in_group_p(fs_info->pid_gid)) return true; return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); } @@ -711,18 +728,18 @@ static bool has_pid_permissions(struct pid_namespace *pid, static int proc_pid_permission(struct inode *inode, int mask) { - struct pid_namespace *pid = proc_pid_ns(inode); + struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); struct task_struct *task; bool has_perms; task = get_proc_task(inode); if (!task) return -ESRCH; - has_perms = has_pid_permissions(pid, task, HIDEPID_NO_ACCESS); + has_perms = has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS); put_task_struct(task); if (!has_perms) { - if (pid->hide_pid == HIDEPID_INVISIBLE) { + if (fs_info->hide_pid == HIDEPID_INVISIBLE) { /* * Let's make getdents(), stat(), and open() * consistent with each other. If a process @@ -746,7 +763,7 @@ static const struct inode_operations proc_def_inode_operations = { static int proc_single_show(struct seq_file *m, void *v) { struct inode *inode = m->private; - struct pid_namespace *ns = proc_pid_ns(inode); + struct pid_namespace *ns = proc_pid_ns(inode->i_sb); struct pid *pid = proc_pid(inode); struct task_struct *task; int ret; @@ -1415,7 +1432,7 @@ static const struct file_operations proc_fail_nth_operations = { static int sched_show(struct seq_file *m, void *v) { struct inode *inode = m->private; - struct pid_namespace *ns = proc_pid_ns(inode); + struct pid_namespace *ns = proc_pid_ns(inode->i_sb); struct task_struct *p; p = get_proc_task(inode); @@ -1573,6 +1590,7 @@ static ssize_t timens_offsets_write(struct file *file, const char __user *buf, noffsets = 0; for (pos = kbuf; pos; pos = next_line) { struct proc_timens_offset *off = &offsets[noffsets]; + char clock[10]; int err; /* Find the end of line and ensure we don't look past it */ @@ -1584,10 +1602,21 @@ static ssize_t timens_offsets_write(struct file *file, const char __user *buf, next_line = NULL; } - err = sscanf(pos, "%u %lld %lu", &off->clockid, + err = sscanf(pos, "%9s %lld %lu", clock, &off->val.tv_sec, &off->val.tv_nsec); if (err != 3 || off->val.tv_nsec >= NSEC_PER_SEC) goto out; + + clock[sizeof(clock) - 1] = 0; + if (strcmp(clock, "monotonic") == 0 || + strcmp(clock, __stringify(CLOCK_MONOTONIC)) == 0) + off->clockid = CLOCK_MONOTONIC; + else if (strcmp(clock, "boottime") == 0 || + strcmp(clock, __stringify(CLOCK_BOOTTIME)) == 0) + off->clockid = CLOCK_BOOTTIME; + else + goto out; + noffsets++; if (noffsets == ARRAY_SIZE(offsets)) { if (next_line) @@ -1834,11 +1863,25 @@ void task_dump_owner(struct task_struct *task, umode_t mode, *rgid = gid; } +void proc_pid_evict_inode(struct proc_inode *ei) +{ + struct pid *pid = ei->pid; + + if (S_ISDIR(ei->vfs_inode.i_mode)) { + spin_lock(&pid->lock); + hlist_del_init_rcu(&ei->sibling_inodes); + spin_unlock(&pid->lock); + } + + put_pid(pid); +} + struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task, umode_t mode) { struct inode * inode; struct proc_inode *ei; + struct pid *pid; /* We need a new inode */ @@ -1856,10 +1899,18 @@ struct inode *proc_pid_make_inode(struct super_block * sb, /* * grab the reference to task. */ - ei->pid = get_task_pid(task, PIDTYPE_PID); - if (!ei->pid) + pid = get_task_pid(task, PIDTYPE_PID); + if (!pid) goto out_unlock; + /* Let the pid remember us for quick removal */ + ei->pid = pid; + if (S_ISDIR(mode)) { + spin_lock(&pid->lock); + hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes); + spin_unlock(&pid->lock); + } + task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); security_task_to_inode(task, inode); @@ -1875,7 +1926,7 @@ int pid_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); - struct pid_namespace *pid = proc_pid_ns(inode); + struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); struct task_struct *task; generic_fillattr(inode, stat); @@ -1885,7 +1936,7 @@ int pid_getattr(const struct path *path, struct kstat *stat, rcu_read_lock(); task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) { - if (!has_pid_permissions(pid, task, HIDEPID_INVISIBLE)) { + if (!has_pid_permissions(fs_info, task, HIDEPID_INVISIBLE)) { rcu_read_unlock(); /* * This doesn't prevent learning whether PID exists, @@ -2070,11 +2121,11 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) goto out; if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) { - status = down_read_killable(&mm->mmap_sem); + status = mmap_read_lock_killable(mm); if (!status) { exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); } } @@ -2121,7 +2172,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path) if (rc) goto out_mmput; - rc = down_read_killable(&mm->mmap_sem); + rc = mmap_read_lock_killable(mm); if (rc) goto out_mmput; @@ -2132,7 +2183,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path) path_get(path); rc = 0; } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); out_mmput: mmput(mm); @@ -2147,16 +2198,16 @@ struct map_files_info { }; /* - * Only allow CAP_SYS_ADMIN to follow the links, due to concerns about how the - * symlinks may be used to bypass permissions on ancestor directories in the - * path to the file in question. + * Only allow CAP_SYS_ADMIN and CAP_CHECKPOINT_RESTORE to follow the links, due + * to concerns about how the symlinks may be used to bypass permissions on + * ancestor directories in the path to the file in question. */ static const char * proc_map_files_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - if (!capable(CAP_SYS_ADMIN)) + if (!checkpoint_restore_ns_capable(&init_user_ns)) return ERR_PTR(-EPERM); return proc_pid_get_link(dentry, inode, done); @@ -2222,7 +2273,7 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, goto out_put_task; result = ERR_PTR(-EINTR); - if (down_read_killable(&mm->mmap_sem)) + if (mmap_read_lock_killable(mm)) goto out_put_mm; result = ERR_PTR(-ENOENT); @@ -2235,7 +2286,7 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, (void *)(unsigned long)vma->vm_file->f_mode); out_no_vma: - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); out_put_mm: mmput(mm); out_put_task: @@ -2280,7 +2331,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) if (!mm) goto out_put_task; - ret = down_read_killable(&mm->mmap_sem); + ret = mmap_read_lock_killable(mm); if (ret) { mmput(mm); goto out_put_task; @@ -2291,11 +2342,11 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) /* * We need two passes here: * - * 1) Collect vmas of mapped files with mmap_sem taken - * 2) Release mmap_sem and instantiate entries + * 1) Collect vmas of mapped files with mmap_lock taken + * 2) Release mmap_lock and instantiate entries * * otherwise we get lockdep complained, since filldir() - * routine might require mmap_sem taken in might_fault(). + * routine might require mmap_lock taken in might_fault(). */ for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { @@ -2307,7 +2358,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) p = genradix_ptr_alloc(&fa, nr_files++, GFP_KERNEL); if (!p) { ret = -ENOMEM; - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); mmput(mm); goto out_put_task; } @@ -2316,7 +2367,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) p->end = vma->vm_end; p->mode = vma->vm_file->f_mode; } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); mmput(mm); for (i = 0; i < nr_files; i++) { @@ -2436,7 +2487,7 @@ static int proc_timers_open(struct inode *inode, struct file *file) return -ENOMEM; tp->pid = proc_pid(inode); - tp->ns = proc_pid_ns(inode); + tp->ns = proc_pid_ns(inode->i_sb); return 0; } @@ -2736,6 +2787,15 @@ static const struct pid_entry smack_attr_dir_stuff[] = { LSM_DIR_OPS(smack); #endif +#ifdef CONFIG_SECURITY_APPARMOR +static const struct pid_entry apparmor_attr_dir_stuff[] = { + ATTR("apparmor", "current", 0666), + ATTR("apparmor", "prev", 0444), + ATTR("apparmor", "exec", 0666), +}; +LSM_DIR_OPS(apparmor); +#endif + static const struct pid_entry attr_dir_stuff[] = { ATTR(NULL, "current", 0666), ATTR(NULL, "prev", 0444), @@ -2747,6 +2807,10 @@ static const struct pid_entry attr_dir_stuff[] = { DIR("smack", 0555, proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops), #endif +#ifdef CONFIG_SECURITY_APPARMOR + DIR("apparmor", 0555, + proc_apparmor_attr_dir_inode_ops, proc_apparmor_attr_dir_ops), +#endif }; static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx) @@ -2861,7 +2925,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh unsigned long flags; int result; - result = mutex_lock_killable(&task->signal->cred_guard_mutex); + result = mutex_lock_killable(&task->signal->exec_update_mutex); if (result) return result; @@ -2897,7 +2961,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh result = 0; out_unlock: - mutex_unlock(&task->signal->cred_guard_mutex); + mutex_unlock(&task->signal->exec_update_mutex); return result; } @@ -3230,90 +3294,28 @@ static const struct inode_operations proc_tgid_base_inode_operations = { .permission = proc_pid_permission, }; -static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) -{ - struct dentry *dentry, *leader, *dir; - char buf[10 + 1]; - struct qstr name; - - name.name = buf; - name.len = snprintf(buf, sizeof(buf), "%u", pid); - /* no ->d_hash() rejects on procfs */ - dentry = d_hash_and_lookup(mnt->mnt_root, &name); - if (dentry) { - d_invalidate(dentry); - dput(dentry); - } - - if (pid == tgid) - return; - - name.name = buf; - name.len = snprintf(buf, sizeof(buf), "%u", tgid); - leader = d_hash_and_lookup(mnt->mnt_root, &name); - if (!leader) - goto out; - - name.name = "task"; - name.len = strlen(name.name); - dir = d_hash_and_lookup(leader, &name); - if (!dir) - goto out_put_leader; - - name.name = buf; - name.len = snprintf(buf, sizeof(buf), "%u", pid); - dentry = d_hash_and_lookup(dir, &name); - if (dentry) { - d_invalidate(dentry); - dput(dentry); - } - - dput(dir); -out_put_leader: - dput(leader); -out: - return; -} - /** - * proc_flush_task - Remove dcache entries for @task from the /proc dcache. - * @task: task that should be flushed. - * - * When flushing dentries from proc, one needs to flush them from global - * proc (proc_mnt) and from all the namespaces' procs this task was seen - * in. This call is supposed to do all of this job. + * proc_flush_pid - Remove dcache entries for @pid from the /proc dcache. + * @pid: pid that should be flushed. * - * Looks in the dcache for - * /proc/@pid - * /proc/@tgid/task/@pid - * if either directory is present flushes it and all of it'ts children - * from the dcache. + * This function walks a list of inodes (that belong to any proc + * filesystem) that are attached to the pid and flushes them from + * the dentry cache. * * It is safe and reasonable to cache /proc entries for a task until * that task exits. After that they just clog up the dcache with * useless entries, possibly causing useful dcache entries to be - * flushed instead. This routine is proved to flush those useless - * dcache entries at process exit time. + * flushed instead. This routine is provided to flush those useless + * dcache entries when a process is reaped. * * NOTE: This routine is just an optimization so it does not guarantee - * that no dcache entries will exist at process exit time it - * just makes it very unlikely that any will persist. + * that no dcache entries will exist after a process is reaped + * it just makes it very unlikely that any will persist. */ -void proc_flush_task(struct task_struct *task) +void proc_flush_pid(struct pid *pid) { - int i; - struct pid *pid, *tgid; - struct upid *upid; - - pid = task_pid(task); - tgid = task_tgid(task); - - for (i = 0; i <= pid->level; i++) { - upid = &pid->numbers[i]; - proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, - tgid->numbers[i].nr); - } + proc_invalidate_siblings_dcache(&pid->inodes, &pid->lock); } static struct dentry *proc_pid_instantiate(struct dentry * dentry, @@ -3340,6 +3342,7 @@ struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags) { struct task_struct *task; unsigned tgid; + struct proc_fs_info *fs_info; struct pid_namespace *ns; struct dentry *result = ERR_PTR(-ENOENT); @@ -3347,7 +3350,8 @@ struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags) if (tgid == ~0U) goto out; - ns = dentry->d_sb->s_fs_info; + fs_info = proc_sb_info(dentry->d_sb); + ns = fs_info->pid_ns; rcu_read_lock(); task = find_task_by_pid_ns(tgid, ns); if (task) @@ -3356,7 +3360,14 @@ struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags) if (!task) goto out; + /* Limit procfs to only ptraceable tasks */ + if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE) { + if (!has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS)) + goto out_put_task; + } + result = proc_pid_instantiate(dentry, task, NULL); +out_put_task: put_task_struct(task); out: return result; @@ -3382,20 +3393,8 @@ retry: pid = find_ge_pid(iter.tgid, ns); if (pid) { iter.tgid = pid_nr_ns(pid, ns); - iter.task = pid_task(pid, PIDTYPE_PID); - /* What we to know is if the pid we have find is the - * pid of a thread_group_leader. Testing for task - * being a thread_group_leader is the obvious thing - * todo but there is a window when it fails, due to - * the pid transfer logic in de_thread. - * - * So we perform the straight forward test of seeing - * if the pid we have found is the pid of a thread - * group leader, and don't worry if the task we have - * found doesn't happen to be a thread group leader. - * As we don't care in the case of readdir. - */ - if (!iter.task || !has_group_leader_pid(iter.task)) { + iter.task = pid_task(pid, PIDTYPE_TGID); + if (!iter.task) { iter.tgid += 1; goto retry; } @@ -3411,20 +3410,21 @@ retry: int proc_pid_readdir(struct file *file, struct dir_context *ctx) { struct tgid_iter iter; - struct pid_namespace *ns = proc_pid_ns(file_inode(file)); + struct proc_fs_info *fs_info = proc_sb_info(file_inode(file)->i_sb); + struct pid_namespace *ns = proc_pid_ns(file_inode(file)->i_sb); loff_t pos = ctx->pos; if (pos >= PID_MAX_LIMIT + TGID_OFFSET) return 0; if (pos == TGID_OFFSET - 2) { - struct inode *inode = d_inode(ns->proc_self); + struct inode *inode = d_inode(fs_info->proc_self); if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK)) return 0; ctx->pos = pos = pos + 1; } if (pos == TGID_OFFSET - 1) { - struct inode *inode = d_inode(ns->proc_thread_self); + struct inode *inode = d_inode(fs_info->proc_thread_self); if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK)) return 0; ctx->pos = pos = pos + 1; @@ -3438,7 +3438,7 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) unsigned int len; cond_resched(); - if (!has_pid_permissions(ns, iter.task, HIDEPID_INVISIBLE)) + if (!has_pid_permissions(fs_info, iter.task, HIDEPID_INVISIBLE)) continue; len = snprintf(name, sizeof(name), "%u", iter.tgid); @@ -3638,6 +3638,7 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry struct task_struct *task; struct task_struct *leader = get_proc_task(dir); unsigned tid; + struct proc_fs_info *fs_info; struct pid_namespace *ns; struct dentry *result = ERR_PTR(-ENOENT); @@ -3648,7 +3649,8 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry if (tid == ~0U) goto out; - ns = dentry->d_sb->s_fs_info; + fs_info = proc_sb_info(dentry->d_sb); + ns = fs_info->pid_ns; rcu_read_lock(); task = find_task_by_pid_ns(tid, ns); if (task) @@ -3762,7 +3764,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) /* f_version caches the tgid value that the last readdir call couldn't * return. lseek aka telldir automagically resets f_version to 0. */ - ns = proc_pid_ns(inode); + ns = proc_pid_ns(inode->i_sb); tid = (int)file->f_version; file->f_version = 0; for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns); diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c index 9955d75c0585..ad31ec4ad627 100644 --- a/fs/proc/bootconfig.c +++ b/fs/proc/bootconfig.c @@ -26,8 +26,9 @@ static int boot_config_proc_show(struct seq_file *m, void *v) static int __init copy_xbc_key_value_list(char *dst, size_t size) { struct xbc_node *leaf, *vnode; - const char *val; char *key, *end = dst + size; + const char *val; + char q; int ret = 0; key = kzalloc(XBC_KEYLEN_MAX, GFP_KERNEL); @@ -41,16 +42,20 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size) break; dst += ret; vnode = xbc_node_get_child(leaf); - if (vnode && xbc_node_is_array(vnode)) { + if (vnode) { xbc_array_for_each_value(vnode, val) { - ret = snprintf(dst, rest(dst, end), "\"%s\"%s", - val, vnode->next ? ", " : "\n"); + if (strchr(val, '"')) + q = '\''; + else + q = '"'; + ret = snprintf(dst, rest(dst, end), "%c%s%c%s", + q, val, q, vnode->next ? ", " : "\n"); if (ret < 0) goto out; dst += ret; } } else { - ret = snprintf(dst, rest(dst, end), "\"%s\"\n", val); + ret = snprintf(dst, rest(dst, end), "\"\"\n"); if (ret < 0) break; dst += ret; diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c index c1dea9b8222e..d0989a443c77 100644 --- a/fs/proc/cpuinfo.c +++ b/fs/proc/cpuinfo.c @@ -17,6 +17,7 @@ static int cpuinfo_open(struct inode *inode, struct file *file) } static const struct proc_ops cpuinfo_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = cpuinfo_open, .proc_read = seq_read, .proc_lseek = seq_lseek, diff --git a/fs/proc/devices.c b/fs/proc/devices.c index 37d38697eaf8..837971e74109 100644 --- a/fs/proc/devices.c +++ b/fs/proc/devices.c @@ -3,6 +3,7 @@ #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/blkdev.h> static int devinfo_show(struct seq_file *f, void *v) { diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 3faed94e4b65..2f9fa179194d 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -269,6 +269,11 @@ struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry, struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { + struct proc_fs_info *fs_info = proc_sb_info(dir->i_sb); + + if (fs_info->pidonly == PROC_PIDONLY_ON) + return ERR_PTR(-ENOENT); + return proc_lookup_de(dir, dentry, PDE(dir)); } @@ -325,6 +330,10 @@ int proc_readdir_de(struct file *file, struct dir_context *ctx, int proc_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); + struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); + + if (fs_info->pidonly == PROC_PIDONLY_ON) + return 1; return proc_readdir_de(file, ctx, PDE(inode)); } @@ -531,6 +540,12 @@ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, return p; } +static inline void pde_set_flags(struct proc_dir_entry *pde) +{ + if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT) + pde->flags |= PROC_ENTRY_PERMANENT; +} + struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct proc_ops *proc_ops, void *data) @@ -541,6 +556,7 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, if (!p) return NULL; p->proc_ops = proc_ops; + pde_set_flags(p); return proc_register(parent, p); } EXPORT_SYMBOL(proc_create_data); @@ -572,6 +588,7 @@ static int proc_seq_release(struct inode *inode, struct file *file) } static const struct proc_ops proc_seq_ops = { + /* not permanent -- can call into arbitrary seq_operations */ .proc_open = proc_seq_open, .proc_read = seq_read, .proc_lseek = seq_lseek, @@ -602,6 +619,7 @@ static int proc_single_open(struct inode *inode, struct file *file) } static const struct proc_ops proc_single_ops = { + /* not permanent -- can call into arbitrary ->single_show */ .proc_open = proc_single_open, .proc_read = seq_read, .proc_lseek = seq_lseek, @@ -662,9 +680,13 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) de = pde_subdir_find(parent, fn, len); if (de) { - rb_erase(&de->subdir_node, &parent->subdir); - if (S_ISDIR(de->mode)) { - parent->nlink--; + if (unlikely(pde_is_permanent(de))) { + WARN(1, "removing permanent /proc entry '%s'", de->name); + de = NULL; + } else { + rb_erase(&de->subdir_node, &parent->subdir); + if (S_ISDIR(de->mode)) + parent->nlink--; } } write_unlock(&proc_subdir_lock); @@ -700,12 +722,24 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) write_unlock(&proc_subdir_lock); return -ENOENT; } + if (unlikely(pde_is_permanent(root))) { + write_unlock(&proc_subdir_lock); + WARN(1, "removing permanent /proc entry '%s/%s'", + root->parent->name, root->name); + return -EINVAL; + } rb_erase(&root->subdir_node, &parent->subdir); de = root; while (1) { next = pde_subdir_first(de); if (next) { + if (unlikely(pde_is_permanent(root))) { + write_unlock(&proc_subdir_lock); + WARN(1, "removing permanent /proc entry '%s/%s'", + next->parent->name, next->name); + return -EINVAL; + } rb_erase(&next->subdir_node, &de->subdir); de = next; continue; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 6da18316d209..28d6105e908e 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -24,6 +24,7 @@ #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/mount.h> +#include <linux/bug.h> #include <linux/uaccess.h> @@ -33,21 +34,27 @@ static void proc_evict_inode(struct inode *inode) { struct proc_dir_entry *de; struct ctl_table_header *head; + struct proc_inode *ei = PROC_I(inode); truncate_inode_pages_final(&inode->i_data); clear_inode(inode); /* Stop tracking associated processes */ - put_pid(PROC_I(inode)->pid); + if (ei->pid) { + proc_pid_evict_inode(ei); + ei->pid = NULL; + } /* Let go of any associated proc directory entry */ - de = PDE(inode); - if (de) + de = ei->pde; + if (de) { pde_put(de); + ei->pde = NULL; + } - head = PROC_I(inode)->sysctl; + head = ei->sysctl; if (head) { - RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL); + RCU_INIT_POINTER(ei->sysctl, NULL); proc_sys_evict_inode(inode, head); } } @@ -68,6 +75,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb) ei->pde = NULL; ei->sysctl = NULL; ei->sysctl_entry = NULL; + INIT_HLIST_NODE(&ei->sibling_inodes); ei->ns_ops = NULL; return &ei->vfs_inode; } @@ -102,15 +110,84 @@ void __init proc_init_kmemcache(void) BUILD_BUG_ON(sizeof(struct proc_dir_entry) >= SIZEOF_PDE); } +void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock) +{ + struct inode *inode; + struct proc_inode *ei; + struct hlist_node *node; + struct super_block *old_sb = NULL; + + rcu_read_lock(); + for (;;) { + struct super_block *sb; + node = hlist_first_rcu(inodes); + if (!node) + break; + ei = hlist_entry(node, struct proc_inode, sibling_inodes); + spin_lock(lock); + hlist_del_init_rcu(&ei->sibling_inodes); + spin_unlock(lock); + + inode = &ei->vfs_inode; + sb = inode->i_sb; + if ((sb != old_sb) && !atomic_inc_not_zero(&sb->s_active)) + continue; + inode = igrab(inode); + rcu_read_unlock(); + if (sb != old_sb) { + if (old_sb) + deactivate_super(old_sb); + old_sb = sb; + } + if (unlikely(!inode)) { + rcu_read_lock(); + continue; + } + + if (S_ISDIR(inode->i_mode)) { + struct dentry *dir = d_find_any_alias(inode); + if (dir) { + d_invalidate(dir); + dput(dir); + } + } else { + struct dentry *dentry; + while ((dentry = d_find_alias(inode))) { + d_invalidate(dentry); + dput(dentry); + } + } + iput(inode); + + rcu_read_lock(); + } + rcu_read_unlock(); + if (old_sb) + deactivate_super(old_sb); +} + +static inline const char *hidepid2str(enum proc_hidepid v) +{ + switch (v) { + case HIDEPID_OFF: return "off"; + case HIDEPID_NO_ACCESS: return "noaccess"; + case HIDEPID_INVISIBLE: return "invisible"; + case HIDEPID_NOT_PTRACEABLE: return "ptraceable"; + } + WARN_ONCE(1, "bad hide_pid value: %d\n", v); + return "unknown"; +} + static int proc_show_options(struct seq_file *seq, struct dentry *root) { - struct super_block *sb = root->d_sb; - struct pid_namespace *pid = sb->s_fs_info; + struct proc_fs_info *fs_info = proc_sb_info(root->d_sb); - if (!gid_eq(pid->pid_gid, GLOBAL_ROOT_GID)) - seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, pid->pid_gid)); - if (pid->hide_pid != HIDEPID_OFF) - seq_printf(seq, ",hidepid=%u", pid->hide_pid); + if (!gid_eq(fs_info->pid_gid, GLOBAL_ROOT_GID)) + seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, fs_info->pid_gid)); + if (fs_info->hide_pid != HIDEPID_OFF) + seq_printf(seq, ",hidepid=%s", hidepid2str(fs_info->hide_pid)); + if (fs_info->pidonly != PROC_PIDONLY_OFF) + seq_printf(seq, ",subset=pid"); return 0; } @@ -139,6 +216,7 @@ static void unuse_pde(struct proc_dir_entry *pde) /* pde is locked on entry, unlocked on exit */ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) + __releases(&pde->pde_unload_lock) { /* * close() (proc_reg_release()) can't delete an entry and proceed: @@ -195,135 +273,204 @@ void proc_entry_rundown(struct proc_dir_entry *de) spin_unlock(&de->pde_unload_lock); } +static loff_t pde_lseek(struct proc_dir_entry *pde, struct file *file, loff_t offset, int whence) +{ + typeof_member(struct proc_ops, proc_lseek) lseek; + + lseek = pde->proc_ops->proc_lseek; + if (!lseek) + lseek = default_llseek; + return lseek(file, offset, whence); +} + static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) { struct proc_dir_entry *pde = PDE(file_inode(file)); loff_t rv = -EINVAL; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_lseek) lseek; - lseek = pde->proc_ops->proc_lseek; - if (!lseek) - lseek = default_llseek; - rv = lseek(file, offset, whence); + if (pde_is_permanent(pde)) { + return pde_lseek(pde, file, offset, whence); + } else if (use_pde(pde)) { + rv = pde_lseek(pde, file, offset, whence); unuse_pde(pde); } return rv; } +static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + typeof_member(struct proc_ops, proc_read) read; + + read = pde->proc_ops->proc_read; + if (read) + return read(file, buf, count, ppos); + return -EIO; +} + static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_read) read; - read = pde->proc_ops->proc_read; - if (read) - rv = read(file, buf, count, ppos); + if (pde_is_permanent(pde)) { + return pde_read(pde, file, buf, count, ppos); + } else if (use_pde(pde)) { + rv = pde_read(pde, file, buf, count, ppos); unuse_pde(pde); } return rv; } +static ssize_t pde_write(struct proc_dir_entry *pde, struct file *file, const char __user *buf, size_t count, loff_t *ppos) +{ + typeof_member(struct proc_ops, proc_write) write; + + write = pde->proc_ops->proc_write; + if (write) + return write(file, buf, count, ppos); + return -EIO; +} + static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_write) write; - write = pde->proc_ops->proc_write; - if (write) - rv = write(file, buf, count, ppos); + if (pde_is_permanent(pde)) { + return pde_write(pde, file, buf, count, ppos); + } else if (use_pde(pde)) { + rv = pde_write(pde, file, buf, count, ppos); unuse_pde(pde); } return rv; } +static __poll_t pde_poll(struct proc_dir_entry *pde, struct file *file, struct poll_table_struct *pts) +{ + typeof_member(struct proc_ops, proc_poll) poll; + + poll = pde->proc_ops->proc_poll; + if (poll) + return poll(file, pts); + return DEFAULT_POLLMASK; +} + static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts) { struct proc_dir_entry *pde = PDE(file_inode(file)); __poll_t rv = DEFAULT_POLLMASK; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_poll) poll; - poll = pde->proc_ops->proc_poll; - if (poll) - rv = poll(file, pts); + if (pde_is_permanent(pde)) { + return pde_poll(pde, file, pts); + } else if (use_pde(pde)) { + rv = pde_poll(pde, file, pts); unuse_pde(pde); } return rv; } +static long pde_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg) +{ + typeof_member(struct proc_ops, proc_ioctl) ioctl; + + ioctl = pde->proc_ops->proc_ioctl; + if (ioctl) + return ioctl(file, cmd, arg); + return -ENOTTY; +} + static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_ioctl) ioctl; - ioctl = pde->proc_ops->proc_ioctl; - if (ioctl) - rv = ioctl(file, cmd, arg); + if (pde_is_permanent(pde)) { + return pde_ioctl(pde, file, cmd, arg); + } else if (use_pde(pde)) { + rv = pde_ioctl(pde, file, cmd, arg); unuse_pde(pde); } return rv; } #ifdef CONFIG_COMPAT +static long pde_compat_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg) +{ + typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl; + + compat_ioctl = pde->proc_ops->proc_compat_ioctl; + if (compat_ioctl) + return compat_ioctl(file, cmd, arg); + return -ENOTTY; +} + static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl; - - compat_ioctl = pde->proc_ops->proc_compat_ioctl; - if (compat_ioctl) - rv = compat_ioctl(file, cmd, arg); + if (pde_is_permanent(pde)) { + return pde_compat_ioctl(pde, file, cmd, arg); + } else if (use_pde(pde)) { + rv = pde_compat_ioctl(pde, file, cmd, arg); unuse_pde(pde); } return rv; } #endif +static int pde_mmap(struct proc_dir_entry *pde, struct file *file, struct vm_area_struct *vma) +{ + typeof_member(struct proc_ops, proc_mmap) mmap; + + mmap = pde->proc_ops->proc_mmap; + if (mmap) + return mmap(file, vma); + return -EIO; +} + static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) { struct proc_dir_entry *pde = PDE(file_inode(file)); int rv = -EIO; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_mmap) mmap; - mmap = pde->proc_ops->proc_mmap; - if (mmap) - rv = mmap(file, vma); + if (pde_is_permanent(pde)) { + return pde_mmap(pde, file, vma); + } else if (use_pde(pde)) { + rv = pde_mmap(pde, file, vma); unuse_pde(pde); } return rv; } static unsigned long -proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, +pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - struct proc_dir_entry *pde = PDE(file_inode(file)); - unsigned long rv = -EIO; + typeof_member(struct proc_ops, proc_get_unmapped_area) get_area; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_get_unmapped_area) get_area; - - get_area = pde->proc_ops->proc_get_unmapped_area; + get_area = pde->proc_ops->proc_get_unmapped_area; #ifdef CONFIG_MMU - if (!get_area) - get_area = current->mm->get_unmapped_area; + if (!get_area) + get_area = current->mm->get_unmapped_area; #endif + if (get_area) + return get_area(file, orig_addr, len, pgoff, flags); + return orig_addr; +} - if (get_area) - rv = get_area(file, orig_addr, len, pgoff, flags); - else - rv = orig_addr; +static unsigned long +proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct proc_dir_entry *pde = PDE(file_inode(file)); + unsigned long rv = -EIO; + + if (pde_is_permanent(pde)) { + return pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags); + } else if (use_pde(pde)) { + rv = pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags); unuse_pde(pde); } return rv; @@ -331,12 +478,23 @@ proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, static int proc_reg_open(struct inode *inode, struct file *file) { + struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); struct proc_dir_entry *pde = PDE(inode); int rv = 0; typeof_member(struct proc_ops, proc_open) open; typeof_member(struct proc_ops, proc_release) release; struct pde_opener *pdeo; + if (pde_is_permanent(pde)) { + open = pde->proc_ops->proc_open; + if (open) + rv = open(inode, file); + return rv; + } + + if (fs_info->pidonly == PROC_PIDONLY_ON) + return -ENOENT; + /* * Ensure that * 1) PDE's ->release hook will be called no matter what @@ -386,6 +544,17 @@ static int proc_reg_release(struct inode *inode, struct file *file) { struct proc_dir_entry *pde = PDE(inode); struct pde_opener *pdeo; + + if (pde_is_permanent(pde)) { + typeof_member(struct proc_ops, proc_release) release; + + release = pde->proc_ops->proc_release; + if (release) { + return release(inode, file); + } + return 0; + } + spin_lock(&pde->pde_unload_lock); list_for_each_entry(pdeo, &pde->pde_openers, lh) { if (pdeo->file == file) { @@ -448,7 +617,7 @@ const struct inode_operations proc_link_inode_operations = { struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) { - struct inode *inode = new_inode_pseudo(sb); + struct inode *inode = new_inode(sb); if (inode) { inode->i_ino = de->low_ino; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 41587276798e..917cc85e3466 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -61,6 +61,7 @@ struct proc_dir_entry { struct rb_node subdir_node; char *name; umode_t mode; + u8 flags; u8 namelen; char inline_name[]; } __randomize_layout; @@ -73,6 +74,11 @@ struct proc_dir_entry { 0) #define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry)) +static inline bool pde_is_permanent(const struct proc_dir_entry *pde) +{ + return pde->flags & PROC_ENTRY_PERMANENT; +} + extern struct kmem_cache *proc_dir_entry_cache; void pde_free(struct proc_dir_entry *pde); @@ -91,7 +97,7 @@ struct proc_inode { struct proc_dir_entry *pde; struct ctl_table_header *sysctl; struct ctl_table *sysctl_entry; - struct hlist_node sysctl_inodes; + struct hlist_node sibling_inodes; const struct proc_ns_operations *ns_ops; struct inode vfs_inode; } __randomize_layout; @@ -158,6 +164,7 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *, extern const struct dentry_operations pid_dentry_operations; extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int); extern int proc_setattr(struct dentry *, struct iattr *); +extern void proc_pid_evict_inode(struct proc_inode *); extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t); extern void pid_update_inode(struct task_struct *, struct inode *); extern int pid_delete_dentry(const struct dentry *); @@ -210,6 +217,7 @@ extern const struct inode_operations proc_pid_link_inode_operations; extern const struct super_operations proc_sops; void proc_init_kmemcache(void); +void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock); void set_proc_pid_nlink(void); extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); extern void proc_entry_rundown(struct proc_dir_entry *); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 8ba492d44e68..e502414b3556 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -512,7 +512,8 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) * Using bounce buffer to bypass the * hardened user copy kernel text checks. */ - if (probe_kernel_read(buf, (void *) start, tsz)) { + if (copy_from_kernel_nofault(buf, (void *)start, + tsz)) { if (clear_user(buffer, tsz)) { ret = -EFAULT; goto out; diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index ec1b7d2fb773..b38ad552887f 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -50,6 +50,7 @@ static __poll_t kmsg_poll(struct file *file, poll_table *wait) static const struct proc_ops kmsg_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_read = kmsg_read, .proc_poll = kmsg_poll, .proc_open = kmsg_open, diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 8c1f1bb1a5ce..887a5532e449 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -17,7 +17,6 @@ #include <linux/cma.h> #endif #include <asm/page.h> -#include <asm/pgtable.h> #include "internal.h" void __attribute__((weak)) arch_report_meminfo(struct seq_file *m) @@ -42,7 +41,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) si_meminfo(&i); si_swapinfo(&i); - committed = percpu_counter_read_positive(&vm_committed_as); + committed = vm_memory_committed(); cached = global_node_page_state(NR_FILE_PAGES) - total_swapcache_pages() - i.bufferram; @@ -53,8 +52,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) pages[lru] = global_node_page_state(NR_LRU_BASE + lru); available = si_mem_available(); - sreclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE); - sunreclaim = global_node_page_state(NR_SLAB_UNRECLAIMABLE); + sreclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B); + sunreclaim = global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B); show_val_kb(m, "MemTotal: ", i.totalram); show_val_kb(m, "MemFree: ", i.freeram); @@ -102,12 +101,15 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "SReclaimable: ", sreclaimable); show_val_kb(m, "SUnreclaim: ", sunreclaim); seq_printf(m, "KernelStack: %8lu kB\n", - global_zone_page_state(NR_KERNEL_STACK_KB)); + global_node_page_state(NR_KERNEL_STACK_KB)); +#ifdef CONFIG_SHADOW_CALL_STACK + seq_printf(m, "ShadowCallStack:%8lu kB\n", + global_node_page_state(NR_KERNEL_SCS_KB)); +#endif show_val_kb(m, "PageTables: ", global_zone_page_state(NR_PAGETABLE)); - show_val_kb(m, "NFS_Unstable: ", - global_node_page_state(NR_UNSTABLE_NFS)); + show_val_kb(m, "NFS_Unstable: ", 0); show_val_kb(m, "Bounce: ", global_zone_page_state(NR_BOUNCE)); show_val_kb(m, "WritebackTmp: ", diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index 14c2badb8fd9..13452b32e2bd 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -22,7 +22,6 @@ #include <linux/hugetlb.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> -#include <asm/pgtable.h> #include <asm/tlb.h> #include <asm/div64.h> #include "internal.h" diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 4888c5224442..ed8a6306990c 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -98,6 +98,25 @@ static const struct proc_ops proc_net_seq_ops = { .proc_release = seq_release_net, }; +int bpf_iter_init_seq_net(void *priv_data, struct bpf_iter_aux_info *aux) +{ +#ifdef CONFIG_NET_NS + struct seq_net_private *p = priv_data; + + p->net = get_net(current->nsproxy->net_ns); +#endif + return 0; +} + +void bpf_iter_fini_seq_net(void *priv_data) +{ +#ifdef CONFIG_NET_NS + struct seq_net_private *p = priv_data; + + put_net(p->net); +#endif +} + struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct seq_operations *ops, unsigned int state_size, void *data) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index c75bb4632ed1..6c1166ccdaea 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -14,6 +14,7 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/bpf-cgroup.h> +#include <linux/mount.h> #include "internal.h" static const struct dentry_operations proc_sys_dentry_operations; @@ -267,42 +268,9 @@ static void unuse_table(struct ctl_table_header *p) complete(p->unregistering); } -static void proc_sys_prune_dcache(struct ctl_table_header *head) +static void proc_sys_invalidate_dcache(struct ctl_table_header *head) { - struct inode *inode; - struct proc_inode *ei; - struct hlist_node *node; - struct super_block *sb; - - rcu_read_lock(); - for (;;) { - node = hlist_first_rcu(&head->inodes); - if (!node) - break; - ei = hlist_entry(node, struct proc_inode, sysctl_inodes); - spin_lock(&sysctl_lock); - hlist_del_init_rcu(&ei->sysctl_inodes); - spin_unlock(&sysctl_lock); - - inode = &ei->vfs_inode; - sb = inode->i_sb; - if (!atomic_inc_not_zero(&sb->s_active)) - continue; - inode = igrab(inode); - rcu_read_unlock(); - if (unlikely(!inode)) { - deactivate_super(sb); - rcu_read_lock(); - continue; - } - - d_prune_aliases(inode); - iput(inode); - deactivate_super(sb); - - rcu_read_lock(); - } - rcu_read_unlock(); + proc_invalidate_siblings_dcache(&head->inodes, &sysctl_lock); } /* called under sysctl_lock, will reacquire if has to wait */ @@ -324,10 +292,10 @@ static void start_unregistering(struct ctl_table_header *p) spin_unlock(&sysctl_lock); } /* - * Prune dentries for unregistered sysctls: namespaced sysctls + * Invalidate dentries for unregistered sysctls: namespaced sysctls * can have duplicate names and contaminate dcache very badly. */ - proc_sys_prune_dcache(p); + proc_sys_invalidate_dcache(p); /* * do not remove from the list until nobody holds it; walking the * list in do_sysctl() relies on that. @@ -483,7 +451,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, } ei->sysctl = head; ei->sysctl_entry = table; - hlist_add_head_rcu(&ei->sysctl_inodes, &head->inodes); + hlist_add_head_rcu(&ei->sibling_inodes, &head->inodes); head->count++; spin_unlock(&sysctl_lock); @@ -514,7 +482,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head) { spin_lock(&sysctl_lock); - hlist_del_init_rcu(&PROC_I(inode)->sysctl_inodes); + hlist_del_init_rcu(&PROC_I(inode)->sibling_inodes); if (!--head->count) kfree_rcu(head, rcu); spin_unlock(&sysctl_lock); @@ -572,13 +540,13 @@ out: return err; } -static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, +static ssize_t proc_sys_call_handler(struct file *filp, void __user *ubuf, size_t count, loff_t *ppos, int write) { struct inode *inode = file_inode(filp); struct ctl_table_header *head = grab_header(inode); struct ctl_table *table = PROC_I(inode)->sysctl_entry; - void *new_buf = NULL; + void *kbuf; ssize_t error; if (IS_ERR(head)) @@ -597,27 +565,42 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, if (!table->proc_handler) goto out; - error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, &count, - ppos, &new_buf); - if (error) + /* don't even try if the size is too large */ + error = -ENOMEM; + if (count >= KMALLOC_MAX_SIZE) goto out; - /* careful: calling conventions are nasty here */ - if (new_buf) { - mm_segment_t old_fs; - - old_fs = get_fs(); - set_fs(KERNEL_DS); - error = table->proc_handler(table, write, (void __user *)new_buf, - &count, ppos); - set_fs(old_fs); - kfree(new_buf); + if (write) { + kbuf = memdup_user_nul(ubuf, count); + if (IS_ERR(kbuf)) { + error = PTR_ERR(kbuf); + goto out; + } } else { - error = table->proc_handler(table, write, buf, &count, ppos); + kbuf = kzalloc(count, GFP_KERNEL); + if (!kbuf) + goto out; } - if (!error) - error = count; + error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, &kbuf, &count, + ppos); + if (error) + goto out_free_buf; + + /* careful: calling conventions are nasty here */ + error = table->proc_handler(table, write, kbuf, &count, ppos); + if (error) + goto out_free_buf; + + if (!write) { + error = -EFAULT; + if (copy_to_user(ubuf, kbuf, count)) + goto out_free_buf; + } + + error = count; +out_free_buf: + kfree(kbuf); out: sysctl_head_finish(head); @@ -1725,3 +1708,147 @@ int __init proc_sys_init(void) return sysctl_init(); } + +struct sysctl_alias { + const char *kernel_param; + const char *sysctl_param; +}; + +/* + * Historically some settings had both sysctl and a command line parameter. + * With the generic sysctl. parameter support, we can handle them at a single + * place and only keep the historical name for compatibility. This is not meant + * to add brand new aliases. When adding existing aliases, consider whether + * the possibly different moment of changing the value (e.g. from early_param + * to the moment do_sysctl_args() is called) is an issue for the specific + * parameter. + */ +static const struct sysctl_alias sysctl_aliases[] = { + {"hardlockup_all_cpu_backtrace", "kernel.hardlockup_all_cpu_backtrace" }, + {"hung_task_panic", "kernel.hung_task_panic" }, + {"numa_zonelist_order", "vm.numa_zonelist_order" }, + {"softlockup_all_cpu_backtrace", "kernel.softlockup_all_cpu_backtrace" }, + {"softlockup_panic", "kernel.softlockup_panic" }, + { } +}; + +static const char *sysctl_find_alias(char *param) +{ + const struct sysctl_alias *alias; + + for (alias = &sysctl_aliases[0]; alias->kernel_param != NULL; alias++) { + if (strcmp(alias->kernel_param, param) == 0) + return alias->sysctl_param; + } + + return NULL; +} + +/* Set sysctl value passed on kernel command line. */ +static int process_sysctl_arg(char *param, char *val, + const char *unused, void *arg) +{ + char *path; + struct vfsmount **proc_mnt = arg; + struct file_system_type *proc_fs_type; + struct file *file; + int len; + int err; + loff_t pos = 0; + ssize_t wret; + + if (strncmp(param, "sysctl", sizeof("sysctl") - 1) == 0) { + param += sizeof("sysctl") - 1; + + if (param[0] != '/' && param[0] != '.') + return 0; + + param++; + } else { + param = (char *) sysctl_find_alias(param); + if (!param) + return 0; + } + + /* + * To set sysctl options, we use a temporary mount of proc, look up the + * respective sys/ file and write to it. To avoid mounting it when no + * options were given, we mount it only when the first sysctl option is + * found. Why not a persistent mount? There are problems with a + * persistent mount of proc in that it forces userspace not to use any + * proc mount options. + */ + if (!*proc_mnt) { + proc_fs_type = get_fs_type("proc"); + if (!proc_fs_type) { + pr_err("Failed to find procfs to set sysctl from command line\n"); + return 0; + } + *proc_mnt = kern_mount(proc_fs_type); + put_filesystem(proc_fs_type); + if (IS_ERR(*proc_mnt)) { + pr_err("Failed to mount procfs to set sysctl from command line\n"); + return 0; + } + } + + path = kasprintf(GFP_KERNEL, "sys/%s", param); + if (!path) + panic("%s: Failed to allocate path for %s\n", __func__, param); + strreplace(path, '.', '/'); + + file = file_open_root((*proc_mnt)->mnt_root, *proc_mnt, path, O_WRONLY, 0); + if (IS_ERR(file)) { + err = PTR_ERR(file); + if (err == -ENOENT) + pr_err("Failed to set sysctl parameter '%s=%s': parameter not found\n", + param, val); + else if (err == -EACCES) + pr_err("Failed to set sysctl parameter '%s=%s': permission denied (read-only?)\n", + param, val); + else + pr_err("Error %pe opening proc file to set sysctl parameter '%s=%s'\n", + file, param, val); + goto out; + } + len = strlen(val); + wret = kernel_write(file, val, len, &pos); + if (wret < 0) { + err = wret; + if (err == -EINVAL) + pr_err("Failed to set sysctl parameter '%s=%s': invalid value\n", + param, val); + else + pr_err("Error %pe writing to proc file to set sysctl parameter '%s=%s'\n", + ERR_PTR(err), param, val); + } else if (wret != len) { + pr_err("Wrote only %zd bytes of %d writing to proc file %s to set sysctl parameter '%s=%s\n", + wret, len, path, param, val); + } + + err = filp_close(file, NULL); + if (err) + pr_err("Error %pe closing proc file to set sysctl parameter '%s=%s\n", + ERR_PTR(err), param, val); +out: + kfree(path); + return 0; +} + +void do_sysctl_args(void) +{ + char *command_line; + struct vfsmount *proc_mnt = NULL; + + command_line = kstrdup(saved_command_line, GFP_KERNEL); + if (!command_line) + panic("%s: Failed to allocate copy of command line\n", __func__); + + parse_args("Setting sysctl args", command_line, + NULL, 0, -1, -1, &proc_mnt, process_sysctl_arg); + + if (proc_mnt) + kern_unmount(proc_mnt); + + kfree(command_line); +} diff --git a/fs/proc/root.c b/fs/proc/root.c index 608233dfd29c..5e444d4f9717 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -32,21 +32,86 @@ struct proc_fs_context { struct pid_namespace *pid_ns; unsigned int mask; - int hidepid; + enum proc_hidepid hidepid; int gid; + enum proc_pidonly pidonly; }; enum proc_param { Opt_gid, Opt_hidepid, + Opt_subset, }; static const struct fs_parameter_spec proc_fs_parameters[] = { fsparam_u32("gid", Opt_gid), - fsparam_u32("hidepid", Opt_hidepid), + fsparam_string("hidepid", Opt_hidepid), + fsparam_string("subset", Opt_subset), {} }; +static inline int valid_hidepid(unsigned int value) +{ + return (value == HIDEPID_OFF || + value == HIDEPID_NO_ACCESS || + value == HIDEPID_INVISIBLE || + value == HIDEPID_NOT_PTRACEABLE); +} + +static int proc_parse_hidepid_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct proc_fs_context *ctx = fc->fs_private; + struct fs_parameter_spec hidepid_u32_spec = fsparam_u32("hidepid", Opt_hidepid); + struct fs_parse_result result; + int base = (unsigned long)hidepid_u32_spec.data; + + if (param->type != fs_value_is_string) + return invalf(fc, "proc: unexpected type of hidepid value\n"); + + if (!kstrtouint(param->string, base, &result.uint_32)) { + if (!valid_hidepid(result.uint_32)) + return invalf(fc, "proc: unknown value of hidepid - %s\n", param->string); + ctx->hidepid = result.uint_32; + return 0; + } + + if (!strcmp(param->string, "off")) + ctx->hidepid = HIDEPID_OFF; + else if (!strcmp(param->string, "noaccess")) + ctx->hidepid = HIDEPID_NO_ACCESS; + else if (!strcmp(param->string, "invisible")) + ctx->hidepid = HIDEPID_INVISIBLE; + else if (!strcmp(param->string, "ptraceable")) + ctx->hidepid = HIDEPID_NOT_PTRACEABLE; + else + return invalf(fc, "proc: unknown value of hidepid - %s\n", param->string); + + return 0; +} + +static int proc_parse_subset_param(struct fs_context *fc, char *value) +{ + struct proc_fs_context *ctx = fc->fs_private; + + while (value) { + char *ptr = strchr(value, ','); + + if (ptr != NULL) + *ptr++ = '\0'; + + if (*value != '\0') { + if (!strcmp(value, "pid")) { + ctx->pidonly = PROC_PIDONLY_ON; + } else { + return invalf(fc, "proc: unsupported subset option - %s\n", value); + } + } + value = ptr; + } + + return 0; +} + static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct proc_fs_context *ctx = fc->fs_private; @@ -63,10 +128,13 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) break; case Opt_hidepid: - ctx->hidepid = result.uint_32; - if (ctx->hidepid < HIDEPID_OFF || - ctx->hidepid > HIDEPID_INVISIBLE) - return invalfc(fc, "hidepid value must be between 0 and 2.\n"); + if (proc_parse_hidepid_param(fc, param)) + return -EINVAL; + break; + + case Opt_subset: + if (proc_parse_subset_param(fc, param->string) < 0) + return -EINVAL; break; default: @@ -77,26 +145,33 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) return 0; } -static void proc_apply_options(struct super_block *s, +static void proc_apply_options(struct proc_fs_info *fs_info, struct fs_context *fc, - struct pid_namespace *pid_ns, struct user_namespace *user_ns) { struct proc_fs_context *ctx = fc->fs_private; if (ctx->mask & (1 << Opt_gid)) - pid_ns->pid_gid = make_kgid(user_ns, ctx->gid); + fs_info->pid_gid = make_kgid(user_ns, ctx->gid); if (ctx->mask & (1 << Opt_hidepid)) - pid_ns->hide_pid = ctx->hidepid; + fs_info->hide_pid = ctx->hidepid; + if (ctx->mask & (1 << Opt_subset)) + fs_info->pidonly = ctx->pidonly; } static int proc_fill_super(struct super_block *s, struct fs_context *fc) { - struct pid_namespace *pid_ns = get_pid_ns(s->s_fs_info); + struct proc_fs_context *ctx = fc->fs_private; struct inode *root_inode; + struct proc_fs_info *fs_info; int ret; - proc_apply_options(s, fc, pid_ns, current_user_ns()); + fs_info = kzalloc(sizeof(*fs_info), GFP_KERNEL); + if (!fs_info) + return -ENOMEM; + + fs_info->pid_ns = get_pid_ns(ctx->pid_ns); + proc_apply_options(fs_info, fc, current_user_ns()); /* User space would break if executables or devices appear on proc */ s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV; @@ -106,6 +181,7 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc) s->s_magic = PROC_SUPER_MAGIC; s->s_op = &proc_sops; s->s_time_gran = 1; + s->s_fs_info = fs_info; /* * procfs isn't actually a stacking filesystem; however, there is @@ -113,7 +189,7 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc) * top of it */ s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; - + /* procfs dentries and inodes don't require IO to create */ s->s_shrink.seeks = 0; @@ -140,19 +216,17 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc) static int proc_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; - struct pid_namespace *pid = sb->s_fs_info; + struct proc_fs_info *fs_info = proc_sb_info(sb); sync_filesystem(sb); - proc_apply_options(sb, fc, pid, current_user_ns()); + proc_apply_options(fs_info, fc, current_user_ns()); return 0; } static int proc_get_tree(struct fs_context *fc) { - struct proc_fs_context *ctx = fc->fs_private; - - return get_tree_keyed(fc, proc_fill_super, ctx->pid_ns); + return get_tree_nodev(fc, proc_fill_super); } static void proc_fs_context_free(struct fs_context *fc) @@ -188,15 +262,19 @@ static int proc_init_fs_context(struct fs_context *fc) static void proc_kill_sb(struct super_block *sb) { - struct pid_namespace *ns; + struct proc_fs_info *fs_info = proc_sb_info(sb); + + if (!fs_info) { + kill_anon_super(sb); + return; + } + + dput(fs_info->proc_self); + dput(fs_info->proc_thread_self); - ns = (struct pid_namespace *)sb->s_fs_info; - if (ns->proc_self) - dput(ns->proc_self); - if (ns->proc_thread_self) - dput(ns->proc_thread_self); kill_anon_super(sb); - put_pid_ns(ns); + put_pid_ns(fs_info->pid_ns); + kfree(fs_info); } static struct file_system_type proc_fs_type = { @@ -292,39 +370,3 @@ struct proc_dir_entry proc_root = { .subdir = RB_ROOT, .name = "/proc", }; - -int pid_ns_prepare_proc(struct pid_namespace *ns) -{ - struct proc_fs_context *ctx; - struct fs_context *fc; - struct vfsmount *mnt; - - fc = fs_context_for_mount(&proc_fs_type, SB_KERNMOUNT); - if (IS_ERR(fc)) - return PTR_ERR(fc); - - if (fc->user_ns != ns->user_ns) { - put_user_ns(fc->user_ns); - fc->user_ns = get_user_ns(ns->user_ns); - } - - ctx = fc->fs_private; - if (ctx->pid_ns != ns) { - put_pid_ns(ctx->pid_ns); - get_pid_ns(ns); - ctx->pid_ns = ns; - } - - mnt = fc_mount(fc); - put_fs_context(fc); - if (IS_ERR(mnt)) - return PTR_ERR(mnt); - - ns->proc_mnt = mnt; - return 0; -} - -void pid_ns_release_proc(struct pid_namespace *ns) -{ - kern_unmount(ns->proc_mnt); -} diff --git a/fs/proc/self.c b/fs/proc/self.c index 57c0a1047250..72cd69bcaf4a 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -12,7 +12,7 @@ static const char *proc_self_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - struct pid_namespace *ns = proc_pid_ns(inode); + struct pid_namespace *ns = proc_pid_ns(inode->i_sb); pid_t tgid = task_tgid_nr_ns(current, ns); char *name; @@ -36,14 +36,14 @@ static unsigned self_inum __ro_after_init; int proc_setup_self(struct super_block *s) { struct inode *root_inode = d_inode(s->s_root); - struct pid_namespace *ns = proc_pid_ns(root_inode); + struct proc_fs_info *fs_info = proc_sb_info(s); struct dentry *self; int ret = -ENOMEM; - + inode_lock(root_inode); self = d_alloc_name(s->s_root, "self"); if (self) { - struct inode *inode = new_inode_pseudo(s); + struct inode *inode = new_inode(s); if (inode) { inode->i_ino = self_inum; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); @@ -62,7 +62,7 @@ int proc_setup_self(struct super_block *s) if (ret) pr_err("proc_fill_super: can't allocate /proc/self\n"); else - ns->proc_self = self; + fs_info->proc_self = self; return ret; } diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 0449edf460f5..46b3293015fe 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -224,6 +224,7 @@ static int stat_open(struct inode *inode, struct file *file) } static const struct proc_ops stat_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = stat_open, .proc_read = seq_read, .proc_lseek = seq_lseek, diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3ba9ae83bff5..5066b0251ed8 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -123,38 +123,14 @@ static void release_task_mempolicy(struct proc_maps_private *priv) } #endif -static void vma_stop(struct proc_maps_private *priv) -{ - struct mm_struct *mm = priv->mm; - - release_task_mempolicy(priv); - up_read(&mm->mmap_sem); - mmput(mm); -} - -static struct vm_area_struct * -m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma) -{ - if (vma == priv->tail_vma) - return NULL; - return vma->vm_next ?: priv->tail_vma; -} - -static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma) -{ - if (m->count < m->size) /* vma is copied successfully */ - m->version = m_next_vma(m->private, vma) ? vma->vm_end : -1UL; -} - static void *m_start(struct seq_file *m, loff_t *ppos) { struct proc_maps_private *priv = m->private; - unsigned long last_addr = m->version; + unsigned long last_addr = *ppos; struct mm_struct *mm; struct vm_area_struct *vma; - unsigned int pos = *ppos; - /* See m_cache_vma(). Zero at the start or after lseek. */ + /* See m_next(). Zero at the start or after lseek. */ if (last_addr == -1UL) return NULL; @@ -163,64 +139,59 @@ static void *m_start(struct seq_file *m, loff_t *ppos) return ERR_PTR(-ESRCH); mm = priv->mm; - if (!mm || !mmget_not_zero(mm)) + if (!mm || !mmget_not_zero(mm)) { + put_task_struct(priv->task); + priv->task = NULL; return NULL; + } - if (down_read_killable(&mm->mmap_sem)) { + if (mmap_read_lock_killable(mm)) { mmput(mm); + put_task_struct(priv->task); + priv->task = NULL; return ERR_PTR(-EINTR); } hold_task_mempolicy(priv); priv->tail_vma = get_gate_vma(mm); - if (last_addr) { - vma = find_vma(mm, last_addr - 1); - if (vma && vma->vm_start <= last_addr) - vma = m_next_vma(priv, vma); - if (vma) - return vma; - } - - m->version = 0; - if (pos < mm->map_count) { - for (vma = mm->mmap; pos; pos--) { - m->version = vma->vm_start; - vma = vma->vm_next; - } + vma = find_vma(mm, last_addr); + if (vma) return vma; - } - - /* we do not bother to update m->version in this case */ - if (pos == mm->map_count && priv->tail_vma) - return priv->tail_vma; - vma_stop(priv); - return NULL; + return priv->tail_vma; } -static void *m_next(struct seq_file *m, void *v, loff_t *pos) +static void *m_next(struct seq_file *m, void *v, loff_t *ppos) { struct proc_maps_private *priv = m->private; - struct vm_area_struct *next; + struct vm_area_struct *next, *vma = v; + + if (vma == priv->tail_vma) + next = NULL; + else if (vma->vm_next) + next = vma->vm_next; + else + next = priv->tail_vma; + + *ppos = next ? next->vm_start : -1UL; - (*pos)++; - next = m_next_vma(priv, v); - if (!next) - vma_stop(priv); return next; } static void m_stop(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; + struct mm_struct *mm = priv->mm; - if (!IS_ERR_OR_NULL(v)) - vma_stop(priv); - if (priv->task) { - put_task_struct(priv->task); - priv->task = NULL; - } + if (!priv->task) + return; + + release_task_mempolicy(priv); + mmap_read_unlock(mm); + mmput(mm); + put_task_struct(priv->task); + priv->task = NULL; } static int proc_maps_open(struct inode *inode, struct file *file, @@ -363,7 +334,6 @@ done: static int show_map(struct seq_file *m, void *v) { show_map_vma(m, v); - m_cache_vma(m, v); return 0; } @@ -576,10 +546,17 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); - struct page *page; + struct page *page = NULL; + + if (pmd_present(*pmd)) { + /* FOLL_DUMP will return -EFAULT on huge zero page */ + page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP); + } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) { + swp_entry_t entry = pmd_to_swp_entry(*pmd); - /* FOLL_DUMP will return -EFAULT on huge zero page */ - page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP); + if (is_migration_entry(entry)) + page = migration_entry_to_page(entry); + } if (IS_ERR_OR_NULL(page)) return; if (PageAnon(page)) @@ -608,8 +585,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { - if (pmd_present(*pmd)) - smaps_pmd_entry(pmd, addr, walk); + smaps_pmd_entry(pmd, addr, walk); spin_unlock(ptl); goto out; } @@ -617,7 +593,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (pmd_trans_unstable(pmd)) goto out; /* - * The mmap_sem held all the way back in m_start() is what + * The mmap_lock held all the way back in m_start() is what * keeps khugepaged out of here and from collapsing things * in here. */ @@ -652,9 +628,6 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_GROWSDOWN)] = "gd", [ilog2(VM_PFNMAP)] = "pf", [ilog2(VM_DENYWRITE)] = "dw", -#ifdef CONFIG_X86_INTEL_MPX - [ilog2(VM_MPX)] = "mp", -#endif [ilog2(VM_LOCKED)] = "lo", [ilog2(VM_IO)] = "io", [ilog2(VM_SEQ_READ)] = "sr", @@ -668,6 +641,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_ARCH_1)] = "ar", [ilog2(VM_WIPEONFORK)] = "wf", [ilog2(VM_DONTDUMP)] = "dd", +#ifdef CONFIG_ARM64_BTI + [ilog2(VM_ARM64_BTI)] = "bt", +#endif #ifdef CONFIG_MEM_SOFT_DIRTY [ilog2(VM_SOFTDIRTY)] = "sd", #endif @@ -776,7 +752,7 @@ static void smap_gather_stats(struct vm_area_struct *vma, } } #endif - /* mmap_sem is held in m_start */ + /* mmap_lock is held in m_start */ walk_page_vma(vma, &smaps_walk_ops, mss); } @@ -810,7 +786,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp); - SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp); + SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp); SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb); seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ", mss->private_hugetlb >> 10, 7); @@ -840,15 +816,13 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); - seq_printf(m, "THPeligible: %d\n", + seq_printf(m, "THPeligible: %d\n", transparent_hugepage_enabled(vma)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); show_smap_vma_flags(m, vma); - m_cache_vma(m, vma); - return 0; } @@ -873,7 +847,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) memset(&mss, 0, sizeof(mss)); - ret = down_read_killable(&mm->mmap_sem); + ret = mmap_read_lock_killable(mm); if (ret) goto out_put_mm; @@ -892,7 +866,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) __show_smap(m, &mss, true); release_task_mempolicy(priv); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); out_put_mm: mmput(mm); @@ -1166,7 +1140,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, }; if (type == CLEAR_REFS_MM_HIWATER_RSS) { - if (down_write_killable(&mm->mmap_sem)) { + if (mmap_write_lock_killable(mm)) { count = -EINTR; goto out_mm; } @@ -1176,11 +1150,11 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, * resident set size to this mm's current rss value. */ reset_mm_hiwater_rss(mm); - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); goto out_mm; } - if (down_read_killable(&mm->mmap_sem)) { + if (mmap_read_lock_killable(mm)) { count = -EINTR; goto out_mm; } @@ -1189,8 +1163,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, for (vma = mm->mmap; vma; vma = vma->vm_next) { if (!(vma->vm_flags & VM_SOFTDIRTY)) continue; - up_read(&mm->mmap_sem); - if (down_write_killable(&mm->mmap_sem)) { + mmap_read_unlock(mm); + if (mmap_write_lock_killable(mm)) { count = -EINTR; goto out_mm; } @@ -1209,14 +1183,14 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, * failed like if * get_proc_task() fails? */ - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); goto out_mm; } for (vma = mm->mmap; vma; vma = vma->vm_next) { vma->vm_flags &= ~VM_SOFTDIRTY; vma_set_page_prot(vma); } - downgrade_write(&mm->mmap_sem); + mmap_write_downgrade(mm); break; } @@ -1229,7 +1203,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, if (type == CLEAR_REFS_SOFT_DIRTY) mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb, 0, -1); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); out_mm: mmput(mm); } @@ -1590,11 +1564,11 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, /* overflow ? */ if (end < start_vaddr || end > end_vaddr) end = end_vaddr; - ret = down_read_killable(&mm->mmap_sem); + ret = mmap_read_lock_killable(mm); if (ret) goto out_free; ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); start_vaddr = end; len = min(count, PM_ENTRY_BYTES * pm.pos); @@ -1853,7 +1827,7 @@ static int show_numa_map(struct seq_file *m, void *v) if (is_vm_hugetlb_page(vma)) seq_puts(m, " huge"); - /* mmap_sem is held by m_start */ + /* mmap_lock is held by m_start */ walk_page_vma(vma, &show_numa_ops, md); if (!md->pages) @@ -1887,7 +1861,6 @@ static int show_numa_map(struct seq_file *m, void *v) seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10); out: seq_putc(m, '\n'); - m_cache_vma(m, vma); return 0; } diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 7907e6419e57..a6d21fc0033c 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -25,7 +25,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) struct rb_node *p; unsigned long bytes = 0, sbytes = 0, slack = 0, size; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { vma = rb_entry(p, struct vm_area_struct, vm_rb); @@ -77,7 +77,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) "Shared:\t%8lu bytes\n", bytes, slack, sbytes); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); } unsigned long task_vsize(struct mm_struct *mm) @@ -86,12 +86,12 @@ unsigned long task_vsize(struct mm_struct *mm) struct rb_node *p; unsigned long vsize = 0; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { vma = rb_entry(p, struct vm_area_struct, vm_rb); vsize += vma->vm_end - vma->vm_start; } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return vsize; } @@ -104,7 +104,7 @@ unsigned long task_statm(struct mm_struct *mm, struct rb_node *p; unsigned long size = kobjsize(mm); - down_read(&mm->mmap_sem); + mmap_read_lock(mm); for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { vma = rb_entry(p, struct vm_area_struct, vm_rb); size += kobjsize(vma); @@ -119,7 +119,7 @@ unsigned long task_statm(struct mm_struct *mm, >> PAGE_SHIFT; *data = (PAGE_ALIGN(mm->start_stack) - (mm->start_data & PAGE_MASK)) >> PAGE_SHIFT; - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); size >>= PAGE_SHIFT; size += *text + *data; *resident = size; @@ -211,7 +211,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) if (!mm || !mmget_not_zero(mm)) return NULL; - if (down_read_killable(&mm->mmap_sem)) { + if (mmap_read_lock_killable(mm)) { mmput(mm); return ERR_PTR(-EINTR); } @@ -221,7 +221,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) if (n-- == 0) return p; - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); mmput(mm); return NULL; } @@ -231,7 +231,7 @@ static void m_stop(struct seq_file *m, void *_vml) struct proc_maps_private *priv = m->private; if (!IS_ERR_OR_NULL(_vml)) { - up_read(&priv->mm->mmap_sem); + mmap_read_unlock(priv->mm); mmput(priv->mm); } if (priv->task) { diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index f61ae53533f5..a553273fbd41 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -12,7 +12,7 @@ static const char *proc_thread_self_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - struct pid_namespace *ns = proc_pid_ns(inode); + struct pid_namespace *ns = proc_pid_ns(inode->i_sb); pid_t tgid = task_tgid_nr_ns(current, ns); pid_t pid = task_pid_nr_ns(current, ns); char *name; @@ -36,14 +36,14 @@ static unsigned thread_self_inum __ro_after_init; int proc_setup_thread_self(struct super_block *s) { struct inode *root_inode = d_inode(s->s_root); - struct pid_namespace *ns = proc_pid_ns(root_inode); + struct proc_fs_info *fs_info = proc_sb_info(s); struct dentry *thread_self; int ret = -ENOMEM; inode_lock(root_inode); thread_self = d_alloc_name(s->s_root, "thread-self"); if (thread_self) { - struct inode *inode = new_inode_pseudo(s); + struct inode *inode = new_inode(s); if (inode) { inode->i_ino = thread_self_inum; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); @@ -60,9 +60,9 @@ int proc_setup_thread_self(struct super_block *s) inode_unlock(root_inode); if (ret) - pr_err("proc_fill_super: can't allocate /proc/thread_self\n"); + pr_err("proc_fill_super: can't allocate /proc/thread-self\n"); else - ns->proc_thread_self = thread_self; + fs_info->proc_thread_self = thread_self; return ret; } diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 7dc800cce354..c3a345c28a93 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -27,7 +27,6 @@ #include <linux/pagemap.h> #include <linux/uaccess.h> #include <linux/mem_encrypt.h> -#include <asm/pgtable.h> #include <asm/io.h> #include "internal.h" @@ -266,7 +265,8 @@ static int vmcoredd_mmap_dumps(struct vm_area_struct *vma, unsigned long dst, if (start < offset + dump->size) { tsz = min(offset + (u64)dump->size - start, (u64)size); buf = dump->buf + start - offset; - if (remap_vmalloc_range_partial(vma, dst, buf, tsz)) { + if (remap_vmalloc_range_partial(vma, dst, buf, 0, + tsz)) { ret = -EFAULT; goto out_unlock; } @@ -624,7 +624,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size); kaddr = elfnotes_buf + start - elfcorebuf_sz - vmcoredd_orig_sz; if (remap_vmalloc_range_partial(vma, vma->vm_start + len, - kaddr, tsz)) + kaddr, 0, tsz)) goto fail; size -= tsz; |