From 25bc5b0de91bc5e7afa65f1face0087fb9e331c7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 19 Jan 2022 18:07:57 -0800 Subject: proc/vmcore: don't fake reading zeroes on surprise vmcore_cb unregistration In commit cc5f2704c934 ("proc/vmcore: convert oldmem_pfn_is_ram callback to more generic vmcore callbacks"), we added detection of surprise vmcore_cb unregistration after the vmcore was already opened. Once detected, we warn the user and simulate reading zeroes from that point on when accessing the vmcore. The basic reason was that unexpected unregistration, for example, by manually unbinding a driver from a device after opening the vmcore, is not supported and could result in reading oldmem the vmcore_cb would have actually prohibited while registered. However, something like that can similarly be trigger by a user that's really looking for trouble simply by unbinding the relevant driver before opening the vmcore -- or by disallowing loading the driver in the first place. So it's actually of limited help. Currently, unregistration can only be triggered via virtio-mem when manually unbinding the driver from the device inside the VM; there is no way to trigger it from the hypervisor, as hypervisors don't allow for unplugging virtio-mem devices -- ripping out system RAM from a VM without coordination with the guest is usually not a good idea. The important part is that unbinding the driver and unregistering the vmcore_cb while concurrently reading the vmcore won't crash the system, and that is handled by the rwsem. To make the mechanism more future proof, let's remove the "read zero" part, but leave the warning in place. For example, we could have a future driver (like virtio-balloon) that will contact the hypervisor to figure out if we already populated a page for a given PFN. Hotunplugging such a device and consequently unregistering the vmcore_cb could be triggered from the hypervisor without harming the system even while kdump is running. In that case, we don't want to silently end up with a vmcore that contains wrong data, because the user inside the VM might be unaware of the hypervisor action and might easily miss the warning in the log. Link: https://lkml.kernel.org/r/20211111192243.22002-1-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Baoquan He Cc: Dave Young Cc: Vivek Goyal Cc: Philipp Rudo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/vmcore.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 509f85148fee..702754dd1daf 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -65,8 +65,6 @@ static size_t vmcoredd_orig_sz; static DECLARE_RWSEM(vmcore_cb_rwsem); /* List of registered vmcore callbacks. */ static LIST_HEAD(vmcore_cb_list); -/* Whether we had a surprise unregistration of a callback. */ -static bool vmcore_cb_unstable; /* Whether the vmcore has been opened once. */ static bool vmcore_opened; @@ -94,10 +92,8 @@ void unregister_vmcore_cb(struct vmcore_cb *cb) * very unusual (e.g., forced driver removal), but we cannot stop * unregistering. */ - if (vmcore_opened) { + if (vmcore_opened) pr_warn_once("Unexpected vmcore callback unregistration\n"); - vmcore_cb_unstable = true; - } up_write(&vmcore_cb_rwsem); } EXPORT_SYMBOL_GPL(unregister_vmcore_cb); @@ -108,8 +104,6 @@ static bool pfn_is_ram(unsigned long pfn) bool ret = true; lockdep_assert_held_read(&vmcore_cb_rwsem); - if (unlikely(vmcore_cb_unstable)) - return false; list_for_each_entry(cb, &vmcore_cb_list, next) { if (unlikely(!cb->pfn_is_ram)) @@ -581,7 +575,7 @@ static int vmcore_remap_oldmem_pfn(struct vm_area_struct *vma, * looping over all pages without a reason. */ down_read(&vmcore_cb_rwsem); - if (!list_empty(&vmcore_cb_list) || vmcore_cb_unstable) + if (!list_empty(&vmcore_cb_list)) ret = remap_oldmem_pfn_checked(vma, from, pfn, size, prot); else ret = remap_oldmem_pfn_range(vma, from, pfn, size, prot); -- cgit v1.2.3 From 51a18734402874382ccfab288342c72d7227e122 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 19 Jan 2022 18:08:03 -0800 Subject: proc: convert the return type of proc_fd_access_allowed() to be boolean Convert return type of proc_fd_access_allowed() and the 'allowed' in it to be boolean since the return type of ptrace_may_access() is boolean. Link: https://lkml.kernel.org/r/20211219024404.29779-1-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Kees Cook Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 13eda8de2998..d654ce7150fd 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -670,10 +670,10 @@ static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns, /************************************************************************/ /* permission checks */ -static int proc_fd_access_allowed(struct inode *inode) +static bool proc_fd_access_allowed(struct inode *inode) { struct task_struct *task; - int allowed = 0; + bool allowed = false; /* Allow access to a task's file descriptors if it is us or we * may use ptrace attach to the process and find out that * information. -- cgit v1.2.3 From 153ee1c41a3ec707438ae0ca6b0061f72de334ef Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 19 Jan 2022 18:08:06 -0800 Subject: sysctl: fix duplicate path separator in printed entries sysctl_print_dir() always terminates the printed path name with a slash, so printing a slash before the file part causes a duplicate like in sysctl duplicate entry: /kernel//perf_user_access Fix this by dropping the extra slash. Link: https://lkml.kernel.org/r/e3054d605dc56f83971e4b6d2f5fa63a978720ad.1641551872.git.geert+renesas@glider.be Signed-off-by: Geert Uytterhoeven Acked-by: Christian Brauner Acked-by: Luis Chamberlain Cc: Iurii Zaikin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 5d66faecd4ef..4f6168ec5079 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -163,7 +163,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry) else { pr_err("sysctl duplicate entry: "); sysctl_print_dir(head->parent); - pr_cont("/%s\n", entry->procname); + pr_cont("%s\n", entry->procname); return -EEXIST; } } @@ -1020,8 +1020,8 @@ failed: if (IS_ERR(subdir)) { pr_err("sysctl could not get directory: "); sysctl_print_dir(dir); - pr_cont("/%*.*s %ld\n", - namelen, namelen, name, PTR_ERR(subdir)); + pr_cont("%*.*s %ld\n", namelen, namelen, name, + PTR_ERR(subdir)); } drop_sysctl_table(&dir->header); if (new) @@ -1626,7 +1626,7 @@ static void put_links(struct ctl_table_header *header) else { pr_err("sysctl link missing during unregister: "); sysctl_print_dir(parent); - pr_cont("/%s\n", name); + pr_cont("%s\n", name); } } } -- cgit v1.2.3 From 7080cead5d45b79ec0c86fa285cf9b6abc413ed8 Mon Sep 17 00:00:00 2001 From: luo penghao Date: Wed, 19 Jan 2022 18:08:09 -0800 Subject: sysctl: remove redundant ret assignment Subsequent if judgments will assign new values to ret, so the statement here should be deleted The clang_analyzer complains as follows: fs/proc/proc_sysctl.c: Value stored to 'ret' is never read Link: https://lkml.kernel.org/r/20211230063622.586360-1-luo.penghao@zte.com.cn Signed-off-by: luo penghao Reported-by: Zeal Robot Acked-by: Luis Chamberlain Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 4f6168ec5079..389e1e42e7d9 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1053,7 +1053,6 @@ static int sysctl_follow_link(struct ctl_table_header **phead, struct ctl_dir *dir; int ret; - ret = 0; spin_lock(&sysctl_lock); root = (*pentry)->data; set = lookup_header_set(root); -- cgit v1.2.3 From d6986ce24fc00b0638bd29efe8fb7ba7619ed2aa Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 19 Jan 2022 18:08:43 -0800 Subject: kthread: dynamically allocate memory to store kthread's full name When I was implementing a new per-cpu kthread cfs_migration, I found the comm of it "cfs_migration/%u" is truncated due to the limitation of TASK_COMM_LEN. For example, the comm of the percpu thread on CPU10~19 all have the same name "cfs_migration/1", which will confuse the user. This issue is not critical, because we can get the corresponding CPU from the task's Cpus_allowed. But for kthreads corresponding to other hardware devices, it is not easy to get the detailed device info from task comm, for example, jbd2/nvme0n1p2- xfs-reclaim/sdf Currently there are so many truncated kthreads: rcu_tasks_kthre rcu_tasks_rude_ rcu_tasks_trace poll_mpt3sas0_s ext4-rsv-conver xfs-reclaim/sd{a, b, c, ...} xfs-blockgc/sd{a, b, c, ...} xfs-inodegc/sd{a, b, c, ...} audit_send_repl ecryptfs-kthrea vfio-irqfd-clea jbd2/nvme0n1p2- ... We can shorten these names to work around this problem, but it may be not applied to all of the truncated kthreads. Take 'jbd2/nvme0n1p2-' for example, it is a nice name, and it is not a good idea to shorten it. One possible way to fix this issue is extending the task comm size, but as task->comm is used in lots of places, that may cause some potential buffer overflows. Another more conservative approach is introducing a new pointer to store kthread's full name if it is truncated, which won't introduce too much overhead as it is in the non-critical path. Finally we make a dicision to use the second approach. See also the discussions in this thread: https://lore.kernel.org/lkml/20211101060419.4682-1-laoar.shao@gmail.com/ After this change, the full name of these truncated kthreads will be displayed via /proc/[pid]/comm: rcu_tasks_kthread rcu_tasks_rude_kthread rcu_tasks_trace_kthread poll_mpt3sas0_statu ext4-rsv-conversion xfs-reclaim/sdf1 xfs-blockgc/sdf1 xfs-inodegc/sdf1 audit_send_reply ecryptfs-kthread vfio-irqfd-cleanup jbd2/nvme0n1p2-8 Link: https://lkml.kernel.org/r/20211120112850.46047-1-laoar.shao@gmail.com Signed-off-by: Yafang Shao Reviewed-by: David Hildenbrand Reviewed-by: Petr Mladek Suggested-by: Petr Mladek Suggested-by: Steven Rostedt Cc: Mathieu Desnoyers Cc: Arnaldo Carvalho de Melo Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Michal Miroslaw Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Matthew Wilcox Cc: Al Viro Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 3 +++ include/linux/kthread.h | 1 + kernel/kthread.c | 32 ++++++++++++++++++++++++++++++-- 3 files changed, 34 insertions(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/array.c b/fs/proc/array.c index ff869a66b34e..4321aa63835d 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -92,6 +92,7 @@ #include #include #include +#include #include #include "internal.h" @@ -102,6 +103,8 @@ void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape) if (p->flags & PF_WQ_WORKER) wq_worker_comm(tcomm, sizeof(tcomm), p); + else if (p->flags & PF_KTHREAD) + get_kthread_comm(tcomm, sizeof(tcomm), p); else __get_task_comm(tcomm, sizeof(tcomm), p); diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 346b0f269161..2a5c04494663 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -33,6 +33,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), unsigned int cpu, const char *namefmt); +void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk); void set_kthread_struct(struct task_struct *p); void kthread_set_per_cpu(struct task_struct *k, int cpu); diff --git a/kernel/kthread.c b/kernel/kthread.c index 7113003fab63..a70cd5dc94e3 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -60,6 +60,8 @@ struct kthread { #ifdef CONFIG_BLK_CGROUP struct cgroup_subsys_state *blkcg_css; #endif + /* To store the full name if task comm is truncated. */ + char *full_name; }; enum KTHREAD_BITS { @@ -93,6 +95,18 @@ static inline struct kthread *__to_kthread(struct task_struct *p) return kthread; } +void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk) +{ + struct kthread *kthread = to_kthread(tsk); + + if (!kthread || !kthread->full_name) { + __get_task_comm(buf, buf_size, tsk); + return; + } + + strscpy_pad(buf, kthread->full_name, buf_size); +} + void set_kthread_struct(struct task_struct *p) { struct kthread *kthread; @@ -118,9 +132,13 @@ void free_kthread_struct(struct task_struct *k) * or if kmalloc() in kthread() failed. */ kthread = to_kthread(k); + if (!kthread) + return; + #ifdef CONFIG_BLK_CGROUP - WARN_ON_ONCE(kthread && kthread->blkcg_css); + WARN_ON_ONCE(kthread->blkcg_css); #endif + kfree(kthread->full_name); kfree(kthread); } @@ -406,12 +424,22 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), task = create->result; if (!IS_ERR(task)) { char name[TASK_COMM_LEN]; + va_list aq; + int len; /* * task is already visible to other tasks, so updating * COMM must be protected. */ - vsnprintf(name, sizeof(name), namefmt, args); + va_copy(aq, args); + len = vsnprintf(name, sizeof(name), namefmt, aq); + va_end(aq); + if (len >= TASK_COMM_LEN) { + struct kthread *kthread = to_kthread(task); + + /* leave it truncated when out of memory. */ + kthread->full_name = kvasprintf(GFP_KERNEL, namefmt, args); + } set_task_comm(task, name); } kfree(create); -- cgit v1.2.3