From 200f091c95bbc4b8660636bd345805c45d6eced7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 28 Sep 2024 14:08:31 -0700 Subject: coredump: Do not lock during 'comm' reporting The 'comm' member will always be NUL terminated, and this is not fast-path, so we can just perform a direct memcpy during a coredump instead of potentially deadlocking while holding the task struct lock. Reported-by: Vegard Nossum Closes: https://lore.kernel.org/all/d122ece6-3606-49de-ae4d-8da88846bef2@oracle.com Fixes: c114e9948c2b ("coredump: Standartize and fix logging") Tested-by: Vegard Nossum Link: https://lore.kernel.org/r/20240928210830.work.307-kees@kernel.org Signed-off-by: Kees Cook --- include/linux/coredump.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 45e598fe3476..77e6e195d1d6 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -52,8 +52,8 @@ extern void do_coredump(const kernel_siginfo_t *siginfo); #define __COREDUMP_PRINTK(Level, Format, ...) \ do { \ char comm[TASK_COMM_LEN]; \ - \ - get_task_comm(comm, current); \ + /* This will always be NUL terminated. */ \ + memcpy(comm, current->comm, sizeof(comm)); \ printk_ratelimited(Level "coredump: %d(%*pE): " Format "\n", \ task_tgid_vnr(current), (int)strlen(comm), comm, ##__VA_ARGS__); \ } while (0) \ -- cgit v1.2.3 From 3a3f61ce5e0b4bcf730acc09c1af91012d241f85 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 29 Nov 2024 20:06:55 -0800 Subject: exec: Make sure task->comm is always NUL-terminated Using strscpy() meant that the final character in task->comm may be non-NUL for a moment before the "string too long" truncation happens. Instead of adding a new use of the ambiguous strncpy(), we'd want to use memtostr_pad() which enforces being able to check at compile time that sizes are sensible, but this requires being able to see string buffer lengths. Instead of trying to inline __set_task_comm() (which needs to call trace and perf functions), just open-code it. But to make sure we're always safe, add compile-time checking like we already do for get_task_comm(). Suggested-by: Linus Torvalds Suggested-by: "Eric W. Biederman" Signed-off-by: Kees Cook --- fs/exec.c | 12 ++++++------ include/linux/sched.h | 9 ++++----- io_uring/io-wq.c | 2 +- io_uring/sqpoll.c | 2 +- kernel/kthread.c | 3 ++- 5 files changed, 14 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/fs/exec.c b/fs/exec.c index e0435b31a811..5f16500ac325 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1200,16 +1200,16 @@ char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk) EXPORT_SYMBOL_GPL(__get_task_comm); /* - * These functions flushes out all traces of the currently running executable - * so that a new one can be started + * This is unlocked -- the string will always be NUL-terminated, but + * may show overlapping contents if racing concurrent reads. */ - void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec) { - task_lock(tsk); + size_t len = min(strlen(buf), sizeof(tsk->comm) - 1); + trace_task_rename(tsk, buf); - strscpy_pad(tsk->comm, buf, sizeof(tsk->comm)); - task_unlock(tsk); + memcpy(tsk->comm, buf, len); + memset(&tsk->comm[len], 0, sizeof(tsk->comm) - len); perf_event_comm(tsk, exec); } diff --git a/include/linux/sched.h b/include/linux/sched.h index e6ee4258169a..ac9f429ddc17 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1932,11 +1932,10 @@ static inline void kick_process(struct task_struct *tsk) { } #endif extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec); - -static inline void set_task_comm(struct task_struct *tsk, const char *from) -{ - __set_task_comm(tsk, from, false); -} +#define set_task_comm(tsk, from) ({ \ + BUILD_BUG_ON(sizeof(from) != TASK_COMM_LEN); \ + __set_task_comm(tsk, from, false); \ +}) extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk); #define get_task_comm(buf, tsk) ({ \ diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index a38f36b68060..5d0928f37471 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -634,7 +634,7 @@ static int io_wq_worker(void *data) struct io_wq_acct *acct = io_wq_get_acct(worker); struct io_wq *wq = worker->wq; bool exit_mask = false, last_timeout = false; - char buf[TASK_COMM_LEN]; + char buf[TASK_COMM_LEN] = {}; set_mask_bits(&worker->flags, 0, BIT(IO_WORKER_F_UP) | BIT(IO_WORKER_F_RUNNING)); diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index a26593979887..90011f06c7fb 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -271,7 +271,7 @@ static int io_sq_thread(void *data) struct io_ring_ctx *ctx; struct rusage start; unsigned long timeout = 0; - char buf[TASK_COMM_LEN]; + char buf[TASK_COMM_LEN] = {}; DEFINE_WAIT(wait); /* offload context creation failed, just exit */ diff --git a/kernel/kthread.c b/kernel/kthread.c index db4ceb0f503c..162d55811744 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -736,10 +736,11 @@ EXPORT_SYMBOL(kthread_stop_put); int kthreadd(void *unused) { + static const char comm[TASK_COMM_LEN] = "kthreadd"; struct task_struct *tsk = current; /* Setup a clean context for our children to inherit. */ - set_task_comm(tsk, "kthreadd"); + set_task_comm(tsk, comm); ignore_signals(tsk); set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD)); set_mems_allowed(node_states[N_MEMORY]); -- cgit v1.2.3 From 543841d1806029889c2f69f040e88b247aba8e22 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 21 Nov 2024 07:07:05 -0800 Subject: exec: fix up /proc/pid/comm in the execveat(AT_EMPTY_PATH) case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Zbigniew mentioned at Linux Plumber's that systemd is interested in switching to execveat() for service execution, but can't, because the contents of /proc/pid/comm are the file descriptor which was used, instead of the path to the binary[1]. This makes the output of tools like top and ps useless, especially in a world where most fds are opened CLOEXEC so the number is truly meaningless. When the filename passed in is empty (e.g. with AT_EMPTY_PATH), use the dentry's filename for "comm" instead of using the useless numeral from the synthetic fdpath construction. This way the actual exec machinery is unchanged, but cosmetically the comm looks reasonable to admins investigating things. Instead of adding TASK_COMM_LEN more bytes to bprm, use one of the unused flag bits to indicate that we need to set "comm" from the dentry. Suggested-by: Zbigniew Jędrzejewski-Szmek Suggested-by: Tycho Andersen Suggested-by: Al Viro Suggested-by: Linus Torvalds Link: https://github.com/uapi-group/kernel-features#set-comm-field-before-exec [1] Reviewed-by: Aleksa Sarai Tested-by: Zbigniew Jędrzejewski-Szmek Signed-off-by: Kees Cook --- fs/exec.c | 29 ++++++++++++++++++++++++++--- include/linux/binfmts.h | 4 +++- 2 files changed, 29 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/fs/exec.c b/fs/exec.c index 5f16500ac325..1843366be6ff 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1347,7 +1347,28 @@ int begin_new_exec(struct linux_binprm * bprm) set_dumpable(current->mm, SUID_DUMP_USER); perf_event_exec(); - __set_task_comm(me, kbasename(bprm->filename), true); + + /* + * If the original filename was empty, alloc_bprm() made up a path + * that will probably not be useful to admins running ps or similar. + * Let's fix it up to be something reasonable. + */ + if (bprm->comm_from_dentry) { + /* + * Hold RCU lock to keep the name from being freed behind our back. + * Use acquire semantics to make sure the terminating NUL from + * __d_alloc() is seen. + * + * Note, we're deliberately sloppy here. We don't need to care about + * detecting a concurrent rename and just want a terminated name. + */ + rcu_read_lock(); + __set_task_comm(me, smp_load_acquire(&bprm->file->f_path.dentry->d_name.name), + true); + rcu_read_unlock(); + } else { + __set_task_comm(me, kbasename(bprm->filename), true); + } /* An exec changes our domain. We are no longer part of the thread group */ @@ -1521,11 +1542,13 @@ static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int fl if (fd == AT_FDCWD || filename->name[0] == '/') { bprm->filename = filename->name; } else { - if (filename->name[0] == '\0') + if (filename->name[0] == '\0') { bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd); - else + bprm->comm_from_dentry = 1; + } else { bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s", fd, filename->name); + } if (!bprm->fdpath) goto out_free; diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index e6c00e860951..3305c849abd6 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -42,7 +42,9 @@ struct linux_binprm { * Set when errors can no longer be returned to the * original userspace. */ - point_of_no_return:1; + point_of_no_return:1, + /* Set when "comm" must come from the dentry. */ + comm_from_dentry:1; struct file *executable; /* Executable to pass to the interpreter */ struct file *interpreter; struct file *file; -- cgit v1.2.3