diff options
author | Benjamin Berg <benjamin@sipsolutions.net> | 2025-06-02 16:00:50 +0300 |
---|---|---|
committer | Johannes Berg <johannes.berg@intel.com> | 2025-06-02 16:17:19 +0300 |
commit | 406d17c6c370a33cfb54067d9e205305293d4604 (patch) | |
tree | bbe9ab55591e1dd773f8539473fda10fa15a7276 /arch/um/os-Linux | |
parent | 8420e08fe3a594b6ffa07705ac270faa2ed452c5 (diff) | |
download | linux-406d17c6c370a33cfb54067d9e205305293d4604.tar.xz |
um: Implement kernel side of SECCOMP based process handling
This adds the kernel side of the seccomp based process handling.
Co-authored-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Benjamin Berg <benjamin@sipsolutions.net>
Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Link: https://patch.msgid.link/20250602130052.545733-6-benjamin@sipsolutions.net
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Diffstat (limited to 'arch/um/os-Linux')
-rw-r--r-- | arch/um/os-Linux/internal.h | 5 | ||||
-rw-r--r-- | arch/um/os-Linux/skas/mem.c | 37 | ||||
-rw-r--r-- | arch/um/os-Linux/skas/process.c | 374 |
3 files changed, 299 insertions, 117 deletions
diff --git a/arch/um/os-Linux/internal.h b/arch/um/os-Linux/internal.h index 317fca190c2b..5d8d3b0817a9 100644 --- a/arch/um/os-Linux/internal.h +++ b/arch/um/os-Linux/internal.h @@ -2,6 +2,9 @@ #ifndef __UM_OS_LINUX_INTERNAL_H #define __UM_OS_LINUX_INTERNAL_H +#include <mm_id.h> +#include <stub-data.h> + /* * elf_aux.c */ @@ -16,5 +19,5 @@ void check_tmpexec(void); * skas/process.c */ void wait_stub_done(int pid); - +void wait_stub_done_seccomp(struct mm_id *mm_idp, int running, int wait_sigsys); #endif /* __UM_OS_LINUX_INTERNAL_H */ diff --git a/arch/um/os-Linux/skas/mem.c b/arch/um/os-Linux/skas/mem.c index d7f1814b0e5a..31bf3a52047a 100644 --- a/arch/um/os-Linux/skas/mem.c +++ b/arch/um/os-Linux/skas/mem.c @@ -80,27 +80,32 @@ static inline long do_syscall_stub(struct mm_id *mm_idp) int n, i; int err, pid = mm_idp->pid; - n = ptrace_setregs(pid, syscall_regs); - if (n < 0) { - printk(UM_KERN_ERR "Registers - \n"); - for (i = 0; i < MAX_REG_NR; i++) - printk(UM_KERN_ERR "\t%d\t0x%lx\n", i, syscall_regs[i]); - panic("%s : PTRACE_SETREGS failed, errno = %d\n", - __func__, -n); - } - /* Inform process how much we have filled in. */ proc_data->syscall_data_len = mm_idp->syscall_data_len; - err = ptrace(PTRACE_CONT, pid, 0, 0); - if (err) - panic("Failed to continue stub, pid = %d, errno = %d\n", pid, - errno); - - wait_stub_done(pid); + if (using_seccomp) { + proc_data->restart_wait = 1; + wait_stub_done_seccomp(mm_idp, 0, 1); + } else { + n = ptrace_setregs(pid, syscall_regs); + if (n < 0) { + printk(UM_KERN_ERR "Registers -\n"); + for (i = 0; i < MAX_REG_NR; i++) + printk(UM_KERN_ERR "\t%d\t0x%lx\n", i, syscall_regs[i]); + panic("%s : PTRACE_SETREGS failed, errno = %d\n", + __func__, -n); + } + + err = ptrace(PTRACE_CONT, pid, 0, 0); + if (err) + panic("Failed to continue stub, pid = %d, errno = %d\n", + pid, errno); + + wait_stub_done(pid); + } /* - * proc_data->err will be non-zero if there was an (unexpected) error. + * proc_data->err will be negative if there was an (unexpected) error. * In that case, syscall_data_len points to the last executed syscall, * otherwise it will be zero (but we do not need to rely on that). */ diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index 150e4f6ba633..e1aaa144e273 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net> * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) * Copyright (C) 2002- 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) */ @@ -25,8 +26,11 @@ #include <registers.h> #include <skas.h> #include <sysdep/stub.h> +#include <sysdep/mcontext.h> +#include <linux/futex.h> #include <linux/threads.h> #include <timetravel.h> +#include <asm-generic/rwonce.h> #include "../internal.h" int is_skas_winch(int pid, int fd, void *data) @@ -142,6 +146,73 @@ bad_wait: fatal_sigsegv(); } +void wait_stub_done_seccomp(struct mm_id *mm_idp, int running, int wait_sigsys) +{ + struct stub_data *data = (void *)mm_idp->stack; + int ret; + + do { + if (!running) { + data->signal = 0; + data->futex = FUTEX_IN_CHILD; + CATCH_EINTR(syscall(__NR_futex, &data->futex, + FUTEX_WAKE, 1, NULL, NULL, 0)); + } + + do { + /* + * We need to check whether the child is still alive + * before and after the FUTEX_WAIT call. Before, in + * case it just died but we still updated data->futex + * to FUTEX_IN_CHILD. And after, in case it died while + * we were waiting (and SIGCHLD woke us up, see the + * IRQ handler in mmu.c). + * + * Either way, if PID is negative, then we have no + * choice but to kill the task. + */ + if (__READ_ONCE(mm_idp->pid) < 0) + goto out_kill; + + ret = syscall(__NR_futex, &data->futex, + FUTEX_WAIT, FUTEX_IN_CHILD, + NULL, NULL, 0); + if (ret < 0 && errno != EINTR && errno != EAGAIN) { + printk(UM_KERN_ERR "%s : FUTEX_WAIT failed, errno = %d\n", + __func__, errno); + goto out_kill; + } + } while (data->futex == FUTEX_IN_CHILD); + + if (__READ_ONCE(mm_idp->pid) < 0) + goto out_kill; + + running = 0; + + /* We may receive a SIGALRM before SIGSYS, iterate again. */ + } while (wait_sigsys && data->signal == SIGALRM); + + if (data->mctx_offset > sizeof(data->sigstack) - sizeof(mcontext_t)) { + printk(UM_KERN_ERR "%s : invalid mcontext offset", __func__); + goto out_kill; + } + + if (wait_sigsys && data->signal != SIGSYS) { + printk(UM_KERN_ERR "%s : expected SIGSYS but got %d", + __func__, data->signal); + goto out_kill; + } + + return; + +out_kill: + printk(UM_KERN_ERR "%s : failed to wait for stub, pid = %d, errno = %d\n", + __func__, mm_idp->pid, errno); + /* This is not true inside start_userspace */ + if (current_mm_id() == mm_idp) + fatal_sigsegv(); +} + extern unsigned long current_stub_stack(void); static void get_skas_faultinfo(int pid, struct faultinfo *fi) @@ -185,14 +256,26 @@ static int userspace_tramp(void *stack) int pipe_fds[2]; unsigned long long offset; struct stub_init_data init_data = { + .seccomp = using_seccomp, .stub_start = STUB_START, - .segv_handler = STUB_CODE + - (unsigned long) stub_segv_handler - - (unsigned long) __syscall_stub_start, }; struct iomem_region *iomem; int ret; + if (using_seccomp) { + init_data.signal_handler = STUB_CODE + + (unsigned long) stub_signal_interrupt - + (unsigned long) __syscall_stub_start; + init_data.signal_restorer = STUB_CODE + + (unsigned long) stub_signal_restorer - + (unsigned long) __syscall_stub_start; + } else { + init_data.signal_handler = STUB_CODE + + (unsigned long) stub_segv_handler - + (unsigned long) __syscall_stub_start; + init_data.signal_restorer = 0; + } + init_data.stub_code_fd = phys_mapping(uml_to_phys(__syscall_stub_start), &offset); init_data.stub_code_offset = MMAP_OFFSET(offset); @@ -323,8 +406,9 @@ int userspace_pid[NR_CPUS]; * when negative: an error number. * FIXME: can PIDs become negative?! */ -int start_userspace(unsigned long stub_stack) +int start_userspace(struct mm_id *mm_id) { + struct stub_data *proc_data = (void *)mm_id->stack; void *stack; unsigned long sp; int pid, status, n, err; @@ -343,10 +427,13 @@ int start_userspace(unsigned long stub_stack) /* set stack pointer to the end of the stack page, so it can grow downwards */ sp = (unsigned long)stack + UM_KERN_PAGE_SIZE; + if (using_seccomp) + proc_data->futex = FUTEX_IN_CHILD; + /* clone into new userspace process */ pid = clone(userspace_tramp, (void *) sp, CLONE_VFORK | CLONE_VM | SIGCHLD, - (void *)stub_stack); + (void *)mm_id->stack); if (pid < 0) { err = -errno; printk(UM_KERN_ERR "%s : clone failed, errno = %d\n", @@ -354,29 +441,34 @@ int start_userspace(unsigned long stub_stack) return err; } - do { - CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED | __WALL)); - if (n < 0) { + if (using_seccomp) { + wait_stub_done_seccomp(mm_id, 1, 1); + } else { + do { + CATCH_EINTR(n = waitpid(pid, &status, + WUNTRACED | __WALL)); + if (n < 0) { + err = -errno; + printk(UM_KERN_ERR "%s : wait failed, errno = %d\n", + __func__, errno); + goto out_kill; + } + } while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGALRM)); + + if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) { + err = -EINVAL; + printk(UM_KERN_ERR "%s : expected SIGSTOP, got status = %d\n", + __func__, status); + goto out_kill; + } + + if (ptrace(PTRACE_SETOPTIONS, pid, NULL, + (void *) PTRACE_O_TRACESYSGOOD) < 0) { err = -errno; - printk(UM_KERN_ERR "%s : wait failed, errno = %d\n", + printk(UM_KERN_ERR "%s : PTRACE_SETOPTIONS failed, errno = %d\n", __func__, errno); goto out_kill; } - } while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGALRM)); - - if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) { - err = -EINVAL; - printk(UM_KERN_ERR "%s : expected SIGSTOP, got status = %d\n", - __func__, status); - goto out_kill; - } - - if (ptrace(PTRACE_SETOPTIONS, pid, NULL, - (void *) PTRACE_O_TRACESYSGOOD) < 0) { - err = -errno; - printk(UM_KERN_ERR "%s : PTRACE_SETOPTIONS failed, errno = %d\n", - __func__, errno); - goto out_kill; } if (munmap(stack, UM_KERN_PAGE_SIZE) < 0) { @@ -386,6 +478,8 @@ int start_userspace(unsigned long stub_stack) goto out_kill; } + mm_id->pid = pid; + return pid; out_kill: @@ -399,7 +493,9 @@ extern unsigned long tt_extra_sched_jiffies; void userspace(struct uml_pt_regs *regs) { int err, status, op, pid = userspace_pid[0]; - siginfo_t si; + siginfo_t si_ptrace; + siginfo_t *si; + int sig; /* Handle any immediate reschedules or signals */ interrupt_end(); @@ -432,104 +528,182 @@ void userspace(struct uml_pt_regs *regs) current_mm_sync(); - /* Flush out any pending syscalls */ - err = syscall_stub_flush(current_mm_id()); - if (err) { - if (err == -ENOMEM) - report_enomem(); + if (using_seccomp) { + struct mm_id *mm_id = current_mm_id(); + struct stub_data *proc_data = (void *) mm_id->stack; + int ret; - printk(UM_KERN_ERR "%s - Error flushing stub syscalls: %d", - __func__, -err); - fatal_sigsegv(); - } + ret = set_stub_state(regs, proc_data, singlestepping()); + if (ret) { + printk(UM_KERN_ERR "%s - failed to set regs: %d", + __func__, ret); + fatal_sigsegv(); + } - /* - * This can legitimately fail if the process loads a - * bogus value into a segment register. It will - * segfault and PTRACE_GETREGS will read that value - * out of the process. However, PTRACE_SETREGS will - * fail. In this case, there is nothing to do but - * just kill the process. - */ - if (ptrace(PTRACE_SETREGS, pid, 0, regs->gp)) { - printk(UM_KERN_ERR "%s - ptrace set regs failed, errno = %d\n", - __func__, errno); - fatal_sigsegv(); - } + /* Must have been reset by the syscall caller */ + if (proc_data->restart_wait != 0) + panic("Programming error: Flag to only run syscalls in child was not cleared!"); + + /* Mark pending syscalls for flushing */ + proc_data->syscall_data_len = mm_id->syscall_data_len; + mm_id->syscall_data_len = 0; + + proc_data->signal = 0; + proc_data->futex = FUTEX_IN_CHILD; + CATCH_EINTR(syscall(__NR_futex, &proc_data->futex, + FUTEX_WAKE, 1, NULL, NULL, 0)); + do { + ret = syscall(__NR_futex, &proc_data->futex, + FUTEX_WAIT, FUTEX_IN_CHILD, NULL, NULL, 0); + } while ((ret == -1 && errno == EINTR) || + proc_data->futex == FUTEX_IN_CHILD); + + sig = proc_data->signal; + + if (sig == SIGTRAP && proc_data->err != 0) { + printk(UM_KERN_ERR "%s - Error flushing stub syscalls", + __func__); + syscall_stub_dump_error(mm_id); + fatal_sigsegv(); + } - if (put_fp_registers(pid, regs->fp)) { - printk(UM_KERN_ERR "%s - ptrace set fp regs failed, errno = %d\n", - __func__, errno); - fatal_sigsegv(); - } + ret = get_stub_state(regs, proc_data, NULL); + if (ret) { + printk(UM_KERN_ERR "%s - failed to get regs: %d", + __func__, ret); + fatal_sigsegv(); + } - if (singlestepping()) - op = PTRACE_SYSEMU_SINGLESTEP; - else - op = PTRACE_SYSEMU; + if (proc_data->si_offset > sizeof(proc_data->sigstack) - sizeof(*si)) + panic("%s - Invalid siginfo offset from child", + __func__); + si = (void *)&proc_data->sigstack[proc_data->si_offset]; - if (ptrace(op, pid, 0, 0)) { - printk(UM_KERN_ERR "%s - ptrace continue failed, op = %d, errno = %d\n", - __func__, op, errno); - fatal_sigsegv(); - } + regs->is_user = 1; - CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | __WALL)); - if (err < 0) { - printk(UM_KERN_ERR "%s - wait failed, errno = %d\n", - __func__, errno); - fatal_sigsegv(); - } + /* Fill in ORIG_RAX and extract fault information */ + PT_SYSCALL_NR(regs->gp) = si->si_syscall; + if (sig == SIGSEGV) { + mcontext_t *mcontext = (void *)&proc_data->sigstack[proc_data->mctx_offset]; - regs->is_user = 1; - if (ptrace(PTRACE_GETREGS, pid, 0, regs->gp)) { - printk(UM_KERN_ERR "%s - PTRACE_GETREGS failed, errno = %d\n", - __func__, errno); - fatal_sigsegv(); - } + GET_FAULTINFO_FROM_MC(regs->faultinfo, mcontext); + } + } else { + /* Flush out any pending syscalls */ + err = syscall_stub_flush(current_mm_id()); + if (err) { + if (err == -ENOMEM) + report_enomem(); + + printk(UM_KERN_ERR "%s - Error flushing stub syscalls: %d", + __func__, -err); + fatal_sigsegv(); + } - if (get_fp_registers(pid, regs->fp)) { - printk(UM_KERN_ERR "%s - get_fp_registers failed, errno = %d\n", - __func__, errno); - fatal_sigsegv(); - } + /* + * This can legitimately fail if the process loads a + * bogus value into a segment register. It will + * segfault and PTRACE_GETREGS will read that value + * out of the process. However, PTRACE_SETREGS will + * fail. In this case, there is nothing to do but + * just kill the process. + */ + if (ptrace(PTRACE_SETREGS, pid, 0, regs->gp)) { + printk(UM_KERN_ERR "%s - ptrace set regs failed, errno = %d\n", + __func__, errno); + fatal_sigsegv(); + } - UPT_SYSCALL_NR(regs) = -1; /* Assume: It's not a syscall */ + if (put_fp_registers(pid, regs->fp)) { + printk(UM_KERN_ERR "%s - ptrace set fp regs failed, errno = %d\n", + __func__, errno); + fatal_sigsegv(); + } - if (WIFSTOPPED(status)) { - int sig = WSTOPSIG(status); + if (singlestepping()) + op = PTRACE_SYSEMU_SINGLESTEP; + else + op = PTRACE_SYSEMU; - /* These signal handlers need the si argument. - * The SIGIO and SIGALARM handlers which constitute the - * majority of invocations, do not use it. - */ - switch (sig) { - case SIGSEGV: - case SIGTRAP: - case SIGILL: - case SIGBUS: - case SIGFPE: - case SIGWINCH: - ptrace(PTRACE_GETSIGINFO, pid, 0, (struct siginfo *)&si); - break; + if (ptrace(op, pid, 0, 0)) { + printk(UM_KERN_ERR "%s - ptrace continue failed, op = %d, errno = %d\n", + __func__, op, errno); + fatal_sigsegv(); + } + + CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | __WALL)); + if (err < 0) { + printk(UM_KERN_ERR "%s - wait failed, errno = %d\n", + __func__, errno); + fatal_sigsegv(); } + regs->is_user = 1; + if (ptrace(PTRACE_GETREGS, pid, 0, regs->gp)) { + printk(UM_KERN_ERR "%s - PTRACE_GETREGS failed, errno = %d\n", + __func__, errno); + fatal_sigsegv(); + } + + if (get_fp_registers(pid, regs->fp)) { + printk(UM_KERN_ERR "%s - get_fp_registers failed, errno = %d\n", + __func__, errno); + fatal_sigsegv(); + } + + if (WIFSTOPPED(status)) { + sig = WSTOPSIG(status); + + /* + * These signal handlers need the si argument + * and SIGSEGV needs the faultinfo. + * The SIGIO and SIGALARM handlers which constitute + * the majority of invocations, do not use it. + */ + switch (sig) { + case SIGSEGV: + get_skas_faultinfo(pid, + ®s->faultinfo); + fallthrough; + case SIGTRAP: + case SIGILL: + case SIGBUS: + case SIGFPE: + case SIGWINCH: + ptrace(PTRACE_GETSIGINFO, pid, 0, + (struct siginfo *)&si_ptrace); + si = &si_ptrace; + break; + default: + si = NULL; + break; + } + } else { + sig = 0; + } + } + + UPT_SYSCALL_NR(regs) = -1; /* Assume: It's not a syscall */ + + if (sig) { switch (sig) { case SIGSEGV: - get_skas_faultinfo(pid, ®s->faultinfo); - - if (PTRACE_FULL_FAULTINFO) - (*sig_info[SIGSEGV])(SIGSEGV, (struct siginfo *)&si, + if (using_seccomp || PTRACE_FULL_FAULTINFO) + (*sig_info[SIGSEGV])(SIGSEGV, + (struct siginfo *)si, regs, NULL); else segv(regs->faultinfo, 0, 1, NULL, NULL); break; + case SIGSYS: + handle_syscall(regs); + break; case SIGTRAP + 0x80: handle_trap(pid, regs); break; case SIGTRAP: - relay_signal(SIGTRAP, (struct siginfo *)&si, regs, NULL); + relay_signal(SIGTRAP, (struct siginfo *)si, regs, NULL); break; case SIGALRM: break; @@ -539,7 +713,7 @@ void userspace(struct uml_pt_regs *regs) case SIGFPE: case SIGWINCH: block_signals_trace(); - (*sig_info[sig])(sig, (struct siginfo *)&si, regs, NULL); + (*sig_info[sig])(sig, (struct siginfo *)si, regs, NULL); unblock_signals_trace(); break; default: |