diff options
Diffstat (limited to 'fs/exec.c')
-rw-r--r-- | fs/exec.c | 330 |
1 files changed, 173 insertions, 157 deletions
diff --git a/fs/exec.c b/fs/exec.c index 7776209d98c1..d60794372963 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -66,6 +66,8 @@ #include <linux/coredump.h> #include <linux/time_namespace.h> #include <linux/user_events.h> +#include <linux/rseq.h> +#include <linux/ksm.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> @@ -127,7 +129,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) struct filename *tmp = getname(library); int error = PTR_ERR(tmp); static const struct open_flags uselib_flags = { - .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, + .open_flag = O_LARGEFILE | O_RDONLY, .acc_mode = MAY_READ | MAY_EXEC, .intent = LOOKUP_OPEN, .lookup_flags = LOOKUP_FOLLOW, @@ -265,6 +267,14 @@ static int __bprm_mm_init(struct linux_binprm *bprm) } /* + * Need to be called with mmap write lock + * held, to avoid race with ksmd. + */ + err = ksm_execve(mm); + if (err) + goto err_ksm; + + /* * Place the stack at the largest stack address the architecture * supports. Later, we'll move this to an appropriate place. We don't * use STACK_TOP because that can depend on attributes which aren't @@ -285,6 +295,8 @@ static int __bprm_mm_init(struct linux_binprm *bprm) bprm->p = vma->vm_end - sizeof(void *); return 0; err: + ksm_exit(mm); +err_ksm: mmap_write_unlock(mm); err_free: bprm->vma = NULL; @@ -472,6 +484,35 @@ static int count_strings_kernel(const char *const *argv) return i; } +static inline int bprm_set_stack_limit(struct linux_binprm *bprm, + unsigned long limit) +{ +#ifdef CONFIG_MMU + /* Avoid a pathological bprm->p. */ + if (bprm->p < limit) + return -E2BIG; + bprm->argmin = bprm->p - limit; +#endif + return 0; +} +static inline bool bprm_hit_stack_limit(struct linux_binprm *bprm) +{ +#ifdef CONFIG_MMU + return bprm->p < bprm->argmin; +#else + return false; +#endif +} + +/* + * Calculate bprm->argmin from: + * - _STK_LIM + * - ARG_MAX + * - bprm->rlim_stack.rlim_cur + * - bprm->argc + * - bprm->envc + * - bprm->p + */ static int bprm_stack_limits(struct linux_binprm *bprm) { unsigned long limit, ptr_size; @@ -491,6 +532,9 @@ static int bprm_stack_limits(struct linux_binprm *bprm) * of argument strings even with small stacks */ limit = max_t(unsigned long, limit, ARG_MAX); + /* Reject totally pathological counts. */ + if (bprm->argc < 0 || bprm->envc < 0) + return -E2BIG; /* * We must account for the size of all the argv and envp pointers to * the argv and envp strings, since they will also take up space in @@ -504,13 +548,14 @@ static int bprm_stack_limits(struct linux_binprm *bprm) * argc can never be 0, to keep them from walking envp by accident. * See do_execveat_common(). */ - ptr_size = (max(bprm->argc, 1) + bprm->envc) * sizeof(void *); + if (check_add_overflow(max(bprm->argc, 1), bprm->envc, &ptr_size) || + check_mul_overflow(ptr_size, sizeof(void *), &ptr_size)) + return -E2BIG; if (limit <= ptr_size) return -E2BIG; limit -= ptr_size; - bprm->argmin = bprm->p - limit; - return 0; + return bprm_set_stack_limit(bprm, limit); } /* @@ -548,10 +593,8 @@ static int copy_strings(int argc, struct user_arg_ptr argv, pos = bprm->p; str += len; bprm->p -= len; -#ifdef CONFIG_MMU - if (bprm->p < bprm->argmin) + if (bprm_hit_stack_limit(bprm)) goto out; -#endif while (len > 0) { int offset, bytes_to_copy; @@ -626,7 +669,7 @@ int copy_string_kernel(const char *arg, struct linux_binprm *bprm) /* We're going to work our way backwards. */ arg += len; bprm->p -= len; - if (IS_ENABLED(CONFIG_MMU) && bprm->p < bprm->argmin) + if (bprm_hit_stack_limit(bprm)) return -E2BIG; while (len > 0) { @@ -667,80 +710,6 @@ static int copy_strings_kernel(int argc, const char *const *argv, #ifdef CONFIG_MMU /* - * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX. Once - * the binfmt code determines where the new stack should reside, we shift it to - * its final location. The process proceeds as follows: - * - * 1) Use shift to calculate the new vma endpoints. - * 2) Extend vma to cover both the old and new ranges. This ensures the - * arguments passed to subsequent functions are consistent. - * 3) Move vma's page tables to the new range. - * 4) Free up any cleared pgd range. - * 5) Shrink the vma to cover only the new range. - */ -static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) -{ - struct mm_struct *mm = vma->vm_mm; - unsigned long old_start = vma->vm_start; - unsigned long old_end = vma->vm_end; - unsigned long length = old_end - old_start; - unsigned long new_start = old_start - shift; - unsigned long new_end = old_end - shift; - VMA_ITERATOR(vmi, mm, new_start); - struct vm_area_struct *next; - struct mmu_gather tlb; - - BUG_ON(new_start > new_end); - - /* - * ensure there are no vmas between where we want to go - * and where we are - */ - if (vma != vma_next(&vmi)) - return -EFAULT; - - vma_iter_prev_range(&vmi); - /* - * cover the whole range: [new_start, old_end) - */ - if (vma_expand(&vmi, vma, new_start, old_end, vma->vm_pgoff, NULL)) - return -ENOMEM; - - /* - * move the page tables downwards, on failure we rely on - * process cleanup to remove whatever mess we made. - */ - if (length != move_page_tables(vma, old_start, - vma, new_start, length, false)) - return -ENOMEM; - - lru_add_drain(); - tlb_gather_mmu(&tlb, mm); - next = vma_next(&vmi); - if (new_end > old_start) { - /* - * when the old and new regions overlap clear from new_end. - */ - free_pgd_range(&tlb, new_end, old_end, new_end, - next ? next->vm_start : USER_PGTABLES_CEILING); - } else { - /* - * otherwise, clean from old_start; this is done to not touch - * the address space in [new_end, old_start) some architectures - * have constraints on va-space that make this illegal (IA64) - - * for the others its just a little faster. - */ - free_pgd_range(&tlb, old_start, old_end, new_end, - next ? next->vm_start : USER_PGTABLES_CEILING); - } - tlb_finish_mmu(&tlb); - - vma_prev(&vmi); - /* Shrink the vma to just the new range */ - return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff); -} - -/* * Finalizes the stack vm_area_struct. The flags and permissions are updated, * the stack is optionally relocated, and some extra space is added. */ @@ -833,7 +802,12 @@ int setup_arg_pages(struct linux_binprm *bprm, /* Move stack pages down in memory. */ if (stack_shift) { - ret = shift_arg_pages(vma, stack_shift); + /* + * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX. Once + * the binfmt code determines where the new stack should reside, we shift it to + * its final location. + */ + ret = relocate_vma_down(vma, stack_shift); if (ret) goto out_unlock; } @@ -903,10 +877,14 @@ EXPORT_SYMBOL(transfer_args_to_stack); #endif /* CONFIG_MMU */ +/* + * On success, caller must call do_close_execat() on the returned + * struct file to close it. + */ static struct file *do_open_execat(int fd, struct filename *name, int flags) { - struct file *file; int err; + struct file *file __free(fput) = NULL; struct open_flags open_exec_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, .acc_mode = MAY_EXEC, @@ -930,22 +908,28 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags) * 633fb6ac3980 ("exec: move S_ISREG() check earlier"). Since then it is * an invariant that all non-regular files error out before we get here. */ - err = -EACCES; if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) || path_noexec(&file->f_path)) - goto exit; + return ERR_PTR(-EACCES); err = deny_write_access(file); if (err) - goto exit; + return ERR_PTR(err); - return file; - -exit: - fput(file); - return ERR_PTR(err); + return no_free_ptr(file); } +/** + * open_exec - Open a path name for execution + * + * @name: path name to open with the intent of executing it. + * + * Returns ERR_PTR on failure or allocated struct file on success. + * + * As this is a wrapper for the internal do_open_execat(), callers + * must call allow_write_access() before fput() on release. Also see + * do_close_execat(). + */ struct file *open_exec(const char *name) { struct filename *filename = getname_kernel(name); @@ -985,8 +969,6 @@ static int exec_mmap(struct mm_struct *mm) tsk = current; old_mm = current->mm; exec_mm_release(tsk, old_mm); - if (old_mm) - sync_mm_rss(old_mm); ret = down_write_killable(&tsk->signal->exec_update_lock); if (ret) @@ -1143,7 +1125,6 @@ static int de_thread(struct task_struct *tsk) BUG_ON(leader->exit_state != EXIT_ZOMBIE); leader->exit_state = EXIT_DEAD; - /* * We are going to release_task()->ptrace_unlink() silently, * the tracer can sleep in do_wait(). EXIT_DEAD guarantees @@ -1253,17 +1234,24 @@ int begin_new_exec(struct linux_binprm * bprm) return retval; /* - * Ensure all future errors are fatal. + * This tracepoint marks the point before flushing the old exec where + * the current task is still unchanged, but errors are fatal (point of + * no return). The later "sched_process_exec" tracepoint is called after + * the current task has successfully switched to the new exec. */ - bprm->point_of_no_return = true; + trace_sched_prepare_exec(current, bprm); /* - * Make this the only thread in the thread group. + * Ensure all future errors are fatal. */ + bprm->point_of_no_return = true; + + /* Make this the only thread in the thread group */ retval = de_thread(me); if (retval) goto out; - + /* see the comment in check_unsafe_exec() */ + current->fs->in_exec = 0; /* * Cancel any io_uring activity across execve */ @@ -1362,7 +1350,28 @@ int begin_new_exec(struct linux_binprm * bprm) set_dumpable(current->mm, SUID_DUMP_USER); perf_event_exec(); - __set_task_comm(me, kbasename(bprm->filename), true); + + /* + * If the original filename was empty, alloc_bprm() made up a path + * that will probably not be useful to admins running ps or similar. + * Let's fix it up to be something reasonable. + */ + if (bprm->comm_from_dentry) { + /* + * Hold RCU lock to keep the name from being freed behind our back. + * Use acquire semantics to make sure the terminating NUL from + * __d_alloc() is seen. + * + * Note, we're deliberately sloppy here. We don't need to care about + * detecting a concurrent rename and just want a terminated name. + */ + rcu_read_lock(); + __set_task_comm(me, smp_load_acquire(&bprm->file->f_path.dentry->d_name.name), + true); + rcu_read_unlock(); + } else { + __set_task_comm(me, kbasename(bprm->filename), true); + } /* An exec changes our domain. We are no longer part of the thread group */ @@ -1487,6 +1496,15 @@ static int prepare_bprm_creds(struct linux_binprm *bprm) return -ENOMEM; } +/* Matches do_open_execat() */ +static void do_close_execat(struct file *file) +{ + if (!file) + return; + allow_write_access(file); + fput(file); +} + static void free_bprm(struct linux_binprm *bprm) { if (bprm->mm) { @@ -1495,13 +1513,12 @@ static void free_bprm(struct linux_binprm *bprm) } free_arg_pages(bprm); if (bprm->cred) { + /* in case exec fails before de_thread() succeeds */ + current->fs->in_exec = 0; mutex_unlock(¤t->signal->cred_guard_mutex); abort_creds(bprm->cred); } - if (bprm->file) { - allow_write_access(bprm->file); - fput(bprm->file); - } + do_close_execat(bprm->file); if (bprm->executable) fput(bprm->executable); /* If a binfmt changed the interp, free it. */ @@ -1511,36 +1528,59 @@ static void free_bprm(struct linux_binprm *bprm) kfree(bprm); } -static struct linux_binprm *alloc_bprm(int fd, struct filename *filename) +static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int flags) { - struct linux_binprm *bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); + struct linux_binprm *bprm; + struct file *file; int retval = -ENOMEM; - if (!bprm) - goto out; + + file = do_open_execat(fd, filename, flags); + if (IS_ERR(file)) + return ERR_CAST(file); + + bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); + if (!bprm) { + do_close_execat(file); + return ERR_PTR(-ENOMEM); + } + + bprm->file = file; if (fd == AT_FDCWD || filename->name[0] == '/') { bprm->filename = filename->name; } else { - if (filename->name[0] == '\0') + if (filename->name[0] == '\0') { bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd); - else + bprm->comm_from_dentry = 1; + } else { bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s", fd, filename->name); + } if (!bprm->fdpath) goto out_free; + /* + * Record that a name derived from an O_CLOEXEC fd will be + * inaccessible after exec. This allows the code in exec to + * choose to fail when the executable is not mmaped into the + * interpreter and an open file descriptor is not passed to + * the interpreter. This makes for a better user experience + * than having the interpreter start and then immediately fail + * when it finds the executable is inaccessible. + */ + if (get_close_on_exec(fd)) + bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; + bprm->filename = bprm->fdpath; } bprm->interp = bprm->filename; retval = bprm_mm_init(bprm); - if (retval) - goto out_free; - return bprm; + if (!retval) + return bprm; out_free: free_bprm(bprm); -out: return ERR_PTR(retval); } @@ -1581,17 +1621,21 @@ static void check_unsafe_exec(struct linux_binprm *bprm) * suid exec because the differently privileged task * will be able to manipulate the current directory, etc. * It would be nice to force an unshare instead... + * + * Otherwise we set fs->in_exec = 1 to deny clone(CLONE_FS) + * from another sub-thread until de_thread() succeeds, this + * state is protected by cred_guard_mutex we hold. */ - t = p; n_fs = 1; spin_lock(&p->fs->lock); rcu_read_lock(); - while_each_thread(p, t) { + for_other_threads(p, t) { if (t->fs == p->fs) n_fs++; } rcu_read_unlock(); + /* "users" and "in_exec" locked for copy_fs() */ if (p->fs->users > n_fs) bprm->unsafe |= LSM_UNSAFE_SHARE; else @@ -1684,7 +1728,6 @@ static int prepare_binprm(struct linux_binprm *bprm) */ int remove_arg_zero(struct linux_binprm *bprm) { - int ret = 0; unsigned long offset; char *kaddr; struct page *page; @@ -1695,10 +1738,8 @@ int remove_arg_zero(struct linux_binprm *bprm) do { offset = bprm->p & ~PAGE_MASK; page = get_arg_page(bprm, bprm->p, 0); - if (!page) { - ret = -EFAULT; - goto out; - } + if (!page) + return -EFAULT; kaddr = kmap_local_page(page); for (; offset < PAGE_SIZE && kaddr[offset]; @@ -1711,10 +1752,8 @@ int remove_arg_zero(struct linux_binprm *bprm) bprm->p++; bprm->argc--; - ret = 0; -out: - return ret; + return 0; } EXPORT_SYMBOL(remove_arg_zero); @@ -1814,13 +1853,8 @@ static int exec_binprm(struct linux_binprm *bprm) return 0; } -/* - * sys_execve() executes a new program. - */ -static int bprm_execve(struct linux_binprm *bprm, - int fd, struct filename *filename, int flags) +static int bprm_execve(struct linux_binprm *bprm) { - struct file *file; int retval; retval = prepare_bprm_creds(bprm); @@ -1836,26 +1870,8 @@ static int bprm_execve(struct linux_binprm *bprm, current->in_execve = 1; sched_mm_cid_before_execve(current); - file = do_open_execat(fd, filename, flags); - retval = PTR_ERR(file); - if (IS_ERR(file)) - goto out_unmark; - sched_exec(); - bprm->file = file; - /* - * Record that a name derived from an O_CLOEXEC fd will be - * inaccessible after exec. This allows the code in exec to - * choose to fail when the executable is not mmaped into the - * interpreter and an open file descriptor is not passed to - * the interpreter. This makes for a better user experience - * than having the interpreter start and then immediately fail - * when it finds the executable is inaccessible. - */ - if (bprm->fdpath && get_close_on_exec(fd)) - bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; - /* Set the unchanging part of bprm->cred */ retval = security_bprm_creds_for_exec(bprm); if (retval) @@ -1867,7 +1883,6 @@ static int bprm_execve(struct linux_binprm *bprm, sched_mm_cid_after_execve(current); /* execve succeeded */ - current->fs->in_exec = 0; current->in_execve = 0; rseq_execve(current); user_events_execve(current); @@ -1885,9 +1900,7 @@ out: if (bprm->point_of_no_return && !fatal_signal_pending(current)) force_fatal_sig(SIGSEGV); -out_unmark: sched_mm_cid_after_execve(current); - current->fs->in_exec = 0; current->in_execve = 0; return retval; @@ -1920,7 +1933,7 @@ static int do_execveat_common(int fd, struct filename *filename, * further execve() calls fail. */ current->flags &= ~PF_NPROC_EXCEEDED; - bprm = alloc_bprm(fd, filename); + bprm = alloc_bprm(fd, filename, flags); if (IS_ERR(bprm)) { retval = PTR_ERR(bprm); goto out_ret; @@ -1969,7 +1982,7 @@ static int do_execveat_common(int fd, struct filename *filename, bprm->argc = 1; } - retval = bprm_execve(bprm, fd, filename, flags); + retval = bprm_execve(bprm); out_free: free_bprm(bprm); @@ -1994,7 +2007,7 @@ int kernel_execve(const char *kernel_filename, if (IS_ERR(filename)) return PTR_ERR(filename); - bprm = alloc_bprm(fd, filename); + bprm = alloc_bprm(fd, filename, 0); if (IS_ERR(bprm)) { retval = PTR_ERR(bprm); goto out_ret; @@ -2029,7 +2042,7 @@ int kernel_execve(const char *kernel_filename, if (retval < 0) goto out_free; - retval = bprm_execve(bprm, fd, filename, 0); + retval = bprm_execve(bprm); out_free: free_bprm(bprm); out_ret: @@ -2155,7 +2168,7 @@ COMPAT_SYSCALL_DEFINE5(execveat, int, fd, #ifdef CONFIG_SYSCTL -static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, +static int proc_dointvec_minmax_coredump(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); @@ -2175,7 +2188,6 @@ static struct ctl_table fs_exec_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, - { } }; static int __init init_fs_exec_sysctls(void) @@ -2186,3 +2198,7 @@ static int __init init_fs_exec_sysctls(void) fs_initcall(init_fs_exec_sysctls); #endif /* CONFIG_SYSCTL */ + +#ifdef CONFIG_EXEC_KUNIT_TEST +#include "tests/exec_kunit.c" +#endif |