From 51f39a1f0cea1cacf8c787f652f26dfee9611874 Mon Sep 17 00:00:00 2001 From: David Drysdale Date: Fri, 12 Dec 2014 16:57:29 -0800 Subject: syscalls: implement execveat() system call This patchset adds execveat(2) for x86, and is derived from Meredydd Luff's patch from Sept 2012 (https://lkml.org/lkml/2012/9/11/528). The primary aim of adding an execveat syscall is to allow an implementation of fexecve(3) that does not rely on the /proc filesystem, at least for executables (rather than scripts). The current glibc version of fexecve(3) is implemented via /proc, which causes problems in sandboxed or otherwise restricted environments. Given the desire for a /proc-free fexecve() implementation, HPA suggested (https://lkml.org/lkml/2006/7/11/556) that an execveat(2) syscall would be an appropriate generalization. Also, having a new syscall means that it can take a flags argument without back-compatibility concerns. The current implementation just defines the AT_EMPTY_PATH and AT_SYMLINK_NOFOLLOW flags, but other flags could be added in future -- for example, flags for new namespaces (as suggested at https://lkml.org/lkml/2006/7/11/474). Related history: - https://lkml.org/lkml/2006/12/27/123 is an example of someone realizing that fexecve() is likely to fail in a chroot environment. - http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=514043 covered documenting the /proc requirement of fexecve(3) in its manpage, to "prevent other people from wasting their time". - https://bugzilla.redhat.com/show_bug.cgi?id=241609 described a problem where a process that did setuid() could not fexecve() because it no longer had access to /proc/self/fd; this has since been fixed. This patch (of 4): Add a new execveat(2) system call. execveat() is to execve() as openat() is to open(): it takes a file descriptor that refers to a directory, and resolves the filename relative to that. In addition, if the filename is empty and AT_EMPTY_PATH is specified, execveat() executes the file to which the file descriptor refers. This replicates the functionality of fexecve(), which is a system call in other UNIXen, but in Linux glibc it depends on opening "/proc/self/fd/" (and so relies on /proc being mounted). The filename fed to the executed program as argv[0] (or the name of the script fed to a script interpreter) will be of the form "/dev/fd/" (for an empty filename) or "/dev/fd//", effectively reflecting how the executable was found. This does however mean that execution of a script in a /proc-less environment won't work; also, script execution via an O_CLOEXEC file descriptor fails (as the file will not be accessible after exec). Based on patches by Meredydd Luff. Signed-off-by: David Drysdale Cc: Meredydd Luff Cc: Shuah Khan Cc: "Eric W. Biederman" Cc: Andy Lutomirski Cc: Alexander Viro Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Kees Cook Cc: Arnd Bergmann Cc: Rich Felker Cc: Christoph Hellwig Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/binfmts.h | 4 ++++ include/linux/compat.h | 3 +++ include/linux/fs.h | 1 + include/linux/sched.h | 4 ++++ include/linux/syscalls.h | 5 +++++ include/uapi/asm-generic/unistd.h | 4 +++- 6 files changed, 20 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 61f29e5ea840..576e4639ca60 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -53,6 +53,10 @@ struct linux_binprm { #define BINPRM_FLAGS_EXECFD_BIT 1 #define BINPRM_FLAGS_EXECFD (1 << BINPRM_FLAGS_EXECFD_BIT) +/* filename of the binary will be inaccessible after exec */ +#define BINPRM_FLAGS_PATH_INACCESSIBLE_BIT 2 +#define BINPRM_FLAGS_PATH_INACCESSIBLE (1 << BINPRM_FLAGS_PATH_INACCESSIBLE_BIT) + /* Function parameter for binfmt->coredump */ struct coredump_params { const siginfo_t *siginfo; diff --git a/include/linux/compat.h b/include/linux/compat.h index e6494261eaff..7450ca2ac1fc 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -357,6 +357,9 @@ asmlinkage long compat_sys_lseek(unsigned int, compat_off_t, unsigned int); asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr_t __user *argv, const compat_uptr_t __user *envp); +asmlinkage long compat_sys_execveat(int dfd, const char __user *filename, + const compat_uptr_t __user *argv, + const compat_uptr_t __user *envp, int flags); asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, diff --git a/include/linux/fs.h b/include/linux/fs.h index 1d1838de6882..4193a0bd99b0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2096,6 +2096,7 @@ extern int vfs_open(const struct path *, struct file *, const struct cred *); extern struct file * dentry_open(const struct path *, int, const struct cred *); extern int filp_close(struct file *, fl_owner_t id); +extern struct filename *getname_flags(const char __user *, int, int *); extern struct filename *getname(const char __user *); extern struct filename *getname_kernel(const char *); diff --git a/include/linux/sched.h b/include/linux/sched.h index 4cfdbcf8cf56..8db31ef98d2f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2485,6 +2485,10 @@ extern void do_group_exit(int); extern int do_execve(struct filename *, const char __user * const __user *, const char __user * const __user *); +extern int do_execveat(int, struct filename *, + const char __user * const __user *, + const char __user * const __user *, + int); extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); struct task_struct *fork_idle(int); extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index c9afdc7a7f84..85893d744901 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -877,4 +877,9 @@ asmlinkage long sys_seccomp(unsigned int op, unsigned int flags, asmlinkage long sys_getrandom(char __user *buf, size_t count, unsigned int flags); asmlinkage long sys_bpf(int cmd, union bpf_attr *attr, unsigned int size); + +asmlinkage long sys_execveat(int dfd, const char __user *filename, + const char __user *const __user *argv, + const char __user *const __user *envp, int flags); + #endif diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 22749c134117..e016bd9b1a04 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -707,9 +707,11 @@ __SYSCALL(__NR_getrandom, sys_getrandom) __SYSCALL(__NR_memfd_create, sys_memfd_create) #define __NR_bpf 280 __SYSCALL(__NR_bpf, sys_bpf) +#define __NR_execveat 281 +__SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat) #undef __NR_syscalls -#define __NR_syscalls 281 +#define __NR_syscalls 282 /* * All syscalls below here should go away really, -- cgit v1.2.3