summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/prctl/seccomp_filter.txt163
-rw-r--r--Documentation/security/Yama.txt10
-rw-r--r--MAINTAINERS3
-rw-r--r--arch/Kconfig23
-rw-r--r--arch/microblaze/kernel/ptrace.c2
-rw-r--r--arch/mips/kernel/ptrace.c2
-rw-r--r--arch/powerpc/kernel/ptrace.c2
-rw-r--r--arch/s390/kernel/ptrace.c2
-rw-r--r--arch/sh/kernel/ptrace_32.c2
-rw-r--r--arch/sh/kernel/ptrace_64.c2
-rw-r--r--arch/sparc/kernel/ptrace_64.c2
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/ia32/ia32_signal.c4
-rw-r--r--arch/x86/include/asm/ia32.h6
-rw-r--r--arch/x86/include/asm/syscall.h27
-rw-r--r--arch/x86/kernel/ptrace.c7
-rw-r--r--fs/exec.c10
-rw-r--r--include/asm-generic/siginfo.h22
-rw-r--r--include/asm-generic/syscall.h14
-rw-r--r--include/linux/Kbuild1
-rw-r--r--include/linux/audit.h8
-rw-r--r--include/linux/filter.h12
-rw-r--r--include/linux/prctl.h15
-rw-r--r--include/linux/ptrace.h5
-rw-r--r--include/linux/sched.h4
-rw-r--r--include/linux/seccomp.h107
-rw-r--r--include/linux/security.h1
-rw-r--r--kernel/auditsc.c8
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/seccomp.c458
-rw-r--r--kernel/signal.c9
-rw-r--r--kernel/sys.c12
-rw-r--r--net/compat.c8
-rw-r--r--net/core/filter.c6
-rw-r--r--samples/Makefile2
-rw-r--r--samples/seccomp/Makefile32
-rw-r--r--samples/seccomp/bpf-direct.c190
-rw-r--r--samples/seccomp/bpf-fancy.c102
-rw-r--r--samples/seccomp/bpf-helper.c89
-rw-r--r--samples/seccomp/bpf-helper.h238
-rw-r--r--samples/seccomp/dropper.c68
-rw-r--r--security/apparmor/domain.c35
-rw-r--r--security/commoncap.c7
-rw-r--r--security/selinux/hooks.c10
-rw-r--r--security/yama/yama_lsm.c63
45 files changed, 1718 insertions, 79 deletions
diff --git a/Documentation/prctl/seccomp_filter.txt b/Documentation/prctl/seccomp_filter.txt
new file mode 100644
index 000000000000..597c3c581375
--- /dev/null
+++ b/Documentation/prctl/seccomp_filter.txt
@@ -0,0 +1,163 @@
+ SECure COMPuting with filters
+ =============================
+
+Introduction
+------------
+
+A large number of system calls are exposed to every userland process
+with many of them going unused for the entire lifetime of the process.
+As system calls change and mature, bugs are found and eradicated. A
+certain subset of userland applications benefit by having a reduced set
+of available system calls. The resulting set reduces the total kernel
+surface exposed to the application. System call filtering is meant for
+use with those applications.
+
+Seccomp filtering provides a means for a process to specify a filter for
+incoming system calls. The filter is expressed as a Berkeley Packet
+Filter (BPF) program, as with socket filters, except that the data
+operated on is related to the system call being made: system call
+number and the system call arguments. This allows for expressive
+filtering of system calls using a filter program language with a long
+history of being exposed to userland and a straightforward data set.
+
+Additionally, BPF makes it impossible for users of seccomp to fall prey
+to time-of-check-time-of-use (TOCTOU) attacks that are common in system
+call interposition frameworks. BPF programs may not dereference
+pointers which constrains all filters to solely evaluating the system
+call arguments directly.
+
+What it isn't
+-------------
+
+System call filtering isn't a sandbox. It provides a clearly defined
+mechanism for minimizing the exposed kernel surface. It is meant to be
+a tool for sandbox developers to use. Beyond that, policy for logical
+behavior and information flow should be managed with a combination of
+other system hardening techniques and, potentially, an LSM of your
+choosing. Expressive, dynamic filters provide further options down this
+path (avoiding pathological sizes or selecting which of the multiplexed
+system calls in socketcall() is allowed, for instance) which could be
+construed, incorrectly, as a more complete sandboxing solution.
+
+Usage
+-----
+
+An additional seccomp mode is added and is enabled using the same
+prctl(2) call as the strict seccomp. If the architecture has
+CONFIG_HAVE_ARCH_SECCOMP_FILTER, then filters may be added as below:
+
+PR_SET_SECCOMP:
+ Now takes an additional argument which specifies a new filter
+ using a BPF program.
+ The BPF program will be executed over struct seccomp_data
+ reflecting the system call number, arguments, and other
+ metadata. The BPF program must then return one of the
+ acceptable values to inform the kernel which action should be
+ taken.
+
+ Usage:
+ prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, prog);
+
+ The 'prog' argument is a pointer to a struct sock_fprog which
+ will contain the filter program. If the program is invalid, the
+ call will return -1 and set errno to EINVAL.
+
+ If fork/clone and execve are allowed by @prog, any child
+ processes will be constrained to the same filters and system
+ call ABI as the parent.
+
+ Prior to use, the task must call prctl(PR_SET_NO_NEW_PRIVS, 1) or
+ run with CAP_SYS_ADMIN privileges in its namespace. If these are not
+ true, -EACCES will be returned. This requirement ensures that filter
+ programs cannot be applied to child processes with greater privileges
+ than the task that installed them.
+
+ Additionally, if prctl(2) is allowed by the attached filter,
+ additional filters may be layered on which will increase evaluation
+ time, but allow for further decreasing the attack surface during
+ execution of a process.
+
+The above call returns 0 on success and non-zero on error.
+
+Return values
+-------------
+A seccomp filter may return any of the following values. If multiple
+filters exist, the return value for the evaluation of a given system
+call will always use the highest precedent value. (For example,
+SECCOMP_RET_KILL will always take precedence.)
+
+In precedence order, they are:
+
+SECCOMP_RET_KILL:
+ Results in the task exiting immediately without executing the
+ system call. The exit status of the task (status & 0x7f) will
+ be SIGSYS, not SIGKILL.
+
+SECCOMP_RET_TRAP:
+ Results in the kernel sending a SIGSYS signal to the triggering
+ task without executing the system call. The kernel will
+ rollback the register state to just before the system call
+ entry such that a signal handler in the task will be able to
+ inspect the ucontext_t->uc_mcontext registers and emulate
+ system call success or failure upon return from the signal
+ handler.
+
+ The SECCOMP_RET_DATA portion of the return value will be passed
+ as si_errno.
+
+ SIGSYS triggered by seccomp will have a si_code of SYS_SECCOMP.
+
+SECCOMP_RET_ERRNO:
+ Results in the lower 16-bits of the return value being passed
+ to userland as the errno without executing the system call.
+
+SECCOMP_RET_TRACE:
+ When returned, this value will cause the kernel to attempt to
+ notify a ptrace()-based tracer prior to executing the system
+ call. If there is no tracer present, -ENOSYS is returned to
+ userland and the system call is not executed.
+
+ A tracer will be notified if it requests PTRACE_O_TRACESECCOMP
+ using ptrace(PTRACE_SETOPTIONS). The tracer will be notified
+ of a PTRACE_EVENT_SECCOMP and the SECCOMP_RET_DATA portion of
+ the BPF program return value will be available to the tracer
+ via PTRACE_GETEVENTMSG.
+
+SECCOMP_RET_ALLOW:
+ Results in the system call being executed.
+
+If multiple filters exist, the return value for the evaluation of a
+given system call will always use the highest precedent value.
+
+Precedence is only determined using the SECCOMP_RET_ACTION mask. When
+multiple filters return values of the same precedence, only the
+SECCOMP_RET_DATA from the most recently installed filter will be
+returned.
+
+Pitfalls
+--------
+
+The biggest pitfall to avoid during use is filtering on system call
+number without checking the architecture value. Why? On any
+architecture that supports multiple system call invocation conventions,
+the system call numbers may vary based on the specific invocation. If
+the numbers in the different calling conventions overlap, then checks in
+the filters may be abused. Always check the arch value!
+
+Example
+-------
+
+The samples/seccomp/ directory contains both an x86-specific example
+and a more generic example of a higher level macro interface for BPF
+program generation.
+
+
+
+Adding architecture support
+-----------------------
+
+See arch/Kconfig for the authoritative requirements. In general, if an
+architecture supports both ptrace_event and seccomp, it will be able to
+support seccomp filter with minor fixup: SIGSYS support and seccomp return
+value checking. Then it must just add CONFIG_HAVE_ARCH_SECCOMP_FILTER
+to its arch-specific Kconfig.
diff --git a/Documentation/security/Yama.txt b/Documentation/security/Yama.txt
index a9511f179069..e369de2d48cd 100644
--- a/Documentation/security/Yama.txt
+++ b/Documentation/security/Yama.txt
@@ -34,7 +34,7 @@ parent to a child process (i.e. direct "gdb EXE" and "strace EXE" still
work), or with CAP_SYS_PTRACE (i.e. "gdb --pid=PID", and "strace -p PID"
still work as root).
-For software that has defined application-specific relationships
+In mode 1, software that has defined application-specific relationships
between a debugging process and its inferior (crash handlers, etc),
prctl(PR_SET_PTRACER, pid, ...) can be used. An inferior can declare which
other process (and its descendents) are allowed to call PTRACE_ATTACH
@@ -46,6 +46,8 @@ restrictions, it can call prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, ...)
so that any otherwise allowed process (even those in external pid namespaces)
may attach.
+These restrictions do not change how ptrace via PTRACE_TRACEME operates.
+
The sysctl settings are:
0 - classic ptrace permissions: a process can PTRACE_ATTACH to any other
@@ -60,6 +62,12 @@ The sysctl settings are:
inferior can call prctl(PR_SET_PTRACER, debugger, ...) to declare
an allowed debugger PID to call PTRACE_ATTACH on the inferior.
+2 - admin-only attach: only processes with CAP_SYS_PTRACE may use ptrace
+ with PTRACE_ATTACH.
+
+3 - no attach: no processes may use ptrace with PTRACE_ATTACH. Once set,
+ this sysctl cannot be changed to a lower value.
+
The original children-only logic was based on the restrictions in grsecurity.
==============================================================
diff --git a/MAINTAINERS b/MAINTAINERS
index bb76fc42fc42..d7ca204a0f8e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1731,6 +1731,7 @@ S: Supported
F: include/linux/capability.h
F: security/capability.c
F: security/commoncap.c
+F: kernel/capability.c
CELL BROADBAND ENGINE ARCHITECTURE
M: Arnd Bergmann <arnd@arndb.de>
@@ -5961,7 +5962,7 @@ SECURITY SUBSYSTEM
M: James Morris <james.l.morris@oracle.com>
L: linux-security-module@vger.kernel.org (suggested Cc:)
T: git git://git.kernel.org/pub/scm/linux/kernel/git/jmorris/linux-security.git
-W: http://security.wiki.kernel.org/
+W: http://kernsec.org/
S: Supported
F: security/
diff --git a/arch/Kconfig b/arch/Kconfig
index 684eb5af439d..c024b3ed6675 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -216,4 +216,27 @@ config HAVE_CMPXCHG_DOUBLE
config ARCH_WANT_OLD_COMPAT_IPC
bool
+config HAVE_ARCH_SECCOMP_FILTER
+ bool
+ help
+ An arch should select this symbol if it provides all of these things:
+ - syscall_get_arch()
+ - syscall_get_arguments()
+ - syscall_rollback()
+ - syscall_set_return_value()
+ - SIGSYS siginfo_t support
+ - secure_computing is called from a ptrace_event()-safe context
+ - secure_computing return value is checked and a return value of -1
+ results in the system call being skipped immediately.
+
+config SECCOMP_FILTER
+ def_bool y
+ depends on HAVE_ARCH_SECCOMP_FILTER && SECCOMP && NET
+ help
+ Enable tasks to build secure computing environments defined
+ in terms of Berkeley Packet Filter programs which implement
+ task-defined system call filtering polices.
+
+ See Documentation/prctl/seccomp_filter.txt for details.
+
source "kernel/gcov/Kconfig"
diff --git a/arch/microblaze/kernel/ptrace.c b/arch/microblaze/kernel/ptrace.c
index 6eb2aa927d89..ab1b9db661f3 100644
--- a/arch/microblaze/kernel/ptrace.c
+++ b/arch/microblaze/kernel/ptrace.c
@@ -136,7 +136,7 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
{
long ret = 0;
- secure_computing(regs->r12);
+ secure_computing_strict(regs->r12);
if (test_thread_flag(TIF_SYSCALL_TRACE) &&
tracehook_report_syscall_entry(regs))
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index 7c24c2973c6d..4812c6d916e4 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -535,7 +535,7 @@ static inline int audit_arch(void)
asmlinkage void syscall_trace_enter(struct pt_regs *regs)
{
/* do the secure computing check first */
- secure_computing(regs->regs[2]);
+ secure_computing_strict(regs->regs[2]);
if (!(current->ptrace & PT_PTRACED))
goto out;
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 8d8e028893be..dd5e214cdf21 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -1710,7 +1710,7 @@ long do_syscall_trace_enter(struct pt_regs *regs)
{
long ret = 0;
- secure_computing(regs->gpr[0]);
+ secure_computing_strict(regs->gpr[0]);
if (test_thread_flag(TIF_SYSCALL_TRACE) &&
tracehook_report_syscall_entry(regs))
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 02f300fbf070..4993e689b2c2 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -719,7 +719,7 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
long ret = 0;
/* Do the secure computing check first. */
- secure_computing(regs->gprs[2]);
+ secure_computing_strict(regs->gprs[2]);
/*
* The sysc_tracesys code in entry.S stored the system
diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c
index 9698671444e6..81f999a672f6 100644
--- a/arch/sh/kernel/ptrace_32.c
+++ b/arch/sh/kernel/ptrace_32.c
@@ -503,7 +503,7 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
{
long ret = 0;
- secure_computing(regs->regs[0]);
+ secure_computing_strict(regs->regs[0]);
if (test_thread_flag(TIF_SYSCALL_TRACE) &&
tracehook_report_syscall_entry(regs))
diff --git a/arch/sh/kernel/ptrace_64.c b/arch/sh/kernel/ptrace_64.c
index bc81e07dc098..af90339dadcd 100644
--- a/arch/sh/kernel/ptrace_64.c
+++ b/arch/sh/kernel/ptrace_64.c
@@ -522,7 +522,7 @@ asmlinkage long long do_syscall_trace_enter(struct pt_regs *regs)
{
long long ret = 0;
- secure_computing(regs->regs[9]);
+ secure_computing_strict(regs->regs[9]);
if (test_thread_flag(TIF_SYSCALL_TRACE) &&
tracehook_report_syscall_entry(regs))
diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c
index 6f97c0767995..484dabac7045 100644
--- a/arch/sparc/kernel/ptrace_64.c
+++ b/arch/sparc/kernel/ptrace_64.c
@@ -1062,7 +1062,7 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs)
int ret = 0;
/* do the secure computing check first */
- secure_computing(regs->u_regs[UREG_G1]);
+ secure_computing_strict(regs->u_regs[UREG_G1]);
if (test_thread_flag(TIF_SYSCALL_TRACE))
ret = tracehook_report_syscall_entry(regs);
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1d14cc6b79ad..3a41c4424a0a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -82,6 +82,7 @@ config X86
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select GENERIC_IOMAP
select DCACHE_WORD_ACCESS if !DEBUG_PAGEALLOC
+ select HAVE_ARCH_SECCOMP_FILTER
config INSTRUCTION_DECODER
def_bool (KPROBES || PERF_EVENTS)
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index a69245ba27e3..0b3f2354f6aa 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -67,6 +67,10 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
switch (from->si_code >> 16) {
case __SI_FAULT >> 16:
break;
+ case __SI_SYS >> 16:
+ put_user_ex(from->si_syscall, &to->si_syscall);
+ put_user_ex(from->si_arch, &to->si_arch);
+ break;
case __SI_CHLD >> 16:
if (ia32) {
put_user_ex(from->si_utime, &to->si_utime);
diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
index ee52760549f0..b04cbdb138cd 100644
--- a/arch/x86/include/asm/ia32.h
+++ b/arch/x86/include/asm/ia32.h
@@ -144,6 +144,12 @@ typedef struct compat_siginfo {
int _band; /* POLL_IN, POLL_OUT, POLL_MSG */
int _fd;
} _sigpoll;
+
+ struct {
+ unsigned int _call_addr; /* calling insn */
+ int _syscall; /* triggering system call number */
+ unsigned int _arch; /* AUDIT_ARCH_* of syscall */
+ } _sigsys;
} _sifields;
} compat_siginfo_t;
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index 386b78686c4d..1ace47b62592 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -13,9 +13,11 @@
#ifndef _ASM_X86_SYSCALL_H
#define _ASM_X86_SYSCALL_H
+#include <linux/audit.h>
#include <linux/sched.h>
#include <linux/err.h>
#include <asm/asm-offsets.h> /* For NR_syscalls */
+#include <asm/thread_info.h> /* for TS_COMPAT */
#include <asm/unistd.h>
extern const unsigned long sys_call_table[];
@@ -88,6 +90,12 @@ static inline void syscall_set_arguments(struct task_struct *task,
memcpy(&regs->bx + i, args, n * sizeof(args[0]));
}
+static inline int syscall_get_arch(struct task_struct *task,
+ struct pt_regs *regs)
+{
+ return AUDIT_ARCH_I386;
+}
+
#else /* CONFIG_X86_64 */
static inline void syscall_get_arguments(struct task_struct *task,
@@ -212,6 +220,25 @@ static inline void syscall_set_arguments(struct task_struct *task,
}
}
+static inline int syscall_get_arch(struct task_struct *task,
+ struct pt_regs *regs)
+{
+#ifdef CONFIG_IA32_EMULATION
+ /*
+ * TS_COMPAT is set for 32-bit syscall entry and then
+ * remains set until we return to user mode.
+ *
+ * TIF_IA32 tasks should always have TS_COMPAT set at
+ * system call time.
+ *
+ * x32 tasks should be considered AUDIT_ARCH_X86_64.
+ */
+ if (task_thread_info(task)->status & TS_COMPAT)
+ return AUDIT_ARCH_I386;
+#endif
+ /* Both x32 and x86_64 are considered "64-bit". */
+ return AUDIT_ARCH_X86_64;
+}
#endif /* CONFIG_X86_32 */
#endif /* _ASM_X86_SYSCALL_H */
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 685845cf16e0..13b1990c7c58 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1480,7 +1480,11 @@ long syscall_trace_enter(struct pt_regs *regs)
regs->flags |= X86_EFLAGS_TF;
/* do the secure computing check first */
- secure_computing(regs->orig_ax);
+ if (secure_computing(regs->orig_ax)) {
+ /* seccomp failures shouldn't expose any additional code. */
+ ret = -1L;
+ goto out;
+ }
if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
ret = -1L;
@@ -1505,6 +1509,7 @@ long syscall_trace_enter(struct pt_regs *regs)
regs->dx, regs->r10);
#endif
+out:
return ret ?: regs->orig_ax;
}
diff --git a/fs/exec.c b/fs/exec.c
index b1fd2025e59a..d038968b54b4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1245,6 +1245,13 @@ static int check_unsafe_exec(struct linux_binprm *bprm)
bprm->unsafe |= LSM_UNSAFE_PTRACE;
}
+ /*
+ * This isn't strictly necessary, but it makes it harder for LSMs to
+ * mess up.
+ */
+ if (current->no_new_privs)
+ bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
+
n_fs = 1;
spin_lock(&p->fs->lock);
rcu_read_lock();
@@ -1288,7 +1295,8 @@ int prepare_binprm(struct linux_binprm *bprm)
bprm->cred->euid = current_euid();
bprm->cred->egid = current_egid();
- if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)) {
+ if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) &&
+ !current->no_new_privs) {
/* Set-uid? */
if (mode & S_ISUID) {
bprm->per_clear |= PER_CLEAR_ON_SETID;
diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h
index 5e5e3865f1ed..8ed67779fc09 100644
--- a/include/asm-generic/siginfo.h
+++ b/include/asm-generic/siginfo.h
@@ -98,9 +98,18 @@ typedef struct siginfo {
__ARCH_SI_BAND_T _band; /* POLL_IN, POLL_OUT, POLL_MSG */
int _fd;
} _sigpoll;
+
+ /* SIGSYS */
+ struct {
+ void __user *_call_addr; /* calling user insn */
+ int _syscall; /* triggering system call number */
+ unsigned int _arch; /* AUDIT_ARCH_* of syscall */
+ } _sigsys;
} _sifields;
} __ARCH_SI_ATTRIBUTES siginfo_t;
+/* If the arch shares siginfo, then it has SIGSYS. */
+#define __ARCH_SIGSYS
#endif
/*
@@ -124,6 +133,11 @@ typedef struct siginfo {
#define si_addr_lsb _sifields._sigfault._addr_lsb
#define si_band _sifields._sigpoll._band
#define si_fd _sifields._sigpoll._fd
+#ifdef __ARCH_SIGSYS
+#define si_call_addr _sifields._sigsys._call_addr
+#define si_syscall _sifields._sigsys._syscall
+#define si_arch _sifields._sigsys._arch
+#endif
#ifdef __KERNEL__
#define __SI_MASK 0xffff0000u
@@ -134,6 +148,7 @@ typedef struct siginfo {
#define __SI_CHLD (4 << 16)
#define __SI_RT (5 << 16)
#define __SI_MESGQ (6 << 16)
+#define __SI_SYS (7 << 16)
#define __SI_CODE(T,N) ((T) | ((N) & 0xffff))
#else
#define __SI_KILL 0
@@ -143,6 +158,7 @@ typedef struct siginfo {
#define __SI_CHLD 0
#define __SI_RT 0
#define __SI_MESGQ 0
+#define __SI_SYS 0
#define __SI_CODE(T,N) (N)
#endif
@@ -240,6 +256,12 @@ typedef struct siginfo {
#define NSIGPOLL 6
/*
+ * SIGSYS si_codes
+ */
+#define SYS_SECCOMP (__SI_SYS|1) /* seccomp triggered */
+#define NSIGSYS 1
+
+/*
* sigevent definitions
*
* It seems likely that SIGEV_THREAD will have to be handled from
diff --git a/include/asm-generic/syscall.h b/include/asm-generic/syscall.h
index 5c122ae6bfa6..5b09392db673 100644
--- a/include/asm-generic/syscall.h
+++ b/include/asm-generic/syscall.h
@@ -142,4 +142,18 @@ void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs,
unsigned int i, unsigned int n,
const unsigned long *args);
+/**
+ * syscall_get_arch - return the AUDIT_ARCH for the current system call
+ * @task: task of interest, must be in system call entry tracing
+ * @regs: task_pt_regs() of @task
+ *
+ * Returns the AUDIT_ARCH_* based on the system call convention in use.
+ *
+ * It's only valid to call this when @task is stopped on entry to a system
+ * call, due to %TIF_SYSCALL_TRACE, %TIF_SYSCALL_AUDIT, or %TIF_SECCOMP.
+ *
+ * Architectures which permit CONFIG_HAVE_ARCH_SECCOMP_FILTER must
+ * provide an implementation of this.
+ */
+int syscall_get_arch(struct task_struct *task, struct pt_regs *regs);
#endif /* _ASM_SYSCALL_H */
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 3c9b616c834a..5c93d6c5d591 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -332,6 +332,7 @@ header-y += scc.h
header-y += sched.h
header-y += screen_info.h
header-y += sdla.h
+header-y += seccomp.h
header-y += securebits.h
header-y += selinux_netlink.h
header-y += sem.h
diff --git a/include/linux/audit.h b/include/linux/audit.h
index ed3ef1972496..22f292a917a3 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -463,7 +463,7 @@ extern void audit_putname(const char *name);
extern void __audit_inode(const char *name, const struct dentry *dentry);
extern void __audit_inode_child(const struct dentry *dentry,
const struct inode *parent);
-extern void __audit_seccomp(unsigned long syscall);
+extern void __audit_seccomp(unsigned long syscall, long signr, int code);
extern void __audit_ptrace(struct task_struct *t);
static inline int audit_dummy_context(void)
@@ -508,10 +508,10 @@ static inline void audit_inode_child(const struct dentry *dentry,
}
void audit_core_dumps(long signr);
-static inline void audit_seccomp(unsigned long syscall)
+static inline void audit_seccomp(unsigned long syscall, long signr, int code)
{
if (unlikely(!audit_dummy_context()))
- __audit_seccomp(syscall);
+ __audit_seccomp(syscall, signr, code);
}
static inline void audit_ptrace(struct task_struct *t)
@@ -634,7 +634,7 @@ extern int audit_signals;
#define audit_inode(n,d) do { (void)(d); } while (0)
#define audit_inode_child(i,p) do { ; } while (0)
#define audit_core_dumps(i) do { ; } while (0)
-#define audit_seccomp(i) do { ; } while (0)
+#define audit_seccomp(i,s,c) do { ; } while (0)
#define auditsc_get_stamp(c,t,s) (0)
#define audit_get_loginuid(t) (-1)
#define audit_get_sessionid(t) (-1)
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 8eeb205f298b..f2e53152e835 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -10,6 +10,7 @@
#ifdef __KERNEL__
#include <linux/atomic.h>
+#include <linux/compat.h>
#endif
/*
@@ -132,6 +133,16 @@ struct sock_fprog { /* Required for SO_ATTACH_FILTER. */
#ifdef __KERNEL__
+#ifdef CONFIG_COMPAT
+/*
+ * A struct sock_filter is architecture independent.
+ */
+struct compat_sock_fprog {
+ u16 len;
+ compat_uptr_t filter; /* struct sock_filter * */
+};
+#endif
+
struct sk_buff;
struct sock;
@@ -228,6 +239,7 @@ enum {
BPF_S_ANC_HATYPE,
BPF_S_ANC_RXHASH,
BPF_S_ANC_CPU,
+ BPF_S_ANC_SECCOMP_LD_W,
};
#endif /* __KERNEL__ */
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index e0cfec2490aa..78b76e24cc7e 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -124,4 +124,19 @@
#define PR_SET_CHILD_SUBREAPER 36
#define PR_GET_CHILD_SUBREAPER 37
+/*
+ * If no_new_privs is set, then operations that grant new privileges (i.e.
+ * execve) will either fail or not grant them. This affects suid/sgid,
+ * file capabilities, and LSMs.
+ *
+ * Operations that merely manipulate or drop existing privileges (setresuid,
+ * capset, etc.) will still work. Drop those privileges if you want them gone.
+ *
+ * Changing LSM security domain is considered a new privilege. So, for example,
+ * asking selinux for a specific new context (e.g. with runcon) will result
+ * in execve returning -EPERM.
+ */
+#define PR_SET_NO_NEW_PRIVS 38
+#define PR_GET_NO_NEW_PRIVS 39
+
#endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 5c719627c2aa..597e4fdb97fe 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -58,6 +58,7 @@
#define PTRACE_EVENT_EXEC 4
#define PTRACE_EVENT_VFORK_DONE 5
#define PTRACE_EVENT_EXIT 6
+#define PTRACE_EVENT_SECCOMP 7
/* Extended result codes which enabled by means other than options. */
#define PTRACE_EVENT_STOP 128
@@ -69,8 +70,9 @@
#define PTRACE_O_TRACEEXEC (1 << PTRACE_EVENT_EXEC)
#define PTRACE_O_TRACEVFORKDONE (1 << PTRACE_EVENT_VFORK_DONE)
#define PTRACE_O_TRACEEXIT (1 << PTRACE_EVENT_EXIT)
+#define PTRACE_O_TRACESECCOMP (1 << PTRACE_EVENT_SECCOMP)
-#define PTRACE_O_MASK 0x0000007f
+#define PTRACE_O_MASK 0x000000ff
#include <asm/ptrace.h>
@@ -98,6 +100,7 @@
#define PT_TRACE_EXEC PT_EVENT_FLAG(PTRACE_EVENT_EXEC)
#define PT_TRACE_VFORK_DONE PT_EVENT_FLAG(PTRACE_EVENT_VFORK_DONE)
#define PT_TRACE_EXIT PT_EVENT_FLAG(PTRACE_EVENT_EXIT)
+#define PT_TRACE_SECCOMP PT_EVENT_FLAG(PTRACE_EVENT_SECCOMP)
/* single stepping state bits (used on ARM and PA-RISC) */
#define PT_SINGLESTEP_BIT 31
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 81a173c0897d..cad15023f458 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1341,6 +1341,8 @@ struct task_struct {
* execve */
unsigned in_iowait:1;
+ /* task may not gain privileges */
+ unsigned no_new_privs:1;
/* Revert to default priority/policy when forking */
unsigned sched_reset_on_fork:1;
@@ -1450,7 +1452,7 @@ struct task_struct {
uid_t loginuid;
unsigned int sessionid;
#endif
- seccomp_t seccomp;
+ struct seccomp seccomp;
/* Thread group tracking */
u32 parent_exec_id;
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index cc7a4e9cc7ad..84f6320da50f 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -1,25 +1,90 @@
#ifndef _LINUX_SECCOMP_H
#define _LINUX_SECCOMP_H
-
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+
+/* Valid values for seccomp.mode and prctl(PR_SET_SECCOMP, <mode>) */
+#define SECCOMP_MODE_DISABLED 0 /* seccomp is not in use. */
+#define SECCOMP_MODE_STRICT 1 /* uses hard-coded filter. */
+#define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
+
+/*
+ * All BPF programs must return a 32-bit value.
+ * The bottom 16-bits are for optional return data.
+ * The upper 16-bits are ordered from least permissive values to most.
+ *
+ * The ordering ensures that a min_t() over composed return values always
+ * selects the least permissive choice.
+ */
+#define SECCOMP_RET_KILL 0x00000000U /* kill the task immediately */
+#define SECCOMP_RET_TRAP 0x00030000U /* disallow and force a SIGSYS */
+#define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */
+#define SECCOMP_RET_TRACE 0x7ff00000U /* pass to a tracer or disallow */
+#define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */
+
+/* Masks for the return value sections. */
+#define SECCOMP_RET_ACTION 0x7fff0000U
+#define SECCOMP_RET_DATA 0x0000ffffU
+
+/**
+ * struct seccomp_data - the format the BPF program executes over.
+ * @nr: the system call number
+ * @arch: indicates system call convention as an AUDIT_ARCH_* value
+ * as defined in <linux/audit.h>.
+ * @instruction_pointer: at the time of the system call.
+ * @args: up to 6 system call arguments always stored as 64-bit values
+ * regardless of the architecture.
+ */
+struct seccomp_data {
+ int nr;
+ __u32 arch;
+ __u64 instruction_pointer;
+ __u64 args[6];
+};
+
+#ifdef __KERNEL__
#ifdef CONFIG_SECCOMP
#include <linux/thread_info.h>
#include <asm/seccomp.h>
-typedef struct { int mode; } seccomp_t;
-
-extern void __secure_computing(int);
-static inline void secure_computing(int this_syscall)
+struct seccomp_filter;
+/**
+ * struct seccomp - the state of a seccomp'ed process
+ *
+ * @mode: indicates one of the valid values above for controlled
+ * system calls available to a process.
+ * @filter: The metadata and ruleset for determining what system calls
+ * are allowed for a task.
+ *
+ * @filter must only be accessed from the context of current as there
+ * is no locking.
+ */
+struct seccomp {
+ int mode;
+ struct seccomp_filter *filter;
+};
+
+extern int __secure_computing(int);
+static inline int secure_computing(int this_syscall)
{
if (unlikely(test_thread_flag(TIF_SECCOMP)))
- __secure_computing(this_syscall);
+ return __secure_computing(this_syscall);
+ return 0;
+}
+
+/* A wrapper for architectures supporting only SECCOMP_MODE_STRICT. */
+static inline void secure_computing_strict(int this_syscall)
+{
+ BUG_ON(secure_computing(this_syscall) != 0);
}
extern long prctl_get_seccomp(void);
-extern long prctl_set_seccomp(unsigned long);
+extern long prctl_set_seccomp(unsigned long, char __user *);
-static inline int seccomp_mode(seccomp_t *s)
+static inline int seccomp_mode(struct seccomp *s)
{
return s->mode;
}
@@ -28,25 +93,41 @@ static inline int seccomp_mode(seccomp_t *s)
#include <linux/errno.h>
-typedef struct { } seccomp_t;
+struct seccomp { };
+struct seccomp_filter { };
-#define secure_computing(x) do { } while (0)
+static inline int secure_computing(int this_syscall) { return 0; }
+static inline void secure_computing_strict(int this_syscall) { return; }
static inline long prctl_get_seccomp(void)
{
return -EINVAL;
}
-static inline long prctl_set_seccomp(unsigned long arg2)
+static inline long prctl_set_seccomp(unsigned long arg2, char __user *arg3)
{
return -EINVAL;
}
-static inline int seccomp_mode(seccomp_t *s)
+static inline int seccomp_mode(struct seccomp *s)
{
return 0;
}
-
#endif /* CONFIG_SECCOMP */
+#ifdef CONFIG_SECCOMP_FILTER
+extern void put_seccomp_filter(struct task_struct *tsk);
+extern void get_seccomp_filter(struct task_struct *tsk);
+extern u32 seccomp_bpf_load(int off);
+#else /* CONFIG_SECCOMP_FILTER */
+static inline void put_seccomp_filter(struct task_struct *tsk)
+{
+ return;
+}
+static inline void get_seccomp_filter(struct task_struct *tsk)
+{
+ return;
+}
+#endif /* CONFIG_SECCOMP_FILTER */
+#endif /* __KERNEL__ */
#endif /* _LINUX_SECCOMP_H */
diff --git a/include/linux/security.h b/include/linux/security.h
index 673afbb8238a..6e1dea93907a 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -144,6 +144,7 @@ struct request_sock;
#define LSM_UNSAFE_SHARE 1
#define LSM_UNSAFE_PTRACE 2
#define LSM_UNSAFE_PTRACE_CAP 4
+#define LSM_UNSAFE_NO_NEW_PRIVS 8
#ifdef CONFIG_MMU
extern int mmap_min_addr_handler(struct ctl_table *table, int write,
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index af1de0f34eae..4b96415527b8 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -67,6 +67,7 @@
#include <linux/syscalls.h>
#include <linux/capability.h>
#include <linux/fs_struct.h>
+#include <linux/compat.h>
#include "audit.h"
@@ -2710,13 +2711,16 @@ void audit_core_dumps(long signr)
audit_log_end(ab);
}
-void __audit_seccomp(unsigned long syscall)
+void __audit_seccomp(unsigned long syscall, long signr, int code)
{
struct audit_buffer *ab;
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
- audit_log_abend(ab, "seccomp", SIGKILL);
+ audit_log_abend(ab, "seccomp", signr);
audit_log_format(ab, " syscall=%ld", syscall);
+ audit_log_format(ab, " compat=%d", is_compat_task());
+ audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current));
+ audit_log_format(ab, " code=0x%x", code);
audit_log_end(ab);
}
diff --git a/kernel/fork.c b/kernel/fork.c
index b9372a0bff18..f7cf6fb107ec 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -34,6 +34,7 @@
#include <linux/cgroup.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
+#include <linux/seccomp.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
#include <linux/jiffies.h>
@@ -170,6 +171,7 @@ void free_task(struct task_struct *tsk)
free_thread_info(tsk->stack);
rt_mutex_debug_task_free(tsk);
ftrace_graph_exit_task(tsk);
+ put_seccomp_filter(tsk);
free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);
@@ -1162,6 +1164,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto fork_out;
ftrace_graph_init_task(p);
+ get_seccomp_filter(p);
rt_mutex_init_task(p);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index e8d76c5895ea..ee376beedaf9 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -3,16 +3,357 @@
*
* Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com>
*
- * This defines a simple but solid secure-computing mode.
+ * Copyright (C) 2012 Google, Inc.
+ * Will Drewry <wad@chromium.org>
+ *
+ * This defines a simple but solid secure-computing facility.
+ *
+ * Mode 1 uses a fixed list of allowed system calls.
+ * Mode 2 allows user-defined system call filters in the form
+ * of Berkeley Packet Filters/Linux Socket Filters.
*/
+#include <linux/atomic.h>
#include <linux/audit.h>
-#include <linux/seccomp.h>
-#include <linux/sched.h>
#include <linux/compat.h>
+#include <linux/sched.h>
+#include <linux/seccomp.h>
/* #define SECCOMP_DEBUG 1 */
-#define NR_SECCOMP_MODES 1
+
+#ifdef CONFIG_SECCOMP_FILTER
+#include <asm/syscall.h>
+#include <linux/filter.h>
+#include <linux/ptrace.h>
+#include <linux/security.h>
+#include <linux/slab.h>
+#include <linux/tracehook.h>
+#include <linux/uaccess.h>
+
+/**
+ * struct seccomp_filter - container for seccomp BPF programs
+ *
+ * @usage: reference count to manage the object lifetime.
+ * get/put helpers should be used when accessing an instance
+ * outside of a lifetime-guarded section. In general, this
+ * is only needed for handling filters shared across tasks.
+ * @prev: points to a previously installed, or inherited, filter
+ * @len: the number of instructions in the program
+ * @insns: the BPF program instructions to evaluate
+ *
+ * seccomp_filter objects are organized in a tree linked via the @prev
+ * pointer. For any task, it appears to be a singly-linked list starting
+ * with current->seccomp.filter, the most recently attached or inherited filter.
+ * However, multiple filters may share a @prev node, by way of fork(), which
+ * results in a unidirectional tree existing in memory. This is similar to
+ * how namespaces work.
+ *
+ * seccomp_filter objects should never be modified after being attached
+ * to a task_struct (other than @usage).
+ */
+struct seccomp_filter {
+ atomic_t usage;
+ struct seccomp_filter *prev;
+ unsigned short len; /* Instruction count */
+ struct sock_filter insns[];
+};
+
+/* Limit any path through the tree to 256KB worth of instructions. */
+#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
+
+/**
+ * get_u32 - returns a u32 offset into data
+ * @data: a unsigned 64 bit value
+ * @index: 0 or 1 to return the first or second 32-bits
+ *
+ * This inline exists to hide the length of unsigned long. If a 32-bit
+ * unsigned long is passed in, it will be extended and the top 32-bits will be
+ * 0. If it is a 64-bit unsigned long, then whatever data is resident will be
+ * properly returned.
+ *
+ * Endianness is explicitly ignored and left for BPF program authors to manage
+ * as per the specific architecture.
+ */
+static inline u32 get_u32(u64 data, int index)
+{
+ return ((u32 *)&data)[index];
+}
+
+/* Helper for bpf_load below. */
+#define BPF_DATA(_name) offsetof(struct seccomp_data, _name)
+/**
+ * bpf_load: checks and returns a pointer to the requested offset
+ * @off: offset into struct seccomp_data to load from
+ *
+ * Returns the requested 32-bits of data.
+ * seccomp_check_filter() should assure that @off is 32-bit aligned
+ * and not out of bounds. Failure to do so is a BUG.
+ */
+u32 seccomp_bpf_load(int off)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+ if (off == BPF_DATA(nr))
+ return syscall_get_nr(current, regs);
+ if (off == BPF_DATA(arch))
+ return syscall_get_arch(current, regs);
+ if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) {
+ unsigned long value;
+ int arg = (off - BPF_DATA(args[0])) / sizeof(u64);
+ int index = !!(off % sizeof(u64));
+ syscall_get_arguments(current, regs, arg, 1, &value);
+ return get_u32(value, index);
+ }
+ if (off == BPF_DATA(instruction_pointer))
+ return get_u32(KSTK_EIP(current), 0);
+ if (off == BPF_DATA(instruction_pointer) + sizeof(u32))
+ return get_u32(KSTK_EIP(current), 1);
+ /* seccomp_check_filter should make this impossible. */
+ BUG();
+}
+
+/**
+ * seccomp_check_filter - verify seccomp filter code
+ * @filter: filter to verify
+ * @flen: length of filter
+ *
+ * Takes a previously checked filter (by sk_chk_filter) and
+ * redirects all filter code that loads struct sk_buff data
+ * and related data through seccomp_bpf_load. It also
+ * enforces length and alignment checking of those loads.
+ *
+ * Returns 0 if the rule set is legal or -EINVAL if not.
+ */
+static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
+{
+ int pc;
+ for (pc = 0; pc < flen; pc++) {
+ struct sock_filter *ftest = &filter[pc];
+ u16 code = ftest->code;
+ u32 k = ftest->k;
+
+ switch (code) {
+ case BPF_S_LD_W_ABS:
+ ftest->code = BPF_S_ANC_SECCOMP_LD_W;
+ /* 32-bit aligned and not out of bounds. */
+ if (k >= sizeof(struct seccomp_data) || k & 3)
+ return -EINVAL;
+ continue;
+ case BPF_S_LD_W_LEN:
+ ftest->code = BPF_S_LD_IMM;
+ ftest->k = sizeof(struct seccomp_data);
+ continue;
+ case BPF_S_LDX_W_LEN:
+ ftest->code = BPF_S_LDX_IMM;
+ ftest->k = sizeof(struct seccomp_data);
+ continue;
+ /* Explicitly include allowed calls. */
+ case BPF_S_RET_K:
+ case BPF_S_RET_A:
+ case BPF_S_ALU_ADD_K:
+ case BPF_S_ALU_ADD_X:
+ case BPF_S_ALU_SUB_K:
+ case BPF_S_ALU_SUB_X:
+ case BPF_S_ALU_MUL_K:
+ case BPF_S_ALU_MUL_X:
+ case BPF_S_ALU_DIV_X:
+ case BPF_S_ALU_AND_K:
+ case BPF_S_ALU_AND_X:
+ case BPF_S_ALU_OR_K:
+ case BPF_S_ALU_OR_X:
+ case BPF_S_ALU_LSH_K:
+ case BPF_S_ALU_LSH_X:
+ case BPF_S_ALU_RSH_K:
+ case BPF_S_ALU_RSH_X:
+ case BPF_S_ALU_NEG:
+ case BPF_S_LD_IMM:
+ case BPF_S_LDX_IMM:
+ case BPF_S_MISC_TAX:
+ case BPF_S_MISC_TXA:
+ case BPF_S_ALU_DIV_K:
+ case BPF_S_LD_MEM:
+ case BPF_S_LDX_MEM:
+ case BPF_S_ST:
+ case BPF_S_STX:
+ case BPF_S_JMP_JA:
+ case BPF_S_JMP_JEQ_K:
+ case BPF_S_JMP_JEQ_X:
+ case BPF_S_JMP_JGE_K:
+ case BPF_S_JMP_JGE_X:
+ case BPF_S_JMP_JGT_K:
+ case BPF_S_JMP_JGT_X:
+ case BPF_S_JMP_JSET_K:
+ case BPF_S_JMP_JSET_X:
+ continue;
+ default:
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+/**
+ * seccomp_run_filters - evaluates all seccomp filters against @syscall
+ * @syscall: number of the current system call
+ *
+ * Returns valid seccomp BPF response codes.
+ */
+static u32 seccomp_run_filters(int syscall)
+{
+ struct seccomp_filter *f;
+ u32 ret = SECCOMP_RET_ALLOW;
+
+ /* Ensure unexpected behavior doesn't result in failing open. */
+ if (WARN_ON(current->seccomp.filter == NULL))
+ return SECCOMP_RET_KILL;
+
+ /*
+ * All filters in the list are evaluated and the lowest BPF return
+ * value always takes priority (ignoring the DATA).
+ */
+ for (f = current->seccomp.filter; f; f = f->prev) {
+ u32 cur_ret = sk_run_filter(NULL, f->insns);
+ if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
+ ret = cur_ret;
+ }
+ return ret;
+}
+
+/**
+ * seccomp_attach_filter: Attaches a seccomp filter to current.
+ * @fprog: BPF program to install
+ *
+ * Returns 0 on success or an errno on failure.
+ */
+static long seccomp_attach_filter(struct sock_fprog *fprog)
+{
+ struct seccomp_filter *filter;
+ unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
+ unsigned long total_insns = fprog->len;
+ long ret;
+
+ if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
+ return -EINVAL;
+
+ for (filter = current->seccomp.filter; filter; filter = filter->prev)
+ total_insns += filter->len + 4; /* include a 4 instr penalty */
+ if (total_insns > MAX_INSNS_PER_PATH)
+ return -ENOMEM;
+
+ /*
+ * Installing a seccomp filter requires that the task have
+ * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
+ * This avoids scenarios where unprivileged tasks can affect the
+ * behavior of privileged children.
+ */
+ if (!current->no_new_privs &&
+ security_capable_noaudit(current_cred(), current_user_ns(),
+ CAP_SYS_ADMIN) != 0)
+ return -EACCES;
+
+ /* Allocate a new seccomp_filter */
+ filter = kzalloc(sizeof(struct seccomp_filter) + fp_size,
+ GFP_KERNEL|__GFP_NOWARN);
+ if (!filter)
+ return -ENOMEM;
+ atomic_set(&filter->usage, 1);
+ filter->len = fprog->len;
+
+ /* Copy the instructions from fprog. */
+ ret = -EFAULT;
+ if (copy_from_user(filter->insns, fprog->filter, fp_size))
+ goto fail;
+
+ /* Check and rewrite the fprog via the skb checker */
+ ret = sk_chk_filter(filter->insns, filter->len);
+ if (ret)
+ goto fail;
+
+ /* Check and rewrite the fprog for seccomp use */
+ ret = seccomp_check_filter(filter->insns, filter->len);
+ if (ret)
+ goto fail;
+
+ /*
+ * If there is an existing filter, make it the prev and don't drop its
+ * task reference.
+ */
+ filter->prev = current->seccomp.filter;
+ current->seccomp.filter = filter;
+ return 0;
+fail:
+ kfree(filter);
+ return ret;
+}
+
+/**
+ * seccomp_attach_user_filter - attaches a user-supplied sock_fprog
+ * @user_filter: pointer to the user data containing a sock_fprog.
+ *
+ * Returns 0 on success and non-zero otherwise.
+ */
+long seccomp_attach_user_filter(char __user *user_filter)
+{
+ struct sock_fprog fprog;
+ long ret = -EFAULT;
+
+#ifdef CONFIG_COMPAT
+ if (is_compat_task()) {
+ struct compat_sock_fprog fprog32;
+ if (copy_from_user(&fprog32, user_filter, sizeof(fprog32)))
+ goto out;
+ fprog.len = fprog32.len;
+ fprog.filter = compat_ptr(fprog32.filter);
+ } else /* falls through to the if below. */
+#endif
+ if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
+ goto out;
+ ret = seccomp_attach_filter(&fprog);
+out:
+ return ret;
+}
+
+/* get_seccomp_filter - increments the reference count of the filter on @tsk */
+void get_seccomp_filter(struct task_struct *tsk)
+{
+ struct seccomp_filter *orig = tsk->seccomp.filter;
+ if (!orig)
+ return;
+ /* Reference count is bounded by the number of total processes. */
+ atomic_inc(&orig->usage);
+}
+
+/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
+void put_seccomp_filter(struct task_struct *tsk)
+{
+ struct seccomp_filter *orig = tsk->seccomp.filter;
+ /* Clean up single-reference branches iteratively. */
+ while (orig && atomic_dec_and_test(&orig->usage)) {
+ struct seccomp_filter *freeme = orig;
+ orig = orig->prev;
+ kfree(freeme);
+ }
+}
+
+/**
+ * seccomp_send_sigsys - signals the task to allow in-process syscall emulation
+ * @syscall: syscall number to send to userland
+ * @reason: filter-supplied reason code to send to userland (via si_errno)
+ *
+ * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
+ */
+static void seccomp_send_sigsys(int syscall, int reason)
+{
+ struct siginfo info;
+ memset(&info, 0, sizeof(info));
+ info.si_signo = SIGSYS;
+ info.si_code = SYS_SECCOMP;
+ info.si_call_addr = (void __user *)KSTK_EIP(current);
+ info.si_errno = reason;
+ info.si_arch = syscall_get_arch(current, task_pt_regs(current));
+ info.si_syscall = syscall;
+ force_sig_info(SIGSYS, &info, current);
+}
+#endif /* CONFIG_SECCOMP_FILTER */
/*
* Secure computing mode 1 allows only read/write/exit/sigreturn.
@@ -31,13 +372,15 @@ static int mode1_syscalls_32[] = {
};
#endif
-void __secure_computing(int this_syscall)
+int __secure_computing(int this_syscall)
{
int mode = current->seccomp.mode;
- int * syscall;
+ int exit_sig = 0;
+ int *syscall;
+ u32 ret;
switch (mode) {
- case 1:
+ case SECCOMP_MODE_STRICT:
syscall = mode1_syscalls;
#ifdef CONFIG_COMPAT
if (is_compat_task())
@@ -45,9 +388,54 @@ void __secure_computing(int this_syscall)
#endif
do {
if (*syscall == this_syscall)
- return;
+ return 0;
} while (*++syscall);
+ exit_sig = SIGKILL;
+ ret = SECCOMP_RET_KILL;
+ break;
+#ifdef CONFIG_SECCOMP_FILTER
+ case SECCOMP_MODE_FILTER: {
+ int data;
+ ret = seccomp_run_filters(this_syscall);
+ data = ret & SECCOMP_RET_DATA;
+ ret &= SECCOMP_RET_ACTION;
+ switch (ret) {
+ case SECCOMP_RET_ERRNO:
+ /* Set the low-order 16-bits as a errno. */
+ syscall_set_return_value(current, task_pt_regs(current),
+ -data, 0);
+ goto skip;
+ case SECCOMP_RET_TRAP:
+ /* Show the handler the original registers. */
+ syscall_rollback(current, task_pt_regs(current));
+ /* Let the filter pass back 16 bits of data. */
+ seccomp_send_sigsys(this_syscall, data);
+ goto skip;
+ case SECCOMP_RET_TRACE:
+ /* Skip these calls if there is no tracer. */
+ if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP))
+ goto skip;
+ /* Allow the BPF to provide the event message */
+ ptrace_event(PTRACE_EVENT_SECCOMP, data);
+ /*
+ * The delivery of a fatal signal during event
+ * notification may silently skip tracer notification.
+ * Terminating the task now avoids executing a system
+ * call that may not be intended.
+ */
+ if (fatal_signal_pending(current))
+ break;
+ return 0;
+ case SECCOMP_RET_ALLOW:
+ return 0;
+ case SECCOMP_RET_KILL:
+ default:
+ break;
+ }
+ exit_sig = SIGSYS;
break;
+ }
+#endif
default:
BUG();
}
@@ -55,8 +443,13 @@ void __secure_computing(int this_syscall)
#ifdef SECCOMP_DEBUG
dump_stack();
#endif
- audit_seccomp(this_syscall);
- do_exit(SIGKILL);
+ audit_seccomp(this_syscall, exit_sig, ret);
+ do_exit(exit_sig);
+#ifdef CONFIG_SECCOMP_FILTER
+skip:
+ audit_seccomp(this_syscall, exit_sig, ret);
+#endif
+ return -1;
}
long prctl_get_seccomp(void)
@@ -64,25 +457,48 @@ long prctl_get_seccomp(void)
return current->seccomp.mode;
}
-long prctl_set_seccomp(unsigned long seccomp_mode)
+/**
+ * prctl_set_seccomp: configures current->seccomp.mode
+ * @seccomp_mode: requested mode to use
+ * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
+ *
+ * This function may be called repeatedly with a @seccomp_mode of
+ * SECCOMP_MODE_FILTER to install additional filters. Every filter
+ * successfully installed will be evaluated (in reverse order) for each system
+ * call the task makes.
+ *
+ * Once current->seccomp.mode is non-zero, it may not be changed.
+ *
+ * Returns 0 on success or -EINVAL on failure.
+ */
+long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
{
- long ret;
+ long ret = -EINVAL;
- /* can set it only once to be even more secure */
- ret = -EPERM;
- if (unlikely(current->seccomp.mode))
+ if (current->seccomp.mode &&
+ current->seccomp.mode != seccomp_mode)
goto out;
- ret = -EINVAL;
- if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
- current->seccomp.mode = seccomp_mode;
- set_thread_flag(TIF_SECCOMP);
+ switch (seccomp_mode) {
+ case SECCOMP_MODE_STRICT:
+ ret = 0;
#ifdef TIF_NOTSC
disable_TSC();
#endif
- ret = 0;
+ break;
+#ifdef CONFIG_SECCOMP_FILTER
+ case SECCOMP_MODE_FILTER:
+ ret = seccomp_attach_user_filter(filter);
+ if (ret)
+ goto out;
+ break;
+#endif
+ default:
+ goto out;
}
- out:
+ current->seccomp.mode = seccomp_mode;
+ set_thread_flag(TIF_SECCOMP);
+out:
return ret;
}
diff --git a/kernel/signal.c b/kernel/signal.c
index 17afcaf582d0..1a006b5d9d9d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -160,7 +160,7 @@ void recalc_sigpending(void)
#define SYNCHRONOUS_MASK \
(sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
- sigmask(SIGTRAP) | sigmask(SIGFPE))
+ sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS))
int next_signal(struct sigpending *pending, sigset_t *mask)
{
@@ -2706,6 +2706,13 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
err |= __put_user(from->si_uid, &to->si_uid);
err |= __put_user(from->si_ptr, &to->si_ptr);
break;
+#ifdef __ARCH_SIGSYS
+ case __SI_SYS:
+ err |= __put_user(from->si_call_addr, &to->si_call_addr);
+ err |= __put_user(from->si_syscall, &to->si_syscall);
+ err |= __put_user(from->si_arch, &to->si_arch);
+ break;
+#endif
default: /* this is just in case for now ... */
err |= __put_user(from->si_pid, &to->si_pid);
err |= __put_user(from->si_uid, &to->si_uid);
diff --git a/kernel/sys.c b/kernel/sys.c
index e7006eb6c1e4..ba0ae8eea6fb 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1908,7 +1908,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = prctl_get_seccomp();
break;
case PR_SET_SECCOMP:
- error = prctl_set_seccomp(arg2);
+ error = prctl_set_seccomp(arg2, (char __user *)arg3);
break;
case PR_GET_TSC:
error = GET_TSC_CTL(arg2);
@@ -1979,6 +1979,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = put_user(me->signal->is_child_subreaper,
(int __user *) arg2);
break;
+ case PR_SET_NO_NEW_PRIVS:
+ if (arg2 != 1 || arg3 || arg4 || arg5)
+ return -EINVAL;
+
+ current->no_new_privs = 1;
+ break;
+ case PR_GET_NO_NEW_PRIVS:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+ return current->no_new_privs ? 1 : 0;
default:
error = -EINVAL;
break;
diff --git a/net/compat.c b/net/compat.c
index e055708b8ec9..242c828810ff 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -328,14 +328,6 @@ void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm)
__scm_destroy(scm);
}
-/*
- * A struct sock_filter is architecture independent.
- */
-struct compat_sock_fprog {
- u16 len;
- compat_uptr_t filter; /* struct sock_filter * */
-};
-
static int do_set_attach_filter(struct socket *sock, int level, int optname,
char __user *optval, unsigned int optlen)
{
diff --git a/net/core/filter.c b/net/core/filter.c
index 6f755cca4520..491e2e1ec277 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -38,6 +38,7 @@
#include <linux/filter.h>
#include <linux/reciprocal_div.h>
#include <linux/ratelimit.h>
+#include <linux/seccomp.h>
/* No hurry in this branch
*
@@ -352,6 +353,11 @@ load_b:
A = 0;
continue;
}
+#ifdef CONFIG_SECCOMP_FILTER
+ case BPF_S_ANC_SECCOMP_LD_W:
+ A = seccomp_bpf_load(fentry->k);
+ continue;
+#endif
default:
WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n",
fentry->code, fentry->jt,
diff --git a/samples/Makefile b/samples/Makefile
index 2f75851ec629..5ef08bba96ce 100644
--- a/samples/Makefile
+++ b/samples/Makefile
@@ -1,4 +1,4 @@
# Makefile for Linux samples code
obj-$(CONFIG_SAMPLES) += kobject/ kprobes/ tracepoints/ trace_events/ \
- hw_breakpoint/ kfifo/ kdb/ hidraw/ rpmsg/
+ hw_breakpoint/ kfifo/ kdb/ hidraw/ rpmsg/ seccomp/
diff --git a/samples/seccomp/Makefile b/samples/seccomp/Makefile
new file mode 100644
index 000000000000..16aa2d424985
--- /dev/null
+++ b/samples/seccomp/Makefile
@@ -0,0 +1,32 @@
+# kbuild trick to avoid linker error. Can be omitted if a module is built.
+obj- := dummy.o
+
+hostprogs-$(CONFIG_SECCOMP_FILTER) := bpf-fancy dropper bpf-direct
+
+HOSTCFLAGS_bpf-fancy.o += -I$(objtree)/usr/include
+HOSTCFLAGS_bpf-fancy.o += -idirafter $(objtree)/include
+HOSTCFLAGS_bpf-helper.o += -I$(objtree)/usr/include
+HOSTCFLAGS_bpf-helper.o += -idirafter $(objtree)/include
+bpf-fancy-objs := bpf-fancy.o bpf-helper.o
+
+HOSTCFLAGS_dropper.o += -I$(objtree)/usr/include
+HOSTCFLAGS_dropper.o += -idirafter $(objtree)/include
+dropper-objs := dropper.o
+
+HOSTCFLAGS_bpf-direct.o += -I$(objtree)/usr/include
+HOSTCFLAGS_bpf-direct.o += -idirafter $(objtree)/include
+bpf-direct-objs := bpf-direct.o
+
+# Try to match the kernel target.
+ifeq ($(CONFIG_64BIT),)
+HOSTCFLAGS_bpf-direct.o += -m32
+HOSTCFLAGS_dropper.o += -m32
+HOSTCFLAGS_bpf-helper.o += -m32
+HOSTCFLAGS_bpf-fancy.o += -m32
+HOSTLOADLIBES_bpf-direct += -m32
+HOSTLOADLIBES_bpf-fancy += -m32
+HOSTLOADLIBES_dropper += -m32
+endif
+
+# Tell kbuild to always build the programs
+always := $(hostprogs-y)
diff --git a/samples/seccomp/bpf-direct.c b/samples/seccomp/bpf-direct.c
new file mode 100644
index 000000000000..151ec3f52189
--- /dev/null
+++ b/samples/seccomp/bpf-direct.c
@@ -0,0 +1,190 @@
+/*
+ * Seccomp filter example for x86 (32-bit and 64-bit) with BPF macros
+ *
+ * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ * Author: Will Drewry <wad@chromium.org>
+ *
+ * The code may be used by anyone for any purpose,
+ * and can serve as a starting point for developing
+ * applications using prctl(PR_SET_SECCOMP, 2, ...).
+ */
+#if defined(__i386__) || defined(__x86_64__)
+#define SUPPORTED_ARCH 1
+#endif
+
+#if defined(SUPPORTED_ARCH)
+#define __USE_GNU 1
+#define _GNU_SOURCE 1
+
+#include <linux/types.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+#include <linux/unistd.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stddef.h>
+#include <string.h>
+#include <sys/prctl.h>
+#include <unistd.h>
+
+#define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]))
+#define syscall_nr (offsetof(struct seccomp_data, nr))
+
+#if defined(__i386__)
+#define REG_RESULT REG_EAX
+#define REG_SYSCALL REG_EAX
+#define REG_ARG0 REG_EBX
+#define REG_ARG1 REG_ECX
+#define REG_ARG2 REG_EDX
+#define REG_ARG3 REG_ESI
+#define REG_ARG4 REG_EDI
+#define REG_ARG5 REG_EBP
+#elif defined(__x86_64__)
+#define REG_RESULT REG_RAX
+#define REG_SYSCALL REG_RAX
+#define REG_ARG0 REG_RDI
+#define REG_ARG1 REG_RSI
+#define REG_ARG2 REG_RDX
+#define REG_ARG3 REG_R10
+#define REG_ARG4 REG_R8
+#define REG_ARG5 REG_R9
+#endif
+
+#ifndef PR_SET_NO_NEW_PRIVS
+#define PR_SET_NO_NEW_PRIVS 38
+#endif
+
+#ifndef SYS_SECCOMP
+#define SYS_SECCOMP 1
+#endif
+
+static void emulator(int nr, siginfo_t *info, void *void_context)
+{
+ ucontext_t *ctx = (ucontext_t *)(void_context);
+ int syscall;
+ char *buf;
+ ssize_t bytes;
+ size_t len;
+ if (info->si_code != SYS_SECCOMP)
+ return;
+ if (!ctx)
+ return;
+ syscall = ctx->uc_mcontext.gregs[REG_SYSCALL];
+ buf = (char *) ctx->uc_mcontext.gregs[REG_ARG1];
+ len = (size_t) ctx->uc_mcontext.gregs[REG_ARG2];
+
+ if (syscall != __NR_write)
+ return;
+ if (ctx->uc_mcontext.gregs[REG_ARG0] != STDERR_FILENO)
+ return;
+ /* Redirect stderr messages to stdout. Doesn't handle EINTR, etc */
+ ctx->uc_mcontext.gregs[REG_RESULT] = -1;
+ if (write(STDOUT_FILENO, "[ERR] ", 6) > 0) {
+ bytes = write(STDOUT_FILENO, buf, len);
+ ctx->uc_mcontext.gregs[REG_RESULT] = bytes;
+ }
+ return;
+}
+
+static int install_emulator(void)
+{
+ struct sigaction act;
+ sigset_t mask;
+ memset(&act, 0, sizeof(act));
+ sigemptyset(&mask);
+ sigaddset(&mask, SIGSYS);
+
+ act.sa_sigaction = &emulator;
+ act.sa_flags = SA_SIGINFO;
+ if (sigaction(SIGSYS, &act, NULL) < 0) {
+ perror("sigaction");
+ return -1;
+ }
+ if (sigprocmask(SIG_UNBLOCK, &mask, NULL)) {
+ perror("sigprocmask");
+ return -1;
+ }
+ return 0;
+}
+
+static int install_filter(void)
+{
+ struct sock_filter filter[] = {
+ /* Grab the system call number */
+ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_nr),
+ /* Jump table for the allowed syscalls */
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_rt_sigreturn, 0, 1),
+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
+#ifdef __NR_sigreturn
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_sigreturn, 0, 1),
+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
+#endif
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_exit_group, 0, 1),
+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_exit, 0, 1),
+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_read, 1, 0),
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_write, 3, 2),
+
+ /* Check that read is only using stdin. */
+ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_arg(0)),
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDIN_FILENO, 4, 0),
+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),
+
+ /* Check that write is only using stdout */
+ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_arg(0)),
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDOUT_FILENO, 1, 0),
+ /* Trap attempts to write to stderr */
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDERR_FILENO, 1, 2),
+
+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_TRAP),
+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
+ .filter = filter,
+ };
+
+ if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+ perror("prctl(NO_NEW_PRIVS)");
+ return 1;
+ }
+
+
+ if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {
+ perror("prctl");
+ return 1;
+ }
+ return 0;
+}
+
+#define payload(_c) (_c), sizeof((_c))
+int main(int argc, char **argv)
+{
+ char buf[4096];
+ ssize_t bytes = 0;
+ if (install_emulator())
+ return 1;
+ if (install_filter())
+ return 1;
+ syscall(__NR_write, STDOUT_FILENO,
+ payload("OHAI! WHAT IS YOUR NAME? "));
+ bytes = syscall(__NR_read, STDIN_FILENO, buf, sizeof(buf));
+ syscall(__NR_write, STDOUT_FILENO, payload("HELLO, "));
+ syscall(__NR_write, STDOUT_FILENO, buf, bytes);
+ syscall(__NR_write, STDERR_FILENO,
+ payload("Error message going to STDERR\n"));
+ return 0;
+}
+#else /* SUPPORTED_ARCH */
+/*
+ * This sample is x86-only. Since kernel samples are compiled with the
+ * host toolchain, a non-x86 host will result in using only the main()
+ * below.
+ */
+int main(void)
+{
+ return 1;
+}
+#endif /* SUPPORTED_ARCH */
diff --git a/samples/seccomp/bpf-fancy.c b/samples/seccomp/bpf-fancy.c
new file mode 100644
index 000000000000..8eb483aaec46
--- /dev/null
+++ b/samples/seccomp/bpf-fancy.c
@@ -0,0 +1,102 @@
+/*
+ * Seccomp BPF example using a macro-based generator.
+ *
+ * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ * Author: Will Drewry <wad@chromium.org>
+ *
+ * The code may be used by anyone for any purpose,
+ * and can serve as a starting point for developing
+ * applications using prctl(PR_ATTACH_SECCOMP_FILTER).
+ */
+
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+#include <linux/unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/prctl.h>
+#include <unistd.h>
+
+#include "bpf-helper.h"
+
+#ifndef PR_SET_NO_NEW_PRIVS
+#define PR_SET_NO_NEW_PRIVS 38
+#endif
+
+int main(int argc, char **argv)
+{
+ struct bpf_labels l;
+ static const char msg1[] = "Please type something: ";
+ static const char msg2[] = "You typed: ";
+ char buf[256];
+ struct sock_filter filter[] = {
+ /* TODO: LOAD_SYSCALL_NR(arch) and enforce an arch */
+ LOAD_SYSCALL_NR,
+ SYSCALL(__NR_exit, ALLOW),
+ SYSCALL(__NR_exit_group, ALLOW),
+ SYSCALL(__NR_write, JUMP(&l, write_fd)),
+ SYSCALL(__NR_read, JUMP(&l, read)),
+ DENY, /* Don't passthrough into a label */
+
+ LABEL(&l, read),
+ ARG(0),
+ JNE(STDIN_FILENO, DENY),
+ ARG(1),
+ JNE((unsigned long)buf, DENY),
+ ARG(2),
+ JGE(sizeof(buf), DENY),
+ ALLOW,
+
+ LABEL(&l, write_fd),
+ ARG(0),
+ JEQ(STDOUT_FILENO, JUMP(&l, write_buf)),
+ JEQ(STDERR_FILENO, JUMP(&l, write_buf)),
+ DENY,
+
+ LABEL(&l, write_buf),
+ ARG(1),
+ JEQ((unsigned long)msg1, JUMP(&l, msg1_len)),
+ JEQ((unsigned long)msg2, JUMP(&l, msg2_len)),
+ JEQ((unsigned long)buf, JUMP(&l, buf_len)),
+ DENY,
+
+ LABEL(&l, msg1_len),
+ ARG(2),
+ JLT(sizeof(msg1), ALLOW),
+ DENY,
+
+ LABEL(&l, msg2_len),
+ ARG(2),
+ JLT(sizeof(msg2), ALLOW),
+ DENY,
+
+ LABEL(&l, buf_len),
+ ARG(2),
+ JLT(sizeof(buf), ALLOW),
+ DENY,
+ };
+ struct sock_fprog prog = {
+ .filter = filter,
+ .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
+ };
+ ssize_t bytes;
+ bpf_resolve_jumps(&l, filter, sizeof(filter)/sizeof(*filter));
+
+ if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+ perror("prctl(NO_NEW_PRIVS)");
+ return 1;
+ }
+
+ if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {
+ perror("prctl(SECCOMP)");
+ return 1;
+ }
+ syscall(__NR_write, STDOUT_FILENO, msg1, strlen(msg1));
+ bytes = syscall(__NR_read, STDIN_FILENO, buf, sizeof(buf)-1);
+ bytes = (bytes > 0 ? bytes : 0);
+ syscall(__NR_write, STDERR_FILENO, msg2, strlen(msg2));
+ syscall(__NR_write, STDERR_FILENO, buf, bytes);
+ /* Now get killed */
+ syscall(__NR_write, STDERR_FILENO, msg2, strlen(msg2)+2);
+ return 0;
+}
diff --git a/samples/seccomp/bpf-helper.c b/samples/seccomp/bpf-helper.c
new file mode 100644
index 000000000000..579cfe331886
--- /dev/null
+++ b/samples/seccomp/bpf-helper.c
@@ -0,0 +1,89 @@
+/*
+ * Seccomp BPF helper functions
+ *
+ * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ * Author: Will Drewry <wad@chromium.org>
+ *
+ * The code may be used by anyone for any purpose,
+ * and can serve as a starting point for developing
+ * applications using prctl(PR_ATTACH_SECCOMP_FILTER).
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include "bpf-helper.h"
+
+int bpf_resolve_jumps(struct bpf_labels *labels,
+ struct sock_filter *filter, size_t count)
+{
+ struct sock_filter *begin = filter;
+ __u8 insn = count - 1;
+
+ if (count < 1)
+ return -1;
+ /*
+ * Walk it once, backwards, to build the label table and do fixups.
+ * Since backward jumps are disallowed by BPF, this is easy.
+ */
+ filter += insn;
+ for (; filter >= begin; --insn, --filter) {
+ if (filter->code != (BPF_JMP+BPF_JA))
+ continue;
+ switch ((filter->jt<<8)|filter->jf) {
+ case (JUMP_JT<<8)|JUMP_JF:
+ if (labels->labels[filter->k].location == 0xffffffff) {
+ fprintf(stderr, "Unresolved label: '%s'\n",
+ labels->labels[filter->k].label);
+ return 1;
+ }
+ filter->k = labels->labels[filter->k].location -
+ (insn + 1);
+ filter->jt = 0;
+ filter->jf = 0;
+ continue;
+ case (LABEL_JT<<8)|LABEL_JF:
+ if (labels->labels[filter->k].location != 0xffffffff) {
+ fprintf(stderr, "Duplicate label use: '%s'\n",
+ labels->labels[filter->k].label);
+ return 1;
+ }
+ labels->labels[filter->k].location = insn;
+ filter->k = 0; /* fall through */
+ filter->jt = 0;
+ filter->jf = 0;
+ continue;
+ }
+ }
+ return 0;
+}
+
+/* Simple lookup table for labels. */
+__u32 seccomp_bpf_label(struct bpf_labels *labels, const char *label)
+{
+ struct __bpf_label *begin = labels->labels, *end;
+ int id;
+ if (labels->count == 0) {
+ begin->label = label;
+ begin->location = 0xffffffff;
+ labels->count++;
+ return 0;
+ }
+ end = begin + labels->count;
+ for (id = 0; begin < end; ++begin, ++id) {
+ if (!strcmp(label, begin->label))
+ return id;
+ }
+ begin->label = label;
+ begin->location = 0xffffffff;
+ labels->count++;
+ return id;
+}
+
+void seccomp_bpf_print(struct sock_filter *filter, size_t count)
+{
+ struct sock_filter *end = filter + count;
+ for ( ; filter < end; ++filter)
+ printf("{ code=%u,jt=%u,jf=%u,k=%u },\n",
+ filter->code, filter->jt, filter->jf, filter->k);
+}
diff --git a/samples/seccomp/bpf-helper.h b/samples/seccomp/bpf-helper.h
new file mode 100644
index 000000000000..643279dd30fb
--- /dev/null
+++ b/samples/seccomp/bpf-helper.h
@@ -0,0 +1,238 @@
+/*
+ * Example wrapper around BPF macros.
+ *
+ * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ * Author: Will Drewry <wad@chromium.org>
+ *
+ * The code may be used by anyone for any purpose,
+ * and can serve as a starting point for developing
+ * applications using prctl(PR_SET_SECCOMP, 2, ...).
+ *
+ * No guarantees are provided with respect to the correctness
+ * or functionality of this code.
+ */
+#ifndef __BPF_HELPER_H__
+#define __BPF_HELPER_H__
+
+#include <asm/bitsperlong.h> /* for __BITS_PER_LONG */
+#include <endian.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h> /* for seccomp_data */
+#include <linux/types.h>
+#include <linux/unistd.h>
+#include <stddef.h>
+
+#define BPF_LABELS_MAX 256
+struct bpf_labels {
+ int count;
+ struct __bpf_label {
+ const char *label;
+ __u32 location;
+ } labels[BPF_LABELS_MAX];
+};
+
+int bpf_resolve_jumps(struct bpf_labels *labels,
+ struct sock_filter *filter, size_t count);
+__u32 seccomp_bpf_label(struct bpf_labels *labels, const char *label);
+void seccomp_bpf_print(struct sock_filter *filter, size_t count);
+
+#define JUMP_JT 0xff
+#define JUMP_JF 0xff
+#define LABEL_JT 0xfe
+#define LABEL_JF 0xfe
+
+#define ALLOW \
+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW)
+#define DENY \
+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL)
+#define JUMP(labels, label) \
+ BPF_JUMP(BPF_JMP+BPF_JA, FIND_LABEL((labels), (label)), \
+ JUMP_JT, JUMP_JF)
+#define LABEL(labels, label) \
+ BPF_JUMP(BPF_JMP+BPF_JA, FIND_LABEL((labels), (label)), \
+ LABEL_JT, LABEL_JF)
+#define SYSCALL(nr, jt) \
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (nr), 0, 1), \
+ jt
+
+/* Lame, but just an example */
+#define FIND_LABEL(labels, label) seccomp_bpf_label((labels), #label)
+
+#define EXPAND(...) __VA_ARGS__
+/* Map all width-sensitive operations */
+#if __BITS_PER_LONG == 32
+
+#define JEQ(x, jt) JEQ32(x, EXPAND(jt))
+#define JNE(x, jt) JNE32(x, EXPAND(jt))
+#define JGT(x, jt) JGT32(x, EXPAND(jt))
+#define JLT(x, jt) JLT32(x, EXPAND(jt))
+#define JGE(x, jt) JGE32(x, EXPAND(jt))
+#define JLE(x, jt) JLE32(x, EXPAND(jt))
+#define JA(x, jt) JA32(x, EXPAND(jt))
+#define ARG(i) ARG_32(i)
+#define LO_ARG(idx) offsetof(struct seccomp_data, args[(idx)])
+
+#elif __BITS_PER_LONG == 64
+
+/* Ensure that we load the logically correct offset. */
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define ENDIAN(_lo, _hi) _lo, _hi
+#define LO_ARG(idx) offsetof(struct seccomp_data, args[(idx)])
+#define HI_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) + sizeof(__u32)
+#elif __BYTE_ORDER == __BIG_ENDIAN
+#define ENDIAN(_lo, _hi) _hi, _lo
+#define LO_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) + sizeof(__u32)
+#define HI_ARG(idx) offsetof(struct seccomp_data, args[(idx)])
+#else
+#error "Unknown endianness"
+#endif
+
+union arg64 {
+ struct {
+ __u32 ENDIAN(lo32, hi32);
+ };
+ __u64 u64;
+};
+
+#define JEQ(x, jt) \
+ JEQ64(((union arg64){.u64 = (x)}).lo32, \
+ ((union arg64){.u64 = (x)}).hi32, \
+ EXPAND(jt))
+#define JGT(x, jt) \
+ JGT64(((union arg64){.u64 = (x)}).lo32, \
+ ((union arg64){.u64 = (x)}).hi32, \
+ EXPAND(jt))
+#define JGE(x, jt) \
+ JGE64(((union arg64){.u64 = (x)}).lo32, \
+ ((union arg64){.u64 = (x)}).hi32, \
+ EXPAND(jt))
+#define JNE(x, jt) \
+ JNE64(((union arg64){.u64 = (x)}).lo32, \
+ ((union arg64){.u64 = (x)}).hi32, \
+ EXPAND(jt))
+#define JLT(x, jt) \
+ JLT64(((union arg64){.u64 = (x)}).lo32, \
+ ((union arg64){.u64 = (x)}).hi32, \
+ EXPAND(jt))
+#define JLE(x, jt) \
+ JLE64(((union arg64){.u64 = (x)}).lo32, \
+ ((union arg64){.u64 = (x)}).hi32, \
+ EXPAND(jt))
+
+#define JA(x, jt) \
+ JA64(((union arg64){.u64 = (x)}).lo32, \
+ ((union arg64){.u64 = (x)}).hi32, \
+ EXPAND(jt))
+#define ARG(i) ARG_64(i)
+
+#else
+#error __BITS_PER_LONG value unusable.
+#endif
+
+/* Loads the arg into A */
+#define ARG_32(idx) \
+ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, LO_ARG(idx))
+
+/* Loads hi into A and lo in X */
+#define ARG_64(idx) \
+ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, LO_ARG(idx)), \
+ BPF_STMT(BPF_ST, 0), /* lo -> M[0] */ \
+ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, HI_ARG(idx)), \
+ BPF_STMT(BPF_ST, 1) /* hi -> M[1] */
+
+#define JEQ32(value, jt) \
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (value), 0, 1), \
+ jt
+
+#define JNE32(value, jt) \
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (value), 1, 0), \
+ jt
+
+/* Checks the lo, then swaps to check the hi. A=lo,X=hi */
+#define JEQ64(lo, hi, jt) \
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
+ BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (lo), 0, 2), \
+ BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
+ jt, \
+ BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
+
+#define JNE64(lo, hi, jt) \
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 5, 0), \
+ BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (lo), 2, 0), \
+ BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
+ jt, \
+ BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
+
+#define JA32(value, jt) \
+ BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, (value), 0, 1), \
+ jt
+
+#define JA64(lo, hi, jt) \
+ BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, (hi), 3, 0), \
+ BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
+ BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, (lo), 0, 2), \
+ BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
+ jt, \
+ BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
+
+#define JGE32(value, jt) \
+ BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (value), 0, 1), \
+ jt
+
+#define JLT32(value, jt) \
+ BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (value), 1, 0), \
+ jt
+
+/* Shortcut checking if hi > arg.hi. */
+#define JGE64(lo, hi, jt) \
+ BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (hi), 4, 0), \
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
+ BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
+ BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (lo), 0, 2), \
+ BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
+ jt, \
+ BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
+
+#define JLT64(lo, hi, jt) \
+ BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (hi), 0, 4), \
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
+ BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
+ BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (lo), 2, 0), \
+ BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
+ jt, \
+ BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
+
+#define JGT32(value, jt) \
+ BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (value), 0, 1), \
+ jt
+
+#define JLE32(value, jt) \
+ BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (value), 1, 0), \
+ jt
+
+/* Check hi > args.hi first, then do the GE checking */
+#define JGT64(lo, hi, jt) \
+ BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (hi), 4, 0), \
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
+ BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
+ BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (lo), 0, 2), \
+ BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
+ jt, \
+ BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
+
+#define JLE64(lo, hi, jt) \
+ BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (hi), 6, 0), \
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 3), \
+ BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
+ BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (lo), 2, 0), \
+ BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
+ jt, \
+ BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
+
+#define LOAD_SYSCALL_NR \
+ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, \
+ offsetof(struct seccomp_data, nr))
+
+#endif /* __BPF_HELPER_H__ */
diff --git a/samples/seccomp/dropper.c b/samples/seccomp/dropper.c
new file mode 100644
index 000000000000..c69c347c7011
--- /dev/null
+++ b/samples/seccomp/dropper.c
@@ -0,0 +1,68 @@
+/*
+ * Naive system call dropper built on seccomp_filter.
+ *
+ * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ * Author: Will Drewry <wad@chromium.org>
+ *
+ * The code may be used by anyone for any purpose,
+ * and can serve as a starting point for developing
+ * applications using prctl(PR_SET_SECCOMP, 2, ...).
+ *
+ * When run, returns the specified errno for the specified
+ * system call number against the given architecture.
+ *
+ * Run this one as root as PR_SET_NO_NEW_PRIVS is not called.
+ */
+
+#include <errno.h>
+#include <linux/audit.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+#include <linux/unistd.h>
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <sys/prctl.h>
+#include <unistd.h>
+
+static int install_filter(int nr, int arch, int error)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD+BPF_W+BPF_ABS,
+ (offsetof(struct seccomp_data, arch))),
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, arch, 0, 3),
+ BPF_STMT(BPF_LD+BPF_W+BPF_ABS,
+ (offsetof(struct seccomp_data, nr))),
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1),
+ BPF_STMT(BPF_RET+BPF_K,
+ SECCOMP_RET_ERRNO|(error & SECCOMP_RET_DATA)),
+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
+ .filter = filter,
+ };
+ if (prctl(PR_SET_SECCOMP, 2, &prog)) {
+ perror("prctl");
+ return 1;
+ }
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ if (argc < 5) {
+ fprintf(stderr, "Usage:\n"
+ "dropper <syscall_nr> <arch> <errno> <prog> [<args>]\n"
+ "Hint: AUDIT_ARCH_I386: 0x%X\n"
+ " AUDIT_ARCH_X86_64: 0x%X\n"
+ "\n", AUDIT_ARCH_I386, AUDIT_ARCH_X86_64);
+ return 1;
+ }
+ if (install_filter(strtol(argv[1], NULL, 0), strtol(argv[2], NULL, 0),
+ strtol(argv[3], NULL, 0)))
+ return 1;
+ execv(argv[4], &argv[4]);
+ printf("Failed to execv\n");
+ return 255;
+}
diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c
index 6327685c101e..b81ea10a17a3 100644
--- a/security/apparmor/domain.c
+++ b/security/apparmor/domain.c
@@ -394,6 +394,11 @@ int apparmor_bprm_set_creds(struct linux_binprm *bprm)
new_profile = find_attach(ns, &ns->base.profiles, name);
if (!new_profile)
goto cleanup;
+ /*
+ * NOTE: Domain transitions from unconfined are allowed
+ * even when no_new_privs is set because this aways results
+ * in a further reduction of permissions.
+ */
goto apply;
}
@@ -455,6 +460,16 @@ int apparmor_bprm_set_creds(struct linux_binprm *bprm)
/* fail exec */
error = -EACCES;
+ /*
+ * Policy has specified a domain transition, if no_new_privs then
+ * fail the exec.
+ */
+ if (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS) {
+ aa_put_profile(new_profile);
+ error = -EPERM;
+ goto cleanup;
+ }
+
if (!new_profile)
goto audit;
@@ -609,6 +624,14 @@ int aa_change_hat(const char *hats[], int count, u64 token, bool permtest)
const char *target = NULL, *info = NULL;
int error = 0;
+ /*
+ * Fail explicitly requested domain transitions if no_new_privs.
+ * There is no exception for unconfined as change_hat is not
+ * available.
+ */
+ if (current->no_new_privs)
+ return -EPERM;
+
/* released below */
cred = get_current_cred();
cxt = cred->security;
@@ -750,6 +773,18 @@ int aa_change_profile(const char *ns_name, const char *hname, bool onexec,
cxt = cred->security;
profile = aa_cred_profile(cred);
+ /*
+ * Fail explicitly requested domain transitions if no_new_privs
+ * and not unconfined.
+ * Domain transitions from unconfined are allowed even when
+ * no_new_privs is set because this aways results in a reduction
+ * of permissions.
+ */
+ if (current->no_new_privs && !unconfined(profile)) {
+ put_cred(cred);
+ return -EPERM;
+ }
+
if (ns_name) {
/* released below */
ns = aa_find_namespace(profile->ns, ns_name);
diff --git a/security/commoncap.c b/security/commoncap.c
index 71a166a05975..f80d11609391 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -512,14 +512,17 @@ skip:
/* Don't let someone trace a set[ug]id/setpcap binary with the revised
- * credentials unless they have the appropriate permit
+ * credentials unless they have the appropriate permit.
+ *
+ * In addition, if NO_NEW_PRIVS, then ensure we get no new privs.
*/
if ((new->euid != old->uid ||
new->egid != old->gid ||
!cap_issubset(new->cap_permitted, old->cap_permitted)) &&
bprm->unsafe & ~LSM_UNSAFE_PTRACE_CAP) {
/* downgrade; they get no more than they had, and maybe less */
- if (!capable(CAP_SETUID)) {
+ if (!capable(CAP_SETUID) ||
+ (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) {
new->euid = new->uid;
new->egid = new->gid;
}
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index d85b793c9321..0b06685787b9 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2016,6 +2016,13 @@ static int selinux_bprm_set_creds(struct linux_binprm *bprm)
new_tsec->sid = old_tsec->exec_sid;
/* Reset exec SID on execve. */
new_tsec->exec_sid = 0;
+
+ /*
+ * Minimize confusion: if no_new_privs and a transition is
+ * explicitly requested, then fail the exec.
+ */
+ if (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)
+ return -EPERM;
} else {
/* Check for a default transition on this program. */
rc = security_transition_sid(old_tsec->sid, isec->sid,
@@ -2029,7 +2036,8 @@ static int selinux_bprm_set_creds(struct linux_binprm *bprm)
ad.selinux_audit_data = &sad;
ad.u.path = bprm->file->f_path;
- if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)
+ if ((bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) ||
+ (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS))
new_tsec->sid = old_tsec->sid;
if (new_tsec->sid == old_tsec->sid) {
diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c
index 573723843a04..c852f7472ad0 100644
--- a/security/yama/yama_lsm.c
+++ b/security/yama/yama_lsm.c
@@ -18,7 +18,12 @@
#include <linux/prctl.h>
#include <linux/ratelimit.h>
-static int ptrace_scope = 1;
+#define YAMA_SCOPE_DISABLED 0
+#define YAMA_SCOPE_RELATIONAL 1
+#define YAMA_SCOPE_CAPABILITY 2
+#define YAMA_SCOPE_NO_ATTACH 3
+
+static int ptrace_scope = YAMA_SCOPE_RELATIONAL;
/* describe a ptrace relationship for potential exception */
struct ptrace_relation {
@@ -251,17 +256,32 @@ static int yama_ptrace_access_check(struct task_struct *child,
return rc;
/* require ptrace target be a child of ptracer on attach */
- if (mode == PTRACE_MODE_ATTACH &&
- ptrace_scope &&
- !task_is_descendant(current, child) &&
- !ptracer_exception_found(current, child) &&
- !capable(CAP_SYS_PTRACE))
- rc = -EPERM;
+ if (mode == PTRACE_MODE_ATTACH) {
+ switch (ptrace_scope) {
+ case YAMA_SCOPE_DISABLED:
+ /* No additional restrictions. */
+ break;
+ case YAMA_SCOPE_RELATIONAL:
+ if (!task_is_descendant(current, child) &&
+ !ptracer_exception_found(current, child) &&
+ !capable(CAP_SYS_PTRACE))
+ rc = -EPERM;
+ break;
+ case YAMA_SCOPE_CAPABILITY:
+ if (!capable(CAP_SYS_PTRACE))
+ rc = -EPERM;
+ break;
+ case YAMA_SCOPE_NO_ATTACH:
+ default:
+ rc = -EPERM;
+ break;
+ }
+ }
if (rc) {
char name[sizeof(current->comm)];
- printk_ratelimited(KERN_NOTICE "ptrace of non-child"
- " pid %d was attempted by: %s (pid %d)\n",
+ printk_ratelimited(KERN_NOTICE
+ "ptrace of pid %d was attempted by: %s (pid %d)\n",
child->pid,
get_task_comm(name, current),
current->pid);
@@ -279,8 +299,27 @@ static struct security_operations yama_ops = {
};
#ifdef CONFIG_SYSCTL
+static int yama_dointvec_minmax(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int rc;
+
+ if (write && !capable(CAP_SYS_PTRACE))
+ return -EPERM;
+
+ rc = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (rc)
+ return rc;
+
+ /* Lock the max value if it ever gets set. */
+ if (write && *(int *)table->data == *(int *)table->extra2)
+ table->extra1 = table->extra2;
+
+ return rc;
+}
+
static int zero;
-static int one = 1;
+static int max_scope = YAMA_SCOPE_NO_ATTACH;
struct ctl_path yama_sysctl_path[] = {
{ .procname = "kernel", },
@@ -294,9 +333,9 @@ static struct ctl_table yama_sysctl_table[] = {
.data = &ptrace_scope,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = yama_dointvec_minmax,
.extra1 = &zero,
- .extra2 = &one,
+ .extra2 = &max_scope,
},
{ }
};