From a227960fe0cafcc229a8d6bb8b454a3a0b33719d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 10 Apr 2014 16:15:59 +0200
Subject: locking/mutex: Fix debug_mutexes

debug_mutex_unlock() would bail when !debug_locks and forgets to
actually unlock.

Reported-by: "Michael L. Semon" <mlsemon35@gmail.com>
Reported-by: "Kirill A. Shutemov" <kirill@shutemov.name>
Reported-by: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Fixes: 6f008e72cd11 ("locking/mutex: Fix debug checks")
Tested-by: Dave Jones <davej@redhat.com>
Cc: Jason Low <jason.low2@hp.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20140410141559.GE13658@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/mutex-debug.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index e1191c996c59..5cf6731b98e9 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -71,18 +71,17 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
 
 void debug_mutex_unlock(struct mutex *lock)
 {
-	if (unlikely(!debug_locks))
-		return;
+	if (likely(debug_locks)) {
+		DEBUG_LOCKS_WARN_ON(lock->magic != lock);
 
-	DEBUG_LOCKS_WARN_ON(lock->magic != lock);
+		if (!lock->owner)
+			DEBUG_LOCKS_WARN_ON(!lock->owner);
+		else
+			DEBUG_LOCKS_WARN_ON(lock->owner != current);
 
-	if (!lock->owner)
-		DEBUG_LOCKS_WARN_ON(!lock->owner);
-	else
-		DEBUG_LOCKS_WARN_ON(lock->owner != current);
-
-	DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
-	mutex_clear_owner(lock);
+		DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
+		mutex_clear_owner(lock);
+	}
 
 	/*
 	 * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug
-- 
cgit v1.2.3


From 2eac7648321f4a08aa4078504d7727af0af7173b Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Mon, 14 Apr 2014 21:02:59 +0200
Subject: seccomp: fix populating a0-a5 syscall args in 32-bit x86 BPF

Linus reports that on 32-bit x86 Chromium throws the following seccomp
resp. audit log messages:

  audit: type=1326 audit(1397359304.356:28108): auid=500 uid=500
gid=500 ses=2 subj=unconfined_u:unconfined_r:chrome_sandbox_t:s0-s0:c0.c1023
pid=3677 comm="chrome" exe="/opt/google/chrome/chrome" sig=0
syscall=172 compat=0 ip=0xb2dd9852 code=0x30000

  audit: type=1326 audit(1397359304.356:28109): auid=500 uid=500
gid=500 ses=2 subj=unconfined_u:unconfined_r:chrome_sandbox_t:s0-s0:c0.c1023
pid=3677 comm="chrome" exe="/opt/google/chrome/chrome" sig=0 syscall=5
compat=0 ip=0xb2dd9852 code=0x50000

These audit messages are being triggered via audit_seccomp() through
__secure_computing() in seccomp mode (BPF) filter with seccomp return
codes 0x30000 (== SECCOMP_RET_TRAP) and 0x50000 (== SECCOMP_RET_ERRNO)
during filter runtime. Moreover, Linus reports that x86_64 Chromium
seems fine.

The underlying issue that explains this is that the implementation of
populate_seccomp_data() is wrong. Our seccomp data structure sd that
is being shared with user ABI is:

  struct seccomp_data {
    int nr;
    __u32 arch;
    __u64 instruction_pointer;
    __u64 args[6];
  };

Therefore, a simple cast to 'unsigned long *' for storing the value of
the syscall argument via syscall_get_arguments() is just wrong as on
32-bit x86 (or any other 32bit arch), it would result in storing a0-a5
at wrong offsets in args[] member, and thus i) could leak stack memory
to user space and ii) tampers with the logic of seccomp BPF programs
that read out and check for syscall arguments:

  syscall_get_arguments(task, regs, 0, 1, (unsigned long *) &sd->args[0]);

Tested on 32-bit x86 with Google Chrome, unfortunately only via remote
test machine through slow ssh X forwarding, but it fixes the issue on
my side. So fix it up by storing args in type correct variables, gcc
is clever and optimizes the copy away in other cases, e.g. x86_64.

Fixes: bd4cf0ed331a ("net: filter: rework/optimize internal BPF interpreter's instruction set")
Reported-and-bisected-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Eric Paris <eparis@redhat.com>
Cc: James Morris <james.l.morris@oracle.com>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/seccomp.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index d8d046c0726a..590c37925084 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -69,18 +69,17 @@ static void populate_seccomp_data(struct seccomp_data *sd)
 {
 	struct task_struct *task = current;
 	struct pt_regs *regs = task_pt_regs(task);
+	unsigned long args[6];
 
 	sd->nr = syscall_get_nr(task, regs);
 	sd->arch = syscall_get_arch();
-
-	/* Unroll syscall_get_args to help gcc on arm. */
-	syscall_get_arguments(task, regs, 0, 1, (unsigned long *) &sd->args[0]);
-	syscall_get_arguments(task, regs, 1, 1, (unsigned long *) &sd->args[1]);
-	syscall_get_arguments(task, regs, 2, 1, (unsigned long *) &sd->args[2]);
-	syscall_get_arguments(task, regs, 3, 1, (unsigned long *) &sd->args[3]);
-	syscall_get_arguments(task, regs, 4, 1, (unsigned long *) &sd->args[4]);
-	syscall_get_arguments(task, regs, 5, 1, (unsigned long *) &sd->args[5]);
-
+	syscall_get_arguments(task, regs, 0, 6, args);
+	sd->args[0] = args[0];
+	sd->args[1] = args[1];
+	sd->args[2] = args[2];
+	sd->args[3] = args[3];
+	sd->args[4] = args[4];
+	sd->args[5] = args[5];
 	sd->instruction_pointer = KSTK_EIP(task);
 }
 
-- 
cgit v1.2.3


From e79323bd87808fdfbc68ce6c5371bd224d9672ee Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 14 Apr 2014 16:58:55 -0400
Subject: user namespace: fix incorrect memory barriers

smp_read_barrier_depends() can be used if there is data dependency between
the readers - i.e. if the read operation after the barrier uses address
that was obtained from the read operation before the barrier.

In this file, there is only control dependency, no data dependecy, so the
use of smp_read_barrier_depends() is incorrect. The code could fail in the
following way:
* the cpu predicts that idx < entries is true and starts executing the
  body of the for loop
* the cpu fetches map->extent[0].first and map->extent[0].count
* the cpu fetches map->nr_extents
* the cpu verifies that idx < extents is true, so it commits the
  instructions in the body of the for loop

The problem is that in this scenario, the cpu read map->extent[0].first
and map->nr_extents in the wrong order. We need a full read memory barrier
to prevent it.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/user_namespace.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 0d8f6023fd8d..bf71b4b2d632 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -152,7 +152,7 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
 
 	/* Find the matching extent */
 	extents = map->nr_extents;
-	smp_read_barrier_depends();
+	smp_rmb();
 	for (idx = 0; idx < extents; idx++) {
 		first = map->extent[idx].first;
 		last = first + map->extent[idx].count - 1;
@@ -176,7 +176,7 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id)
 
 	/* Find the matching extent */
 	extents = map->nr_extents;
-	smp_read_barrier_depends();
+	smp_rmb();
 	for (idx = 0; idx < extents; idx++) {
 		first = map->extent[idx].first;
 		last = first + map->extent[idx].count - 1;
@@ -199,7 +199,7 @@ static u32 map_id_up(struct uid_gid_map *map, u32 id)
 
 	/* Find the matching extent */
 	extents = map->nr_extents;
-	smp_read_barrier_depends();
+	smp_rmb();
 	for (idx = 0; idx < extents; idx++) {
 		first = map->extent[idx].lower_first;
 		last = first + map->extent[idx].count - 1;
@@ -615,9 +615,8 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 	 * were written before the count of the extents.
 	 *
 	 * To achieve this smp_wmb() is used on guarantee the write
-	 * order and smp_read_barrier_depends() is guaranteed that we
-	 * don't have crazy architectures returning stale data.
-	 *
+	 * order and smp_rmb() is guaranteed that we don't have crazy
+	 * architectures returning stale data.
 	 */
 	mutex_lock(&id_map_mutex);
 
-- 
cgit v1.2.3