From 5facae4f3549b5cf7c0e10ec312a65ffd43b5726 Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Thu, 19 Sep 2019 12:09:40 -0400
Subject: locking/lockdep: Remove unused @nested argument from lock_release()

Since the following commit:

  b4adfe8e05f1 ("locking/lockdep: Remove unused argument in __lock_release")

@nested is no longer used in lock_release(), so remove it from all
lock_release() calls and friends.

Signed-off-by: Qian Cai <cai@lca.pw>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: airlied@linux.ie
Cc: akpm@linux-foundation.org
Cc: alexander.levin@microsoft.com
Cc: daniel@iogearbox.net
Cc: davem@davemloft.net
Cc: dri-devel@lists.freedesktop.org
Cc: duyuyang@gmail.com
Cc: gregkh@linuxfoundation.org
Cc: hannes@cmpxchg.org
Cc: intel-gfx@lists.freedesktop.org
Cc: jack@suse.com
Cc: jlbec@evilplan.or
Cc: joonas.lahtinen@linux.intel.com
Cc: joseph.qi@linux.alibaba.com
Cc: jslaby@suse.com
Cc: juri.lelli@redhat.com
Cc: maarten.lankhorst@linux.intel.com
Cc: mark@fasheh.com
Cc: mhocko@kernel.org
Cc: mripard@kernel.org
Cc: ocfs2-devel@oss.oracle.com
Cc: rodrigo.vivi@intel.com
Cc: sean@poorly.run
Cc: st@kernel.org
Cc: tj@kernel.org
Cc: tytso@mit.edu
Cc: vdavydov.dev@gmail.com
Cc: vincent.guittot@linaro.org
Cc: viro@zeniv.linux.org.uk
Link: https://lkml.kernel.org/r/1568909380-32199-1-git-send-email-cai@lca.pw
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/bpf/stackmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/bpf/stackmap.c')
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 052580c33d26..dcfe2d37ad15 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -338,7 +338,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 		 * up_read_non_owner(). The rwsem_release() is called
 		 * here to release the lock from lockdep's perspective.
 		 */
-		rwsem_release(&current->mm->mmap_sem.dep_map, 1, _RET_IP_);
+		rwsem_release(&current->mm->mmap_sem.dep_map, _RET_IP_);
 	}
 }
 
-- 
cgit v1.2.3


From eac9153f2b584c702cea02c1f1a57d85aa9aea42 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Mon, 14 Oct 2019 10:12:23 -0700
Subject: bpf/stackmap: Fix deadlock with rq_lock in bpf_get_stack()

bpf stackmap with build-id lookup (BPF_F_STACK_BUILD_ID) can trigger A-A
deadlock on rq_lock():

rcu: INFO: rcu_sched detected stalls on CPUs/tasks:
[...]
Call Trace:
 try_to_wake_up+0x1ad/0x590
 wake_up_q+0x54/0x80
 rwsem_wake+0x8a/0xb0
 bpf_get_stack+0x13c/0x150
 bpf_prog_fbdaf42eded9fe46_on_event+0x5e3/0x1000
 bpf_overflow_handler+0x60/0x100
 __perf_event_overflow+0x4f/0xf0
 perf_swevent_overflow+0x99/0xc0
 ___perf_sw_event+0xe7/0x120
 __schedule+0x47d/0x620
 schedule+0x29/0x90
 futex_wait_queue_me+0xb9/0x110
 futex_wait+0x139/0x230
 do_futex+0x2ac/0xa50
 __x64_sys_futex+0x13c/0x180
 do_syscall_64+0x42/0x100
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

This can be reproduced by:
1. Start a multi-thread program that does parallel mmap() and malloc();
2. taskset the program to 2 CPUs;
3. Attach bpf program to trace_sched_switch and gather stackmap with
   build-id, e.g. with trace.py from bcc tools:
   trace.py -U -p <pid> -s <some-bin,some-lib> t:sched:sched_switch

A sample reproducer is attached at the end.

This could also trigger deadlock with other locks that are nested with
rq_lock.

Fix this by checking whether irqs are disabled. Since rq_lock and all
other nested locks are irq safe, it is safe to do up_read() when irqs are
not disable. If the irqs are disabled, postpone up_read() in irq_work.

Fixes: 615755a77b24 ("bpf: extend stackmap to save binary_build_id+offset instead of address")
Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20191014171223.357174-1-songliubraving@fb.com

Reproducer:
============================ 8< ============================

char *filename;

void *worker(void *p)
{
        void *ptr;
        int fd;
        char *pptr;

        fd = open(filename, O_RDONLY);
        if (fd < 0)
                return NULL;
        while (1) {
                struct timespec ts = {0, 1000 + rand() % 2000};

                ptr = mmap(NULL, 4096 * 64, PROT_READ, MAP_PRIVATE, fd, 0);
                usleep(1);
                if (ptr == MAP_FAILED) {
                        printf("failed to mmap\n");
                        break;
                }
                munmap(ptr, 4096 * 64);
                usleep(1);
                pptr = malloc(1);
                usleep(1);
                pptr[0] = 1;
                usleep(1);
                free(pptr);
                usleep(1);
                nanosleep(&ts, NULL);
        }
        close(fd);
        return NULL;
}

int main(int argc, char *argv[])
{
        void *ptr;
        int i;
        pthread_t threads[THREAD_COUNT];

        if (argc < 2)
                return 0;

        filename = argv[1];

        for (i = 0; i < THREAD_COUNT; i++) {
                if (pthread_create(threads + i, NULL, worker, NULL)) {
                        fprintf(stderr, "Error creating thread\n");
                        return 0;
                }
        }

        for (i = 0; i < THREAD_COUNT; i++)
                pthread_join(threads[i], NULL);
        return 0;
}
============================ 8< ============================
---
 kernel/bpf/stackmap.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel/bpf/stackmap.c')

diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 052580c33d26..173e983619d7 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -287,7 +287,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 	bool irq_work_busy = false;
 	struct stack_map_irq_work *work = NULL;
 
-	if (in_nmi()) {
+	if (irqs_disabled()) {
 		work = this_cpu_ptr(&up_read_work);
 		if (work->irq_work.flags & IRQ_WORK_BUSY)
 			/* cannot queue more up_read, fallback */
@@ -295,8 +295,9 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 	}
 
 	/*
-	 * We cannot do up_read() in nmi context. To do build_id lookup
-	 * in nmi context, we need to run up_read() in irq_work. We use
+	 * We cannot do up_read() when the irq is disabled, because of
+	 * risk to deadlock with rq_lock. To do build_id lookup when the
+	 * irqs are disabled, we need to run up_read() in irq_work. We use
 	 * a percpu variable to do the irq_work. If the irq_work is
 	 * already used by another lookup, we fall back to report ips.
 	 *
-- 
cgit v1.2.3


From 153bedbac2ebd475e1c7c2d2fa0c042f5525927d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 8 Nov 2019 17:08:55 +0100
Subject: irq_work: Convert flags to atomic_t

We need to convert flags to atomic_t in order to later fix an ordering
issue on atomic_cmpxchg() failure. This will allow us to use atomic_fetch_or().

Also clarify the nature of those flags.

[ mingo: Converted two more usage site the original patch missed. ]

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E . McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20191108160858.31665-2-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/irq_work.h | 10 +++++++---
 kernel/bpf/stackmap.c    |  2 +-
 kernel/irq_work.c        | 18 +++++++++---------
 kernel/printk/printk.c   |  2 +-
 kernel/trace/bpf_trace.c |  2 +-
 5 files changed, 19 insertions(+), 15 deletions(-)

(limited to 'kernel/bpf/stackmap.c')

diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
index b11fcdfd0770..02da997ad12c 100644
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -22,7 +22,7 @@
 #define IRQ_WORK_CLAIMED	(IRQ_WORK_PENDING | IRQ_WORK_BUSY)
 
 struct irq_work {
-	unsigned long flags;
+	atomic_t flags;
 	struct llist_node llnode;
 	void (*func)(struct irq_work *);
 };
@@ -30,11 +30,15 @@ struct irq_work {
 static inline
 void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *))
 {
-	work->flags = 0;
+	atomic_set(&work->flags, 0);
 	work->func = func;
 }
 
-#define DEFINE_IRQ_WORK(name, _f) struct irq_work name = { .func = (_f), }
+#define DEFINE_IRQ_WORK(name, _f) struct irq_work name = {	\
+		.flags = ATOMIC_INIT(0),			\
+		.func  = (_f)					\
+}
+
 
 bool irq_work_queue(struct irq_work *work);
 bool irq_work_queue_on(struct irq_work *work, int cpu);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 052580c33d26..4d31284095e2 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -289,7 +289,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 
 	if (in_nmi()) {
 		work = this_cpu_ptr(&up_read_work);
-		if (work->irq_work.flags & IRQ_WORK_BUSY)
+		if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY)
 			/* cannot queue more up_read, fallback */
 			irq_work_busy = true;
 	}
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index d42acaf81886..df0dbf4d859b 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -29,16 +29,16 @@ static DEFINE_PER_CPU(struct llist_head, lazy_list);
  */
 static bool irq_work_claim(struct irq_work *work)
 {
-	unsigned long flags, oflags, nflags;
+	int flags, oflags, nflags;
 
 	/*
 	 * Start with our best wish as a premise but only trust any
 	 * flag value after cmpxchg() result.
 	 */
-	flags = work->flags & ~IRQ_WORK_PENDING;
+	flags = atomic_read(&work->flags) & ~IRQ_WORK_PENDING;
 	for (;;) {
 		nflags = flags | IRQ_WORK_CLAIMED;
-		oflags = cmpxchg(&work->flags, flags, nflags);
+		oflags = atomic_cmpxchg(&work->flags, flags, nflags);
 		if (oflags == flags)
 			break;
 		if (oflags & IRQ_WORK_PENDING)
@@ -61,7 +61,7 @@ void __weak arch_irq_work_raise(void)
 static void __irq_work_queue_local(struct irq_work *work)
 {
 	/* If the work is "lazy", handle it from next tick if any */
-	if (work->flags & IRQ_WORK_LAZY) {
+	if (atomic_read(&work->flags) & IRQ_WORK_LAZY) {
 		if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
 		    tick_nohz_tick_stopped())
 			arch_irq_work_raise();
@@ -143,7 +143,7 @@ static void irq_work_run_list(struct llist_head *list)
 {
 	struct irq_work *work, *tmp;
 	struct llist_node *llnode;
-	unsigned long flags;
+	int flags;
 
 	BUG_ON(!irqs_disabled());
 
@@ -159,15 +159,15 @@ static void irq_work_run_list(struct llist_head *list)
 		 * to claim that work don't rely on us to handle their data
 		 * while we are in the middle of the func.
 		 */
-		flags = work->flags & ~IRQ_WORK_PENDING;
-		xchg(&work->flags, flags);
+		flags = atomic_read(&work->flags) & ~IRQ_WORK_PENDING;
+		atomic_xchg(&work->flags, flags);
 
 		work->func(work);
 		/*
 		 * Clear the BUSY bit and return to the free state if
 		 * no-one else claimed it meanwhile.
 		 */
-		(void)cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
+		(void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
 	}
 }
 
@@ -199,7 +199,7 @@ void irq_work_sync(struct irq_work *work)
 {
 	lockdep_assert_irqs_enabled();
 
-	while (work->flags & IRQ_WORK_BUSY)
+	while (atomic_read(&work->flags) & IRQ_WORK_BUSY)
 		cpu_relax();
 }
 EXPORT_SYMBOL_GPL(irq_work_sync);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index ca65327a6de8..865727373a3b 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2961,7 +2961,7 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
 
 static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
 	.func = wake_up_klogd_work_func,
-	.flags = IRQ_WORK_LAZY,
+	.flags = ATOMIC_INIT(IRQ_WORK_LAZY),
 };
 
 void wake_up_klogd(void)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 44bd08f2443b..ff467a4e2639 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -660,7 +660,7 @@ BPF_CALL_1(bpf_send_signal, u32, sig)
 			return -EINVAL;
 
 		work = this_cpu_ptr(&send_signal_work);
-		if (work->irq_work.flags & IRQ_WORK_BUSY)
+		if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY)
 			return -EBUSY;
 
 		/* Add the current task, which is the target of sending signal,
-- 
cgit v1.2.3