From 153fbd1226fb30b8630802aa5047b8af5ef53c9f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 31 Oct 2017 11:18:53 +0100 Subject: futex: Fix more put_pi_state() vs. exit_pi_state_list() races Dmitry (through syzbot) reported being able to trigger the WARN in get_pi_state() and a use-after-free on: raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); Both are due to this race: exit_pi_state_list() put_pi_state() lock(&curr->pi_lock) while() { pi_state = list_first_entry(head); hb = hash_futex(&pi_state->key); unlock(&curr->pi_lock); dec_and_test(&pi_state->refcount); lock(&hb->lock) lock(&pi_state->pi_mutex.wait_lock) // uaf if pi_state free'd lock(&curr->pi_lock); .... unlock(&curr->pi_lock); get_pi_state(); // WARN; refcount==0 The problem is we take the reference count too late, and don't allow it being 0. Fix it by using inc_not_zero() and simply retrying the loop when we fail to get a refcount. In that case put_pi_state() should remove the entry from the list. Reported-by: Dmitry Vyukov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Cc: Gratian Crisan Cc: Linus Torvalds Cc: Peter Zijlstra Cc: dvhart@infradead.org Cc: syzbot Cc: syzkaller-bugs@googlegroups.com Cc: Fixes: c74aef2d06a9 ("futex: Fix pi_state->owner serialization") Link: http://lkml.kernel.org/r/20171031101853.xpfh72y643kdfhjs@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/futex.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 0518a0bfc746..ca5bb9cba5cf 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -903,11 +903,27 @@ void exit_pi_state_list(struct task_struct *curr) */ raw_spin_lock_irq(&curr->pi_lock); while (!list_empty(head)) { - next = head->next; pi_state = list_entry(next, struct futex_pi_state, list); key = pi_state->key; hb = hash_futex(&key); + + /* + * We can race against put_pi_state() removing itself from the + * list (a waiter going away). put_pi_state() will first + * decrement the reference count and then modify the list, so + * its possible to see the list entry but fail this reference + * acquire. + * + * In that case; drop the locks to let put_pi_state() make + * progress and retry the loop. + */ + if (!atomic_inc_not_zero(&pi_state->refcount)) { + raw_spin_unlock_irq(&curr->pi_lock); + cpu_relax(); + raw_spin_lock_irq(&curr->pi_lock); + continue; + } raw_spin_unlock_irq(&curr->pi_lock); spin_lock(&hb->lock); @@ -918,8 +934,10 @@ void exit_pi_state_list(struct task_struct *curr) * task still owns the PI-state: */ if (head->next != next) { + /* retain curr->pi_lock for the loop invariant */ raw_spin_unlock(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); + put_pi_state(pi_state); continue; } @@ -927,9 +945,8 @@ void exit_pi_state_list(struct task_struct *curr) WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); pi_state->owner = NULL; - raw_spin_unlock(&curr->pi_lock); - get_pi_state(pi_state); + raw_spin_unlock(&curr->pi_lock); raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); -- cgit v1.2.3 From 9c388a5ed1960b2ebbebd3dbe7553092b0c15ec1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 31 Oct 2017 22:32:00 +0100 Subject: watchdog/harclockup/perf: Revert a33d44843d45 ("watchdog/hardlockup/perf: Simplify deferred event destroy") Guenter reported a crash in the watchdog/perf code, which is caused by cleanup() and enable() running concurrently. The reason for this is: The watchdog functions are serialized via the watchdog_mutex and cpu hotplug locking, but the enable of the perf based watchdog happens in context of the unpark callback of the smpboot thread. But that unpark function is not synchronous inside the locking. The unparking of the thread just wakes it up and leaves so there is no guarantee when the thread is executing. If it starts running _before_ the cleanup happened then it will create a event and overwrite the dead event pointer. The new event is then cleaned up because the event is marked dead. lock(watchdog_mutex); lockup_detector_reconfigure(); cpus_read_lock(); stop(); park() update(); start(); unpark() cpus_read_unlock(); thread runs() overwrite dead event ptr cleanup(); free new event, which is active inside perf.... unlock(watchdog_mutex); The park side is safe as that actually waits for the thread to reach parked state. Commit a33d44843d45 removed the protection against this kind of scenario under the stupid assumption that the hotplug serialization and the watchdog_mutex cover everything. Bring it back. Reverts: a33d44843d45 ("watchdog/hardlockup/perf: Simplify deferred event destroy") Reported-and-tested-by: Guenter Roeck Signed-off-by: Thomas Feels-stupid Gleixner Cc: Peter Zijlstra Cc: Don Zickus Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1710312145190.1942@nanos --- kernel/watchdog_hld.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 71a62ceacdc8..a7f137c1933a 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -21,6 +21,7 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); +static DEFINE_PER_CPU(struct perf_event *, dead_event); static struct cpumask dead_events_mask; static unsigned long hardlockup_allcpu_dumped; @@ -203,6 +204,8 @@ void hardlockup_detector_perf_disable(void) if (event) { perf_event_disable(event); + this_cpu_write(watchdog_ev, NULL); + this_cpu_write(dead_event, event); cpumask_set_cpu(smp_processor_id(), &dead_events_mask); watchdog_cpus--; } @@ -218,7 +221,7 @@ void hardlockup_detector_perf_cleanup(void) int cpu; for_each_cpu(cpu, &dead_events_mask) { - struct perf_event *event = per_cpu(watchdog_ev, cpu); + struct perf_event *event = per_cpu(dead_event, cpu); /* * Required because for_each_cpu() reports unconditionally @@ -226,7 +229,7 @@ void hardlockup_detector_perf_cleanup(void) */ if (event) perf_event_release_kernel(event); - per_cpu(watchdog_ev, cpu) = NULL; + per_cpu(dead_event, cpu) = NULL; } cpumask_clear(&dead_events_mask); } -- cgit v1.2.3 From 42f930da7f00c0ab23df4c7aed36137f35988980 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Wed, 1 Nov 2017 14:11:27 -0400 Subject: watchdog/hardlockup/perf: Use atomics to track in-use cpu counter Guenter reported: There is still a problem. When running echo 6 > /proc/sys/kernel/watchdog_thresh echo 5 > /proc/sys/kernel/watchdog_thresh repeatedly, the message NMI watchdog: Enabled. Permanently consumes one hw-PMU counter. stops after a while (after ~10-30 iterations, with fluctuations). Maybe watchdog_cpus needs to be atomic ? That's correct as this again is affected by the asynchronous nature of the smpboot thread unpark mechanism. CPU 0 CPU1 CPU2 write(watchdog_thresh, 6) stop() park() update() start() unpark() thread->unpark() cnt++; write(watchdog_thresh, 5) thread->unpark() stop() park() thread->park() cnt--; cnt++; update() start() unpark() That's not a functional problem, it just affects the informational message. Convert watchdog_cpus to atomic_t to prevent the problem Reported-and-tested-by: Guenter Roeck Signed-off-by: Don Zickus Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/20171101181126.j727fqjmdthjz4xk@redhat.com --- kernel/watchdog_hld.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index a7f137c1933a..a84b205fac9a 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -12,6 +12,7 @@ #define pr_fmt(fmt) "NMI watchdog: " fmt #include +#include #include #include @@ -25,7 +26,7 @@ static DEFINE_PER_CPU(struct perf_event *, dead_event); static struct cpumask dead_events_mask; static unsigned long hardlockup_allcpu_dumped; -static unsigned int watchdog_cpus; +static atomic_t watchdog_cpus = ATOMIC_INIT(0); void arch_touch_nmi_watchdog(void) { @@ -189,7 +190,8 @@ void hardlockup_detector_perf_enable(void) if (hardlockup_detector_event_create()) return; - if (!watchdog_cpus++) + /* use original value for check */ + if (!atomic_fetch_inc(&watchdog_cpus)) pr_info("Enabled. Permanently consumes one hw-PMU counter.\n"); perf_event_enable(this_cpu_read(watchdog_ev)); @@ -207,7 +209,7 @@ void hardlockup_detector_perf_disable(void) this_cpu_write(watchdog_ev, NULL); this_cpu_write(dead_event, event); cpumask_set_cpu(smp_processor_id(), &dead_events_mask); - watchdog_cpus--; + atomic_dec(&watchdog_cpus); } } -- cgit v1.2.3 From da0db32bbe816085d7f7f9cebfc825e55eff811f Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 3 Nov 2017 09:31:43 -0500 Subject: objtool: Resync objtool's instruction decoder source code copy with the kernel's latest version This fixes the following warning: warning: objtool: x86 instruction decoder differs from kernel Reported-by: Stephen Rothwell Signed-off-by: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/013315a808ccf5580abc293808827c8e2b5e1354.1509719152.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- tools/objtool/arch/x86/insn/gen-insn-attr-x86.awk | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/objtool/arch/x86/insn/gen-insn-attr-x86.awk b/tools/objtool/arch/x86/insn/gen-insn-attr-x86.awk index a3d2c62fd805..b02a36b2c14f 100644 --- a/tools/objtool/arch/x86/insn/gen-insn-attr-x86.awk +++ b/tools/objtool/arch/x86/insn/gen-insn-attr-x86.awk @@ -1,4 +1,5 @@ #!/bin/awk -f +# SPDX-License-Identifier: GPL-2.0 # gen-insn-attr-x86.awk: Instruction attribute table generator # Written by Masami Hiramatsu # -- cgit v1.2.3 From ec1e1b6109171d1890a437481c35b2b56d2327b8 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 3 Nov 2017 17:19:41 -0500 Subject: objtool: Prevent GCC from merging annotate_unreachable(), take 2 This fixes the following warning with GCC 4.6: mm/migrate.o: warning: objtool: migrate_misplaced_transhuge_page()+0x71: unreachable instruction The problem is that the compiler merged identical annotate_unreachable() inline asm blocks, resulting in a missing 'unreachable' annotation. This problem happened before, and was partially fixed with: 3d1e236022cc ("objtool: Prevent GCC from merging annotate_unreachable()") That commit tried to ensure that each instance of the annotate_unreachable() inline asm statement has a unique label. It used the __LINE__ macro to generate the label number. However, even the line number isn't necessarily unique when used in an inline function with multiple callers (in this case, __alloc_pages_node()'s use of VM_BUG_ON). Reported-by: kbuild test robot Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kbuild-all@01.org Cc: tipbuild@zytor.com Fixes: 3d1e236022cc ("objtool: Prevent GCC from merging annotate_unreachable()") Link: http://lkml.kernel.org/r/20171103221941.cajpwszir7ujxyc4@treble Signed-off-by: Ingo Molnar --- include/linux/compiler.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index fd8697aa4f73..202710420d6d 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -191,13 +191,13 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, asm("%c0:\n\t" \ ".pushsection .discard.reachable\n\t" \ ".long %c0b - .\n\t" \ - ".popsection\n\t" : : "i" (__LINE__)); \ + ".popsection\n\t" : : "i" (__COUNTER__)); \ }) #define annotate_unreachable() ({ \ asm("%c0:\n\t" \ ".pushsection .discard.unreachable\n\t" \ ".long %c0b - .\n\t" \ - ".popsection\n\t" : : "i" (__LINE__)); \ + ".popsection\n\t" : : "i" (__COUNTER__)); \ }) #define ASM_UNREACHABLE \ "999:\n\t" \ -- cgit v1.2.3