From d136122f58458479fd8926020ba2937de61d7f65 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 20 Jul 2020 17:20:21 +0200 Subject: sched: Fix race against ptrace_freeze_trace() There is apparently one site that violates the rule that only current and ttwu() will modify task->state, namely ptrace_{,un}freeze_traced() will change task->state for a remote task. Oleg explains: "TASK_TRACED/TASK_STOPPED was always protected by siglock. In particular, ttwu(__TASK_TRACED) must be always called with siglock held. That is why ptrace_freeze_traced() assumes it can safely do s/TASK_TRACED/__TASK_TRACED/ under spin_lock(siglock)." This breaks the ordering scheme introduced by commit: dbfb089d360b ("sched: Fix loadavg accounting race") Specifically, the reload not matching no longer implies we don't have to block. Simply things by noting that what we need is a LOAD->STORE ordering and this can be provided by a control dependency. So replace: prev_state = prev->state; raw_spin_lock(&rq->lock); smp_mb__after_spinlock(); /* SMP-MB */ if (... && prev_state && prev_state == prev->state) deactivate_task(); with: prev_state = prev->state; if (... && prev_state) /* CTRL-DEP */ deactivate_task(); Since that already implies the 'prev->state' load must be complete before allowing the 'prev->on_rq = 0' store to become visible. Fixes: dbfb089d360b ("sched: Fix loadavg accounting race") Reported-by: Jiri Slaby Signed-off-by: Peter Zijlstra (Intel) Acked-by: Oleg Nesterov Tested-by: Paul Gortmaker Tested-by: Christian Brauner --- kernel/sched/core.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e15543cb8481..5dece9b34e25 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4119,9 +4119,6 @@ static void __sched notrace __schedule(bool preempt) local_irq_disable(); rcu_note_context_switch(preempt); - /* See deactivate_task() below. */ - prev_state = prev->state; - /* * Make sure that signal_pending_state()->signal_pending() below * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) @@ -4145,11 +4142,16 @@ static void __sched notrace __schedule(bool preempt) update_rq_clock(rq); switch_count = &prev->nivcsw; + /* - * We must re-load prev->state in case ttwu_remote() changed it - * before we acquired rq->lock. + * We must load prev->state once (task_struct::state is volatile), such + * that: + * + * - we form a control dependency vs deactivate_task() below. + * - ptrace_{,un}freeze_traced() can change ->state underneath us. */ - if (!preempt && prev_state && prev_state == prev->state) { + prev_state = prev->state; + if (!preempt && prev_state) { if (signal_pending_state(prev_state, prev)) { prev->state = TASK_RUNNING; } else { @@ -4163,10 +4165,12 @@ static void __sched notrace __schedule(bool preempt) /* * __schedule() ttwu() - * prev_state = prev->state; if (READ_ONCE(p->on_rq) && ...) - * LOCK rq->lock goto out; - * smp_mb__after_spinlock(); smp_acquire__after_ctrl_dep(); - * p->on_rq = 0; p->state = TASK_WAKING; + * prev_state = prev->state; if (p->on_rq && ...) + * if (prev_state) goto out; + * p->on_rq = 0; smp_acquire__after_ctrl_dep(); + * p->state = TASK_WAKING + * + * Where __schedule() and ttwu() have matching control dependencies. * * After this, schedule() must not care about p->state any more. */ -- cgit v1.2.3 From 062d3f95b630113e1156a31f376ad36e25da29a7 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 23 Jul 2020 21:10:42 +0100 Subject: sched: Warn if garbage is passed to default_wake_function() Since the default_wake_function() passes its flags onto try_to_wake_up(), warn if those flags collide with internal values. Given that the supplied flags are garbage, no repair can be done but at least alert the user to the damage they are causing. In the belief that these errors should be picked up during testing, the warning is only compiled in under CONFIG_SCHED_DEBUG. Signed-off-by: Chris Wilson Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200723201042.18861-1-chris@chris-wilson.co.uk --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5dece9b34e25..2142c6767682 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4485,6 +4485,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, void *key) { + WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC); return try_to_wake_up(curr->private, mode, wake_flags); } EXPORT_SYMBOL(default_wake_function); -- cgit v1.2.3 From fe5ed7ab99c656bd2f5b79b49df0e9ebf2cead8a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 Jul 2020 17:44:20 +0200 Subject: uprobes: Change handle_swbp() to send SIGTRAP with si_code=SI_KERNEL, to fix GDB regression If a tracee is uprobed and it hits int3 inserted by debugger, handle_swbp() does send_sig(SIGTRAP, current, 0) which means si_code == SI_USER. This used to work when this code was written, but then GDB started to validate si_code and now it simply can't use breakpoints if the tracee has an active uprobe: # cat test.c void unused_func(void) { } int main(void) { return 0; } # gcc -g test.c -o test # perf probe -x ./test -a unused_func # perf record -e probe_test:unused_func gdb ./test -ex run GNU gdb (GDB) 10.0.50.20200714-git ... Program received signal SIGTRAP, Trace/breakpoint trap. 0x00007ffff7ddf909 in dl_main () from /lib64/ld-linux-x86-64.so.2 (gdb) The tracee hits the internal breakpoint inserted by GDB to monitor shared library events but GDB misinterprets this SIGTRAP and reports a signal. Change handle_swbp() to use force_sig(SIGTRAP), this matches do_int3_user() and fixes the problem. This is the minimal fix for -stable, arch/x86/kernel/uprobes.c is equally wrong; it should use send_sigtrap(TRAP_TRACE) instead of send_sig(SIGTRAP), but this doesn't confuse GDB and needs another x86-specific patch. Reported-by: Aaron Merey Signed-off-by: Oleg Nesterov Signed-off-by: Ingo Molnar Reviewed-by: Srikar Dronamraju Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20200723154420.GA32043@redhat.com --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index bb0862873dba..5f8b0c52fd2e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2199,7 +2199,7 @@ static void handle_swbp(struct pt_regs *regs) if (!uprobe) { if (is_swbp > 0) { /* No matching uprobe; signal SIGTRAP. */ - send_sig(SIGTRAP, current, 0); + force_sig(SIGTRAP); } else { /* * Either we raced with uprobe_unregister() or we can't -- cgit v1.2.3 From 8ac68dc455d9d18241d44b96800d73229029ed34 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 28 Jul 2020 15:33:21 -0400 Subject: revert: 1320a4052ea1 ("audit: trigger accompanying records when no rules present") Unfortunately the commit listed in the subject line above failed to ensure that the task's audit_context was properly initialized/set before enabling the "accompanying records". Depending on the situation, the resulting audit_context could have invalid values in some of it's fields which could cause a kernel panic/oops when the task/syscall exists and the audit records are generated. We will revisit the original patch, with the necessary fixes, in a future kernel but right now we just want to fix the kernel panic with the least amount of added risk. Cc: stable@vger.kernel.org Fixes: 1320a4052ea1 ("audit: trigger accompanying records when no rules present") Reported-by: j2468h@googlemail.com Signed-off-by: Paul Moore --- kernel/audit.c | 1 - kernel/audit.h | 8 -------- kernel/auditsc.c | 3 +++ 3 files changed, 3 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index e33460e01b3b..9bf2b08b051f 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1848,7 +1848,6 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, } audit_get_stamp(ab->ctx, &t, &serial); - audit_clear_dummy(ab->ctx); audit_log_format(ab, "audit(%llu.%03lu:%u): ", (unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial); diff --git a/kernel/audit.h b/kernel/audit.h index f0233dc40b17..ddc22878433d 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -290,13 +290,6 @@ extern int audit_signal_info_syscall(struct task_struct *t); extern void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx); extern struct list_head *audit_killed_trees(void); - -static inline void audit_clear_dummy(struct audit_context *ctx) -{ - if (ctx) - ctx->dummy = 0; -} - #else /* CONFIG_AUDITSYSCALL */ #define auditsc_get_stamp(c, t, s) 0 #define audit_put_watch(w) {} @@ -330,7 +323,6 @@ static inline int audit_signal_info_syscall(struct task_struct *t) } #define audit_filter_inodes(t, c) AUDIT_DISABLED -#define audit_clear_dummy(c) {} #endif /* CONFIG_AUDITSYSCALL */ extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 468a23390457..fd840c40abf7 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1417,6 +1417,9 @@ static void audit_log_proctitle(void) struct audit_context *context = audit_context(); struct audit_buffer *ab; + if (!context || context->dummy) + return; + ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE); if (!ab) return; /* audit_panic or being filtered */ -- cgit v1.2.3 From f227e3ec3b5cad859ad15666874405e8c1bbc1d4 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Fri, 10 Jul 2020 15:23:19 +0200 Subject: random32: update the net random state on interrupt and activity This modifies the first 32 bits out of the 128 bits of a random CPU's net_rand_state on interrupt or CPU activity to complicate remote observations that could lead to guessing the network RNG's internal state. Note that depending on some network devices' interrupt rate moderation or binding, this re-seeding might happen on every packet or even almost never. In addition, with NOHZ some CPUs might not even get timer interrupts, leaving their local state rarely updated, while they are running networked processes making use of the random state. For this reason, we also perform this update in update_process_times() in order to at least update the state when there is user or system activity, since it's the only case we care about. Reported-by: Amit Klein Suggested-by: Linus Torvalds Cc: Eric Dumazet Cc: "Jason A. Donenfeld" Cc: Andy Lutomirski Cc: Kees Cook Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Signed-off-by: Willy Tarreau Signed-off-by: Linus Torvalds --- drivers/char/random.c | 1 + include/linux/random.h | 3 +++ kernel/time/timer.c | 8 ++++++++ lib/random32.c | 2 +- 4 files changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/drivers/char/random.c b/drivers/char/random.c index 2a41b21623ae..d20ba1b104ca 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1277,6 +1277,7 @@ void add_interrupt_randomness(int irq, int irq_flags) fast_mix(fast_pool); add_interrupt_bench(cycles); + this_cpu_add(net_rand_state.s1, fast_pool->pool[cycles & 3]); if (unlikely(crng_init == 0)) { if ((fast_pool->count >= 64) && diff --git a/include/linux/random.h b/include/linux/random.h index 45e1f8fa742b..39aaa1f78f9d 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -11,6 +11,7 @@ #include #include #include +#include #include @@ -119,6 +120,8 @@ struct rnd_state { __u32 s1, s2, s3, s4; }; +DECLARE_PER_CPU(struct rnd_state, net_rand_state) __latent_entropy; + u32 prandom_u32_state(struct rnd_state *state); void prandom_bytes_state(struct rnd_state *state, void *buf, size_t nbytes); void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index df1ff803acc4..026ac01af9da 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -1742,6 +1743,13 @@ void update_process_times(int user_tick) scheduler_tick(); if (IS_ENABLED(CONFIG_POSIX_TIMERS)) run_posix_cpu_timers(); + + /* The current CPU might make use of net randoms without receiving IRQs + * to renew them often enough. Let's update the net_rand_state from a + * non-constant value that's not affine to the number of calls to make + * sure it's updated when there's some activity (we don't care in idle). + */ + this_cpu_add(net_rand_state.s1, rol32(jiffies, 24) + user_tick); } /** diff --git a/lib/random32.c b/lib/random32.c index 763b920a6206..c4d317be2997 100644 --- a/lib/random32.c +++ b/lib/random32.c @@ -48,7 +48,7 @@ static inline void prandom_state_selftest(void) } #endif -static DEFINE_PER_CPU(struct rnd_state, net_rand_state) __latent_entropy; +DEFINE_PER_CPU(struct rnd_state, net_rand_state) __latent_entropy; /** * prandom_u32_state - seeded pseudo-random number generator. -- cgit v1.2.3