From cee4393989333795ae04dc9f3b83a578afe3fca6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 2 Mar 2018 16:35:27 -0800 Subject: rcu: Rename cond_resched_rcu_qs() to cond_resched_tasks_rcu_qs() Commit e31d28b6ab8f ("trace: Eliminate cond_resched_rcu_qs() in favor of cond_resched()") substituted cond_resched() for the earlier call to cond_resched_rcu_qs(). However, the new-age cond_resched() does not do anything to help RCU-tasks grace periods because (1) RCU-tasks is only enabled when CONFIG_PREEMPT=y and (2) cond_resched() is a complete no-op when preemption is enabled. This situation results in hangs when running the trace benchmarks. A number of potential fixes were discussed on LKML (https://lkml.kernel.org/r/20180224151240.0d63a059@vmware.local.home), including making cond_resched() not be a no-op; making cond_resched() not be a no-op, but only when running tracing benchmarks; reverting the aforementioned commit (which works because cond_resched_rcu_qs() does provide an RCU-tasks quiescent state; and adding a call to the scheduler/RCU rcu_note_voluntary_context_switch() function. All were deemed unsatisfactory, either due to added cond_resched() overhead or due to magic functions inviting cargo culting. This commit renames cond_resched_rcu_qs() to cond_resched_tasks_rcu_qs(), which provides a clear hint as to what this function is doing and why and where it should be used, and then replaces the call to cond_resched() with cond_resched_tasks_rcu_qs() in the trace benchmark's benchmark_event_kthread() function. Reported-by: Steven Rostedt Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- include/linux/rcupdate.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 36360d07f25b..19d235fefdb9 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -188,13 +188,13 @@ static inline void exit_tasks_rcu_finish(void) { } #endif /* #else #ifdef CONFIG_TASKS_RCU */ /** - * cond_resched_rcu_qs - Report potential quiescent states to RCU + * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU * * This macro resembles cond_resched(), except that it is defined to * report potential quiescent states to RCU-tasks even if the cond_resched() * machinery were to be shut off, as some advocate for PREEMPT kernels. */ -#define cond_resched_rcu_qs() \ +#define cond_resched_tasks_rcu_qs() \ do { \ if (!cond_resched()) \ rcu_note_voluntary_context_switch_lite(current); \ -- cgit v1.2.3 From c3442697c2d73d3cdb9d4135cf630ad36ba8552f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 5 Mar 2018 11:29:40 -0800 Subject: softirq: Eliminate unused cond_resched_softirq() macro The cond_resched_softirq() macro is not used anywhere in mainline, so this commit simplifies the kernel by eliminating it. Suggested-by: Eric Dumazet Signed-off-by: Paul E. McKenney Cc: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Reviewed-by: Eric Dumazet Tested-by: Nicholas Piggin --- include/linux/sched.h | 8 -------- kernel/sched/core.c | 14 -------------- kernel/softirq.c | 3 +-- 3 files changed, 1 insertion(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index b3d697f3b573..6fc99045658a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1613,7 +1613,6 @@ static inline int test_tsk_need_resched(struct task_struct *tsk) * explicit rescheduling in places that are safe. The return * value indicates whether a reschedule was done in fact. * cond_resched_lock() will drop the spinlock before scheduling, - * cond_resched_softirq() will enable bhs before scheduling. */ #ifndef CONFIG_PREEMPT extern int _cond_resched(void); @@ -1633,13 +1632,6 @@ extern int __cond_resched_lock(spinlock_t *lock); __cond_resched_lock(lock); \ }) -extern int __cond_resched_softirq(void); - -#define cond_resched_softirq() ({ \ - ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \ - __cond_resched_softirq(); \ -}) - static inline void cond_resched_rcu(void) { #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5e10aaeebfcc..6a09e6af64b9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5012,20 +5012,6 @@ int __cond_resched_lock(spinlock_t *lock) } EXPORT_SYMBOL(__cond_resched_lock); -int __sched __cond_resched_softirq(void) -{ - BUG_ON(!in_softirq()); - - if (should_resched(SOFTIRQ_DISABLE_OFFSET)) { - local_bh_enable(); - preempt_schedule_common(); - local_bh_disable(); - return 1; - } - return 0; -} -EXPORT_SYMBOL(__cond_resched_softirq); - /** * yield - yield the current processor to other threads. * diff --git a/kernel/softirq.c b/kernel/softirq.c index 177de3640c78..03981f1c39ea 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -145,8 +145,7 @@ static void __local_bh_enable(unsigned int cnt) } /* - * Special-case - softirqs can safely be enabled in - * cond_resched_softirq(), or by __do_softirq(), + * Special-case - softirqs can safely be enabled by __do_softirq(), * without processing still-pending softirqs: */ void _local_bh_enable(void) -- cgit v1.2.3 From 17672480fb1e953f999623b598a98130f8aacfbc Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sun, 25 Mar 2018 20:50:03 +0300 Subject: rcu: Declare rcu_eqs_special_set() in public header Because rcu_eqs_special_set() is declared only in internal header kernel/rcu/tree.h and stubbed in include/linux/rcutiny.h, it is inaccessible outside of the RCU implementation. This patch therefore moves the rcu_eqs_special_set() declaration to include/linux/rcutree.h, which allows it to be used in non-rcu kernel code. Signed-off-by: Yury Norov Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- include/linux/rcutree.h | 1 + kernel/rcu/tree.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index fd996cdf1833..448f20f27396 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -74,6 +74,7 @@ static inline void synchronize_rcu_bh_expedited(void) void rcu_barrier(void); void rcu_barrier_bh(void); void rcu_barrier_sched(void); +bool rcu_eqs_special_set(int cpu); unsigned long get_state_synchronize_rcu(void); void cond_synchronize_rcu(unsigned long oldstate); unsigned long get_state_synchronize_sched(void); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 5fd374c71404..0b3a90ebe225 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -404,7 +404,6 @@ extern struct rcu_state rcu_preempt_state; #endif /* #ifdef CONFIG_PREEMPT_RCU */ int rcu_dynticks_snap(struct rcu_dynticks *rdtp); -bool rcu_eqs_special_set(int cpu); #ifdef CONFIG_RCU_BOOST DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); -- cgit v1.2.3 From f7194ac32ca241d28765a98e42a7fe13debc85a7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Apr 2018 17:19:17 -0700 Subject: srcu: Add cleanup_srcu_struct_quiesced() The current cleanup_srcu_struct() flushes work, which prevents it from being invoked from some workqueue contexts, as well as from atomic (non-blocking) contexts. This patch therefore introduced a cleanup_srcu_struct_quiesced(), which can be invoked only after all activity on the specified srcu_struct has completed. This restriction allows cleanup_srcu_struct_quiesced() to be invoked from workqueue contexts as well as from atomic contexts. Suggested-by: Christoph Hellwig Signed-off-by: Paul E. McKenney Tested-by: Nitzan Carmi Tested-by: Nicholas Piggin --- include/linux/srcu.h | 36 +++++++++++++++++++++++++++++++++++- kernel/rcu/rcutorture.c | 7 ++++++- kernel/rcu/srcutiny.c | 9 ++++++--- kernel/rcu/srcutree.c | 30 +++++++++++++++++------------- 4 files changed, 64 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 33c1c698df09..91494d7e8e41 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -69,11 +69,45 @@ struct srcu_struct { }; void call_srcu(struct srcu_struct *sp, struct rcu_head *head, void (*func)(struct rcu_head *head)); -void cleanup_srcu_struct(struct srcu_struct *sp); +void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced); int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp); void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp); void synchronize_srcu(struct srcu_struct *sp); +/** + * cleanup_srcu_struct - deconstruct a sleep-RCU structure + * @sp: structure to clean up. + * + * Must invoke this after you are finished using a given srcu_struct that + * was initialized via init_srcu_struct(), else you leak memory. + */ +static inline void cleanup_srcu_struct(struct srcu_struct *sp) +{ + _cleanup_srcu_struct(sp, false); +} + +/** + * cleanup_srcu_struct_quiesced - deconstruct a quiesced sleep-RCU structure + * @sp: structure to clean up. + * + * Must invoke this after you are finished using a given srcu_struct that + * was initialized via init_srcu_struct(), else you leak memory. Also, + * all grace-period processing must have completed. + * + * "Completed" means that the last synchronize_srcu() and + * synchronize_srcu_expedited() calls must have returned before the call + * to cleanup_srcu_struct_quiesced(). It also means that the callback + * from the last call_srcu() must have been invoked before the call to + * cleanup_srcu_struct_quiesced(), but you can use srcu_barrier() to help + * with this last. Violating these rules will get you a WARN_ON() splat + * (with high probability, anyway), and will also cause the srcu_struct + * to be leaked. + */ +static inline void cleanup_srcu_struct_quiesced(struct srcu_struct *sp) +{ + _cleanup_srcu_struct(sp, true); +} + #ifdef CONFIG_DEBUG_LOCK_ALLOC /** diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 680c96d8c00f..f0e1d44459f8 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -593,7 +593,12 @@ static void srcu_torture_init(void) static void srcu_torture_cleanup(void) { - cleanup_srcu_struct(&srcu_ctld); + static DEFINE_TORTURE_RANDOM(rand); + + if (torture_random(&rand) & 0x800) + cleanup_srcu_struct(&srcu_ctld); + else + cleanup_srcu_struct_quiesced(&srcu_ctld); srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */ } diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 76ac5f50b2c7..622792abe41a 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -86,16 +86,19 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); * Must invoke this after you are finished using a given srcu_struct that * was initialized via init_srcu_struct(), else you leak memory. */ -void cleanup_srcu_struct(struct srcu_struct *sp) +void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced) { WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]); - flush_work(&sp->srcu_work); + if (quiesced) + WARN_ON(work_pending(&sp->srcu_work)); + else + flush_work(&sp->srcu_work); WARN_ON(sp->srcu_gp_running); WARN_ON(sp->srcu_gp_waiting); WARN_ON(sp->srcu_cb_head); WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail); } -EXPORT_SYMBOL_GPL(cleanup_srcu_struct); +EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); /* * Removes the count for the old reader from the appropriate element of diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index fb560fca9ef4..b4123d7a2cec 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -366,24 +366,28 @@ static unsigned long srcu_get_delay(struct srcu_struct *sp) return SRCU_INTERVAL; } -/** - * cleanup_srcu_struct - deconstruct a sleep-RCU structure - * @sp: structure to clean up. - * - * Must invoke this after you are finished using a given srcu_struct that - * was initialized via init_srcu_struct(), else you leak memory. - */ -void cleanup_srcu_struct(struct srcu_struct *sp) +/* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */ +void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced) { int cpu; if (WARN_ON(!srcu_get_delay(sp))) - return; /* Leakage unless caller handles error. */ + return; /* Just leak it! */ if (WARN_ON(srcu_readers_active(sp))) - return; /* Leakage unless caller handles error. */ - flush_delayed_work(&sp->work); + return; /* Just leak it! */ + if (quiesced) { + if (WARN_ON(delayed_work_pending(&sp->work))) + return; /* Just leak it! */ + } else { + flush_delayed_work(&sp->work); + } for_each_possible_cpu(cpu) - flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work); + if (quiesced) { + if (WARN_ON(delayed_work_pending(&per_cpu_ptr(sp->sda, cpu)->work))) + return; /* Just leak it! */ + } else { + flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work); + } if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) || WARN_ON(srcu_readers_active(sp))) { pr_info("%s: Active srcu_struct %p state: %d\n", __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); @@ -392,7 +396,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp) free_percpu(sp->sda); sp->sda = NULL; } -EXPORT_SYMBOL_GPL(cleanup_srcu_struct); +EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); /* * Counts the new reader in the appropriate per-CPU element of the -- cgit v1.2.3 From c21b8b3ed0ebba0d9fd3cb126d749b653e7b5038 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Apr 2018 16:38:25 -0700 Subject: rcu: Update list of rcu_future_grace_period() trace events Reworking grace-period initiation and funnel locking added new rcu_future_grace_period() trace events, so this commit updates the rcu_future_grace_period() trace event's header comment accordingly. Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- include/trace/events/rcu.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index d8c33298c153..5936aac357ab 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -84,20 +84,21 @@ TRACE_EVENT(rcu_grace_period, ); /* - * Tracepoint for future grace-period events, including those for no-callbacks - * CPUs. The caller should pull the data from the rcu_node structure, - * other than rcuname, which comes from the rcu_state structure, and event, - * which is one of the following: + * Tracepoint for future grace-period events. The caller should pull + * the data from the rcu_node structure, other than rcuname, which comes + * from the rcu_state structure, and event, which is one of the following: * - * "Startleaf": Request a nocb grace period based on leaf-node data. + * "Startleaf": Request a grace period based on leaf-node data. + * "Prestarted": Someone beat us to the request * "Startedleaf": Leaf-node start proved sufficient. * "Startedleafroot": Leaf-node start proved sufficient after checking root. * "Startedroot": Requested a nocb grace period based on root-node data. + * "NoGPkthread": The RCU grace-period kthread has not yet started. * "StartWait": Start waiting for the requested grace period. * "ResumeWait": Resume waiting after signal. * "EndWait": Complete wait. * "Cleanup": Clean up rcu_node structure after previous GP. - * "CleanupMore": Clean up, and another no-CB GP is needed. + * "CleanupMore": Clean up, and another GP is needed. */ TRACE_EVENT(rcu_future_grace_period, -- cgit v1.2.3 From f64c6013a2029316ea552f99823d116ee5f5f955 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 22 May 2018 09:50:53 -0700 Subject: rcu/x86: Provide early rcu_cpu_starting() callback The x86/mtrr code does horrific things because hardware. It uses stop_machine_from_inactive_cpu(), which does a wakeup (of the stopper thread on another CPU), which uses RCU, all before the CPU is onlined. RCU complains about this, because wakeups use RCU and RCU does (rightfully) not consider offline CPUs for grace-periods. Fix this by initializing RCU way early in the MTRR case. Tested-by: Mike Galbraith Signed-off-by: Peter Zijlstra Signed-off-by: Paul E. McKenney [ paulmck: Add !SMP support, per 0day Test Robot report. ] --- arch/x86/kernel/cpu/mtrr/main.c | 4 ++++ include/linux/rcupdate.h | 1 - include/linux/rcutiny.h | 1 + include/linux/rcutree.h | 1 + kernel/rcu/tree.c | 9 +++++++++ 5 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 7468de429087..3ea0047beb40 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -793,6 +794,9 @@ void mtrr_ap_init(void) if (!use_intel() || mtrr_aps_delayed_init) return; + + rcu_cpu_starting(smp_processor_id()); + /* * Ideally we should hold mtrr_mutex here to avoid mtrr entries * changed, but this routine will be called in cpu boot time, diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 19d235fefdb9..e679b175b411 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -108,7 +108,6 @@ void rcu_sched_qs(void); void rcu_bh_qs(void); void rcu_check_callbacks(int user); void rcu_report_dead(unsigned int cpu); -void rcu_cpu_starting(unsigned int cpu); void rcutree_migrate_callbacks(int cpu); #ifdef CONFIG_RCU_STALL_COMMON diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index ce9beec35e34..7b3c82e8a625 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -132,5 +132,6 @@ static inline void rcu_all_qs(void) { barrier(); } #define rcutree_offline_cpu NULL #define rcutree_dead_cpu NULL #define rcutree_dying_cpu NULL +static inline void rcu_cpu_starting(unsigned int cpu) { } #endif /* __LINUX_RCUTINY_H */ diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 448f20f27396..914655848ef6 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -101,5 +101,6 @@ int rcutree_online_cpu(unsigned int cpu); int rcutree_offline_cpu(unsigned int cpu); int rcutree_dead_cpu(unsigned int cpu); int rcutree_dying_cpu(unsigned int cpu); +void rcu_cpu_starting(unsigned int cpu); #endif /* __LINUX_RCUTREE_H */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4fccdfa25716..aa7cade1b9f3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3665,6 +3665,8 @@ int rcutree_dead_cpu(unsigned int cpu) return 0; } +static DEFINE_PER_CPU(int, rcu_cpu_started); + /* * Mark the specified CPU as being online so that subsequent grace periods * (both expedited and normal) will wait on it. Note that this means that @@ -3686,6 +3688,11 @@ void rcu_cpu_starting(unsigned int cpu) struct rcu_node *rnp; struct rcu_state *rsp; + if (per_cpu(rcu_cpu_started, cpu)) + return; + + per_cpu(rcu_cpu_started, cpu) = 1; + for_each_rcu_flavor(rsp) { rdp = per_cpu_ptr(rsp->rda, cpu); rnp = rdp->mynode; @@ -3742,6 +3749,8 @@ void rcu_report_dead(unsigned int cpu) preempt_enable(); for_each_rcu_flavor(rsp) rcu_cleanup_dying_idle_cpu(cpu, rsp); + + per_cpu(rcu_cpu_started, cpu) = 0; } /* Migrate the dead CPU's callbacks to the current CPU. */ -- cgit v1.2.3