From 8315f42295d2667a7f942f154b73a86fd7cb2227 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Jun 2014 13:42:20 -0700 Subject: rcu: Add call_rcu_tasks() This commit adds a new RCU-tasks flavor of RCU, which provides call_rcu_tasks(). This RCU flavor's quiescent states are voluntary context switch (not preemption!) and userspace execution (not the idle loop -- use some sort of schedule_on_each_cpu() if you need to handle the idle tasks. Note that unlike other RCU flavors, these quiescent states occur in tasks, not necessarily CPUs. Includes fixes from Steven Rostedt. This RCU flavor is assumed to have very infrequent latency-tolerant updaters. This assumption permits significant simplifications, including a single global callback list protected by a single global lock, along with a single task-private linked list containing all tasks that have not yet passed through a quiescent state. If experience shows this assumption to be incorrect, the required additional complexity will be added. Suggested-by: Steven Rostedt Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1b70cb6fbe3c..8ad91d1e317d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2410,6 +2410,8 @@ void rcu_check_callbacks(int cpu, int user) rcu_preempt_check_callbacks(cpu); if (rcu_pending(cpu)) invoke_rcu_core(); + if (user) + rcu_note_voluntary_context_switch(current); trace_rcu_utilization(TPS("End scheduler-tick")); } -- cgit v1.2.3 From bde6c3aa993066acb0d6ce32ecabe03b9d5df92d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Jul 2014 11:26:57 -0700 Subject: rcu: Provide cond_resched_rcu_qs() to force quiescent states in long loops RCU-tasks requires the occasional voluntary context switch from CPU-bound in-kernel tasks. In some cases, this requires instrumenting cond_resched(). However, there is some reluctance to countenance unconditionally instrumenting cond_resched() (see http://lwn.net/Articles/603252/), so this commit creates a separate cond_resched_rcu_qs() that may be used in place of cond_resched() in locations prone to long-duration in-kernel looping. This commit currently instruments only RCU-tasks. Future possibilities include also instrumenting RCU, RCU-bh, and RCU-sched in order to reduce IPI usage. Signed-off-by: Paul E. McKenney --- fs/file.c | 2 +- include/linux/rcupdate.h | 13 +++++++++++++ kernel/rcu/rcutorture.c | 4 ++-- kernel/rcu/tree.c | 12 ++++++------ kernel/rcu/tree_plugin.h | 2 +- mm/mlock.c | 2 +- 6 files changed, 24 insertions(+), 11 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/fs/file.c b/fs/file.c index 66923fe3176e..1cafc4c9275b 100644 --- a/fs/file.c +++ b/fs/file.c @@ -367,7 +367,7 @@ static struct fdtable *close_files(struct files_struct * files) struct file * file = xchg(&fdt->fd[i], NULL); if (file) { filp_close(file, files); - cond_resched(); + cond_resched_rcu_qs(); } } i++; diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 3432063f4c87..473350462d04 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -330,6 +330,19 @@ static inline void rcu_user_hooks_switch(struct task_struct *prev, #define rcu_note_voluntary_context_switch(t) do { } while (0) #endif /* #else #ifdef CONFIG_TASKS_RCU */ +/** + * cond_resched_rcu_qs - Report potential quiescent states to RCU + * + * This macro resembles cond_resched(), except that it is defined to + * report potential quiescent states to RCU-tasks even if the cond_resched() + * machinery were to be shut off, as some advocate for PREEMPT kernels. + */ +#define cond_resched_rcu_qs() \ +do { \ + rcu_note_voluntary_context_switch(current); \ + cond_resched(); \ +} while (0) + #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) bool __rcu_is_watching(void); #endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 948a7693748e..178716713e11 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -667,7 +667,7 @@ static int rcu_torture_boost(void *arg) } call_rcu_time = jiffies; } - cond_resched(); + cond_resched_rcu_qs(); stutter_wait("rcu_torture_boost"); if (torture_must_stop()) goto checkwait; @@ -1019,7 +1019,7 @@ rcu_torture_reader(void *arg) __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); cur_ops->readunlock(idx); - cond_resched(); + cond_resched_rcu_qs(); stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); if (irqreader && cur_ops->irq_capable) { diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8ad91d1e317d..e23dad0661e2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1647,7 +1647,7 @@ static int rcu_gp_init(struct rcu_state *rsp) rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); raw_spin_unlock_irq(&rnp->lock); - cond_resched(); + cond_resched_rcu_qs(); } mutex_unlock(&rsp->onoff_mutex); @@ -1736,7 +1736,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) /* smp_mb() provided by prior unlock-lock pair. */ nocb += rcu_future_gp_cleanup(rsp, rnp); raw_spin_unlock_irq(&rnp->lock); - cond_resched(); + cond_resched_rcu_qs(); } rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); @@ -1785,7 +1785,7 @@ static int __noreturn rcu_gp_kthread(void *arg) /* Locking provides needed memory barrier. */ if (rcu_gp_init(rsp)) break; - cond_resched(); + cond_resched_rcu_qs(); flush_signals(current); trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), @@ -1828,10 +1828,10 @@ static int __noreturn rcu_gp_kthread(void *arg) trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), TPS("fqsend")); - cond_resched(); + cond_resched_rcu_qs(); } else { /* Deal with stray signal. */ - cond_resched(); + cond_resched_rcu_qs(); flush_signals(current); trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), @@ -2434,7 +2434,7 @@ static void force_qs_rnp(struct rcu_state *rsp, struct rcu_node *rnp; rcu_for_each_leaf_node(rsp, rnp) { - cond_resched(); + cond_resched_rcu_qs(); mask = 0; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index a7997e272564..7672586d3920 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1848,7 +1848,7 @@ static int rcu_oom_notify(struct notifier_block *self, get_online_cpus(); for_each_online_cpu(cpu) { smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); - cond_resched(); + cond_resched_rcu_qs(); } put_online_cpus(); diff --git a/mm/mlock.c b/mm/mlock.c index ce84cb0b83ef..ab3150c26711 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -789,7 +789,7 @@ static int do_mlockall(int flags) /* Ignore errors */ mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); - cond_resched(); + cond_resched_rcu_qs(); } out: return 0; -- cgit v1.2.3 From 176f8f7a52cc6d09d686f0d900abda6942a52fbb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 4 Aug 2014 17:43:50 -0700 Subject: rcu: Make TASKS_RCU handle nohz_full= CPUs Currently TASKS_RCU would ignore a CPU running a task in nohz_full= usermode execution. There would be neither a context switch nor a scheduling-clock interrupt to tell TASKS_RCU that the task in question had passed through a quiescent state. The grace period would therefore extend indefinitely. This commit therefore makes RCU's dyntick-idle subsystem record the task_struct structure of the task that is running in dyntick-idle mode on each CPU. The TASKS_RCU grace period can then access this information and record a quiescent state on behalf of any CPU running in dyntick-idle usermode. Signed-off-by: Paul E. McKenney --- include/linux/init_task.h | 3 ++- include/linux/sched.h | 2 ++ kernel/rcu/tree.c | 2 ++ kernel/rcu/tree.h | 2 ++ kernel/rcu/tree_plugin.h | 16 ++++++++++++++++ kernel/rcu/update.c | 4 +++- 6 files changed, 27 insertions(+), 2 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index dffd9258ee60..03b274873b06 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -121,7 +121,8 @@ extern struct group_info init_groups; #define INIT_TASK_RCU_TASKS(tsk) \ .rcu_tasks_holdout = false, \ .rcu_tasks_holdout_list = \ - LIST_HEAD_INIT(tsk.rcu_tasks_holdout_list), + LIST_HEAD_INIT(tsk.rcu_tasks_holdout_list), \ + .rcu_tasks_idle_cpu = -1, #else #define INIT_TASK_RCU_TASKS(tsk) #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index eaacac4ae77d..ec8b34722bcc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1274,6 +1274,7 @@ struct task_struct { unsigned long rcu_tasks_nvcsw; bool rcu_tasks_holdout; struct list_head rcu_tasks_holdout_list; + int rcu_tasks_idle_cpu; #endif /* #ifdef CONFIG_TASKS_RCU */ #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) @@ -2020,6 +2021,7 @@ static inline void rcu_copy_process(struct task_struct *p) #ifdef CONFIG_TASKS_RCU p->rcu_tasks_holdout = false; INIT_LIST_HEAD(&p->rcu_tasks_holdout_list); + p->rcu_tasks_idle_cpu = -1; #endif /* #ifdef CONFIG_TASKS_RCU */ } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e23dad0661e2..c880f5387b1f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -526,6 +526,7 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, atomic_inc(&rdtp->dynticks); smp_mb__after_atomic(); /* Force ordering with next sojourn. */ WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); + rcu_dynticks_task_enter(); /* * It is illegal to enter an extended quiescent state while @@ -642,6 +643,7 @@ void rcu_irq_exit(void) static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, int user) { + rcu_dynticks_task_exit(); smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */ atomic_inc(&rdtp->dynticks); /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 6a86eb7bac45..3a92000c354f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -605,6 +605,8 @@ static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, static void rcu_bind_gp_kthread(void); static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); static bool rcu_nohz_full_cpu(struct rcu_state *rsp); +static void rcu_dynticks_task_enter(void); +static void rcu_dynticks_task_exit(void); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7672586d3920..e466b40052a7 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -3036,3 +3036,19 @@ static void rcu_bind_gp_kthread(void) housekeeping_affine(current); #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ } + +/* Record the current task on dyntick-idle entry. */ +static void rcu_dynticks_task_enter(void) +{ +#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) + ACCESS_ONCE(current->rcu_tasks_idle_cpu) = smp_processor_id(); +#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ +} + +/* Record no current task on dyntick-idle exit. */ +static void rcu_dynticks_task_exit(void) +{ +#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) + ACCESS_ONCE(current->rcu_tasks_idle_cpu) = -1; +#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ +} diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index e1d71741958f..2658de4a5975 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -463,7 +463,9 @@ static void check_holdout_task(struct task_struct *t, { if (!ACCESS_ONCE(t->rcu_tasks_holdout) || t->rcu_tasks_nvcsw != ACCESS_ONCE(t->nvcsw) || - !ACCESS_ONCE(t->on_rq)) { + !ACCESS_ONCE(t->on_rq) || + (IS_ENABLED(CONFIG_NO_HZ_FULL) && + !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { ACCESS_ONCE(t->rcu_tasks_holdout) = false; list_del_rcu(&t->rcu_tasks_holdout_list); put_task_struct(t); -- cgit v1.2.3 From 284a8c93af47306beed967a303d84730b32bab39 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 Aug 2014 16:38:46 -0700 Subject: rcu: Per-CPU operation cleanups to rcu_*_qs() functions The rcu_bh_qs(), rcu_preempt_qs(), and rcu_sched_qs() functions use old-style per-CPU variable access and write to ->passed_quiesce even if it is already set. This commit therefore updates to use the new-style per-CPU variable access functions and avoids the spurious writes. This commit also eliminates the "cpu" argument to these functions because they are always invoked on the indicated CPU. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 4 ++-- include/linux/rcutiny.h | 2 +- kernel/rcu/tiny.c | 10 +++++----- kernel/rcu/tree.c | 34 ++++++++++++++++++---------------- kernel/rcu/tree_plugin.h | 27 +++++++++++++++------------ kernel/softirq.c | 2 +- 6 files changed, 42 insertions(+), 37 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 132e1e34cdca..2fab0e37afe0 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -261,8 +261,8 @@ static inline int rcu_preempt_depth(void) /* Internal to kernel */ void rcu_init(void); -void rcu_sched_qs(int cpu); -void rcu_bh_qs(int cpu); +void rcu_sched_qs(void); +void rcu_bh_qs(void); void rcu_check_callbacks(int cpu, int user); struct notifier_block; void rcu_idle_enter(void); diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index d40a6a451330..38cc5b1e252d 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -80,7 +80,7 @@ static inline void kfree_call_rcu(struct rcu_head *head, static inline void rcu_note_context_switch(int cpu) { - rcu_sched_qs(cpu); + rcu_sched_qs(); } /* diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 717f00854fc0..61b8d2ccc2cb 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -72,7 +72,7 @@ static void rcu_idle_enter_common(long long newval) current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ } - rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ + rcu_sched_qs(); /* implies rcu_bh_inc() */ barrier(); rcu_dynticks_nesting = newval; } @@ -217,7 +217,7 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) * are at it, given that any rcu quiescent state is also an rcu_bh * quiescent state. Use "+" instead of "||" to defeat short circuiting. */ -void rcu_sched_qs(int cpu) +void rcu_sched_qs(void) { unsigned long flags; @@ -231,7 +231,7 @@ void rcu_sched_qs(int cpu) /* * Record an rcu_bh quiescent state. */ -void rcu_bh_qs(int cpu) +void rcu_bh_qs(void) { unsigned long flags; @@ -251,9 +251,9 @@ void rcu_check_callbacks(int cpu, int user) { RCU_TRACE(check_cpu_stalls()); if (user || rcu_is_cpu_rrupt_from_idle()) - rcu_sched_qs(cpu); + rcu_sched_qs(); else if (!in_softirq()) - rcu_bh_qs(cpu); + rcu_bh_qs(); if (user) rcu_note_voluntary_context_switch(current); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c880f5387b1f..4c340625ffd4 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -188,22 +188,24 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) * one since the start of the grace period, this just sets a flag. * The caller must have disabled preemption. */ -void rcu_sched_qs(int cpu) +void rcu_sched_qs(void) { - struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); - - if (rdp->passed_quiesce == 0) - trace_rcu_grace_period(TPS("rcu_sched"), rdp->gpnum, TPS("cpuqs")); - rdp->passed_quiesce = 1; + if (!__this_cpu_read(rcu_sched_data.passed_quiesce)) { + trace_rcu_grace_period(TPS("rcu_sched"), + __this_cpu_read(rcu_sched_data.gpnum), + TPS("cpuqs")); + __this_cpu_write(rcu_sched_data.passed_quiesce, 1); + } } -void rcu_bh_qs(int cpu) +void rcu_bh_qs(void) { - struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); - - if (rdp->passed_quiesce == 0) - trace_rcu_grace_period(TPS("rcu_bh"), rdp->gpnum, TPS("cpuqs")); - rdp->passed_quiesce = 1; + if (!__this_cpu_read(rcu_bh_data.passed_quiesce)) { + trace_rcu_grace_period(TPS("rcu_bh"), + __this_cpu_read(rcu_bh_data.gpnum), + TPS("cpuqs")); + __this_cpu_write(rcu_bh_data.passed_quiesce, 1); + } } static DEFINE_PER_CPU(int, rcu_sched_qs_mask); @@ -278,7 +280,7 @@ static void rcu_momentary_dyntick_idle(void) void rcu_note_context_switch(int cpu) { trace_rcu_utilization(TPS("Start context switch")); - rcu_sched_qs(cpu); + rcu_sched_qs(); rcu_preempt_note_context_switch(cpu); if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) rcu_momentary_dyntick_idle(); @@ -2395,8 +2397,8 @@ void rcu_check_callbacks(int cpu, int user) * at least not while the corresponding CPU is online. */ - rcu_sched_qs(cpu); - rcu_bh_qs(cpu); + rcu_sched_qs(); + rcu_bh_qs(); } else if (!in_softirq()) { @@ -2407,7 +2409,7 @@ void rcu_check_callbacks(int cpu, int user) * critical section, so note it. */ - rcu_bh_qs(cpu); + rcu_bh_qs(); } rcu_preempt_check_callbacks(cpu); if (rcu_pending(cpu)) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0981c0cd70fe..25e692a36280 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -158,14 +158,16 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed); * As with the other rcu_*_qs() functions, callers to this function * must disable preemption. */ -static void rcu_preempt_qs(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); - - if (rdp->passed_quiesce == 0) - trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs")); - rdp->passed_quiesce = 1; - current->rcu_read_unlock_special.b.need_qs = false; +static void rcu_preempt_qs(void) +{ + if (!__this_cpu_read(rcu_preempt_data.passed_quiesce)) { + trace_rcu_grace_period(TPS("rcu_preempt"), + __this_cpu_read(rcu_preempt_data.gpnum), + TPS("cpuqs")); + __this_cpu_write(rcu_preempt_data.passed_quiesce, 1); + barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */ + current->rcu_read_unlock_special.b.need_qs = false; + } } /* @@ -256,7 +258,7 @@ static void rcu_preempt_note_context_switch(int cpu) * grace period, then the fact that the task has been enqueued * means that we continue to block the current grace period. */ - rcu_preempt_qs(cpu); + rcu_preempt_qs(); } /* @@ -352,7 +354,7 @@ void rcu_read_unlock_special(struct task_struct *t) */ special = t->rcu_read_unlock_special; if (special.b.need_qs) { - rcu_preempt_qs(smp_processor_id()); + rcu_preempt_qs(); if (!t->rcu_read_unlock_special.s) { local_irq_restore(flags); return; @@ -651,11 +653,12 @@ static void rcu_preempt_check_callbacks(int cpu) struct task_struct *t = current; if (t->rcu_read_lock_nesting == 0) { - rcu_preempt_qs(cpu); + rcu_preempt_qs(); return; } if (t->rcu_read_lock_nesting > 0 && - per_cpu(rcu_preempt_data, cpu).qs_pending) + per_cpu(rcu_preempt_data, cpu).qs_pending && + !per_cpu(rcu_preempt_data, cpu).passed_quiesce) t->rcu_read_unlock_special.b.need_qs = true; } diff --git a/kernel/softirq.c b/kernel/softirq.c index 5918d227730f..348ec763b104 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -278,7 +278,7 @@ restart: pending >>= softirq_bit; } - rcu_bh_qs(smp_processor_id()); + rcu_bh_qs(); local_irq_disable(); pending = local_softirq_pending(); -- cgit v1.2.3