summaryrefslogtreecommitdiff
path: root/kernel/rcutree.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/rcutree.c')
-rw-r--r--kernel/rcutree.c493
1 files changed, 317 insertions, 176 deletions
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 3b0f1337f75b..f280e542e3e9 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -60,36 +60,44 @@
/* Data structures. */
-static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
-
-#define RCU_STATE_INITIALIZER(structname) { \
- .level = { &structname##_state.node[0] }, \
- .levelcnt = { \
- NUM_RCU_LVL_0, /* root of hierarchy. */ \
- NUM_RCU_LVL_1, \
- NUM_RCU_LVL_2, \
- NUM_RCU_LVL_3, \
- NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
- }, \
+static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
+
+#define RCU_STATE_INITIALIZER(sname, cr) { \
+ .level = { &sname##_state.node[0] }, \
+ .call = cr, \
.fqs_state = RCU_GP_IDLE, \
.gpnum = -300, \
.completed = -300, \
- .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
- .orphan_nxttail = &structname##_state.orphan_nxtlist, \
- .orphan_donetail = &structname##_state.orphan_donelist, \
- .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \
- .n_force_qs = 0, \
- .n_force_qs_ngp = 0, \
- .name = #structname, \
+ .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.onofflock), \
+ .orphan_nxttail = &sname##_state.orphan_nxtlist, \
+ .orphan_donetail = &sname##_state.orphan_donelist, \
+ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
+ .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \
+ .name = #sname, \
}
-struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched);
+struct rcu_state rcu_sched_state =
+ RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched);
DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
-struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh);
+struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh);
DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
static struct rcu_state *rcu_state;
+LIST_HEAD(rcu_struct_flavors);
+
+/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
+static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF;
+module_param(rcu_fanout_leaf, int, 0);
+int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
+static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */
+ NUM_RCU_LVL_0,
+ NUM_RCU_LVL_1,
+ NUM_RCU_LVL_2,
+ NUM_RCU_LVL_3,
+ NUM_RCU_LVL_4,
+};
+int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
/*
* The rcu_scheduler_active variable transitions from zero to one just
@@ -147,13 +155,6 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
unsigned long rcutorture_testseq;
unsigned long rcutorture_vernum;
-/* State information for rcu_barrier() and friends. */
-
-static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
-static atomic_t rcu_barrier_cpu_count;
-static DEFINE_MUTEX(rcu_barrier_mutex);
-static struct completion rcu_barrier_completion;
-
/*
* Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
* permit this function to be invoked without holding the root rcu_node
@@ -201,6 +202,7 @@ void rcu_note_context_switch(int cpu)
{
trace_rcu_utilization("Start context switch");
rcu_sched_qs(cpu);
+ rcu_preempt_note_context_switch(cpu);
trace_rcu_utilization("End context switch");
}
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -357,7 +359,7 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
struct task_struct *idle = idle_task(smp_processor_id());
trace_rcu_dyntick("Error on entry: not idle task", oldval, 0);
- ftrace_dump(DUMP_ALL);
+ ftrace_dump(DUMP_ORIG);
WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
current->pid, current->comm,
idle->pid, idle->comm); /* must be idle task! */
@@ -467,7 +469,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
trace_rcu_dyntick("Error on exit: not idle task",
oldval, rdtp->dynticks_nesting);
- ftrace_dump(DUMP_ALL);
+ ftrace_dump(DUMP_ORIG);
WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
current->pid, current->comm,
idle->pid, idle->comm); /* must be idle task! */
@@ -584,8 +586,6 @@ void rcu_nmi_exit(void)
WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
}
-#ifdef CONFIG_PROVE_RCU
-
/**
* rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle
*
@@ -603,7 +603,7 @@ int rcu_is_cpu_idle(void)
}
EXPORT_SYMBOL(rcu_is_cpu_idle);
-#ifdef CONFIG_HOTPLUG_CPU
+#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
/*
* Is the current CPU online? Disable preemption to avoid false positives
@@ -644,9 +644,7 @@ bool rcu_lockdep_current_cpu_online(void)
}
EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
-#endif /* #ifdef CONFIG_PROVE_RCU */
+#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
/**
* rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
@@ -732,7 +730,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
int cpu;
long delta;
unsigned long flags;
- int ndetected;
+ int ndetected = 0;
struct rcu_node *rnp = rcu_get_root(rsp);
/* Only let one CPU complain about others per time interval. */
@@ -773,7 +771,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
*/
rnp = rcu_get_root(rsp);
raw_spin_lock_irqsave(&rnp->lock, flags);
- ndetected = rcu_print_task_stall(rnp);
+ ndetected += rcu_print_task_stall(rnp);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
print_cpu_stall_info_end();
@@ -859,9 +857,10 @@ static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
*/
void rcu_cpu_stall_reset(void)
{
- rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2;
- rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2;
- rcu_preempt_stall_reset();
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp)
+ rsp->jiffies_stall = jiffies + ULONG_MAX / 2;
}
static struct notifier_block rcu_panic_block = {
@@ -893,8 +892,9 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct
if (rnp->qsmask & rdp->grpmask) {
rdp->qs_pending = 1;
rdp->passed_quiesce = 0;
- } else
+ } else {
rdp->qs_pending = 0;
+ }
zero_cpu_stall_ticks(rdp);
}
}
@@ -936,6 +936,18 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
}
/*
+ * Initialize the specified rcu_data structure's callback list to empty.
+ */
+static void init_callback_list(struct rcu_data *rdp)
+{
+ int i;
+
+ rdp->nxtlist = NULL;
+ for (i = 0; i < RCU_NEXT_SIZE; i++)
+ rdp->nxttail[i] = &rdp->nxtlist;
+}
+
+/*
* Advance this CPU's callbacks, but only if the current grace period
* has ended. This may be called only from the CPU to whom the rdp
* belongs. In addition, the corresponding leaf rcu_node structure's
@@ -1327,8 +1339,6 @@ static void
rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
struct rcu_node *rnp, struct rcu_data *rdp)
{
- int i;
-
/*
* Orphan the callbacks. First adjust the counts. This is safe
* because ->onofflock excludes _rcu_barrier()'s adoption of
@@ -1339,7 +1349,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
rsp->qlen += rdp->qlen;
rdp->n_cbs_orphaned += rdp->qlen;
rdp->qlen_lazy = 0;
- rdp->qlen = 0;
+ ACCESS_ONCE(rdp->qlen) = 0;
}
/*
@@ -1368,9 +1378,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
}
/* Finally, initialize the rcu_data structure's list to empty. */
- rdp->nxtlist = NULL;
- for (i = 0; i < RCU_NEXT_SIZE; i++)
- rdp->nxttail[i] = &rdp->nxtlist;
+ init_callback_list(rdp);
}
/*
@@ -1504,6 +1512,9 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
raw_spin_unlock_irqrestore(&rnp->lock, flags);
if (need_report & RCU_OFL_TASKS_EXP_GP)
rcu_report_exp_rnp(rsp, rnp, true);
+ WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
+ "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
+ cpu, rdp->qlen, rdp->nxtlist);
}
#else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -1530,7 +1541,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
{
unsigned long flags;
struct rcu_head *next, *list, **tail;
- int bl, count, count_lazy;
+ int bl, count, count_lazy, i;
/* If no callbacks are ready, just return.*/
if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
@@ -1553,9 +1564,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
*rdp->nxttail[RCU_DONE_TAIL] = NULL;
tail = rdp->nxttail[RCU_DONE_TAIL];
- for (count = RCU_NEXT_SIZE - 1; count >= 0; count--)
- if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL])
- rdp->nxttail[count] = &rdp->nxtlist;
+ for (i = RCU_NEXT_SIZE - 1; i >= 0; i--)
+ if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
+ rdp->nxttail[i] = &rdp->nxtlist;
local_irq_restore(flags);
/* Invoke callbacks. */
@@ -1583,15 +1594,15 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
if (list != NULL) {
*tail = rdp->nxtlist;
rdp->nxtlist = list;
- for (count = 0; count < RCU_NEXT_SIZE; count++)
- if (&rdp->nxtlist == rdp->nxttail[count])
- rdp->nxttail[count] = tail;
+ for (i = 0; i < RCU_NEXT_SIZE; i++)
+ if (&rdp->nxtlist == rdp->nxttail[i])
+ rdp->nxttail[i] = tail;
else
break;
}
smp_mb(); /* List handling before counting for rcu_barrier(). */
rdp->qlen_lazy -= count_lazy;
- rdp->qlen -= count;
+ ACCESS_ONCE(rdp->qlen) -= count;
rdp->n_cbs_invoked += count;
/* Reinstate batch limit if we have worked down the excess. */
@@ -1604,6 +1615,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
rdp->n_force_qs_snap = rsp->n_force_qs;
} else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark)
rdp->qlen_last_fqs_check = rdp->qlen;
+ WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0));
local_irq_restore(flags);
@@ -1744,8 +1756,6 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
break; /* grace period idle or initializing, ignore. */
case RCU_SAVE_DYNTICK:
- if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
- break; /* So gcc recognizes the dead code. */
raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
@@ -1787,9 +1797,10 @@ unlock_fqs_ret:
* whom the rdp belongs.
*/
static void
-__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
+__rcu_process_callbacks(struct rcu_state *rsp)
{
unsigned long flags;
+ struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
WARN_ON_ONCE(rdp->beenonline == 0);
@@ -1825,11 +1836,11 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
*/
static void rcu_process_callbacks(struct softirq_action *unused)
{
+ struct rcu_state *rsp;
+
trace_rcu_utilization("Start RCU core");
- __rcu_process_callbacks(&rcu_sched_state,
- &__get_cpu_var(rcu_sched_data));
- __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
- rcu_preempt_process_callbacks();
+ for_each_rcu_flavor(rsp)
+ __rcu_process_callbacks(rsp);
trace_rcu_utilization("End RCU core");
}
@@ -1856,6 +1867,56 @@ static void invoke_rcu_core(void)
raise_softirq(RCU_SOFTIRQ);
}
+/*
+ * Handle any core-RCU processing required by a call_rcu() invocation.
+ */
+static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
+ struct rcu_head *head, unsigned long flags)
+{
+ /*
+ * If called from an extended quiescent state, invoke the RCU
+ * core in order to force a re-evaluation of RCU's idleness.
+ */
+ if (rcu_is_cpu_idle() && cpu_online(smp_processor_id()))
+ invoke_rcu_core();
+
+ /* If interrupts were disabled or CPU offline, don't invoke RCU core. */
+ if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id()))
+ return;
+
+ /*
+ * Force the grace period if too many callbacks or too long waiting.
+ * Enforce hysteresis, and don't invoke force_quiescent_state()
+ * if some other CPU has recently done so. Also, don't bother
+ * invoking force_quiescent_state() if the newly enqueued callback
+ * is the only one waiting for a grace period to complete.
+ */
+ if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
+
+ /* Are we ignoring a completed grace period? */
+ rcu_process_gp_end(rsp, rdp);
+ check_for_new_grace_period(rsp, rdp);
+
+ /* Start a new grace period if one not already started. */
+ if (!rcu_gp_in_progress(rsp)) {
+ unsigned long nestflag;
+ struct rcu_node *rnp_root = rcu_get_root(rsp);
+
+ raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
+ rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */
+ } else {
+ /* Give the grace period a kick. */
+ rdp->blimit = LONG_MAX;
+ if (rsp->n_force_qs == rdp->n_force_qs_snap &&
+ *rdp->nxttail[RCU_DONE_TAIL] != head)
+ force_quiescent_state(rsp, 0);
+ rdp->n_force_qs_snap = rsp->n_force_qs;
+ rdp->qlen_last_fqs_check = rdp->qlen;
+ }
+ } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
+ force_quiescent_state(rsp, 1);
+}
+
static void
__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
struct rcu_state *rsp, bool lazy)
@@ -1880,7 +1941,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
rdp = this_cpu_ptr(rsp->rda);
/* Add the callback to our list. */
- rdp->qlen++;
+ ACCESS_ONCE(rdp->qlen)++;
if (lazy)
rdp->qlen_lazy++;
else
@@ -1895,43 +1956,8 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
else
trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen);
- /* If interrupts were disabled, don't dive into RCU core. */
- if (irqs_disabled_flags(flags)) {
- local_irq_restore(flags);
- return;
- }
-
- /*
- * Force the grace period if too many callbacks or too long waiting.
- * Enforce hysteresis, and don't invoke force_quiescent_state()
- * if some other CPU has recently done so. Also, don't bother
- * invoking force_quiescent_state() if the newly enqueued callback
- * is the only one waiting for a grace period to complete.
- */
- if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
-
- /* Are we ignoring a completed grace period? */
- rcu_process_gp_end(rsp, rdp);
- check_for_new_grace_period(rsp, rdp);
-
- /* Start a new grace period if one not already started. */
- if (!rcu_gp_in_progress(rsp)) {
- unsigned long nestflag;
- struct rcu_node *rnp_root = rcu_get_root(rsp);
-
- raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
- rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */
- } else {
- /* Give the grace period a kick. */
- rdp->blimit = LONG_MAX;
- if (rsp->n_force_qs == rdp->n_force_qs_snap &&
- *rdp->nxttail[RCU_DONE_TAIL] != head)
- force_quiescent_state(rsp, 0);
- rdp->n_force_qs_snap = rsp->n_force_qs;
- rdp->qlen_last_fqs_check = rdp->qlen;
- }
- } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
- force_quiescent_state(rsp, 1);
+ /* Go handle any RCU core processing required. */
+ __call_rcu_core(rsp, rdp, head, flags);
local_irq_restore(flags);
}
@@ -1961,28 +1987,16 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
* occasionally incorrectly indicate that there are multiple CPUs online
* when there was in fact only one the whole time, as this just adds
* some overhead: RCU still operates correctly.
- *
- * Of course, sampling num_online_cpus() with preemption enabled can
- * give erroneous results if there are concurrent CPU-hotplug operations.
- * For example, given a demonic sequence of preemptions in num_online_cpus()
- * and CPU-hotplug operations, there could be two or more CPUs online at
- * all times, but num_online_cpus() might well return one (or even zero).
- *
- * However, all such demonic sequences require at least one CPU-offline
- * operation. Furthermore, rcu_blocking_is_gp() giving the wrong answer
- * is only a problem if there is an RCU read-side critical section executing
- * throughout. But RCU-sched and RCU-bh read-side critical sections
- * disable either preemption or bh, which prevents a CPU from going offline.
- * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return
- * that there is only one CPU when in fact there was more than one throughout
- * is when there were no RCU readers in the system. If there are no
- * RCU readers, the grace period by definition can be of zero length,
- * regardless of the number of online CPUs.
*/
static inline int rcu_blocking_is_gp(void)
{
+ int ret;
+
might_sleep(); /* Check for RCU read-side critical section. */
- return num_online_cpus() <= 1;
+ preempt_disable();
+ ret = num_online_cpus() <= 1;
+ preempt_enable();
+ return ret;
}
/**
@@ -2117,9 +2131,9 @@ void synchronize_sched_expedited(void)
put_online_cpus();
/* No joy, try again later. Or just synchronize_sched(). */
- if (trycount++ < 10)
+ if (trycount++ < 10) {
udelay(trycount * num_online_cpus());
- else {
+ } else {
synchronize_sched();
return;
}
@@ -2240,9 +2254,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
*/
static int rcu_pending(int cpu)
{
- return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) ||
- __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) ||
- rcu_preempt_pending(cpu);
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp)
+ if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu)))
+ return 1;
+ return 0;
}
/*
@@ -2252,20 +2269,41 @@ static int rcu_pending(int cpu)
*/
static int rcu_cpu_has_callbacks(int cpu)
{
+ struct rcu_state *rsp;
+
/* RCU callbacks either ready or pending? */
- return per_cpu(rcu_sched_data, cpu).nxtlist ||
- per_cpu(rcu_bh_data, cpu).nxtlist ||
- rcu_preempt_cpu_has_callbacks(cpu);
+ for_each_rcu_flavor(rsp)
+ if (per_cpu_ptr(rsp->rda, cpu)->nxtlist)
+ return 1;
+ return 0;
+}
+
+/*
+ * Helper function for _rcu_barrier() tracing. If tracing is disabled,
+ * the compiler is expected to optimize this away.
+ */
+static void _rcu_barrier_trace(struct rcu_state *rsp, char *s,
+ int cpu, unsigned long done)
+{
+ trace_rcu_barrier(rsp->name, s, cpu,
+ atomic_read(&rsp->barrier_cpu_count), done);
}
/*
* RCU callback function for _rcu_barrier(). If we are last, wake
* up the task executing _rcu_barrier().
*/
-static void rcu_barrier_callback(struct rcu_head *notused)
+static void rcu_barrier_callback(struct rcu_head *rhp)
{
- if (atomic_dec_and_test(&rcu_barrier_cpu_count))
- complete(&rcu_barrier_completion);
+ struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head);
+ struct rcu_state *rsp = rdp->rsp;
+
+ if (atomic_dec_and_test(&rsp->barrier_cpu_count)) {
+ _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done);
+ complete(&rsp->barrier_completion);
+ } else {
+ _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done);
+ }
}
/*
@@ -2273,35 +2311,63 @@ static void rcu_barrier_callback(struct rcu_head *notused)
*/
static void rcu_barrier_func(void *type)
{
- int cpu = smp_processor_id();
- struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
- void (*call_rcu_func)(struct rcu_head *head,
- void (*func)(struct rcu_head *head));
+ struct rcu_state *rsp = type;
+ struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
- atomic_inc(&rcu_barrier_cpu_count);
- call_rcu_func = type;
- call_rcu_func(head, rcu_barrier_callback);
+ _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done);
+ atomic_inc(&rsp->barrier_cpu_count);
+ rsp->call(&rdp->barrier_head, rcu_barrier_callback);
}
/*
* Orchestrate the specified type of RCU barrier, waiting for all
* RCU callbacks of the specified type to complete.
*/
-static void _rcu_barrier(struct rcu_state *rsp,
- void (*call_rcu_func)(struct rcu_head *head,
- void (*func)(struct rcu_head *head)))
+static void _rcu_barrier(struct rcu_state *rsp)
{
int cpu;
unsigned long flags;
struct rcu_data *rdp;
- struct rcu_head rh;
+ struct rcu_data rd;
+ unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done);
+ unsigned long snap_done;
- init_rcu_head_on_stack(&rh);
+ init_rcu_head_on_stack(&rd.barrier_head);
+ _rcu_barrier_trace(rsp, "Begin", -1, snap);
/* Take mutex to serialize concurrent rcu_barrier() requests. */
- mutex_lock(&rcu_barrier_mutex);
+ mutex_lock(&rsp->barrier_mutex);
+
+ /*
+ * Ensure that all prior references, including to ->n_barrier_done,
+ * are ordered before the _rcu_barrier() machinery.
+ */
+ smp_mb(); /* See above block comment. */
+
+ /*
+ * Recheck ->n_barrier_done to see if others did our work for us.
+ * This means checking ->n_barrier_done for an even-to-odd-to-even
+ * transition. The "if" expression below therefore rounds the old
+ * value up to the next even number and adds two before comparing.
+ */
+ snap_done = ACCESS_ONCE(rsp->n_barrier_done);
+ _rcu_barrier_trace(rsp, "Check", -1, snap_done);
+ if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) {
+ _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done);
+ smp_mb(); /* caller's subsequent code after above check. */
+ mutex_unlock(&rsp->barrier_mutex);
+ return;
+ }
- smp_mb(); /* Prevent any prior operations from leaking in. */
+ /*
+ * Increment ->n_barrier_done to avoid duplicate work. Use
+ * ACCESS_ONCE() to prevent the compiler from speculating
+ * the increment to precede the early-exit check.
+ */
+ ACCESS_ONCE(rsp->n_barrier_done)++;
+ WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
+ _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
+ smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
/*
* Initialize the count to one rather than to zero in order to
@@ -2320,8 +2386,8 @@ static void _rcu_barrier(struct rcu_state *rsp,
* 6. Both rcu_barrier_callback() callbacks are invoked, awakening
* us -- but before CPU 1's orphaned callbacks are invoked!!!
*/
- init_completion(&rcu_barrier_completion);
- atomic_set(&rcu_barrier_cpu_count, 1);
+ init_completion(&rsp->barrier_completion);
+ atomic_set(&rsp->barrier_cpu_count, 1);
raw_spin_lock_irqsave(&rsp->onofflock, flags);
rsp->rcu_barrier_in_progress = current;
raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
@@ -2337,14 +2403,19 @@ static void _rcu_barrier(struct rcu_state *rsp,
preempt_disable();
rdp = per_cpu_ptr(rsp->rda, cpu);
if (cpu_is_offline(cpu)) {
+ _rcu_barrier_trace(rsp, "Offline", cpu,
+ rsp->n_barrier_done);
preempt_enable();
while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen))
schedule_timeout_interruptible(1);
} else if (ACCESS_ONCE(rdp->qlen)) {
- smp_call_function_single(cpu, rcu_barrier_func,
- (void *)call_rcu_func, 1);
+ _rcu_barrier_trace(rsp, "OnlineQ", cpu,
+ rsp->n_barrier_done);
+ smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
preempt_enable();
} else {
+ _rcu_barrier_trace(rsp, "OnlineNQ", cpu,
+ rsp->n_barrier_done);
preempt_enable();
}
}
@@ -2361,24 +2432,32 @@ static void _rcu_barrier(struct rcu_state *rsp,
rcu_adopt_orphan_cbs(rsp);
rsp->rcu_barrier_in_progress = NULL;
raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
- atomic_inc(&rcu_barrier_cpu_count);
+ atomic_inc(&rsp->barrier_cpu_count);
smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */
- call_rcu_func(&rh, rcu_barrier_callback);
+ rd.rsp = rsp;
+ rsp->call(&rd.barrier_head, rcu_barrier_callback);
/*
* Now that we have an rcu_barrier_callback() callback on each
* CPU, and thus each counted, remove the initial count.
*/
- if (atomic_dec_and_test(&rcu_barrier_cpu_count))
- complete(&rcu_barrier_completion);
+ if (atomic_dec_and_test(&rsp->barrier_cpu_count))
+ complete(&rsp->barrier_completion);
+
+ /* Increment ->n_barrier_done to prevent duplicate work. */
+ smp_mb(); /* Keep increment after above mechanism. */
+ ACCESS_ONCE(rsp->n_barrier_done)++;
+ WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
+ _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
+ smp_mb(); /* Keep increment before caller's subsequent code. */
/* Wait for all rcu_barrier_callback() callbacks to be invoked. */
- wait_for_completion(&rcu_barrier_completion);
+ wait_for_completion(&rsp->barrier_completion);
/* Other rcu_barrier() invocations can now safely proceed. */
- mutex_unlock(&rcu_barrier_mutex);
+ mutex_unlock(&rsp->barrier_mutex);
- destroy_rcu_head_on_stack(&rh);
+ destroy_rcu_head_on_stack(&rd.barrier_head);
}
/**
@@ -2386,7 +2465,7 @@ static void _rcu_barrier(struct rcu_state *rsp,
*/
void rcu_barrier_bh(void)
{
- _rcu_barrier(&rcu_bh_state, call_rcu_bh);
+ _rcu_barrier(&rcu_bh_state);
}
EXPORT_SYMBOL_GPL(rcu_barrier_bh);
@@ -2395,7 +2474,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier_bh);
*/
void rcu_barrier_sched(void)
{
- _rcu_barrier(&rcu_sched_state, call_rcu_sched);
+ _rcu_barrier(&rcu_sched_state);
}
EXPORT_SYMBOL_GPL(rcu_barrier_sched);
@@ -2406,18 +2485,15 @@ static void __init
rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
{
unsigned long flags;
- int i;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rcu_get_root(rsp);
/* Set up local state, ensuring consistent view of global state. */
raw_spin_lock_irqsave(&rnp->lock, flags);
rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
- rdp->nxtlist = NULL;
- for (i = 0; i < RCU_NEXT_SIZE; i++)
- rdp->nxttail[i] = &rdp->nxtlist;
+ init_callback_list(rdp);
rdp->qlen_lazy = 0;
- rdp->qlen = 0;
+ ACCESS_ONCE(rdp->qlen) = 0;
rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
@@ -2491,9 +2567,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
static void __cpuinit rcu_prepare_cpu(int cpu)
{
- rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
- rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
- rcu_preempt_init_percpu_data(cpu);
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp)
+ rcu_init_percpu_data(cpu, rsp,
+ strcmp(rsp->name, "rcu_preempt") == 0);
}
/*
@@ -2505,6 +2583,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
long cpu = (long)hcpu;
struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
struct rcu_node *rnp = rdp->mynode;
+ struct rcu_state *rsp;
trace_rcu_utilization("Start CPU hotplug");
switch (action) {
@@ -2529,18 +2608,16 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
* touch any data without introducing corruption. We send the
* dying CPU's callbacks to an arbitrarily chosen online CPU.
*/
- rcu_cleanup_dying_cpu(&rcu_bh_state);
- rcu_cleanup_dying_cpu(&rcu_sched_state);
- rcu_preempt_cleanup_dying_cpu();
+ for_each_rcu_flavor(rsp)
+ rcu_cleanup_dying_cpu(rsp);
rcu_cleanup_after_idle(cpu);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
- rcu_cleanup_dead_cpu(cpu, &rcu_bh_state);
- rcu_cleanup_dead_cpu(cpu, &rcu_sched_state);
- rcu_preempt_cleanup_dead_cpu(cpu);
+ for_each_rcu_flavor(rsp)
+ rcu_cleanup_dead_cpu(cpu, rsp);
break;
default:
break;
@@ -2573,9 +2650,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
{
int i;
- for (i = NUM_RCU_LVLS - 1; i > 0; i--)
+ for (i = rcu_num_lvls - 1; i > 0; i--)
rsp->levelspread[i] = CONFIG_RCU_FANOUT;
- rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF;
+ rsp->levelspread[0] = rcu_fanout_leaf;
}
#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
static void __init rcu_init_levelspread(struct rcu_state *rsp)
@@ -2585,7 +2662,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
int i;
cprv = NR_CPUS;
- for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+ for (i = rcu_num_lvls - 1; i >= 0; i--) {
ccur = rsp->levelcnt[i];
rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
cprv = ccur;
@@ -2612,13 +2689,15 @@ static void __init rcu_init_one(struct rcu_state *rsp,
/* Initialize the level-tracking arrays. */
- for (i = 1; i < NUM_RCU_LVLS; i++)
+ for (i = 0; i < rcu_num_lvls; i++)
+ rsp->levelcnt[i] = num_rcu_lvl[i];
+ for (i = 1; i < rcu_num_lvls; i++)
rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
rcu_init_levelspread(rsp);
/* Initialize the elements themselves, starting from the leaves. */
- for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+ for (i = rcu_num_lvls - 1; i >= 0; i--) {
cpustride *= rsp->levelspread[i];
rnp = rsp->level[i];
for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
@@ -2648,13 +2727,74 @@ static void __init rcu_init_one(struct rcu_state *rsp,
}
rsp->rda = rda;
- rnp = rsp->level[NUM_RCU_LVLS - 1];
+ rnp = rsp->level[rcu_num_lvls - 1];
for_each_possible_cpu(i) {
while (i > rnp->grphi)
rnp++;
per_cpu_ptr(rsp->rda, i)->mynode = rnp;
rcu_boot_init_percpu_data(i, rsp);
}
+ list_add(&rsp->flavors, &rcu_struct_flavors);
+}
+
+/*
+ * Compute the rcu_node tree geometry from kernel parameters. This cannot
+ * replace the definitions in rcutree.h because those are needed to size
+ * the ->node array in the rcu_state structure.
+ */
+static void __init rcu_init_geometry(void)
+{
+ int i;
+ int j;
+ int n = nr_cpu_ids;
+ int rcu_capacity[MAX_RCU_LVLS + 1];
+
+ /* If the compile-time values are accurate, just leave. */
+ if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF)
+ return;
+
+ /*
+ * Compute number of nodes that can be handled an rcu_node tree
+ * with the given number of levels. Setting rcu_capacity[0] makes
+ * some of the arithmetic easier.
+ */
+ rcu_capacity[0] = 1;
+ rcu_capacity[1] = rcu_fanout_leaf;
+ for (i = 2; i <= MAX_RCU_LVLS; i++)
+ rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT;
+
+ /*
+ * The boot-time rcu_fanout_leaf parameter is only permitted
+ * to increase the leaf-level fanout, not decrease it. Of course,
+ * the leaf-level fanout cannot exceed the number of bits in
+ * the rcu_node masks. Finally, the tree must be able to accommodate
+ * the configured number of CPUs. Complain and fall back to the
+ * compile-time values if these limits are exceeded.
+ */
+ if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF ||
+ rcu_fanout_leaf > sizeof(unsigned long) * 8 ||
+ n > rcu_capacity[MAX_RCU_LVLS]) {
+ WARN_ON(1);
+ return;
+ }
+
+ /* Calculate the number of rcu_nodes at each level of the tree. */
+ for (i = 1; i <= MAX_RCU_LVLS; i++)
+ if (n <= rcu_capacity[i]) {
+ for (j = 0; j <= i; j++)
+ num_rcu_lvl[j] =
+ DIV_ROUND_UP(n, rcu_capacity[i - j]);
+ rcu_num_lvls = i;
+ for (j = i + 1; j <= MAX_RCU_LVLS; j++)
+ num_rcu_lvl[j] = 0;
+ break;
+ }
+
+ /* Calculate the total number of rcu_node structures. */
+ rcu_num_nodes = 0;
+ for (i = 0; i <= MAX_RCU_LVLS; i++)
+ rcu_num_nodes += num_rcu_lvl[i];
+ rcu_num_nodes -= n;
}
void __init rcu_init(void)
@@ -2662,6 +2802,7 @@ void __init rcu_init(void)
int cpu;
rcu_bootup_announce();
+ rcu_init_geometry();
rcu_init_one(&rcu_sched_state, &rcu_sched_data);
rcu_init_one(&rcu_bh_state, &rcu_bh_data);
__rcu_init_preempt();