From 102c9323c35a83789ad5ebd3c45fa8fb389add88 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 12 Jul 2013 17:07:27 -0400
Subject: tracing: Add __tracepoint_string() to export string pointers

There are several tracepoints (mostly in RCU), that reference a string
pointer and uses the print format of "%s" to display the string that
exists in the kernel, instead of copying the actual string to the
ring buffer (saves time and ring buffer space).

But this has an issue with userspace tools that read the binary buffers
that has the address of the string but has no access to what the string
itself is. The end result is just output that looks like:

 rcu_dyntick:          ffffffff818adeaa 1 0
 rcu_dyntick:          ffffffff818adeb5 0 140000000000000
 rcu_dyntick:          ffffffff818adeb5 0 140000000000000
 rcu_utilization:      ffffffff8184333b
 rcu_utilization:      ffffffff8184333b

The above is pretty useless when read by the userspace tools. Ideally
we would want something that looks like this:

 rcu_dyntick:          Start 1 0
 rcu_dyntick:          End 0 140000000000000
 rcu_dyntick:          Start 140000000000000 0
 rcu_callback:         rcu_preempt rhp=0xffff880037aff710 func=put_cred_rcu 0/4
 rcu_callback:         rcu_preempt rhp=0xffff880078961980 func=file_free_rcu 0/5
 rcu_dyntick:          End 0 1

The trace_printk() which also only stores the address of the string
format instead of recording the string into the buffer itself, exports
the mapping of kernel addresses to format strings via the printk_format
file in the debugfs tracing directory.

The tracepoint strings can use this same method and output the format
to the same file and the userspace tools will be able to decipher
the address without any modification.

The tracepoint strings need its own section to save the strings because
the trace_printk section will cause the trace_printk() buffers to be
allocated if anything exists within the section. trace_printk() is only
used for debugging and should never exist in the kernel, we can not use
the trace_printk sections.

Add a new tracepoint_str section that will also be examined by the output
of the printk_format file.

Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/asm-generic/vmlinux.lds.h |  7 ++++++-
 include/linux/ftrace_event.h      | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 69732d279e8b..83e2c31e8b00 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -122,8 +122,12 @@
 #define TRACE_PRINTKS() VMLINUX_SYMBOL(__start___trace_bprintk_fmt) = .;      \
 			 *(__trace_printk_fmt) /* Trace_printk fmt' pointer */ \
 			 VMLINUX_SYMBOL(__stop___trace_bprintk_fmt) = .;
+#define TRACEPOINT_STR() VMLINUX_SYMBOL(__start___tracepoint_str) = .;	\
+			 *(__tracepoint_str) /* Trace_printk fmt' pointer */ \
+			 VMLINUX_SYMBOL(__stop___tracepoint_str) = .;
 #else
 #define TRACE_PRINTKS()
+#define TRACEPOINT_STR()
 #endif
 
 #ifdef CONFIG_FTRACE_SYSCALLS
@@ -190,7 +194,8 @@
 	VMLINUX_SYMBOL(__stop___verbose) = .;				\
 	LIKELY_PROFILE()		       				\
 	BRANCH_PROFILE()						\
-	TRACE_PRINTKS()
+	TRACE_PRINTKS()							\
+	TRACEPOINT_STR()
 
 /*
  * Data section helpers
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 4372658c73ae..81af18a75f4d 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -357,6 +357,40 @@ do {									\
 		__trace_printk(ip, fmt, ##args);			\
 } while (0)
 
+/**
+ * tracepoint_string - register constant persistent string to trace system
+ * @str - a constant persistent string that will be referenced in tracepoints
+ *
+ * If constant strings are being used in tracepoints, it is faster and
+ * more efficient to just save the pointer to the string and reference
+ * that with a printf "%s" instead of saving the string in the ring buffer
+ * and wasting space and time.
+ *
+ * The problem with the above approach is that userspace tools that read
+ * the binary output of the trace buffers do not have access to the string.
+ * Instead they just show the address of the string which is not very
+ * useful to users.
+ *
+ * With tracepoint_string(), the string will be registered to the tracing
+ * system and exported to userspace via the debugfs/tracing/printk_formats
+ * file that maps the string address to the string text. This way userspace
+ * tools that read the binary buffers have a way to map the pointers to
+ * the ASCII strings they represent.
+ *
+ * The @str used must be a constant string and persistent as it would not
+ * make sense to show a string that no longer exists. But it is still fine
+ * to be used with modules, because when modules are unloaded, if they
+ * had tracepoints, the ring buffers are cleared too. As long as the string
+ * does not change during the life of the module, it is fine to use
+ * tracepoint_string() within a module.
+ */
+#define tracepoint_string(str)						\
+	({								\
+		static const char *___tp_str __tracepoint_string = str; \
+		___tp_str;						\
+	})
+#define __tracepoint_string	__attribute__((section("__tracepoint_str")))
+
 #ifdef CONFIG_PERF_EVENTS
 struct perf_event;
 
-- 
cgit v1.2.3


From e66c33d579ea566d10e8c8695a7168aae3e02992 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 12 Jul 2013 16:50:28 -0400
Subject: rcu: Add const annotation to char * for RCU tracepoints and functions

All the RCU tracepoints and functions that reference char pointers do
so with just 'char *' even though they do not modify the contents of
the string itself. This will cause warnings if a const char * is used
in one of these functions.

The RCU tracepoints store the pointer to the string to refer back to them
when the trace output is displayed. As this can be minutes, hours or
even days later, those strings had better be constant.

This change also opens the door to allow the RCU tracepoint strings and
their addresses to be exported so that userspace tracing tools can
translate the contents of the pointers of the RCU tracepoints.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/rcupdate.h   |  4 +--
 include/trace/events/rcu.h | 82 +++++++++++++++++++++++-----------------------
 kernel/rcu.h               |  2 +-
 kernel/rcupdate.c          |  2 +-
 kernel/rcutiny.c           |  2 +-
 kernel/rcutiny_plugin.h    |  2 +-
 kernel/rcutorture.c        |  8 ++---
 kernel/rcutree.c           |  4 +--
 kernel/rcutree.h           |  2 +-
 9 files changed, 54 insertions(+), 54 deletions(-)

(limited to 'include')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 4b14bdc911d7..0c38abbe6e35 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -52,7 +52,7 @@ extern int rcutorture_runnable; /* for sysctl */
 #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU)
 extern void rcutorture_record_test_transition(void);
 extern void rcutorture_record_progress(unsigned long vernum);
-extern void do_trace_rcu_torture_read(char *rcutorturename,
+extern void do_trace_rcu_torture_read(const char *rcutorturename,
 				      struct rcu_head *rhp,
 				      unsigned long secs,
 				      unsigned long c_old,
@@ -65,7 +65,7 @@ static inline void rcutorture_record_progress(unsigned long vernum)
 {
 }
 #ifdef CONFIG_RCU_TRACE
-extern void do_trace_rcu_torture_read(char *rcutorturename,
+extern void do_trace_rcu_torture_read(const char *rcutorturename,
 				      struct rcu_head *rhp,
 				      unsigned long secs,
 				      unsigned long c_old,
diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 59ebcc89f148..ee2376cfaab3 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -19,12 +19,12 @@
  */
 TRACE_EVENT(rcu_utilization,
 
-	TP_PROTO(char *s),
+	TP_PROTO(const char *s),
 
 	TP_ARGS(s),
 
 	TP_STRUCT__entry(
-		__field(char *, s)
+		__field(const char *, s)
 	),
 
 	TP_fast_assign(
@@ -51,14 +51,14 @@ TRACE_EVENT(rcu_utilization,
  */
 TRACE_EVENT(rcu_grace_period,
 
-	TP_PROTO(char *rcuname, unsigned long gpnum, char *gpevent),
+	TP_PROTO(const char *rcuname, unsigned long gpnum, const char *gpevent),
 
 	TP_ARGS(rcuname, gpnum, gpevent),
 
 	TP_STRUCT__entry(
-		__field(char *, rcuname)
+		__field(const char *, rcuname)
 		__field(unsigned long, gpnum)
-		__field(char *, gpevent)
+		__field(const char *, gpevent)
 	),
 
 	TP_fast_assign(
@@ -89,21 +89,21 @@ TRACE_EVENT(rcu_grace_period,
  */
 TRACE_EVENT(rcu_future_grace_period,
 
-	TP_PROTO(char *rcuname, unsigned long gpnum, unsigned long completed,
+	TP_PROTO(const char *rcuname, unsigned long gpnum, unsigned long completed,
 		 unsigned long c, u8 level, int grplo, int grphi,
-		 char *gpevent),
+		 const char *gpevent),
 
 	TP_ARGS(rcuname, gpnum, completed, c, level, grplo, grphi, gpevent),
 
 	TP_STRUCT__entry(
-		__field(char *, rcuname)
+		__field(const char *, rcuname)
 		__field(unsigned long, gpnum)
 		__field(unsigned long, completed)
 		__field(unsigned long, c)
 		__field(u8, level)
 		__field(int, grplo)
 		__field(int, grphi)
-		__field(char *, gpevent)
+		__field(const char *, gpevent)
 	),
 
 	TP_fast_assign(
@@ -132,13 +132,13 @@ TRACE_EVENT(rcu_future_grace_period,
  */
 TRACE_EVENT(rcu_grace_period_init,
 
-	TP_PROTO(char *rcuname, unsigned long gpnum, u8 level,
+	TP_PROTO(const char *rcuname, unsigned long gpnum, u8 level,
 		 int grplo, int grphi, unsigned long qsmask),
 
 	TP_ARGS(rcuname, gpnum, level, grplo, grphi, qsmask),
 
 	TP_STRUCT__entry(
-		__field(char *, rcuname)
+		__field(const char *, rcuname)
 		__field(unsigned long, gpnum)
 		__field(u8, level)
 		__field(int, grplo)
@@ -168,12 +168,12 @@ TRACE_EVENT(rcu_grace_period_init,
  */
 TRACE_EVENT(rcu_preempt_task,
 
-	TP_PROTO(char *rcuname, int pid, unsigned long gpnum),
+	TP_PROTO(const char *rcuname, int pid, unsigned long gpnum),
 
 	TP_ARGS(rcuname, pid, gpnum),
 
 	TP_STRUCT__entry(
-		__field(char *, rcuname)
+		__field(const char *, rcuname)
 		__field(unsigned long, gpnum)
 		__field(int, pid)
 	),
@@ -195,12 +195,12 @@ TRACE_EVENT(rcu_preempt_task,
  */
 TRACE_EVENT(rcu_unlock_preempted_task,
 
-	TP_PROTO(char *rcuname, unsigned long gpnum, int pid),
+	TP_PROTO(const char *rcuname, unsigned long gpnum, int pid),
 
 	TP_ARGS(rcuname, gpnum, pid),
 
 	TP_STRUCT__entry(
-		__field(char *, rcuname)
+		__field(const char *, rcuname)
 		__field(unsigned long, gpnum)
 		__field(int, pid)
 	),
@@ -224,14 +224,14 @@ TRACE_EVENT(rcu_unlock_preempted_task,
  */
 TRACE_EVENT(rcu_quiescent_state_report,
 
-	TP_PROTO(char *rcuname, unsigned long gpnum,
+	TP_PROTO(const char *rcuname, unsigned long gpnum,
 		 unsigned long mask, unsigned long qsmask,
 		 u8 level, int grplo, int grphi, int gp_tasks),
 
 	TP_ARGS(rcuname, gpnum, mask, qsmask, level, grplo, grphi, gp_tasks),
 
 	TP_STRUCT__entry(
-		__field(char *, rcuname)
+		__field(const char *, rcuname)
 		__field(unsigned long, gpnum)
 		__field(unsigned long, mask)
 		__field(unsigned long, qsmask)
@@ -268,15 +268,15 @@ TRACE_EVENT(rcu_quiescent_state_report,
  */
 TRACE_EVENT(rcu_fqs,
 
-	TP_PROTO(char *rcuname, unsigned long gpnum, int cpu, char *qsevent),
+	TP_PROTO(const char *rcuname, unsigned long gpnum, int cpu, const char *qsevent),
 
 	TP_ARGS(rcuname, gpnum, cpu, qsevent),
 
 	TP_STRUCT__entry(
-		__field(char *, rcuname)
+		__field(const char *, rcuname)
 		__field(unsigned long, gpnum)
 		__field(int, cpu)
-		__field(char *, qsevent)
+		__field(const char *, qsevent)
 	),
 
 	TP_fast_assign(
@@ -308,12 +308,12 @@ TRACE_EVENT(rcu_fqs,
  */
 TRACE_EVENT(rcu_dyntick,
 
-	TP_PROTO(char *polarity, long long oldnesting, long long newnesting),
+	TP_PROTO(const char *polarity, long long oldnesting, long long newnesting),
 
 	TP_ARGS(polarity, oldnesting, newnesting),
 
 	TP_STRUCT__entry(
-		__field(char *, polarity)
+		__field(const char *, polarity)
 		__field(long long, oldnesting)
 		__field(long long, newnesting)
 	),
@@ -352,12 +352,12 @@ TRACE_EVENT(rcu_dyntick,
  */
 TRACE_EVENT(rcu_prep_idle,
 
-	TP_PROTO(char *reason),
+	TP_PROTO(const char *reason),
 
 	TP_ARGS(reason),
 
 	TP_STRUCT__entry(
-		__field(char *, reason)
+		__field(const char *, reason)
 	),
 
 	TP_fast_assign(
@@ -376,13 +376,13 @@ TRACE_EVENT(rcu_prep_idle,
  */
 TRACE_EVENT(rcu_callback,
 
-	TP_PROTO(char *rcuname, struct rcu_head *rhp, long qlen_lazy,
+	TP_PROTO(const char *rcuname, struct rcu_head *rhp, long qlen_lazy,
 		 long qlen),
 
 	TP_ARGS(rcuname, rhp, qlen_lazy, qlen),
 
 	TP_STRUCT__entry(
-		__field(char *, rcuname)
+		__field(const char *, rcuname)
 		__field(void *, rhp)
 		__field(void *, func)
 		__field(long, qlen_lazy)
@@ -412,13 +412,13 @@ TRACE_EVENT(rcu_callback,
  */
 TRACE_EVENT(rcu_kfree_callback,
 
-	TP_PROTO(char *rcuname, struct rcu_head *rhp, unsigned long offset,
+	TP_PROTO(const char *rcuname, struct rcu_head *rhp, unsigned long offset,
 		 long qlen_lazy, long qlen),
 
 	TP_ARGS(rcuname, rhp, offset, qlen_lazy, qlen),
 
 	TP_STRUCT__entry(
-		__field(char *, rcuname)
+		__field(const char *, rcuname)
 		__field(void *, rhp)
 		__field(unsigned long, offset)
 		__field(long, qlen_lazy)
@@ -447,12 +447,12 @@ TRACE_EVENT(rcu_kfree_callback,
  */
 TRACE_EVENT(rcu_batch_start,
 
-	TP_PROTO(char *rcuname, long qlen_lazy, long qlen, long blimit),
+	TP_PROTO(const char *rcuname, long qlen_lazy, long qlen, long blimit),
 
 	TP_ARGS(rcuname, qlen_lazy, qlen, blimit),
 
 	TP_STRUCT__entry(
-		__field(char *, rcuname)
+		__field(const char *, rcuname)
 		__field(long, qlen_lazy)
 		__field(long, qlen)
 		__field(long, blimit)
@@ -477,12 +477,12 @@ TRACE_EVENT(rcu_batch_start,
  */
 TRACE_EVENT(rcu_invoke_callback,
 
-	TP_PROTO(char *rcuname, struct rcu_head *rhp),
+	TP_PROTO(const char *rcuname, struct rcu_head *rhp),
 
 	TP_ARGS(rcuname, rhp),
 
 	TP_STRUCT__entry(
-		__field(char *, rcuname)
+		__field(const char *, rcuname)
 		__field(void *, rhp)
 		__field(void *, func)
 	),
@@ -506,12 +506,12 @@ TRACE_EVENT(rcu_invoke_callback,
  */
 TRACE_EVENT(rcu_invoke_kfree_callback,
 
-	TP_PROTO(char *rcuname, struct rcu_head *rhp, unsigned long offset),
+	TP_PROTO(const char *rcuname, struct rcu_head *rhp, unsigned long offset),
 
 	TP_ARGS(rcuname, rhp, offset),
 
 	TP_STRUCT__entry(
-		__field(char *, rcuname)
+		__field(const char *, rcuname)
 		__field(void *, rhp)
 		__field(unsigned long, offset)
 	),
@@ -539,13 +539,13 @@ TRACE_EVENT(rcu_invoke_kfree_callback,
  */
 TRACE_EVENT(rcu_batch_end,
 
-	TP_PROTO(char *rcuname, int callbacks_invoked,
+	TP_PROTO(const char *rcuname, int callbacks_invoked,
 		 bool cb, bool nr, bool iit, bool risk),
 
 	TP_ARGS(rcuname, callbacks_invoked, cb, nr, iit, risk),
 
 	TP_STRUCT__entry(
-		__field(char *, rcuname)
+		__field(const char *, rcuname)
 		__field(int, callbacks_invoked)
 		__field(bool, cb)
 		__field(bool, nr)
@@ -577,13 +577,13 @@ TRACE_EVENT(rcu_batch_end,
  */
 TRACE_EVENT(rcu_torture_read,
 
-	TP_PROTO(char *rcutorturename, struct rcu_head *rhp,
+	TP_PROTO(const char *rcutorturename, struct rcu_head *rhp,
 		 unsigned long secs, unsigned long c_old, unsigned long c),
 
 	TP_ARGS(rcutorturename, rhp, secs, c_old, c),
 
 	TP_STRUCT__entry(
-		__field(char *, rcutorturename)
+		__field(const char *, rcutorturename)
 		__field(struct rcu_head *, rhp)
 		__field(unsigned long, secs)
 		__field(unsigned long, c_old)
@@ -623,13 +623,13 @@ TRACE_EVENT(rcu_torture_read,
  */
 TRACE_EVENT(rcu_barrier,
 
-	TP_PROTO(char *rcuname, char *s, int cpu, int cnt, unsigned long done),
+	TP_PROTO(const char *rcuname, const char *s, int cpu, int cnt, unsigned long done),
 
 	TP_ARGS(rcuname, s, cpu, cnt, done),
 
 	TP_STRUCT__entry(
-		__field(char *, rcuname)
-		__field(char *, s)
+		__field(const char *, rcuname)
+		__field(const char *, s)
 		__field(int, cpu)
 		__field(int, cnt)
 		__field(unsigned long, done)
diff --git a/kernel/rcu.h b/kernel/rcu.h
index 7f8e7590e3e5..0a90ccc65bfb 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu.h
@@ -94,7 +94,7 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
 
 extern void kfree(const void *);
 
-static inline bool __rcu_reclaim(char *rn, struct rcu_head *head)
+static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
 {
 	unsigned long offset = (unsigned long)head->func;
 
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index cce6ba8bbace..14994d4e1a54 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -377,7 +377,7 @@ EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
 #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
 
 #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
-void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp,
+void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp,
 			       unsigned long secs,
 			       unsigned long c_old, unsigned long c)
 {
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index aa344111de3e..9ed6075dc562 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -264,7 +264,7 @@ void rcu_check_callbacks(int cpu, int user)
  */
 static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 {
-	char *rn = NULL;
+	const char *rn = NULL;
 	struct rcu_head *next, *list;
 	unsigned long flags;
 	RCU_TRACE(int cb_count = 0);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 0cd385acccfa..280d06cae352 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -36,7 +36,7 @@ struct rcu_ctrlblk {
 	RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */
 	RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */
 	RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */
-	RCU_TRACE(char *name);		/* Name of RCU type. */
+	RCU_TRACE(const char *name);	/* Name of RCU type. */
 };
 
 /* Definition for rcupdate control block. */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index f4871e52c546..3d936f0fbcd8 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -267,7 +267,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1,
  * Absorb kthreads into a kernel function that won't return, so that
  * they won't ever access module text or data again.
  */
-static void rcutorture_shutdown_absorb(char *title)
+static void rcutorture_shutdown_absorb(const char *title)
 {
 	if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
 		pr_notice(
@@ -337,7 +337,7 @@ rcu_random(struct rcu_random_state *rrsp)
 }
 
 static void
-rcu_stutter_wait(char *title)
+rcu_stutter_wait(const char *title)
 {
 	while (stutter_pause_test || !rcutorture_runnable) {
 		if (rcutorture_runnable)
@@ -366,7 +366,7 @@ struct rcu_torture_ops {
 	int (*stats)(char *page);
 	int irq_capable;
 	int can_boost;
-	char *name;
+	const char *name;
 };
 
 static struct rcu_torture_ops *cur_ops;
@@ -1364,7 +1364,7 @@ rcu_torture_stutter(void *arg)
 }
 
 static inline void
-rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
+rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
 {
 	pr_alert("%s" TORTURE_FLAG
 		 "--- %s: nreaders=%d nfakewriters=%d "
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 068de3a93606..30201494560b 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1032,7 +1032,7 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
  * rcu_nocb_wait_gp().
  */
 static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
-				unsigned long c, char *s)
+				unsigned long c, const char *s)
 {
 	trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum,
 				      rnp->completed, c, rnp->level,
@@ -2720,7 +2720,7 @@ static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
  * Helper function for _rcu_barrier() tracing.  If tracing is disabled,
  * the compiler is expected to optimize this away.
  */
-static void _rcu_barrier_trace(struct rcu_state *rsp, char *s,
+static void _rcu_barrier_trace(struct rcu_state *rsp, const char *s,
 			       int cpu, unsigned long done)
 {
 	trace_rcu_barrier(rsp->name, s, cpu,
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index b3832581043c..cbdeac6cea9e 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -445,7 +445,7 @@ struct rcu_state {
 						/*  for CPU stalls. */
 	unsigned long gp_max;			/* Maximum GP duration in */
 						/*  jiffies. */
-	char *name;				/* Name of structure. */
+	const char *name;			/* Name of structure. */
 	char abbr;				/* Abbreviated name. */
 	struct list_head flavors;		/* List of RCU flavors. */
 	struct irq_work wakeup_work;		/* Postponed wakeups */
-- 
cgit v1.2.3


From b778ae25366e6f3891fe51306f56a3bca211975d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 23 Apr 2013 12:51:11 -0700
Subject: debugobjects: Make debug_object_activate() return status

In order to better respond to things like duplicate invocations
of call_rcu(), RCU needs to see the status of a call to
debug_object_activate().  This would allow RCU to leak the callback in
order to avoid adding freelist-reuse mischief to the duplicate invoations.
This commit therefore makes debug_object_activate() return status,
zero for success and -EINVAL for failure.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Sedat Dilek <sedat.dilek@gmail.com>
Cc: Davidlohr Bueso <davidlohr.bueso@hp.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/debugobjects.h |  6 +++---
 lib/debugobjects.c           | 20 ++++++++++++++------
 2 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/debugobjects.h b/include/linux/debugobjects.h
index 0e5f5785d9f2..98ffcbd4888e 100644
--- a/include/linux/debugobjects.h
+++ b/include/linux/debugobjects.h
@@ -63,7 +63,7 @@ struct debug_obj_descr {
 extern void debug_object_init      (void *addr, struct debug_obj_descr *descr);
 extern void
 debug_object_init_on_stack(void *addr, struct debug_obj_descr *descr);
-extern void debug_object_activate  (void *addr, struct debug_obj_descr *descr);
+extern int debug_object_activate  (void *addr, struct debug_obj_descr *descr);
 extern void debug_object_deactivate(void *addr, struct debug_obj_descr *descr);
 extern void debug_object_destroy   (void *addr, struct debug_obj_descr *descr);
 extern void debug_object_free      (void *addr, struct debug_obj_descr *descr);
@@ -85,8 +85,8 @@ static inline void
 debug_object_init      (void *addr, struct debug_obj_descr *descr) { }
 static inline void
 debug_object_init_on_stack(void *addr, struct debug_obj_descr *descr) { }
-static inline void
-debug_object_activate  (void *addr, struct debug_obj_descr *descr) { }
+static inline int
+debug_object_activate  (void *addr, struct debug_obj_descr *descr) { return 0; }
 static inline void
 debug_object_deactivate(void *addr, struct debug_obj_descr *descr) { }
 static inline void
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index 37061ede8b81..bf2c8b1043d8 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -381,19 +381,21 @@ void debug_object_init_on_stack(void *addr, struct debug_obj_descr *descr)
  * debug_object_activate - debug checks when an object is activated
  * @addr:	address of the object
  * @descr:	pointer to an object specific debug description structure
+ * Returns 0 for success, -EINVAL for check failed.
  */
-void debug_object_activate(void *addr, struct debug_obj_descr *descr)
+int debug_object_activate(void *addr, struct debug_obj_descr *descr)
 {
 	enum debug_obj_state state;
 	struct debug_bucket *db;
 	struct debug_obj *obj;
 	unsigned long flags;
+	int ret;
 	struct debug_obj o = { .object = addr,
 			       .state = ODEBUG_STATE_NOTAVAILABLE,
 			       .descr = descr };
 
 	if (!debug_objects_enabled)
-		return;
+		return 0;
 
 	db = get_bucket((unsigned long) addr);
 
@@ -405,23 +407,26 @@ void debug_object_activate(void *addr, struct debug_obj_descr *descr)
 		case ODEBUG_STATE_INIT:
 		case ODEBUG_STATE_INACTIVE:
 			obj->state = ODEBUG_STATE_ACTIVE;
+			ret = 0;
 			break;
 
 		case ODEBUG_STATE_ACTIVE:
 			debug_print_object(obj, "activate");
 			state = obj->state;
 			raw_spin_unlock_irqrestore(&db->lock, flags);
-			debug_object_fixup(descr->fixup_activate, addr, state);
-			return;
+			ret = debug_object_fixup(descr->fixup_activate, addr, state);
+			return ret ? -EINVAL : 0;
 
 		case ODEBUG_STATE_DESTROYED:
 			debug_print_object(obj, "activate");
+			ret = -EINVAL;
 			break;
 		default:
+			ret = 0;
 			break;
 		}
 		raw_spin_unlock_irqrestore(&db->lock, flags);
-		return;
+		return ret;
 	}
 
 	raw_spin_unlock_irqrestore(&db->lock, flags);
@@ -431,8 +436,11 @@ void debug_object_activate(void *addr, struct debug_obj_descr *descr)
 	 * true or not.
 	 */
 	if (debug_object_fixup(descr->fixup_activate, addr,
-			   ODEBUG_STATE_NOTAVAILABLE))
+			   ODEBUG_STATE_NOTAVAILABLE)) {
 		debug_print_object(&o, "activate");
+		return -EINVAL;
+	}
+	return 0;
 }
 
 /**
-- 
cgit v1.2.3


From c34ac00caefbe49d40058ae7200bd58725cebb45 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 28 Jun 2013 10:34:48 -0700
Subject: rculist: list_first_or_null_rcu() should use list_entry_rcu()

list_first_or_null() should test whether the list is empty and return
pointer to the first entry if not in a RCU safe manner.  It's broken
in several ways.

* It compares __kernel @__ptr with __rcu @__next triggering the
  following sparse warning.

  net/core/dev.c:4331:17: error: incompatible types in comparison expression (different address spaces)

* It doesn't perform rcu_dereference*() and computes the entry address
  using container_of() directly from the __rcu pointer which is
  inconsitent with other rculist interface.  As a result, all three
  in-kernel users - net/core/dev.c, macvlan, cgroup - are buggy.  They
  dereference the pointer w/o going through read barrier.

* While ->next dereference passes through list_next_rcu(), the
  compiler is still free to fetch ->next more than once and thus
  nullify the "__ptr != __next" condition check.

Fix it by making list_first_or_null_rcu() dereference ->next directly
using ACCESS_ONCE() and then use list_entry_rcu() on it like other
rculist accessors.

v2: Paul pointed out that the compiler may fetch the pointer more than
    once nullifying the condition check.  ACCESS_ONCE() added on
    ->next dereference.

v3: Restored () around macro param which was accidentally removed.
    Spotted by Paul.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Patrick McHardy <kaber@trash.net>
Cc: stable@vger.kernel.org
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/rculist.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index f4b1001a4676..4106721c4e5e 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -267,8 +267,9 @@ static inline void list_splice_init_rcu(struct list_head *list,
  */
 #define list_first_or_null_rcu(ptr, type, member) \
 	({struct list_head *__ptr = (ptr); \
-	  struct list_head __rcu *__next = list_next_rcu(__ptr); \
-	  likely(__ptr != __next) ? container_of(__next, type, member) : NULL; \
+	  struct list_head *__next = ACCESS_ONCE(__ptr->next); \
+	  likely(__ptr != __next) ? \
+		list_entry_rcu(__next, type, member) : NULL; \
 	})
 
 /**
-- 
cgit v1.2.3


From feed66ed26a53e700ca02ce1744fed7d0c647292 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 9 May 2013 08:55:54 -0700
Subject: rcu: Eliminate unused APIs intended for adaptive ticks

The rcu_user_enter_after_irq() and rcu_user_exit_after_irq()
functions were intended for use by adaptive ticks, but changes
in implementation have rendered them unnecessary.  This commit
therefore removes them.

Reported-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/rcupdate.h |  4 ----
 kernel/rcutree.c         | 43 -------------------------------------------
 2 files changed, 47 deletions(-)

(limited to 'include')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 0c38abbe6e35..30bea9c25735 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -229,13 +229,9 @@ extern void rcu_irq_exit(void);
 #ifdef CONFIG_RCU_USER_QS
 extern void rcu_user_enter(void);
 extern void rcu_user_exit(void);
-extern void rcu_user_enter_after_irq(void);
-extern void rcu_user_exit_after_irq(void);
 #else
 static inline void rcu_user_enter(void) { }
 static inline void rcu_user_exit(void) { }
-static inline void rcu_user_enter_after_irq(void) { }
-static inline void rcu_user_exit_after_irq(void) { }
 static inline void rcu_user_hooks_switch(struct task_struct *prev,
 					 struct task_struct *next) { }
 #endif /* CONFIG_RCU_USER_QS */
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 338f1d1c1c66..8807019138c6 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -444,27 +444,6 @@ void rcu_user_enter(void)
 {
 	rcu_eqs_enter(1);
 }
-
-/**
- * rcu_user_enter_after_irq - inform RCU that we are going to resume userspace
- * after the current irq returns.
- *
- * This is similar to rcu_user_enter() but in the context of a non-nesting
- * irq. After this call, RCU enters into idle mode when the interrupt
- * returns.
- */
-void rcu_user_enter_after_irq(void)
-{
-	unsigned long flags;
-	struct rcu_dynticks *rdtp;
-
-	local_irq_save(flags);
-	rdtp = &__get_cpu_var(rcu_dynticks);
-	/* Ensure this irq is interrupting a non-idle RCU state.  */
-	WARN_ON_ONCE(!(rdtp->dynticks_nesting & DYNTICK_TASK_MASK));
-	rdtp->dynticks_nesting = 1;
-	local_irq_restore(flags);
-}
 #endif /* CONFIG_RCU_USER_QS */
 
 /**
@@ -581,28 +560,6 @@ void rcu_user_exit(void)
 {
 	rcu_eqs_exit(1);
 }
-
-/**
- * rcu_user_exit_after_irq - inform RCU that we won't resume to userspace
- * idle mode after the current non-nesting irq returns.
- *
- * This is similar to rcu_user_exit() but in the context of an irq.
- * This is called when the irq has interrupted a userspace RCU idle mode
- * context. When the current non-nesting interrupt returns after this call,
- * the CPU won't restore the RCU idle mode.
- */
-void rcu_user_exit_after_irq(void)
-{
-	unsigned long flags;
-	struct rcu_dynticks *rdtp;
-
-	local_irq_save(flags);
-	rdtp = &__get_cpu_var(rcu_dynticks);
-	/* Ensure we are interrupting an RCU idle mode. */
-	WARN_ON_ONCE(rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK);
-	rdtp->dynticks_nesting += DYNTICK_TASK_EXIT_IDLE;
-	local_irq_restore(flags);
-}
 #endif /* CONFIG_RCU_USER_QS */
 
 /**
-- 
cgit v1.2.3


From 5a581b367b5df0531265311fc681c2abd377e5e6 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 27 Jul 2013 03:53:54 -0700
Subject: jiffies: Avoid undefined behavior from signed overflow

According to the C standard 3.4.3p3, overflow of a signed integer results
in undefined behavior.  This commit therefore changes the definitions
of time_after(), time_after_eq(), time_after64(), and time_after_eq64()
to avoid this undefined behavior.  The trick is that the subtraction
is done using unsigned arithmetic, which according to 6.2.5p9 cannot
overflow because it is defined as modulo arithmetic.  This has the added
(though admittedly quite small) benefit of shortening four lines of code
by four characters each.

Note that the C standard considers the cast from unsigned to
signed to be implementation-defined, see 6.3.1.3p3.  However, on a
two's-complement system, an implementation that defines anything other
than a reinterpretation of the bits is free to come to me, and I will be
happy to act as a witness for its being committed to an insane asylum.
(Although I have nothing against saturating arithmetic or signals in some
cases, these things really should not be the default when compiling an
operating-system kernel.)

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Kevin Easton <kevin@guarana.org>
[ paulmck: Included time_after64() and time_after_eq64(), as suggested
  by Eric Dumazet, also fixed commit message.]
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/jiffies.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 97ba4e78a37e..d235e88cfd7c 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -101,13 +101,13 @@ static inline u64 get_jiffies_64(void)
 #define time_after(a,b)		\
 	(typecheck(unsigned long, a) && \
 	 typecheck(unsigned long, b) && \
-	 ((long)(b) - (long)(a) < 0))
+	 ((long)((b) - (a)) < 0))
 #define time_before(a,b)	time_after(b,a)
 
 #define time_after_eq(a,b)	\
 	(typecheck(unsigned long, a) && \
 	 typecheck(unsigned long, b) && \
-	 ((long)(a) - (long)(b) >= 0))
+	 ((long)((a) - (b)) >= 0))
 #define time_before_eq(a,b)	time_after_eq(b,a)
 
 /*
@@ -130,13 +130,13 @@ static inline u64 get_jiffies_64(void)
 #define time_after64(a,b)	\
 	(typecheck(__u64, a) &&	\
 	 typecheck(__u64, b) && \
-	 ((__s64)(b) - (__s64)(a) < 0))
+	 ((__s64)((b) - (a)) < 0))
 #define time_before64(a,b)	time_after64(b,a)
 
 #define time_after_eq64(a,b)	\
 	(typecheck(__u64, a) && \
 	 typecheck(__u64, b) && \
-	 ((__s64)(a) - (__s64)(b) >= 0))
+	 ((__s64)((a) - (b)) >= 0))
 #define time_before_eq64(a,b)	time_after_eq64(b,a)
 
 #define time_in_range64(a, b, c) \
-- 
cgit v1.2.3


From 0edd1b1784cbdad55aca2c1293be018f53c0ab1d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 21 Jun 2013 16:37:22 -0700
Subject: nohz_full: Add full-system-idle state machine

This commit adds the state machine that takes the per-CPU idle data
as input and produces a full-system-idle indication as output.  This
state machine is driven out of RCU's quiescent-state-forcing
mechanism, which invokes rcu_sysidle_check_cpu() to collect per-CPU
idle state and then rcu_sysidle_report() to drive the state machine.

The full-system-idle state is sampled using rcu_sys_is_idle(), which
also drives the state machine if RCU is idle (and does so by forcing
RCU to become non-idle).  This function returns true if all but the
timekeeping CPU (tick_do_timer_cpu) are idle and have been idle long
enough to avoid memory contention on the full_sysidle_state state
variable.  The rcu_sysidle_force_exit() may be called externally
to reset the state machine back into non-idle state.

For large systems the state machine is driven out of RCU's
force-quiescent-state logic, which provides good scalability at the price
of millisecond-scale latencies on the transition to full-system-idle
state.  This is not so good for battery-powered systems, which are usually
small enough that they don't need to care about scalability, but which
do care deeply about energy efficiency.  Small systems therefore drive
the state machine directly out of the idle-entry code.  The number of
CPUs in a "small" system is defined by a new NO_HZ_FULL_SYSIDLE_SMALL
Kconfig parameter, which defaults to 8.  Note that this is a build-time
definition.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
[ paulmck: Use true and false for boolean constants per Lai Jiangshan. ]
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
[ paulmck: Simplify logic and provide better comments for memory barriers,
  based on review comments and questions by Lai Jiangshan. ]
---
 include/linux/rcupdate.h |  18 +++
 kernel/rcutree.c         |  16 ++-
 kernel/rcutree.h         |   5 +
 kernel/rcutree_plugin.h  | 296 ++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/time/Kconfig      |  27 +++++
 5 files changed, 355 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 30bea9c25735..f1f1bc39346b 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -1011,4 +1011,22 @@ static inline bool rcu_is_nocb_cpu(int cpu) { return false; }
 #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
 
 
+/* Only for use by adaptive-ticks code. */
+#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
+extern bool rcu_sys_is_idle(void);
+extern void rcu_sysidle_force_exit(void);
+#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+
+static inline bool rcu_sys_is_idle(void)
+{
+	return false;
+}
+
+static inline void rcu_sysidle_force_exit(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+
+
 #endif /* __LINUX_RCUPDATE_H */
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 7b5be56d95ae..eca70f4469c1 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -734,6 +734,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
 					 bool *isidle, unsigned long *maxj)
 {
 	rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
+	rcu_sysidle_check_cpu(rdp, isidle, maxj);
 	return (rdp->dynticks_snap & 0x1) == 0;
 }
 
@@ -1373,11 +1374,17 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
 	rsp->n_force_qs++;
 	if (fqs_state == RCU_SAVE_DYNTICK) {
 		/* Collect dyntick-idle snapshots. */
+		if (is_sysidle_rcu_state(rsp)) {
+			isidle = 1;
+			maxj = jiffies - ULONG_MAX / 4;
+		}
 		force_qs_rnp(rsp, dyntick_save_progress_counter,
 			     &isidle, &maxj);
+		rcu_sysidle_report_gp(rsp, isidle, maxj);
 		fqs_state = RCU_FORCE_QS;
 	} else {
 		/* Handle dyntick-idle and offline CPUs. */
+		isidle = 0;
 		force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
 	}
 	/* Clear flag to prevent immediate re-entry. */
@@ -2103,9 +2110,12 @@ static void force_qs_rnp(struct rcu_state *rsp,
 		cpu = rnp->grplo;
 		bit = 1;
 		for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
-			if ((rnp->qsmask & bit) != 0 &&
-			    f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
-				mask |= bit;
+			if ((rnp->qsmask & bit) != 0) {
+				if ((rnp->qsmaskinit & bit) != 0)
+					*isidle = 0;
+				if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
+					mask |= bit;
+			}
 		}
 		if (mask != 0) {
 
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 9dd8b177f1ac..6fd3659cf01a 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -555,6 +555,11 @@ static void rcu_kick_nohz_cpu(int cpu);
 static bool init_nocb_callback_list(struct rcu_data *rdp);
 static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
 static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
+static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
+				  unsigned long *maxj);
+static bool is_sysidle_rcu_state(struct rcu_state *rsp);
+static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
+				  unsigned long maxj);
 static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
 
 #endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index a7419ceb19ad..45ebba747af4 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -28,7 +28,7 @@
 #include <linux/gfp.h>
 #include <linux/oom.h>
 #include <linux/smpboot.h>
-#include <linux/tick.h>
+#include "time/tick-internal.h"
 
 #define RCU_KTHREAD_PRIO 1
 
@@ -2382,12 +2382,12 @@ static void rcu_kick_nohz_cpu(int cpu)
  * most active flavor of RCU.
  */
 #ifdef CONFIG_PREEMPT_RCU
-static struct rcu_state __maybe_unused *rcu_sysidle_state = &rcu_preempt_state;
+static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state;
 #else /* #ifdef CONFIG_PREEMPT_RCU */
-static struct rcu_state __maybe_unused *rcu_sysidle_state = &rcu_sched_state;
+static struct rcu_state *rcu_sysidle_state = &rcu_sched_state;
 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
 
-static int __maybe_unused full_sysidle_state; /* Current system-idle state. */
+static int full_sysidle_state;		/* Current system-idle state. */
 #define RCU_SYSIDLE_NOT		0	/* Some CPU is not idle. */
 #define RCU_SYSIDLE_SHORT	1	/* All CPUs idle for brief period. */
 #define RCU_SYSIDLE_LONG	2	/* All CPUs idle for long enough. */
@@ -2430,6 +2430,38 @@ static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
 	WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1);
 }
 
+/*
+ * Unconditionally force exit from full system-idle state.  This is
+ * invoked when a normal CPU exits idle, but must be called separately
+ * for the timekeeping CPU (tick_do_timer_cpu).  The reason for this
+ * is that the timekeeping CPU is permitted to take scheduling-clock
+ * interrupts while the system is in system-idle state, and of course
+ * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock
+ * interrupt from any other type of interrupt.
+ */
+void rcu_sysidle_force_exit(void)
+{
+	int oldstate = ACCESS_ONCE(full_sysidle_state);
+	int newoldstate;
+
+	/*
+	 * Each pass through the following loop attempts to exit full
+	 * system-idle state.  If contention proves to be a problem,
+	 * a trylock-based contention tree could be used here.
+	 */
+	while (oldstate > RCU_SYSIDLE_SHORT) {
+		newoldstate = cmpxchg(&full_sysidle_state,
+				      oldstate, RCU_SYSIDLE_NOT);
+		if (oldstate == newoldstate &&
+		    oldstate == RCU_SYSIDLE_FULL_NOTED) {
+			rcu_kick_nohz_cpu(tick_do_timer_cpu);
+			return; /* We cleared it, done! */
+		}
+		oldstate = newoldstate;
+	}
+	smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */
+}
+
 /*
  * Invoked to note entry to irq or task transition from idle.  Note that
  * usermode execution does -not- count as idle here!  The caller must
@@ -2463,6 +2495,247 @@ static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
 	atomic_inc(&rdtp->dynticks_idle);
 	smp_mb__after_atomic_inc();
 	WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1));
+
+	/*
+	 * If we are the timekeeping CPU, we are permitted to be non-idle
+	 * during a system-idle state.  This must be the case, because
+	 * the timekeeping CPU has to take scheduling-clock interrupts
+	 * during the time that the system is transitioning to full
+	 * system-idle state.  This means that the timekeeping CPU must
+	 * invoke rcu_sysidle_force_exit() directly if it does anything
+	 * more than take a scheduling-clock interrupt.
+	 */
+	if (smp_processor_id() == tick_do_timer_cpu)
+		return;
+
+	/* Update system-idle state: We are clearly no longer fully idle! */
+	rcu_sysidle_force_exit();
+}
+
+/*
+ * Check to see if the current CPU is idle.  Note that usermode execution
+ * does not count as idle.  The caller must have disabled interrupts.
+ */
+static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
+				  unsigned long *maxj)
+{
+	int cur;
+	unsigned long j;
+	struct rcu_dynticks *rdtp = rdp->dynticks;
+
+	/*
+	 * If some other CPU has already reported non-idle, if this is
+	 * not the flavor of RCU that tracks sysidle state, or if this
+	 * is an offline or the timekeeping CPU, nothing to do.
+	 */
+	if (!*isidle || rdp->rsp != rcu_sysidle_state ||
+	    cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
+		return;
+	/* WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); */
+
+	/* Pick up current idle and NMI-nesting counter and check. */
+	cur = atomic_read(&rdtp->dynticks_idle);
+	if (cur & 0x1) {
+		*isidle = false; /* We are not idle! */
+		return;
+	}
+	smp_mb(); /* Read counters before timestamps. */
+
+	/* Pick up timestamps. */
+	j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies);
+	/* If this CPU entered idle more recently, update maxj timestamp. */
+	if (ULONG_CMP_LT(*maxj, j))
+		*maxj = j;
+}
+
+/*
+ * Is this the flavor of RCU that is handling full-system idle?
+ */
+static bool is_sysidle_rcu_state(struct rcu_state *rsp)
+{
+	return rsp == rcu_sysidle_state;
+}
+
+/*
+ * Return a delay in jiffies based on the number of CPUs, rcu_node
+ * leaf fanout, and jiffies tick rate.  The idea is to allow larger
+ * systems more time to transition to full-idle state in order to
+ * avoid the cache thrashing that otherwise occur on the state variable.
+ * Really small systems (less than a couple of tens of CPUs) should
+ * instead use a single global atomically incremented counter, and later
+ * versions of this will automatically reconfigure themselves accordingly.
+ */
+static unsigned long rcu_sysidle_delay(void)
+{
+	if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
+		return 0;
+	return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000);
+}
+
+/*
+ * Advance the full-system-idle state.  This is invoked when all of
+ * the non-timekeeping CPUs are idle.
+ */
+static void rcu_sysidle(unsigned long j)
+{
+	/* Check the current state. */
+	switch (ACCESS_ONCE(full_sysidle_state)) {
+	case RCU_SYSIDLE_NOT:
+
+		/* First time all are idle, so note a short idle period. */
+		ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT;
+		break;
+
+	case RCU_SYSIDLE_SHORT:
+
+		/*
+		 * Idle for a bit, time to advance to next state?
+		 * cmpxchg failure means race with non-idle, let them win.
+		 */
+		if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
+			(void)cmpxchg(&full_sysidle_state,
+				      RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG);
+		break;
+
+	case RCU_SYSIDLE_LONG:
+
+		/*
+		 * Do an additional check pass before advancing to full.
+		 * cmpxchg failure means race with non-idle, let them win.
+		 */
+		if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
+			(void)cmpxchg(&full_sysidle_state,
+				      RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL);
+		break;
+
+	default:
+		break;
+	}
+}
+
+/*
+ * Found a non-idle non-timekeeping CPU, so kick the system-idle state
+ * back to the beginning.
+ */
+static void rcu_sysidle_cancel(void)
+{
+	smp_mb();
+	ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
+}
+
+/*
+ * Update the sysidle state based on the results of a force-quiescent-state
+ * scan of the CPUs' dyntick-idle state.
+ */
+static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
+			       unsigned long maxj, bool gpkt)
+{
+	if (rsp != rcu_sysidle_state)
+		return;  /* Wrong flavor, ignore. */
+	if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
+		return;  /* Running state machine from timekeeping CPU. */
+	if (isidle)
+		rcu_sysidle(maxj);    /* More idle! */
+	else
+		rcu_sysidle_cancel(); /* Idle is over. */
+}
+
+/*
+ * Wrapper for rcu_sysidle_report() when called from the grace-period
+ * kthread's context.
+ */
+static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
+				  unsigned long maxj)
+{
+	rcu_sysidle_report(rsp, isidle, maxj, true);
+}
+
+/* Callback and function for forcing an RCU grace period. */
+struct rcu_sysidle_head {
+	struct rcu_head rh;
+	int inuse;
+};
+
+static void rcu_sysidle_cb(struct rcu_head *rhp)
+{
+	struct rcu_sysidle_head *rshp;
+
+	/*
+	 * The following memory barrier is needed to replace the
+	 * memory barriers that would normally be in the memory
+	 * allocator.
+	 */
+	smp_mb();  /* grace period precedes setting inuse. */
+
+	rshp = container_of(rhp, struct rcu_sysidle_head, rh);
+	ACCESS_ONCE(rshp->inuse) = 0;
+}
+
+/*
+ * Check to see if the system is fully idle, other than the timekeeping CPU.
+ * The caller must have disabled interrupts.
+ */
+bool rcu_sys_is_idle(void)
+{
+	static struct rcu_sysidle_head rsh;
+	int rss = ACCESS_ONCE(full_sysidle_state);
+
+	if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
+		return false;
+
+	/* Handle small-system case by doing a full scan of CPUs. */
+	if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) {
+		int oldrss = rss - 1;
+
+		/*
+		 * One pass to advance to each state up to _FULL.
+		 * Give up if any pass fails to advance the state.
+		 */
+		while (rss < RCU_SYSIDLE_FULL && oldrss < rss) {
+			int cpu;
+			bool isidle = true;
+			unsigned long maxj = jiffies - ULONG_MAX / 4;
+			struct rcu_data *rdp;
+
+			/* Scan all the CPUs looking for nonidle CPUs. */
+			for_each_possible_cpu(cpu) {
+				rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu);
+				rcu_sysidle_check_cpu(rdp, &isidle, &maxj);
+				if (!isidle)
+					break;
+			}
+			rcu_sysidle_report(rcu_sysidle_state,
+					   isidle, maxj, false);
+			oldrss = rss;
+			rss = ACCESS_ONCE(full_sysidle_state);
+		}
+	}
+
+	/* If this is the first observation of an idle period, record it. */
+	if (rss == RCU_SYSIDLE_FULL) {
+		rss = cmpxchg(&full_sysidle_state,
+			      RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED);
+		return rss == RCU_SYSIDLE_FULL;
+	}
+
+	smp_mb(); /* ensure rss load happens before later caller actions. */
+
+	/* If already fully idle, tell the caller (in case of races). */
+	if (rss == RCU_SYSIDLE_FULL_NOTED)
+		return true;
+
+	/*
+	 * If we aren't there yet, and a grace period is not in flight,
+	 * initiate a grace period.  Either way, tell the caller that
+	 * we are not there yet.  We use an xchg() rather than an assignment
+	 * to make up for the memory barriers that would otherwise be
+	 * provided by the memory allocator.
+	 */
+	if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL &&
+	    !rcu_gp_in_progress(rcu_sysidle_state) &&
+	    !rsh.inuse && xchg(&rsh.inuse, 1) == 0)
+		call_rcu(&rsh.rh, rcu_sysidle_cb);
+	return false;
 }
 
 /*
@@ -2483,6 +2756,21 @@ static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
 {
 }
 
+static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
+				  unsigned long *maxj)
+{
+}
+
+static bool is_sysidle_rcu_state(struct rcu_state *rsp)
+{
+	return false;
+}
+
+static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
+				  unsigned long maxj)
+{
+}
+
 static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
 {
 }
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index c7d2fd67799e..3381f098070f 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -157,6 +157,33 @@ config NO_HZ_FULL_SYSIDLE
 
 	 Say N if you are unsure.
 
+config NO_HZ_FULL_SYSIDLE_SMALL
+	int "Number of CPUs above which large-system approach is used"
+	depends on NO_HZ_FULL_SYSIDLE
+	range 1 NR_CPUS
+	default 8
+	help
+	 The full-system idle detection mechanism takes a lazy approach
+	 on large systems, as is required to attain decent scalability.
+	 However, on smaller systems, scalability is not anywhere near as
+	 large a concern as is energy efficiency.  The sysidle subsystem
+	 therefore uses a fast but non-scalable algorithm for small
+	 systems and a lazier but scalable algorithm for large systems.
+	 This Kconfig parameter defines the number of CPUs in the largest
+	 system that will be considered to be "small".
+
+	 The default value will be fine in most cases.	Battery-powered
+	 systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger
+	 numbers of CPUs, and (3) are suffering from battery-lifetime
+	 problems due to long sysidle latencies might wish to experiment
+	 with larger values for this Kconfig parameter.  On the other
+	 hand, they might be even better served by disabling NO_HZ_FULL
+	 entirely, given that NO_HZ_FULL is intended for HPC and
+	 real-time workloads that at present do not tend to be run on
+	 battery-powered systems.
+
+	 Take the default if you are unsure.
+
 config NO_HZ
 	bool "Old Idle dynticks config"
 	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
-- 
cgit v1.2.3