From 0f67f04ffcb592d065a20862a82d4539e0f8e909 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 3 Feb 2015 11:56:20 -0500
Subject: tracing: Only create tracer options files if directory exists

Do not bother creating tracer options if no tracing directory
exists. If a tracer is enabled via the command line, and is
started before the tracing directory is created, then it wont have
its tracer specific options created.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 38c613ede10d..d4627f15407a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4172,8 +4172,11 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
 		free_snapshot(tr);
 	}
 #endif
-	/* Currently, only the top instance has options */
-	if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
+	/*
+	 * Only enable if the directory has been created already.
+	 * Currently, only the top instance has options
+	 */
+	if (tr->dir && tr->flags & TRACE_ARRAY_FL_GLOBAL) {
 		destroy_trace_option_files(topts);
 		topts = create_trace_option_files(tr, t);
 	}
-- 
cgit v1.2.3


From 09d23a1d8a82e814bd56a4f121b80ea8214ac49d Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 3 Feb 2015 12:45:53 -0500
Subject: tracing: Create cmdline tracer options on tracing fs init

The options for cmdline tracers are not created if the debugfs system
is not ready yet. If tracing has started before debugfs is up, then the
option files for the tracer are not created. Create them when creating
the tracing directory if the current tracer requires option files.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d4627f15407a..05e0e50539fc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4105,9 +4105,24 @@ static void tracing_set_nop(struct trace_array *tr)
 	tr->current_trace = &nop_trace;
 }
 
-static int tracing_set_tracer(struct trace_array *tr, const char *buf)
+static void update_tracer_options(struct trace_array *tr, struct tracer *t)
 {
 	static struct trace_option_dentry *topts;
+
+	/* Only enable if the directory has been created already. */
+	if (!tr->dir)
+		return;
+
+	/* Currently, only the top instance has options */
+	if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL))
+		return;
+
+	destroy_trace_option_files(topts);
+	topts = create_trace_option_files(tr, t);
+}
+
+static int tracing_set_tracer(struct trace_array *tr, const char *buf)
+{
 	struct tracer *t;
 #ifdef CONFIG_TRACER_MAX_TRACE
 	bool had_max_tr;
@@ -4172,14 +4187,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
 		free_snapshot(tr);
 	}
 #endif
-	/*
-	 * Only enable if the directory has been created already.
-	 * Currently, only the top instance has options
-	 */
-	if (tr->dir && tr->flags & TRACE_ARRAY_FL_GLOBAL) {
-		destroy_trace_option_files(topts);
-		topts = create_trace_option_files(tr, t);
-	}
+	update_tracer_options(tr, t);
 
 #ifdef CONFIG_TRACER_MAX_TRACE
 	if (t->use_max_tr && !had_max_tr) {
@@ -6578,6 +6586,10 @@ static __init int tracer_init_debugfs(void)
 
 	create_trace_options_dir(&global_trace);
 
+	/* If the tracer was started via cmdline, create options for it here */
+	if (global_trace.current_trace != &nop_trace)
+		update_tracer_options(&global_trace, global_trace.current_trace);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 8434dc9340cd2e117fc944cf7526263bf490a52a Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 20 Jan 2015 12:13:40 -0500
Subject: tracing: Convert the tracing facility over to use tracefs

debugfs was fine for the tracing facility as a quick way to get
an interface. Now that tracing has matured, it should separate itself
from debugfs such that it can be mounted separately without needing
to mount all of debugfs with it. That is, users resist using tracing
because it requires mounting debugfs. Having tracing have its own file
system lets users get the features of tracing without needing to bring
in the rest of the kernel's debug infrastructure.

Another reason for tracefs is that debubfs does not support mkdir.
Currently, to create instances, one does a mkdir in the tracing/instance
directory. This is implemented via a hack that forces debugfs to do
something it is not intended on doing. By converting over to tracefs, this
hack can be removed and mkdir can be properly implemented. This patch does
not address this yet, but it lays the ground work for that to be done.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c                | 22 +++++++--------
 kernel/trace/trace.c                 | 55 +++++++++++++++++++++---------------
 kernel/trace/trace.h                 |  2 +-
 kernel/trace/trace_events.c          | 32 ++++++++++-----------
 kernel/trace/trace_functions_graph.c |  7 ++---
 kernel/trace/trace_kprobe.c          | 10 +++----
 kernel/trace/trace_probe.h           |  2 +-
 kernel/trace/trace_stat.c            | 10 +++----
 8 files changed, 74 insertions(+), 66 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 45e5cb143d17..fcc0e7052a79 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -18,7 +18,7 @@
 #include <linux/kallsyms.h>
 #include <linux/seq_file.h>
 #include <linux/suspend.h>
-#include <linux/debugfs.h>
+#include <linux/tracefs.h>
 #include <linux/hardirq.h>
 #include <linux/kthread.h>
 #include <linux/uaccess.h>
@@ -1008,7 +1008,7 @@ static struct tracer_stat function_stats __initdata = {
 	.stat_show	= function_stat_show
 };
 
-static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
+static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
 {
 	struct ftrace_profile_stat *stat;
 	struct dentry *entry;
@@ -1044,15 +1044,15 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
 		}
 	}
 
-	entry = debugfs_create_file("function_profile_enabled", 0644,
+	entry = tracefs_create_file("function_profile_enabled", 0644,
 				    d_tracer, NULL, &ftrace_profile_fops);
 	if (!entry)
-		pr_warning("Could not create debugfs "
+		pr_warning("Could not create tracefs "
 			   "'function_profile_enabled' entry\n");
 }
 
 #else /* CONFIG_FUNCTION_PROFILER */
-static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
+static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
 {
 }
 #endif /* CONFIG_FUNCTION_PROFILER */
@@ -4690,7 +4690,7 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops)
 	mutex_unlock(&ftrace_lock);
 }
 
-static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
+static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer)
 {
 
 	trace_create_file("available_filter_functions", 0444,
@@ -4998,7 +4998,7 @@ static int __init ftrace_nodyn_init(void)
 }
 core_initcall(ftrace_nodyn_init);
 
-static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
+static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; }
 static inline void ftrace_startup_enable(int command) { }
 static inline void ftrace_startup_all(int command) { }
 /* Keep as macros so we do not need to define the commands */
@@ -5451,7 +5451,7 @@ static const struct file_operations ftrace_pid_fops = {
 	.release	= ftrace_pid_release,
 };
 
-static __init int ftrace_init_debugfs(void)
+static __init int ftrace_init_tracefs(void)
 {
 	struct dentry *d_tracer;
 
@@ -5459,16 +5459,16 @@ static __init int ftrace_init_debugfs(void)
 	if (IS_ERR(d_tracer))
 		return 0;
 
-	ftrace_init_dyn_debugfs(d_tracer);
+	ftrace_init_dyn_tracefs(d_tracer);
 
 	trace_create_file("set_ftrace_pid", 0644, d_tracer,
 			    NULL, &ftrace_pid_fops);
 
-	ftrace_profile_debugfs(d_tracer);
+	ftrace_profile_tracefs(d_tracer);
 
 	return 0;
 }
-fs_initcall(ftrace_init_debugfs);
+fs_initcall(ftrace_init_tracefs);
 
 /**
  * ftrace_kill - kill ftrace
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 05e0e50539fc..6c4739bee4bb 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -20,6 +20,7 @@
 #include <linux/notifier.h>
 #include <linux/irqflags.h>
 #include <linux/debugfs.h>
+#include <linux/tracefs.h>
 #include <linux/pagemap.h>
 #include <linux/hardirq.h>
 #include <linux/linkage.h>
@@ -5828,6 +5829,14 @@ static inline __init int register_snapshot_cmd(void) { return 0; }
 
 static struct dentry *tracing_get_dentry(struct trace_array *tr)
 {
+	if (WARN_ON(!tr->dir))
+		return ERR_PTR(-ENODEV);
+
+	/* Top directory uses NULL as the parent */
+	if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
+		return NULL;
+
+	/* All sub buffers have a descriptor */
 	return tr->dir;
 }
 
@@ -5842,10 +5851,10 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
 	if (IS_ERR(d_tracer))
 		return NULL;
 
-	tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer);
+	tr->percpu_dir = tracefs_create_dir("per_cpu", d_tracer);
 
 	WARN_ONCE(!tr->percpu_dir,
-		  "Could not create debugfs directory 'per_cpu/%d'\n", cpu);
+		  "Could not create tracefs directory 'per_cpu/%d'\n", cpu);
 
 	return tr->percpu_dir;
 }
@@ -5862,7 +5871,7 @@ trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent,
 }
 
 static void
-tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
+tracing_init_tracefs_percpu(struct trace_array *tr, long cpu)
 {
 	struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu);
 	struct dentry *d_cpu;
@@ -5872,9 +5881,9 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
 		return;
 
 	snprintf(cpu_dir, 30, "cpu%ld", cpu);
-	d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
+	d_cpu = tracefs_create_dir(cpu_dir, d_percpu);
 	if (!d_cpu) {
-		pr_warning("Could not create debugfs '%s' entry\n", cpu_dir);
+		pr_warning("Could not create tracefs '%s' entry\n", cpu_dir);
 		return;
 	}
 
@@ -6026,9 +6035,9 @@ struct dentry *trace_create_file(const char *name,
 {
 	struct dentry *ret;
 
-	ret = debugfs_create_file(name, mode, parent, data, fops);
+	ret = tracefs_create_file(name, mode, parent, data, fops);
 	if (!ret)
-		pr_warning("Could not create debugfs '%s' entry\n", name);
+		pr_warning("Could not create tracefs '%s' entry\n", name);
 
 	return ret;
 }
@@ -6045,9 +6054,9 @@ static struct dentry *trace_options_init_dentry(struct trace_array *tr)
 	if (IS_ERR(d_tracer))
 		return NULL;
 
-	tr->options = debugfs_create_dir("options", d_tracer);
+	tr->options = tracefs_create_dir("options", d_tracer);
 	if (!tr->options) {
-		pr_warning("Could not create debugfs directory 'options'\n");
+		pr_warning("Could not create tracefs directory 'options'\n");
 		return NULL;
 	}
 
@@ -6116,7 +6125,7 @@ destroy_trace_option_files(struct trace_option_dentry *topts)
 		return;
 
 	for (cnt = 0; topts[cnt].opt; cnt++)
-		debugfs_remove(topts[cnt].entry);
+		tracefs_remove(topts[cnt].entry);
 
 	kfree(topts);
 }
@@ -6205,7 +6214,7 @@ static const struct file_operations rb_simple_fops = {
 struct dentry *trace_instance_dir;
 
 static void
-init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer);
+init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer);
 
 static int
 allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size)
@@ -6321,17 +6330,17 @@ static int new_instance_create(const char *name)
 	if (allocate_trace_buffers(tr, trace_buf_size) < 0)
 		goto out_free_tr;
 
-	tr->dir = debugfs_create_dir(name, trace_instance_dir);
+	tr->dir = tracefs_create_dir(name, trace_instance_dir);
 	if (!tr->dir)
 		goto out_free_tr;
 
 	ret = event_trace_add_tracer(tr->dir, tr);
 	if (ret) {
-		debugfs_remove_recursive(tr->dir);
+		tracefs_remove_recursive(tr->dir);
 		goto out_free_tr;
 	}
 
-	init_tracer_debugfs(tr, tr->dir);
+	init_tracer_tracefs(tr, tr->dir);
 
 	list_add(&tr->list, &ftrace_trace_arrays);
 
@@ -6404,7 +6413,7 @@ static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t m
 		return -ENOENT;
 
 	/*
-	 * The inode mutex is locked, but debugfs_create_dir() will also
+	 * The inode mutex is locked, but tracefs_create_dir() will also
 	 * take the mutex. As the instances directory can not be destroyed
 	 * or changed in any other way, it is safe to unlock it, and
 	 * let the dentry try. If two users try to make the same dir at
@@ -6434,7 +6443,7 @@ static int instance_rmdir(struct inode *inode, struct dentry *dentry)
 	mutex_unlock(&dentry->d_inode->i_mutex);
 
 	/*
-	 * The inode mutex is locked, but debugfs_create_dir() will also
+	 * The inode mutex is locked, but tracefs_create_dir() will also
 	 * take the mutex. As the instances directory can not be destroyed
 	 * or changed in any other way, it is safe to unlock it, and
 	 * let the dentry try. If two users try to make the same dir at
@@ -6459,7 +6468,7 @@ static const struct inode_operations instance_dir_inode_operations = {
 
 static __init void create_trace_instances(struct dentry *d_tracer)
 {
-	trace_instance_dir = debugfs_create_dir("instances", d_tracer);
+	trace_instance_dir = tracefs_create_dir("instances", d_tracer);
 	if (WARN_ON(!trace_instance_dir))
 		return;
 
@@ -6468,7 +6477,7 @@ static __init void create_trace_instances(struct dentry *d_tracer)
 }
 
 static void
-init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
+init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 {
 	int cpu;
 
@@ -6522,7 +6531,7 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
 #endif
 
 	for_each_tracing_cpu(cpu)
-		tracing_init_debugfs_percpu(tr, cpu);
+		tracing_init_tracefs_percpu(tr, cpu);
 
 }
 
@@ -6550,10 +6559,10 @@ struct dentry *tracing_init_dentry(void)
 		return ERR_PTR(-ENOMEM);
 	}
 
-	return tr->dir;
+	return NULL;
 }
 
-static __init int tracer_init_debugfs(void)
+static __init int tracer_init_tracefs(void)
 {
 	struct dentry *d_tracer;
 
@@ -6563,7 +6572,7 @@ static __init int tracer_init_debugfs(void)
 	if (IS_ERR(d_tracer))
 		return 0;
 
-	init_tracer_debugfs(&global_trace, d_tracer);
+	init_tracer_tracefs(&global_trace, d_tracer);
 
 	trace_create_file("tracing_thresh", 0644, d_tracer,
 			&global_trace, &tracing_thresh_fops);
@@ -6925,5 +6934,5 @@ __init static int clear_boot_tracer(void)
 	return 0;
 }
 
-fs_initcall(tracer_init_debugfs);
+fs_initcall(tracer_init_tracefs);
 late_initcall(clear_boot_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index dd8205a35760..d951deddec89 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -334,7 +334,7 @@ struct tracer_flags {
 
 
 /**
- * struct tracer - a specific tracer and its callbacks to interact with debugfs
+ * struct tracer - a specific tracer and its callbacks to interact with tracefs
  * @name: the name chosen to select it on the available_tracers file
  * @init: called when one switches to this tracer (echo name > current_tracer)
  * @reset: called when one switches to another tracer
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index db54dda10ccc..0d2e47370ee7 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -13,7 +13,7 @@
 #include <linux/workqueue.h>
 #include <linux/spinlock.h>
 #include <linux/kthread.h>
-#include <linux/debugfs.h>
+#include <linux/tracefs.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
@@ -480,7 +480,7 @@ static void remove_subsystem(struct ftrace_subsystem_dir *dir)
 		return;
 
 	if (!--dir->nr_events) {
-		debugfs_remove_recursive(dir->entry);
+		tracefs_remove_recursive(dir->entry);
 		list_del(&dir->list);
 		__put_system_dir(dir);
 	}
@@ -499,7 +499,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file)
 		}
 		spin_unlock(&dir->d_lock);
 
-		debugfs_remove_recursive(dir);
+		tracefs_remove_recursive(dir);
 	}
 
 	list_del(&file->list);
@@ -1526,7 +1526,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
 	} else
 		__get_system(system);
 
-	dir->entry = debugfs_create_dir(name, parent);
+	dir->entry = tracefs_create_dir(name, parent);
 	if (!dir->entry) {
 		pr_warn("Failed to create system directory %s\n", name);
 		__put_system(system);
@@ -1539,12 +1539,12 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
 	dir->subsystem = system;
 	file->system = dir;
 
-	entry = debugfs_create_file("filter", 0644, dir->entry, dir,
+	entry = tracefs_create_file("filter", 0644, dir->entry, dir,
 				    &ftrace_subsystem_filter_fops);
 	if (!entry) {
 		kfree(system->filter);
 		system->filter = NULL;
-		pr_warn("Could not create debugfs '%s/filter' entry\n", name);
+		pr_warn("Could not create tracefs '%s/filter' entry\n", name);
 	}
 
 	trace_create_file("enable", 0644, dir->entry, dir,
@@ -1585,9 +1585,9 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
 		d_events = parent;
 
 	name = ftrace_event_name(call);
-	file->dir = debugfs_create_dir(name, d_events);
+	file->dir = tracefs_create_dir(name, d_events);
 	if (!file->dir) {
-		pr_warn("Could not create debugfs '%s' directory\n", name);
+		pr_warn("Could not create tracefs '%s' directory\n", name);
 		return -1;
 	}
 
@@ -2228,7 +2228,7 @@ static inline int register_event_cmds(void) { return 0; }
 /*
  * The top level array has already had its ftrace_event_file
  * descriptors created in order to allow for early events to
- * be recorded. This function is called after the debugfs has been
+ * be recorded. This function is called after the tracefs has been
  * initialized, and we now have to create the files associated
  * to the events.
  */
@@ -2311,16 +2311,16 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
 	struct dentry *d_events;
 	struct dentry *entry;
 
-	entry = debugfs_create_file("set_event", 0644, parent,
+	entry = tracefs_create_file("set_event", 0644, parent,
 				    tr, &ftrace_set_event_fops);
 	if (!entry) {
-		pr_warn("Could not create debugfs 'set_event' entry\n");
+		pr_warn("Could not create tracefs 'set_event' entry\n");
 		return -ENOMEM;
 	}
 
-	d_events = debugfs_create_dir("events", parent);
+	d_events = tracefs_create_dir("events", parent);
 	if (!d_events) {
-		pr_warn("Could not create debugfs 'events' directory\n");
+		pr_warn("Could not create tracefs 'events' directory\n");
 		return -ENOMEM;
 	}
 
@@ -2412,7 +2412,7 @@ int event_trace_del_tracer(struct trace_array *tr)
 
 	down_write(&trace_event_sem);
 	__trace_remove_event_dirs(tr);
-	debugfs_remove_recursive(tr->event_dir);
+	tracefs_remove_recursive(tr->event_dir);
 	up_write(&trace_event_sem);
 
 	tr->event_dir = NULL;
@@ -2534,10 +2534,10 @@ static __init int event_trace_init(void)
 	if (IS_ERR(d_tracer))
 		return 0;
 
-	entry = debugfs_create_file("available_events", 0444, d_tracer,
+	entry = tracefs_create_file("available_events", 0444, d_tracer,
 				    tr, &ftrace_avail_fops);
 	if (!entry)
-		pr_warn("Could not create debugfs 'available_events' entry\n");
+		pr_warn("Could not create tracefs 'available_events' entry\n");
 
 	if (trace_define_common_fields())
 		pr_warn("tracing: Failed to allocate common fields");
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 2d25ad1526bb..9cfea4c6d314 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -6,7 +6,6 @@
  * is Copyright (c) Steven Rostedt <srostedt@redhat.com>
  *
  */
-#include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
 #include <linux/slab.h>
@@ -151,7 +150,7 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
 	 * The curr_ret_stack is initialized to -1 and get increased
 	 * in this function.  So it can be less than -1 only if it was
 	 * filtered out via ftrace_graph_notrace_addr() which can be
-	 * set from set_graph_notrace file in debugfs by user.
+	 * set from set_graph_notrace file in tracefs by user.
 	 */
 	if (current->curr_ret_stack < -1)
 		return -EBUSY;
@@ -1432,7 +1431,7 @@ static const struct file_operations graph_depth_fops = {
 	.llseek		= generic_file_llseek,
 };
 
-static __init int init_graph_debugfs(void)
+static __init int init_graph_tracefs(void)
 {
 	struct dentry *d_tracer;
 
@@ -1445,7 +1444,7 @@ static __init int init_graph_debugfs(void)
 
 	return 0;
 }
-fs_initcall(init_graph_debugfs);
+fs_initcall(init_graph_tracefs);
 
 static __init int init_graph_trace(void)
 {
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index b4a00def88f5..c1c6655847c8 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1310,7 +1310,7 @@ static int unregister_kprobe_event(struct trace_kprobe *tk)
 	return ret;
 }
 
-/* Make a debugfs interface for controlling probe points */
+/* Make a tracefs interface for controlling probe points */
 static __init int init_kprobe_trace(void)
 {
 	struct dentry *d_tracer;
@@ -1323,20 +1323,20 @@ static __init int init_kprobe_trace(void)
 	if (IS_ERR(d_tracer))
 		return 0;
 
-	entry = debugfs_create_file("kprobe_events", 0644, d_tracer,
+	entry = tracefs_create_file("kprobe_events", 0644, d_tracer,
 				    NULL, &kprobe_events_ops);
 
 	/* Event list interface */
 	if (!entry)
-		pr_warning("Could not create debugfs "
+		pr_warning("Could not create tracefs "
 			   "'kprobe_events' entry\n");
 
 	/* Profile interface */
-	entry = debugfs_create_file("kprobe_profile", 0444, d_tracer,
+	entry = tracefs_create_file("kprobe_profile", 0444, d_tracer,
 				    NULL, &kprobe_profile_ops);
 
 	if (!entry)
-		pr_warning("Could not create debugfs "
+		pr_warning("Could not create tracefs "
 			   "'kprobe_profile' entry\n");
 	return 0;
 }
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 4f815fbce16d..19aff635841a 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -25,7 +25,7 @@
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/smp.h>
-#include <linux/debugfs.h>
+#include <linux/tracefs.h>
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/ctype.h>
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 75e19e86c954..6cf935316769 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -12,7 +12,7 @@
 #include <linux/list.h>
 #include <linux/slab.h>
 #include <linux/rbtree.h>
-#include <linux/debugfs.h>
+#include <linux/tracefs.h>
 #include "trace_stat.h"
 #include "trace.h"
 
@@ -65,7 +65,7 @@ static void reset_stat_session(struct stat_session *session)
 
 static void destroy_session(struct stat_session *session)
 {
-	debugfs_remove(session->file);
+	tracefs_remove(session->file);
 	__reset_stat_session(session);
 	mutex_destroy(&session->stat_mutex);
 	kfree(session);
@@ -279,9 +279,9 @@ static int tracing_stat_init(void)
 	if (IS_ERR(d_tracing))
 		return 0;
 
-	stat_dir = debugfs_create_dir("trace_stat", d_tracing);
+	stat_dir = tracefs_create_dir("trace_stat", d_tracing);
 	if (!stat_dir)
-		pr_warning("Could not create debugfs "
+		pr_warning("Could not create tracefs "
 			   "'trace_stat' entry\n");
 	return 0;
 }
@@ -291,7 +291,7 @@ static int init_stat_file(struct stat_session *session)
 	if (!stat_dir && tracing_stat_init())
 		return -ENODEV;
 
-	session->file = debugfs_create_file(session->ts->name, 0644,
+	session->file = tracefs_create_file(session->ts->name, 0644,
 					    stat_dir,
 					    session, &tracing_stat_fops);
 	if (!session->file)
-- 
cgit v1.2.3


From f76180bc07abc399977bfbe8c43bf58c4570e893 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 20 Jan 2015 15:48:46 -0500
Subject: tracing: Automatically mount tracefs on debugfs/tracing

As tools currently rely on the tracing directory in debugfs, we can not
just created a tracefs infrastructure and expect sysadmins to mount
the new tracefs to have their old tools work.

Instead, the debugfs tracing directory is still created and the tracefs
file system is mounted there when the debugfs filesystem is mounted.

No longer does the tracing infrastructure update the debugfs file system,
but instead interacts with the tracefs file system. But now, it still
appears to the user like nothing changed, except you also have the feature
of mounting just the tracing system without needing all of debugfs!

Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 36 +++++++++++++++++++++++++++++++++---
 1 file changed, 33 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6c4739bee4bb..b4aa936509d2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -32,6 +32,7 @@
 #include <linux/splice.h>
 #include <linux/kdebug.h>
 #include <linux/string.h>
+#include <linux/mount.h>
 #include <linux/rwsem.h>
 #include <linux/slab.h>
 #include <linux/ctype.h>
@@ -6535,6 +6536,28 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 
 }
 
+static struct vfsmount *trace_automount(void *ingore)
+{
+	struct vfsmount *mnt;
+	struct file_system_type *type;
+
+	/*
+	 * To maintain backward compatibility for tools that mount
+	 * debugfs to get to the tracing facility, tracefs is automatically
+	 * mounted to the debugfs/tracing directory.
+	 */
+	type = get_fs_type("tracefs");
+	if (!type)
+		return NULL;
+	mnt = vfs_kern_mount(type, 0, "tracefs", NULL);
+	put_filesystem(type);
+	if (IS_ERR(mnt))
+		return NULL;
+	mntget(mnt);
+
+	return mnt;
+}
+
 /**
  * tracing_init_dentry - initialize top level trace array
  *
@@ -6546,14 +6569,21 @@ struct dentry *tracing_init_dentry(void)
 {
 	struct trace_array *tr = &global_trace;
 
+	/* The top level trace array uses  NULL as parent */
 	if (tr->dir)
-		return tr->dir;
+		return NULL;
 
 	if (WARN_ON(!debugfs_initialized()))
 		return ERR_PTR(-ENODEV);
 
-	tr->dir = debugfs_create_dir("tracing", NULL);
-
+	/*
+	 * As there may still be users that expect the tracing
+	 * files to exist in debugfs/tracing, we must automount
+	 * the tracefs file system there, so older tools still
+	 * work with the newer kerenl.
+	 */
+	tr->dir = debugfs_create_automount("tracing", NULL,
+					   trace_automount, NULL);
 	if (!tr->dir) {
 		pr_warn_once("Could not create debugfs directory 'tracing'\n");
 		return ERR_PTR(-ENOMEM);
-- 
cgit v1.2.3


From eae473581cf93dad94ca833aa961c033c6a43924 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 21 Jan 2015 10:01:39 -0500
Subject: tracing: Have mkdir and rmdir be part of tracefs

The tracing "instances" directory can create sub tracing buffers
with mkdir, and remove them with rmdir. As a mkdir will also create
all the files and directories that control the sub buffer the inode
mutexes need to be released before this is done, to avoid deadlocks.
It is better to let the tracing system unlock the inode mutexes before
calling the functions that create the files within the new directory
(or deletes the files from the one being destroyed).

Now that tracing has been converted over to tracefs, the tracefs file
system can be modified to accommodate this feature. It still releases
the locks, but the filesystem itself can take care of the ugly
business and let the user just do what it needs.

The tracing system now attaches a descriptor to the directory dentry
that can have userspace create or remove sub directories. If this
descriptor does not exist for a dentry, then that dentry can not be
used to create other directories. This descriptor holds a mkdir and
rmdir method that only takes a character string as an argument.

The tracefs file system will first make a copy of the dentry name
before releasing the locks. Then it will pass the copied name to the
methods. It is up to the tracing system that supplied the methods to
handle races with duplicate names and such as all the inode mutexes
would be released when the functions are called.

Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 fs/tracefs/inode.c      | 151 +++++++++++++++++++++++++++++++++++++++++++-----
 include/linux/tracefs.h |   4 ++
 kernel/trace/trace.c    |  75 ++----------------------
 3 files changed, 145 insertions(+), 85 deletions(-)

(limited to 'kernel')

diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 0b9cf5cf24c9..d92bdf3b079a 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -50,6 +50,84 @@ static const struct file_operations tracefs_file_operations = {
 	.llseek =	noop_llseek,
 };
 
+static struct tracefs_dir_ops {
+	int (*mkdir)(const char *name);
+	int (*rmdir)(const char *name);
+} tracefs_ops;
+
+static char *get_dname(struct dentry *dentry)
+{
+	const char *dname;
+	char *name;
+	int len = dentry->d_name.len;
+
+	dname = dentry->d_name.name;
+	name = kmalloc(len + 1, GFP_KERNEL);
+	if (!name)
+		return NULL;
+	memcpy(name, dname, len);
+	name[len] = 0;
+	return name;
+}
+
+static int tracefs_syscall_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode)
+{
+	char *name;
+	int ret;
+
+	name = get_dname(dentry);
+	if (!name)
+		return -ENOMEM;
+
+	/*
+	 * The mkdir call can call the generic functions that create
+	 * the files within the tracefs system. It is up to the individual
+	 * mkdir routine to handle races.
+	 */
+	mutex_unlock(&inode->i_mutex);
+	ret = tracefs_ops.mkdir(name);
+	mutex_lock(&inode->i_mutex);
+
+	kfree(name);
+
+	return ret;
+}
+
+static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry)
+{
+	char *name;
+	int ret;
+
+	name = get_dname(dentry);
+	if (!name)
+		return -ENOMEM;
+
+	/*
+	 * The rmdir call can call the generic functions that create
+	 * the files within the tracefs system. It is up to the individual
+	 * rmdir routine to handle races.
+	 * This time we need to unlock not only the parent (inode) but
+	 * also the directory that is being deleted.
+	 */
+	mutex_unlock(&inode->i_mutex);
+	mutex_unlock(&dentry->d_inode->i_mutex);
+
+	ret = tracefs_ops.rmdir(name);
+
+	mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+	mutex_lock(&dentry->d_inode->i_mutex);
+
+	kfree(name);
+
+	return ret;
+}
+
+static const struct inode_operations tracefs_dir_inode_operations = {
+	.lookup		= simple_lookup,
+	.mkdir		= tracefs_syscall_mkdir,
+	.rmdir		= tracefs_syscall_rmdir,
+};
+
 static struct inode *tracefs_get_inode(struct super_block *sb)
 {
 	struct inode *inode = new_inode(sb);
@@ -334,6 +412,31 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode,
 	return end_creating(dentry);
 }
 
+static struct dentry *__create_dir(const char *name, struct dentry *parent,
+				   const struct inode_operations *ops)
+{
+	struct dentry *dentry = start_creating(name, parent);
+	struct inode *inode;
+
+	if (IS_ERR(dentry))
+		return NULL;
+
+	inode = tracefs_get_inode(dentry->d_sb);
+	if (unlikely(!inode))
+		return failed_creating(dentry);
+
+	inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+	inode->i_op = ops;
+	inode->i_fop = &simple_dir_operations;
+
+	/* directory inodes start off with i_nlink == 2 (for "." entry) */
+	inc_nlink(inode);
+	d_instantiate(dentry, inode);
+	inc_nlink(dentry->d_parent->d_inode);
+	fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
+	return end_creating(dentry);
+}
+
 /**
  * tracefs_create_dir - create a directory in the tracefs filesystem
  * @name: a pointer to a string containing the name of the directory to
@@ -353,26 +456,44 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode,
  */
 struct dentry *tracefs_create_dir(const char *name, struct dentry *parent)
 {
-	struct dentry *dentry = start_creating(name, parent);
-	struct inode *inode;
+	return __create_dir(name, parent, &simple_dir_inode_operations);
+}
 
-	if (IS_ERR(dentry))
+/**
+ * tracefs_create_instance_dir - create the tracing instances directory
+ * @name: The name of the instances directory to create
+ * @parent: The parent directory that the instances directory will exist
+ * @mkdir: The function to call when a mkdir is performed.
+ * @rmdir: The function to call when a rmdir is performed.
+ *
+ * Only one instances directory is allowed.
+ *
+ * The instances directory is special as it allows for mkdir and rmdir to
+ * to be done by userspace. When a mkdir or rmdir is performed, the inode
+ * locks are released and the methhods passed in (@mkdir and @rmdir) are
+ * called without locks and with the name of the directory being created
+ * within the instances directory.
+ *
+ * Returns the dentry of the instances directory.
+ */
+struct dentry *tracefs_create_instance_dir(const char *name, struct dentry *parent,
+					  int (*mkdir)(const char *name),
+					  int (*rmdir)(const char *name))
+{
+	struct dentry *dentry;
+
+	/* Only allow one instance of the instances directory. */
+	if (WARN_ON(tracefs_ops.mkdir || tracefs_ops.rmdir))
 		return NULL;
 
-	inode = tracefs_get_inode(dentry->d_sb);
-	if (unlikely(!inode))
-		return failed_creating(dentry);
+	dentry = __create_dir(name, parent, &tracefs_dir_inode_operations);
+	if (!dentry)
+		return NULL;
 
-	inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
-	inode->i_op = &simple_dir_inode_operations;
-	inode->i_fop = &simple_dir_operations;
+	tracefs_ops.mkdir = mkdir;
+	tracefs_ops.rmdir = rmdir;
 
-	/* directory inodes start off with i_nlink == 2 (for "." entry) */
-	inc_nlink(inode);
-	d_instantiate(dentry, inode);
-	inc_nlink(dentry->d_parent->d_inode);
-	fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
-	return end_creating(dentry);
+	return dentry;
 }
 
 static inline int tracefs_positive(struct dentry *dentry)
diff --git a/include/linux/tracefs.h b/include/linux/tracefs.h
index 23e04ce21749..5b727a17beee 100644
--- a/include/linux/tracefs.h
+++ b/include/linux/tracefs.h
@@ -34,6 +34,10 @@ struct dentry *tracefs_create_dir(const char *name, struct dentry *parent);
 void tracefs_remove(struct dentry *dentry);
 void tracefs_remove_recursive(struct dentry *dentry);
 
+struct dentry *tracefs_create_instance_dir(const char *name, struct dentry *parent,
+					   int (*mkdir)(const char *name),
+					   int (*rmdir)(const char *name));
+
 bool tracefs_initialized(void);
 
 #endif /* CONFIG_TRACING */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b4aa936509d2..3c8913bac204 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6292,7 +6292,7 @@ static void free_trace_buffers(struct trace_array *tr)
 #endif
 }
 
-static int new_instance_create(const char *name)
+static int instance_mkdir(const char *name)
 {
 	struct trace_array *tr;
 	int ret;
@@ -6362,7 +6362,7 @@ static int new_instance_create(const char *name)
 
 }
 
-static int instance_delete(const char *name)
+static int instance_rmdir(const char *name)
 {
 	struct trace_array *tr;
 	int found = 0;
@@ -6403,78 +6403,13 @@ static int instance_delete(const char *name)
 	return ret;
 }
 
-static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode)
-{
-	struct dentry *parent;
-	int ret;
-
-	/* Paranoid: Make sure the parent is the "instances" directory */
-	parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
-	if (WARN_ON_ONCE(parent != trace_instance_dir))
-		return -ENOENT;
-
-	/*
-	 * The inode mutex is locked, but tracefs_create_dir() will also
-	 * take the mutex. As the instances directory can not be destroyed
-	 * or changed in any other way, it is safe to unlock it, and
-	 * let the dentry try. If two users try to make the same dir at
-	 * the same time, then the new_instance_create() will determine the
-	 * winner.
-	 */
-	mutex_unlock(&inode->i_mutex);
-
-	ret = new_instance_create(dentry->d_iname);
-
-	mutex_lock(&inode->i_mutex);
-
-	return ret;
-}
-
-static int instance_rmdir(struct inode *inode, struct dentry *dentry)
-{
-	struct dentry *parent;
-	int ret;
-
-	/* Paranoid: Make sure the parent is the "instances" directory */
-	parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
-	if (WARN_ON_ONCE(parent != trace_instance_dir))
-		return -ENOENT;
-
-	/* The caller did a dget() on dentry */
-	mutex_unlock(&dentry->d_inode->i_mutex);
-
-	/*
-	 * The inode mutex is locked, but tracefs_create_dir() will also
-	 * take the mutex. As the instances directory can not be destroyed
-	 * or changed in any other way, it is safe to unlock it, and
-	 * let the dentry try. If two users try to make the same dir at
-	 * the same time, then the instance_delete() will determine the
-	 * winner.
-	 */
-	mutex_unlock(&inode->i_mutex);
-
-	ret = instance_delete(dentry->d_iname);
-
-	mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
-	mutex_lock(&dentry->d_inode->i_mutex);
-
-	return ret;
-}
-
-static const struct inode_operations instance_dir_inode_operations = {
-	.lookup		= simple_lookup,
-	.mkdir		= instance_mkdir,
-	.rmdir		= instance_rmdir,
-};
-
 static __init void create_trace_instances(struct dentry *d_tracer)
 {
-	trace_instance_dir = tracefs_create_dir("instances", d_tracer);
+	trace_instance_dir = tracefs_create_instance_dir("instances", d_tracer,
+							 instance_mkdir,
+							 instance_rmdir);
 	if (WARN_ON(!trace_instance_dir))
 		return;
-
-	/* Hijack the dir inode operations, to allow mkdir */
-	trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations;
 }
 
 static void
-- 
cgit v1.2.3


From 74390aa5567827add5058a3b26eff0ed06a629ba Mon Sep 17 00:00:00 2001
From: Kaixu Xia <xiakaixu@huawei.com>
Date: Tue, 27 Jan 2015 17:55:12 +0800
Subject: perf: Remove the extra validity check on nr_pages

The function is_power_of_2() also do the check on nr_pages, so the first
check performed is unnecessary. On the other hand, the key point is to
ensure @nr_pages is a power-of-two number and mostly @nr_pages is a
nonzero value, so in the most cases, the function is_power_of_2() will
be called.

Signed-off-by: Kaixu Xia <xiakaixu@huawei.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Link: http://lkml.kernel.org/r/1422352512-75150-1-git-send-email-xiakaixu@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 kernel/events/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7f2fbb8b5069..0969c9b67eec 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4420,7 +4420,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	 * If we have rb pages ensure they're a power-of-two number, so we
 	 * can do bitmasks instead of modulo.
 	 */
-	if (nr_pages != 0 && !is_power_of_2(nr_pages))
+	if (!is_power_of_2(nr_pages))
 		return -EINVAL;
 
 	if (vma_size != PAGE_SIZE * (1 + nr_pages))
-- 
cgit v1.2.3


From 72f669c0086fbbbbebc92ce7390125722c4c0ec5 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Thu, 5 Feb 2015 15:55:31 -0800
Subject: perf: Update shadow timestamp before add event

Update the shadow timestamp before start event, because .add might
use the timestamp.

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Link: http://lkml.kernel.org/r/9cd0276d6a047cb7c2885994f25e3a1f7c8c28af.1423180257.git.shli@fb.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 13209a90b751..e580e0f41ac6 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1881,6 +1881,10 @@ event_sched_in(struct perf_event *event,
 
 	perf_pmu_disable(event->pmu);
 
+	event->tstamp_running += tstamp - event->tstamp_stopped;
+
+	perf_set_shadow_time(event, ctx, tstamp);
+
 	if (event->pmu->add(event, PERF_EF_START)) {
 		event->state = PERF_EVENT_STATE_INACTIVE;
 		event->oncpu = -1;
@@ -1888,10 +1892,6 @@ event_sched_in(struct perf_event *event,
 		goto out;
 	}
 
-	event->tstamp_running += tstamp - event->tstamp_stopped;
-
-	perf_set_shadow_time(event, ctx, tstamp);
-
 	if (!is_software_event(event))
 		cpuctx->active_oncpu++;
 	if (!ctx->nr_active++)
-- 
cgit v1.2.3


From 6a694a607a97d58c042fb7fbd60ef1caea26950c Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Thu, 5 Feb 2015 15:55:32 -0800
Subject: perf: Update userspace page info for software event

For hardware events, the userspace page of the event gets updated in
context switches, so if we read the timestamp in the page, we get
fresh info.

For software events, this is missing currently. This patch makes the
behavior consistent.

With this patch, we can implement clock_gettime(THREAD_CPUTIME) with
PERF_COUNT_SW_DUMMY in userspace as suggested by Andy and Peter. Code
like this:

  if (pc->cap_user_time) {
	do {
		seq = pc->lock;
		barrier();

		running = pc->time_running;
		cyc = rdtsc();
		time_mult = pc->time_mult;
		time_shift = pc->time_shift;
		time_offset = pc->time_offset;

		barrier();
	} while (pc->lock != seq);

	quot = (cyc >> time_shift);
	rem = cyc & ((1 << time_shift) - 1);
	delta = time_offset + quot * time_mult +
		((rem * time_mult) >> time_shift);

	running += delta;
	return running;
  }

I tried it on a busy system, the userspace page updating doesn't
have noticeable overhead.

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/aa2dd2e4f1e9f2225758be5ba00f14d6909a8ce1.1423180257.git.shli@fb.com
[ Improved the changelog. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index e580e0f41ac6..fef45b4bb5f8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6123,6 +6123,7 @@ static int perf_swevent_add(struct perf_event *event, int flags)
 	}
 
 	hlist_add_head_rcu(&event->hlist_entry, head);
+	perf_event_update_userpage(event);
 
 	return 0;
 }
@@ -6592,6 +6593,7 @@ static int cpu_clock_event_add(struct perf_event *event, int flags)
 {
 	if (flags & PERF_EF_START)
 		cpu_clock_event_start(event, flags);
+	perf_event_update_userpage(event);
 
 	return 0;
 }
@@ -6666,6 +6668,7 @@ static int task_clock_event_add(struct perf_event *event, int flags)
 {
 	if (flags & PERF_EF_START)
 		task_clock_event_start(event, flags);
+	perf_event_update_userpage(event);
 
 	return 0;
 }
-- 
cgit v1.2.3


From ba532500c5651a4be4108acc64ed99a95cb005b3 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Tue, 4 Nov 2014 21:55:58 -0500
Subject: perf: Introduce pmu context switch callback

The callback is invoked when process is scheduled in or out.
It provides mechanism for later patches to save/store the LBR
stack. For the schedule in case, the callback is invoked at
the same place that flush branch stack callback is invoked.
So it also can replace the flush branch stack callback. To
avoid unnecessary overhead, the callback is enabled only when
there are events use the LBR stack.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-3-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c |  7 +++++
 arch/x86/kernel/cpu/perf_event.h |  2 ++
 include/linux/perf_event.h       |  9 +++++++
 kernel/events/core.c             | 57 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 75 insertions(+)

(limited to 'kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b71a7f86d68a..0efbd6cc2966 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1914,6 +1914,12 @@ static const struct attribute_group *x86_pmu_attr_groups[] = {
 	NULL,
 };
 
+static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+	if (x86_pmu.sched_task)
+		x86_pmu.sched_task(ctx, sched_in);
+}
+
 static void x86_pmu_flush_branch_stack(void)
 {
 	if (x86_pmu.flush_branch_stack)
@@ -1950,6 +1956,7 @@ static struct pmu pmu = {
 
 	.event_idx		= x86_pmu_event_idx,
 	.flush_branch_stack	= x86_pmu_flush_branch_stack,
+	.sched_task		= x86_pmu_sched_task,
 };
 
 void arch_perf_update_userpage(struct perf_event *event,
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 0c45b22495dc..211b54c0c00c 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -473,6 +473,8 @@ struct x86_pmu {
 
 	void		(*check_microcode)(void);
 	void		(*flush_branch_stack)(void);
+	void		(*sched_task)(struct perf_event_context *ctx,
+				      bool sched_in);
 
 	/*
 	 * Intel Arch Perfmon v2+
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 33262004c310..fbab6235d053 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -265,6 +265,13 @@ struct pmu {
 	 * flush branch stack on context-switches (needed in cpu-wide mode)
 	 */
 	void (*flush_branch_stack)	(void);
+
+	/*
+	 * context-switches callback
+	 */
+	void (*sched_task)		(struct perf_event_context *ctx,
+					bool sched_in);
+
 };
 
 /**
@@ -558,6 +565,8 @@ extern void perf_event_delayed_put(struct task_struct *task);
 extern void perf_event_print_debug(void);
 extern void perf_pmu_disable(struct pmu *pmu);
 extern void perf_pmu_enable(struct pmu *pmu);
+extern void perf_sched_cb_dec(struct pmu *pmu);
+extern void perf_sched_cb_inc(struct pmu *pmu);
 extern int perf_event_task_disable(void);
 extern int perf_event_task_enable(void);
 extern int perf_event_refresh(struct perf_event *event, int refresh);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fef45b4bb5f8..6c8b31b7efb6 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -154,6 +154,7 @@ enum event_type_t {
 struct static_key_deferred perf_sched_events __read_mostly;
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);
 
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
@@ -2577,6 +2578,56 @@ unlock:
 	}
 }
 
+void perf_sched_cb_dec(struct pmu *pmu)
+{
+	this_cpu_dec(perf_sched_cb_usages);
+}
+
+void perf_sched_cb_inc(struct pmu *pmu)
+{
+	this_cpu_inc(perf_sched_cb_usages);
+}
+
+/*
+ * This function provides the context switch callback to the lower code
+ * layer. It is invoked ONLY when the context switch callback is enabled.
+ */
+static void perf_pmu_sched_task(struct task_struct *prev,
+				struct task_struct *next,
+				bool sched_in)
+{
+	struct perf_cpu_context *cpuctx;
+	struct pmu *pmu;
+	unsigned long flags;
+
+	if (prev == next)
+		return;
+
+	local_irq_save(flags);
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(pmu, &pmus, entry) {
+		if (pmu->sched_task) {
+			cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+			perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+			perf_pmu_disable(pmu);
+
+			pmu->sched_task(cpuctx->task_ctx, sched_in);
+
+			perf_pmu_enable(pmu);
+
+			perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+		}
+	}
+
+	rcu_read_unlock();
+
+	local_irq_restore(flags);
+}
+
 #define for_each_task_context_nr(ctxn)					\
 	for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
 
@@ -2596,6 +2647,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
 {
 	int ctxn;
 
+	if (__this_cpu_read(perf_sched_cb_usages))
+		perf_pmu_sched_task(task, next, false);
+
 	for_each_task_context_nr(ctxn)
 		perf_event_context_sched_out(task, ctxn, next);
 
@@ -2847,6 +2901,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
 	/* check for system-wide branch_stack events */
 	if (atomic_read(this_cpu_ptr(&perf_branch_stack_events)))
 		perf_branch_stack_sched_in(prev, task);
+
+	if (__this_cpu_read(perf_sched_cb_usages))
+		perf_pmu_sched_task(prev, task, true);
 }
 
 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
-- 
cgit v1.2.3


From 2a0ad3b326a9024ba86dca4028499d31fa0c6c4d Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Tue, 4 Nov 2014 21:55:59 -0500
Subject: perf/x86/intel: Use context switch callback to flush LBR stack

Previous commit introduces context switch callback, its function
overlaps with the flush branch stack callback. So we can use the
context switch callback to flush LBR stack.

This patch adds code that uses the flush branch callback to
flush the LBR stack when task is being scheduled in. The callback
is enabled only when there are events use the LBR hardware. This
patch also removes all old flush branch stack code.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-4-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c           |  7 ---
 arch/x86/kernel/cpu/perf_event.h           |  3 +-
 arch/x86/kernel/cpu/perf_event_intel.c     | 14 +-----
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 27 +++++++++++
 include/linux/perf_event.h                 |  1 -
 kernel/events/core.c                       | 77 ------------------------------
 6 files changed, 30 insertions(+), 99 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 0efbd6cc2966..6b1fd26a37cf 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1920,12 +1920,6 @@ static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
 		x86_pmu.sched_task(ctx, sched_in);
 }
 
-static void x86_pmu_flush_branch_stack(void)
-{
-	if (x86_pmu.flush_branch_stack)
-		x86_pmu.flush_branch_stack();
-}
-
 void perf_check_microcode(void)
 {
 	if (x86_pmu.check_microcode)
@@ -1955,7 +1949,6 @@ static struct pmu pmu = {
 	.commit_txn		= x86_pmu_commit_txn,
 
 	.event_idx		= x86_pmu_event_idx,
-	.flush_branch_stack	= x86_pmu_flush_branch_stack,
 	.sched_task		= x86_pmu_sched_task,
 };
 
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 211b54c0c00c..949d0083a29e 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -472,7 +472,6 @@ struct x86_pmu {
 	void		(*cpu_dead)(int cpu);
 
 	void		(*check_microcode)(void);
-	void		(*flush_branch_stack)(void);
 	void		(*sched_task)(struct perf_event_context *ctx,
 				      bool sched_in);
 
@@ -733,6 +732,8 @@ void intel_pmu_pebs_disable_all(void);
 
 void intel_ds_init(void);
 
+void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
+
 void intel_pmu_lbr_reset(void);
 
 void intel_pmu_lbr_enable(struct perf_event *event);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 498b6d967138..424fbf74dee7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2044,18 +2044,6 @@ static void intel_pmu_cpu_dying(int cpu)
 	fini_debug_store_on_cpu(cpu);
 }
 
-static void intel_pmu_flush_branch_stack(void)
-{
-	/*
-	 * Intel LBR does not tag entries with the
-	 * PID of the current task, then we need to
-	 * flush it on ctxsw
-	 * For now, we simply reset it
-	 */
-	if (x86_pmu.lbr_nr)
-		intel_pmu_lbr_reset();
-}
-
 PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
 
 PMU_FORMAT_ATTR(ldlat, "config1:0-15");
@@ -2107,7 +2095,7 @@ static __initconst const struct x86_pmu intel_pmu = {
 	.cpu_starting		= intel_pmu_cpu_starting,
 	.cpu_dying		= intel_pmu_cpu_dying,
 	.guest_get_msrs		= intel_guest_get_msrs,
-	.flush_branch_stack	= intel_pmu_flush_branch_stack,
+	.sched_task		= intel_pmu_lbr_sched_task,
 };
 
 static __init void intel_clovertown_quirk(void)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 8bc078f43a82..c0e23c5f85bb 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -177,6 +177,31 @@ void intel_pmu_lbr_reset(void)
 		intel_pmu_lbr_reset_64();
 }
 
+void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	if (!x86_pmu.lbr_nr)
+		return;
+
+	/*
+	 * When sampling the branck stack in system-wide, it may be
+	 * necessary to flush the stack on context switch. This happens
+	 * when the branch stack does not tag its entries with the pid
+	 * of the current task. Otherwise it becomes impossible to
+	 * associate a branch entry with a task. This ambiguity is more
+	 * likely to appear when the branch stack supports priv level
+	 * filtering and the user sets it to monitor only at the user
+	 * level (which could be a useful measurement in system-wide
+	 * mode). In that case, the risk is high of having a branch
+	 * stack with branch from multiple tasks.
+ 	 */
+	if (sched_in) {
+		intel_pmu_lbr_reset();
+		cpuc->lbr_context = ctx;
+	}
+}
+
 void intel_pmu_lbr_enable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -195,6 +220,7 @@ void intel_pmu_lbr_enable(struct perf_event *event)
 	cpuc->br_sel = event->hw.branch_reg.reg;
 
 	cpuc->lbr_users++;
+	perf_sched_cb_inc(event->ctx->pmu);
 }
 
 void intel_pmu_lbr_disable(struct perf_event *event)
@@ -206,6 +232,7 @@ void intel_pmu_lbr_disable(struct perf_event *event)
 
 	cpuc->lbr_users--;
 	WARN_ON_ONCE(cpuc->lbr_users < 0);
+	perf_sched_cb_dec(event->ctx->pmu);
 
 	if (cpuc->enabled && !cpuc->lbr_users) {
 		__intel_pmu_lbr_disable();
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fbab6235d053..c7007a564440 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -511,7 +511,6 @@ struct perf_event_context {
 	u64				generation;
 	int				pin_count;
 	int				nr_cgroups;	 /* cgroup evts */
-	int				nr_branch_stack; /* branch_stack evt */
 	struct rcu_head			rcu_head;
 
 	struct delayed_work		orphans_remove;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6c8b31b7efb6..f563ce767f93 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -153,7 +153,6 @@ enum event_type_t {
  */
 struct static_key_deferred perf_sched_events __read_mostly;
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
-static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
 
 static atomic_t nr_mmap_events __read_mostly;
@@ -1240,9 +1239,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 	if (is_cgroup_event(event))
 		ctx->nr_cgroups++;
 
-	if (has_branch_stack(event))
-		ctx->nr_branch_stack++;
-
 	list_add_rcu(&event->event_entry, &ctx->event_list);
 	ctx->nr_events++;
 	if (event->attr.inherit_stat)
@@ -1409,9 +1405,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 			cpuctx->cgrp = NULL;
 	}
 
-	if (has_branch_stack(event))
-		ctx->nr_branch_stack--;
-
 	ctx->nr_events--;
 	if (event->attr.inherit_stat)
 		ctx->nr_stat--;
@@ -2808,64 +2801,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 	perf_ctx_unlock(cpuctx, ctx);
 }
 
-/*
- * When sampling the branck stack in system-wide, it may be necessary
- * to flush the stack on context switch. This happens when the branch
- * stack does not tag its entries with the pid of the current task.
- * Otherwise it becomes impossible to associate a branch entry with a
- * task. This ambiguity is more likely to appear when the branch stack
- * supports priv level filtering and the user sets it to monitor only
- * at the user level (which could be a useful measurement in system-wide
- * mode). In that case, the risk is high of having a branch stack with
- * branch from multiple tasks. Flushing may mean dropping the existing
- * entries or stashing them somewhere in the PMU specific code layer.
- *
- * This function provides the context switch callback to the lower code
- * layer. It is invoked ONLY when there is at least one system-wide context
- * with at least one active event using taken branch sampling.
- */
-static void perf_branch_stack_sched_in(struct task_struct *prev,
-				       struct task_struct *task)
-{
-	struct perf_cpu_context *cpuctx;
-	struct pmu *pmu;
-	unsigned long flags;
-
-	/* no need to flush branch stack if not changing task */
-	if (prev == task)
-		return;
-
-	local_irq_save(flags);
-
-	rcu_read_lock();
-
-	list_for_each_entry_rcu(pmu, &pmus, entry) {
-		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-
-		/*
-		 * check if the context has at least one
-		 * event using PERF_SAMPLE_BRANCH_STACK
-		 */
-		if (cpuctx->ctx.nr_branch_stack > 0
-		    && pmu->flush_branch_stack) {
-
-			perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-
-			perf_pmu_disable(pmu);
-
-			pmu->flush_branch_stack();
-
-			perf_pmu_enable(pmu);
-
-			perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
-		}
-	}
-
-	rcu_read_unlock();
-
-	local_irq_restore(flags);
-}
-
 /*
  * Called from scheduler to add the events of the current task
  * with interrupts disabled.
@@ -2898,10 +2833,6 @@ void __perf_event_task_sched_in(struct task_struct *prev,
 	if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
 		perf_cgroup_sched_in(prev, task);
 
-	/* check for system-wide branch_stack events */
-	if (atomic_read(this_cpu_ptr(&perf_branch_stack_events)))
-		perf_branch_stack_sched_in(prev, task);
-
 	if (__this_cpu_read(perf_sched_cb_usages))
 		perf_pmu_sched_task(prev, task, true);
 }
@@ -3480,10 +3411,6 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu)
 	if (event->parent)
 		return;
 
-	if (has_branch_stack(event)) {
-		if (!(event->attach_state & PERF_ATTACH_TASK))
-			atomic_dec(&per_cpu(perf_branch_stack_events, cpu));
-	}
 	if (is_cgroup_event(event))
 		atomic_dec(&per_cpu(perf_cgroup_events, cpu));
 }
@@ -7139,10 +7066,6 @@ static void account_event_cpu(struct perf_event *event, int cpu)
 	if (event->parent)
 		return;
 
-	if (has_branch_stack(event)) {
-		if (!(event->attach_state & PERF_ATTACH_TASK))
-			atomic_inc(&per_cpu(perf_branch_stack_events, cpu));
-	}
 	if (is_cgroup_event(event))
 		atomic_inc(&per_cpu(perf_cgroup_events, cpu));
 }
-- 
cgit v1.2.3


From 4af57ef28c2c1047fda9e1a5be02aa7a6a69cf9e Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Tue, 4 Nov 2014 21:56:01 -0500
Subject: perf: Add pmu specific data for perf task context

Introduce a new flag PERF_ATTACH_TASK_DATA for perf event's attach
stata. The flag is set by PMU's event_init() callback, it indicates
that perf event needs PMU specific data.

The PMU specific data are initialized to zeros. Later patches will
use PMU specific data to save LBR stack.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-6-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  6 ++++++
 kernel/events/core.c       | 40 ++++++++++++++++++++++++++++++++++++----
 2 files changed, 42 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index c7007a564440..270cd0173e61 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -271,6 +271,10 @@ struct pmu {
 	 */
 	void (*sched_task)		(struct perf_event_context *ctx,
 					bool sched_in);
+	/*
+	 * PMU specific data size
+	 */
+	size_t				task_ctx_size;
 
 };
 
@@ -307,6 +311,7 @@ struct swevent_hlist {
 #define PERF_ATTACH_CONTEXT	0x01
 #define PERF_ATTACH_GROUP	0x02
 #define PERF_ATTACH_TASK	0x04
+#define PERF_ATTACH_TASK_DATA	0x08
 
 struct perf_cgroup;
 struct ring_buffer;
@@ -511,6 +516,7 @@ struct perf_event_context {
 	u64				generation;
 	int				pin_count;
 	int				nr_cgroups;	 /* cgroup evts */
+	void				*task_ctx_data; /* pmu specific data */
 	struct rcu_head			rcu_head;
 
 	struct delayed_work		orphans_remove;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f563ce767f93..688086bb7144 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -905,6 +905,15 @@ static void get_ctx(struct perf_event_context *ctx)
 	WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
 }
 
+static void free_ctx(struct rcu_head *head)
+{
+	struct perf_event_context *ctx;
+
+	ctx = container_of(head, struct perf_event_context, rcu_head);
+	kfree(ctx->task_ctx_data);
+	kfree(ctx);
+}
+
 static void put_ctx(struct perf_event_context *ctx)
 {
 	if (atomic_dec_and_test(&ctx->refcount)) {
@@ -912,7 +921,7 @@ static void put_ctx(struct perf_event_context *ctx)
 			put_ctx(ctx->parent_ctx);
 		if (ctx->task)
 			put_task_struct(ctx->task);
-		kfree_rcu(ctx, rcu_head);
+		call_rcu(&ctx->rcu_head, free_ctx);
 	}
 }
 
@@ -3309,12 +3318,15 @@ errout:
  * Returns a matching context with refcount and pincount.
  */
 static struct perf_event_context *
-find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
+find_get_context(struct pmu *pmu, struct task_struct *task,
+		struct perf_event *event)
 {
 	struct perf_event_context *ctx, *clone_ctx = NULL;
 	struct perf_cpu_context *cpuctx;
+	void *task_ctx_data = NULL;
 	unsigned long flags;
 	int ctxn, err;
+	int cpu = event->cpu;
 
 	if (!task) {
 		/* Must be root to operate on a CPU event: */
@@ -3342,11 +3354,24 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
 	if (ctxn < 0)
 		goto errout;
 
+	if (event->attach_state & PERF_ATTACH_TASK_DATA) {
+		task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
+		if (!task_ctx_data) {
+			err = -ENOMEM;
+			goto errout;
+		}
+	}
+
 retry:
 	ctx = perf_lock_task_context(task, ctxn, &flags);
 	if (ctx) {
 		clone_ctx = unclone_ctx(ctx);
 		++ctx->pin_count;
+
+		if (task_ctx_data && !ctx->task_ctx_data) {
+			ctx->task_ctx_data = task_ctx_data;
+			task_ctx_data = NULL;
+		}
 		raw_spin_unlock_irqrestore(&ctx->lock, flags);
 
 		if (clone_ctx)
@@ -3357,6 +3382,11 @@ retry:
 		if (!ctx)
 			goto errout;
 
+		if (task_ctx_data) {
+			ctx->task_ctx_data = task_ctx_data;
+			task_ctx_data = NULL;
+		}
+
 		err = 0;
 		mutex_lock(&task->perf_event_mutex);
 		/*
@@ -3383,9 +3413,11 @@ retry:
 		}
 	}
 
+	kfree(task_ctx_data);
 	return ctx;
 
 errout:
+	kfree(task_ctx_data);
 	return ERR_PTR(err);
 }
 
@@ -7559,7 +7591,7 @@ SYSCALL_DEFINE5(perf_event_open,
 	/*
 	 * Get the target context (task or percpu):
 	 */
-	ctx = find_get_context(pmu, task, event->cpu);
+	ctx = find_get_context(pmu, task, event);
 	if (IS_ERR(ctx)) {
 		err = PTR_ERR(ctx);
 		goto err_alloc;
@@ -7765,7 +7797,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 
 	account_event(event);
 
-	ctx = find_get_context(event->pmu, task, cpu);
+	ctx = find_get_context(event->pmu, task, event);
 	if (IS_ERR(ctx)) {
 		err = PTR_ERR(ctx);
 		goto err_free;
-- 
cgit v1.2.3


From 5a158c3ccd2183a7b0866be6685d001fe653430f Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Tue, 4 Nov 2014 21:56:02 -0500
Subject: perf: Always switch pmu specific data during context switch

If two tasks were both forked from the same parent task, Events in
their perf task contexts can be the same. Perf core may leave out
switching the perf event contexts.

Previous patch inroduces pmu specific data. The data is for saving
the LBR stack, it is task specific. So we need to switch the data
even when context switch is optimized out.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-7-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 688086bb7144..84451c0debba 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2562,6 +2562,9 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 			next->perf_event_ctxp[ctxn] = ctx;
 			ctx->task = next;
 			next_ctx->task = task;
+
+			swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
+
 			do_switch = 0;
 
 			perf_event_sync_stat(ctx, next_ctx);
-- 
cgit v1.2.3


From a46a23000198d929391aa9dac8de68734efa2703 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Tue, 4 Nov 2014 21:56:06 -0500
Subject: perf: Simplify the branch stack check

Use event->attr.branch_sample_type to replace
intel_pmu_needs_lbr_smpl() for avoiding duplicated code that
implicitly enables the LBR.

Currently, branch stack can be enabled by user explicitly requesting
branch sampling or implicit branch sampling to correct PEBS skid.

For user explicitly requested branch sampling, the branch_sample_type
is explicitly set by user. For PEBS case, the branch_sample_type is also
implicitly set to PERF_SAMPLE_BRANCH_ANY in x86_pmu_hw_config.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-11-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 20 +++-----------------
 include/linux/perf_event.h             |  5 +++++
 kernel/events/core.c                   |  3 +++
 3 files changed, 11 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index a485ba124476..9f1dd18fa395 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1029,20 +1029,6 @@ static __initconst const u64 slm_hw_cache_event_ids
  },
 };
 
-static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
-{
-	/* user explicitly requested branch sampling */
-	if (has_branch_stack(event))
-		return true;
-
-	/* implicit branch sampling to correct PEBS skid */
-	if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
-	    x86_pmu.intel_cap.pebs_format < 2)
-		return true;
-
-	return false;
-}
-
 static void intel_pmu_disable_all(void)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1207,7 +1193,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
 	 * must disable before any actual event
 	 * because any event may be combined with LBR
 	 */
-	if (intel_pmu_needs_lbr_smpl(event))
+	if (needs_branch_stack(event))
 		intel_pmu_lbr_disable(event);
 
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
@@ -1268,7 +1254,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
 	 * must enabled before any actual event
 	 * because any event may be combined with LBR
 	 */
-	if (intel_pmu_needs_lbr_smpl(event))
+	if (needs_branch_stack(event))
 		intel_pmu_lbr_enable(event);
 
 	if (event->attr.exclude_host)
@@ -1747,7 +1733,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
 	if (event->attr.precise_ip && x86_pmu.pebs_aliases)
 		x86_pmu.pebs_aliases(event);
 
-	if (intel_pmu_needs_lbr_smpl(event)) {
+	if (needs_branch_stack(event)) {
 		ret = intel_pmu_setup_lbr_filter(event);
 		if (ret)
 			return ret;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 270cd0173e61..43cc158487e6 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -814,6 +814,11 @@ static inline bool has_branch_stack(struct perf_event *event)
 	return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK;
 }
 
+static inline bool needs_branch_stack(struct perf_event *event)
+{
+	return event->attr.branch_sample_type != 0;
+}
+
 extern int perf_output_begin(struct perf_output_handle *handle,
 			     struct perf_event *event, unsigned int size);
 extern void perf_output_end(struct perf_output_handle *handle);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 84451c0debba..257eccf9afd4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7232,6 +7232,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
 		goto err_ns;
 
+	if (!has_branch_stack(event))
+		event->attr.branch_sample_type = 0;
+
 	pmu = perf_init_event(event);
 	if (!pmu)
 		goto err_ns;
-- 
cgit v1.2.3


From 0937e3b025f70e33f018aa55ee8d32b8731730a7 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 9 Feb 2015 11:31:13 -0600
Subject: livepatch: simplify disable error path

If registering the function with ftrace has previously succeeded,
unregistering will almost never fail.  Even if it does, it's not a fatal
error.  We can still carry on and disable the klp_func from being used
by removing it from the klp_ops func stack.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Petr Mladek <pmladek@suse.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/livepatch/core.c | 67 +++++++++++++------------------------------------
 1 file changed, 17 insertions(+), 50 deletions(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index ff7f47d026ac..26df09d56f7c 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -322,32 +322,20 @@ static void notrace klp_ftrace_handler(unsigned long ip,
 	klp_arch_set_pc(regs, (unsigned long)func->new_func);
 }
 
-static int klp_disable_func(struct klp_func *func)
+static void klp_disable_func(struct klp_func *func)
 {
 	struct klp_ops *ops;
-	int ret;
-
-	if (WARN_ON(func->state != KLP_ENABLED))
-		return -EINVAL;
 
-	if (WARN_ON(!func->old_addr))
-		return -EINVAL;
+	WARN_ON(func->state != KLP_ENABLED);
+	WARN_ON(!func->old_addr);
 
 	ops = klp_find_ops(func->old_addr);
 	if (WARN_ON(!ops))
-		return -EINVAL;
+		return;
 
 	if (list_is_singular(&ops->func_stack)) {
-		ret = unregister_ftrace_function(&ops->fops);
-		if (ret) {
-			pr_err("failed to unregister ftrace handler for function '%s' (%d)\n",
-			       func->old_name, ret);
-			return ret;
-		}
-
-		ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0);
-		if (ret)
-			pr_warn("function unregister succeeded but failed to clear the filter\n");
+		WARN_ON(unregister_ftrace_function(&ops->fops));
+		WARN_ON(ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0));
 
 		list_del_rcu(&func->stack_node);
 		list_del(&ops->node);
@@ -357,8 +345,6 @@ static int klp_disable_func(struct klp_func *func)
 	}
 
 	func->state = KLP_DISABLED;
-
-	return 0;
 }
 
 static int klp_enable_func(struct klp_func *func)
@@ -419,23 +405,15 @@ err:
 	return ret;
 }
 
-static int klp_disable_object(struct klp_object *obj)
+static void klp_disable_object(struct klp_object *obj)
 {
 	struct klp_func *func;
-	int ret;
 
-	for (func = obj->funcs; func->old_name; func++) {
-		if (func->state != KLP_ENABLED)
-			continue;
-
-		ret = klp_disable_func(func);
-		if (ret)
-			return ret;
-	}
+	for (func = obj->funcs; func->old_name; func++)
+		if (func->state == KLP_ENABLED)
+			klp_disable_func(func);
 
 	obj->state = KLP_DISABLED;
-
-	return 0;
 }
 
 static int klp_enable_object(struct klp_object *obj)
@@ -451,22 +429,19 @@ static int klp_enable_object(struct klp_object *obj)
 
 	for (func = obj->funcs; func->old_name; func++) {
 		ret = klp_enable_func(func);
-		if (ret)
-			goto unregister;
+		if (ret) {
+			klp_disable_object(obj);
+			return ret;
+		}
 	}
 	obj->state = KLP_ENABLED;
 
 	return 0;
-
-unregister:
-	WARN_ON(klp_disable_object(obj));
-	return ret;
 }
 
 static int __klp_disable_patch(struct klp_patch *patch)
 {
 	struct klp_object *obj;
-	int ret;
 
 	/* enforce stacking: only the last enabled patch can be disabled */
 	if (!list_is_last(&patch->list, &klp_patches) &&
@@ -476,12 +451,8 @@ static int __klp_disable_patch(struct klp_patch *patch)
 	pr_notice("disabling patch '%s'\n", patch->mod->name);
 
 	for (obj = patch->objs; obj->funcs; obj++) {
-		if (obj->state != KLP_ENABLED)
-			continue;
-
-		ret = klp_disable_object(obj);
-		if (ret)
-			return ret;
+		if (obj->state == KLP_ENABLED)
+			klp_disable_object(obj);
 	}
 
 	patch->state = KLP_DISABLED;
@@ -931,7 +902,6 @@ static void klp_module_notify_going(struct klp_patch *patch,
 {
 	struct module *pmod = patch->mod;
 	struct module *mod = obj->mod;
-	int ret;
 
 	if (patch->state == KLP_DISABLED)
 		goto disabled;
@@ -939,10 +909,7 @@ static void klp_module_notify_going(struct klp_patch *patch,
 	pr_notice("reverting patch '%s' on unloading module '%s'\n",
 		  pmod->name, mod->name);
 
-	ret = klp_disable_object(obj);
-	if (ret)
-		pr_warn("failed to revert patch '%s' on module '%s' (%d)\n",
-			pmod->name, mod->name, ret);
+	klp_disable_object(obj);
 
 disabled:
 	klp_free_object_loaded(obj);
-- 
cgit v1.2.3


From f1aaf26224bee779012aab136e5373ce3487982c Mon Sep 17 00:00:00 2001
From: Imre Palik <imrep@amazon.de>
Date: Mon, 23 Feb 2015 15:37:59 -0500
Subject: audit: move the tree pruning to a dedicated thread

When file auditing is enabled, during a low memory situation, a memory
allocation with __GFP_FS can lead to pruning the inode cache.  Which can,
in turn lead to audit_tree_freeing_mark() being called.  This can call
audit_schedule_prune(), that tries to fork a pruning thread, and
waits until the thread is created.  But forking needs memory, and the
memory allocations there are done with __GFP_FS.

So we are waiting merrily for some __GFP_FS memory allocations to complete,
while holding some filesystem locks.  This can take a while ...

This patch creates a single thread for pruning the tree from
audit_add_tree_rule(), and thus avoids the deadlock that the on-demand
thread creation can cause.

Reported-by: Matt Wilson <msw@amazon.com>
Cc: Matt Wilson <msw@amazon.com>
Signed-off-by: Imre Palik <imrep@amazon.de>
Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <pmoore@redhat.com>
---
 kernel/audit_tree.c | 88 ++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 60 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 80f29e015570..415072c8e875 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -37,6 +37,7 @@ struct audit_chunk {
 
 static LIST_HEAD(tree_list);
 static LIST_HEAD(prune_list);
+static struct task_struct *prune_thread;
 
 /*
  * One struct chunk is attached to each inode of interest.
@@ -651,6 +652,57 @@ static int tag_mount(struct vfsmount *mnt, void *arg)
 	return tag_chunk(mnt->mnt_root->d_inode, arg);
 }
 
+/*
+ * That gets run when evict_chunk() ends up needing to kill audit_tree.
+ * Runs from a separate thread.
+ */
+static int prune_tree_thread(void *unused)
+{
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (list_empty(&prune_list))
+			schedule();
+		__set_current_state(TASK_RUNNING);
+
+		mutex_lock(&audit_cmd_mutex);
+		mutex_lock(&audit_filter_mutex);
+
+		while (!list_empty(&prune_list)) {
+			struct audit_tree *victim;
+
+			victim = list_entry(prune_list.next,
+					struct audit_tree, list);
+			list_del_init(&victim->list);
+
+			mutex_unlock(&audit_filter_mutex);
+
+			prune_one(victim);
+
+			mutex_lock(&audit_filter_mutex);
+		}
+
+		mutex_unlock(&audit_filter_mutex);
+		mutex_unlock(&audit_cmd_mutex);
+	}
+	return 0;
+}
+
+static int audit_launch_prune(void)
+{
+	if (prune_thread)
+		return 0;
+	prune_thread = kthread_create(prune_tree_thread, NULL,
+				"audit_prune_tree");
+	if (IS_ERR(prune_thread)) {
+		pr_err("cannot start thread audit_prune_tree");
+		prune_thread = NULL;
+		return -ENOMEM;
+	} else {
+		wake_up_process(prune_thread);
+		return 0;
+	}
+}
+
 /* called with audit_filter_mutex */
 int audit_add_tree_rule(struct audit_krule *rule)
 {
@@ -674,6 +726,12 @@ int audit_add_tree_rule(struct audit_krule *rule)
 	/* do not set rule->tree yet */
 	mutex_unlock(&audit_filter_mutex);
 
+	if (unlikely(!prune_thread)) {
+		err = audit_launch_prune();
+		if (err)
+			goto Err;
+	}
+
 	err = kern_path(tree->pathname, 0, &path);
 	if (err)
 		goto Err;
@@ -811,36 +869,10 @@ int audit_tag_tree(char *old, char *new)
 	return failed;
 }
 
-/*
- * That gets run when evict_chunk() ends up needing to kill audit_tree.
- * Runs from a separate thread.
- */
-static int prune_tree_thread(void *unused)
-{
-	mutex_lock(&audit_cmd_mutex);
-	mutex_lock(&audit_filter_mutex);
-
-	while (!list_empty(&prune_list)) {
-		struct audit_tree *victim;
-
-		victim = list_entry(prune_list.next, struct audit_tree, list);
-		list_del_init(&victim->list);
-
-		mutex_unlock(&audit_filter_mutex);
-
-		prune_one(victim);
-
-		mutex_lock(&audit_filter_mutex);
-	}
-
-	mutex_unlock(&audit_filter_mutex);
-	mutex_unlock(&audit_cmd_mutex);
-	return 0;
-}
 
 static void audit_schedule_prune(void)
 {
-	kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
+	wake_up_process(prune_thread);
 }
 
 /*
@@ -907,9 +939,9 @@ static void evict_chunk(struct audit_chunk *chunk)
 	for (n = 0; n < chunk->count; n++)
 		list_del_init(&chunk->owners[n].list);
 	spin_unlock(&hash_lock);
+	mutex_unlock(&audit_filter_mutex);
 	if (need_prune)
 		audit_schedule_prune();
-	mutex_unlock(&audit_filter_mutex);
 }
 
 static int audit_tree_handle_event(struct fsnotify_group *group,
-- 
cgit v1.2.3


From a77ed4e5689627b0e9ac0a3532521b31c6daa99c Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Mon, 23 Feb 2015 15:37:59 -0500
Subject: audit: don't lose set wait time on first successful call to
 audit_log_start()

Copy the set wait time to a working value to avoid losing the set
value if the queue overflows.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <pmoore@redhat.com>
---
 kernel/audit.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index c7e097a0d7af..2125cc40e921 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -107,6 +107,7 @@ static u32	audit_rate_limit;
  * When set to zero, this means unlimited. */
 static u32	audit_backlog_limit = 64;
 #define AUDIT_BACKLOG_WAIT_TIME (60 * HZ)
+static u32	audit_backlog_wait_time_master = AUDIT_BACKLOG_WAIT_TIME;
 static u32	audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
 static u32	audit_backlog_wait_overflow = 0;
 
@@ -338,7 +339,7 @@ static int audit_set_backlog_limit(u32 limit)
 static int audit_set_backlog_wait_time(u32 timeout)
 {
 	return audit_do_config_change("audit_backlog_wait_time",
-				      &audit_backlog_wait_time, timeout);
+				      &audit_backlog_wait_time_master, timeout);
 }
 
 static int audit_set_enabled(u32 state)
@@ -843,7 +844,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		s.lost			= atomic_read(&audit_lost);
 		s.backlog		= skb_queue_len(&audit_skb_queue);
 		s.feature_bitmap	= AUDIT_FEATURE_BITMAP_ALL;
-		s.backlog_wait_time	= audit_backlog_wait_time;
+		s.backlog_wait_time	= audit_backlog_wait_time_master;
 		audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
 		break;
 	}
@@ -1394,7 +1395,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
 		return NULL;
 	}
 
-	audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
+	audit_backlog_wait_time = audit_backlog_wait_time_master;
 
 	ab = audit_buffer_alloc(ctx, gfp_mask, type);
 	if (!ab) {
-- 
cgit v1.2.3


From efef73a1a206c4b5e37e5c63a361243ed1603eff Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Mon, 23 Feb 2015 15:38:00 -0500
Subject: audit: don't reset working wait time accidentally with auditd

During a queue overflow condition while we are waiting for auditd to drain the
queue to make room for regular messages, we don't want a successful auditd that
has bypassed the queue check to reset the backlog wait time.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <pmoore@redhat.com>
---
 kernel/audit.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 2125cc40e921..59d4ceb3b2a6 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1395,7 +1395,8 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
 		return NULL;
 	}
 
-	audit_backlog_wait_time = audit_backlog_wait_time_master;
+	if (!reserve)
+		audit_backlog_wait_time = audit_backlog_wait_time_master;
 
 	ab = audit_buffer_alloc(ctx, gfp_mask, type);
 	if (!ab) {
-- 
cgit v1.2.3


From 5985de6754a6fc22fbf7e4b6033d4bfa0240a63a Mon Sep 17 00:00:00 2001
From: Ameen Ali <ameenali023@gmail.com>
Date: Mon, 23 Feb 2015 15:38:00 -0500
Subject: audit: code clean up

Fixed a coding style issue (unnecessary parentheses , unnecessary braces)

Signed-off-by: Ameen-Ali <Ameenali023@gmail.com>
[PM: tweaked subject line]
Signed-off-by: Paul Moore <pmoore@redhat.com>
---
 kernel/audit.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 59d4ceb3b2a6..d219bb03a364 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -673,7 +673,7 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
 	case AUDIT_MAKE_EQUIV:
 		/* Only support auditd and auditctl in initial pid namespace
 		 * for now. */
-		if ((task_active_pid_ns(current) != &init_pid_ns))
+		if (task_active_pid_ns(current) != &init_pid_ns)
 			return -EPERM;
 
 		if (!netlink_capable(skb, CAP_AUDIT_CONTROL))
@@ -1770,7 +1770,7 @@ void audit_log_name(struct audit_context *context, struct audit_names *n,
 	} else
 		audit_log_format(ab, " name=(null)");
 
-	if (n->ino != (unsigned long)-1) {
+	if (n->ino != (unsigned long)-1)
 		audit_log_format(ab, " inode=%lu"
 				 " dev=%02x:%02x mode=%#ho"
 				 " ouid=%u ogid=%u rdev=%02x:%02x",
@@ -1782,7 +1782,6 @@ void audit_log_name(struct audit_context *context, struct audit_names *n,
 				 from_kgid(&init_user_ns, n->gid),
 				 MAJOR(n->rdev),
 				 MINOR(n->rdev));
-	}
 	if (n->osid != 0) {
 		char *ctx = NULL;
 		u32 len;
-- 
cgit v1.2.3


From 4766b199ef9e1ca6316ee4f8f9d80c2ba1ed0290 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Sun, 22 Feb 2015 18:20:00 -0800
Subject: audit: consolidate handling of mm->exe_file

This patch adds a audit_log_d_path_exe() helper function
to share how we handle auditing of the exe_file's path.
Used by both audit and auditsc. No functionality is changed.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
[PM: tweaked subject line]
Signed-off-by: Paul Moore <pmoore@redhat.com>
---
 kernel/audit.c   | 23 +++++++++++++++--------
 kernel/audit.h   |  3 +++
 kernel/auditsc.c |  9 +--------
 3 files changed, 19 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index d219bb03a364..684b51d612a3 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1848,11 +1848,24 @@ error_path:
 }
 EXPORT_SYMBOL(audit_log_task_context);
 
+void audit_log_d_path_exe(struct audit_buffer *ab,
+			  struct mm_struct *mm)
+{
+	if (!mm) {
+		audit_log_format(ab, " exe=(null)");
+		return;
+	}
+
+	down_read(&mm->mmap_sem);
+	if (mm->exe_file)
+		audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
+	up_read(&mm->mmap_sem);
+}
+
 void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
 {
 	const struct cred *cred;
 	char comm[sizeof(tsk->comm)];
-	struct mm_struct *mm = tsk->mm;
 	char *tty;
 
 	if (!ab)
@@ -1888,13 +1901,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
 	audit_log_format(ab, " comm=");
 	audit_log_untrustedstring(ab, get_task_comm(comm, tsk));
 
-	if (mm) {
-		down_read(&mm->mmap_sem);
-		if (mm->exe_file)
-			audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
-		up_read(&mm->mmap_sem);
-	} else
-		audit_log_format(ab, " exe=(null)");
+	audit_log_d_path_exe(ab, tsk->mm);
 	audit_log_task_context(ab);
 }
 EXPORT_SYMBOL(audit_log_task_info);
diff --git a/kernel/audit.h b/kernel/audit.h
index 3cdffad5a1d9..28067c50fd04 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -270,6 +270,9 @@ extern struct list_head audit_filter_list[];
 
 extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
 
+extern void audit_log_d_path_exe(struct audit_buffer *ab,
+				 struct mm_struct *mm);
+
 /* audit watch functions */
 #ifdef CONFIG_AUDIT_WATCH
 extern void audit_put_watch(struct audit_watch *watch);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 793e9e98f7f8..4b89f7f95d84 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2460,7 +2460,6 @@ static void audit_log_task(struct audit_buffer *ab)
 	kuid_t auid, uid;
 	kgid_t gid;
 	unsigned int sessionid;
-	struct mm_struct *mm = current->mm;
 	char comm[sizeof(current->comm)];
 
 	auid = audit_get_loginuid(current);
@@ -2475,13 +2474,7 @@ static void audit_log_task(struct audit_buffer *ab)
 	audit_log_task_context(ab);
 	audit_log_format(ab, " pid=%d comm=", task_pid_nr(current));
 	audit_log_untrustedstring(ab, get_task_comm(comm, current));
-	if (mm) {
-		down_read(&mm->mmap_sem);
-		if (mm->exe_file)
-			audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
-		up_read(&mm->mmap_sem);
-	} else
-		audit_log_format(ab, " exe=(null)");
+	audit_log_d_path_exe(ab, current->mm);
 }
 
 /**
-- 
cgit v1.2.3


From 5b28255278dd7e594c8dde317c2498b7dcbf900d Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Sun, 22 Feb 2015 18:20:09 -0800
Subject: audit: reduce mmap_sem hold for mm->exe_file

The mm->exe_file is currently serialized with mmap_sem (shared)
in order to both safely (1) read the file and (2) audit it via
audit_log_d_path(). Good users will, on the other hand, make use
of the more standard get_mm_exe_file(), requiring only holding
the mmap_sem to read the value, and relying on reference counting
to make sure that the exe file won't dissapear underneath us.

Additionally, upon NULL return of get_mm_exe_file, we also call
audit_log_format(ab, " exe=(null)").

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
[PM: tweaked subject line]
Signed-off-by: Paul Moore <pmoore@redhat.com>
---
 kernel/audit.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 684b51d612a3..52ee8eee0e07 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -43,6 +43,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/file.h>
 #include <linux/init.h>
 #include <linux/types.h>
 #include <linux/atomic.h>
@@ -1851,15 +1852,20 @@ EXPORT_SYMBOL(audit_log_task_context);
 void audit_log_d_path_exe(struct audit_buffer *ab,
 			  struct mm_struct *mm)
 {
-	if (!mm) {
-		audit_log_format(ab, " exe=(null)");
-		return;
-	}
+	struct file *exe_file;
+
+	if (!mm)
+		goto out_null;
 
-	down_read(&mm->mmap_sem);
-	if (mm->exe_file)
-		audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
-	up_read(&mm->mmap_sem);
+	exe_file = get_mm_exe_file(mm);
+	if (!exe_file)
+		goto out_null;
+
+	audit_log_d_path(ab, " exe=", &exe_file->f_path);
+	fput(exe_file);
+	return;
+out_null:
+	audit_log_format(ab, " exe=(null)");
 }
 
 void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
-- 
cgit v1.2.3


From 39bed6cbb842d8edf5a26b01122b391d36775b5e Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Fri, 23 Jan 2015 18:45:40 +0000
Subject: perf: Make perf_cgroup_from_task() global

Move perf_cgroup_from_task() from kernel/events/ to include/linux/ along
with the necessary struct definitions, so that it can be used by the PMU
code.

When the upcoming Intel Cache Monitoring PMU driver assigns monitoring
IDs to perf events, it needs to be able to check whether any two
monitoring events overlap (say, a cgroup and task event), which means we
need to be able to lookup the cgroup associated with a task (if any).

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kanaka Juvva <kanaka.d.juvva@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Link: http://lkml.kernel.org/r/1422038748-21397-2-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 30 ++++++++++++++++++++++++++++++
 kernel/events/core.c       | 28 +---------------------------
 2 files changed, 31 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 724d3720c9b1..cae4a9481777 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -53,6 +53,7 @@ struct perf_guest_info_callbacks {
 #include <linux/sysfs.h>
 #include <linux/perf_regs.h>
 #include <linux/workqueue.h>
+#include <linux/cgroup.h>
 #include <asm/local.h>
 
 struct perf_callchain_entry {
@@ -547,6 +548,35 @@ struct perf_output_handle {
 	int				page;
 };
 
+#ifdef CONFIG_CGROUP_PERF
+
+/*
+ * perf_cgroup_info keeps track of time_enabled for a cgroup.
+ * This is a per-cpu dynamically allocated data structure.
+ */
+struct perf_cgroup_info {
+	u64				time;
+	u64				timestamp;
+};
+
+struct perf_cgroup {
+	struct cgroup_subsys_state	css;
+	struct perf_cgroup_info	__percpu *info;
+};
+
+/*
+ * Must ensure cgroup is pinned (css_get) before calling
+ * this function. In other words, we cannot call this function
+ * if there is no cgroup event for the current CPU context.
+ */
+static inline struct perf_cgroup *
+perf_cgroup_from_task(struct task_struct *task)
+{
+	return container_of(task_css(task, perf_event_cgrp_id),
+			    struct perf_cgroup, css);
+}
+#endif /* CONFIG_CGROUP_PERF */
+
 #ifdef CONFIG_PERF_EVENTS
 
 extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 20cece0a7aea..072de3143244 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -34,11 +34,11 @@
 #include <linux/syscalls.h>
 #include <linux/anon_inodes.h>
 #include <linux/kernel_stat.h>
+#include <linux/cgroup.h>
 #include <linux/perf_event.h>
 #include <linux/ftrace_event.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/mm_types.h>
-#include <linux/cgroup.h>
 #include <linux/module.h>
 #include <linux/mman.h>
 #include <linux/compat.h>
@@ -351,32 +351,6 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
 
 #ifdef CONFIG_CGROUP_PERF
 
-/*
- * perf_cgroup_info keeps track of time_enabled for a cgroup.
- * This is a per-cpu dynamically allocated data structure.
- */
-struct perf_cgroup_info {
-	u64				time;
-	u64				timestamp;
-};
-
-struct perf_cgroup {
-	struct cgroup_subsys_state	css;
-	struct perf_cgroup_info	__percpu *info;
-};
-
-/*
- * Must ensure cgroup is pinned (css_get) before calling
- * this function. In other words, we cannot call this function
- * if there is no cgroup event for the current CPU context.
- */
-static inline struct perf_cgroup *
-perf_cgroup_from_task(struct task_struct *task)
-{
-	return container_of(task_css(task, perf_event_cgrp_id),
-			    struct perf_cgroup, css);
-}
-
 static inline bool
 perf_cgroup_match(struct perf_event *event)
 {
-- 
cgit v1.2.3


From eacd3ecc34472ce3751eedfc94e44c7cc6eb6305 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Fri, 23 Jan 2015 18:45:41 +0000
Subject: perf: Add ->count() function to read per-package counters

For PMU drivers that record per-package counters, the ->count variable
cannot be used to record an accurate aggregated value, since it's not
possible to perform SMP cross-calls to cpus on other packages from the
context in which we update ->count.

Introduce a new optional ->count() accessor function that can be used to
customize how values are collected. If a PMU driver doesn't provide a
->count() function, we fallback to the existing code.

There is necessarily a window of staleness with this approach because
the task that generated the counter value may not have been scheduled by
the cpu recently.

An alternative and more complex approach would be to use a hrtimer to
periodically refresh the values from a more permissive scheduling
context. So, we're trading off complexity for accuracy.

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kanaka Juvva <kanaka.d.juvva@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Link: http://lkml.kernel.org/r/1422038748-21397-3-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 10 ++++++++++
 kernel/events/core.c       |  5 ++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index cae4a9481777..9fc9b0d31442 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -272,6 +272,11 @@ struct pmu {
 	 */
 	size_t				task_ctx_size;
 
+
+	/*
+	 * Return the count value for a counter.
+	 */
+	u64 (*count)			(struct perf_event *event); /*optional*/
 };
 
 /**
@@ -770,6 +775,11 @@ static inline void perf_event_task_sched_out(struct task_struct *prev,
 		__perf_event_task_sched_out(prev, next);
 }
 
+static inline u64 __perf_event_count(struct perf_event *event)
+{
+	return local64_read(&event->count) + atomic64_read(&event->child_count);
+}
+
 extern void perf_event_mmap(struct vm_area_struct *vma);
 extern struct perf_guest_info_callbacks *perf_guest_cbs;
 extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 072de3143244..4e8dc596f101 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3194,7 +3194,10 @@ static void __perf_event_read(void *info)
 
 static inline u64 perf_event_count(struct perf_event *event)
 {
-	return local64_read(&event->count) + atomic64_read(&event->child_count);
+	if (event->pmu->count)
+		return event->pmu->count(event);
+
+	return __perf_event_count(event);
 }
 
 static u64 perf_event_read(struct perf_event *event)
-- 
cgit v1.2.3


From 79dff51e900fd26a073be8b23acfbd8c15edb181 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Fri, 23 Jan 2015 18:45:42 +0000
Subject: perf: Move cgroup init before PMU ->event_init()

The Intel QoS PMU needs to know whether an event is part of a cgroup
during ->event_init(), because tasks in the same cgroup share a
monitoring ID.

Move the cgroup initialisation before calling into the PMU driver.

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kanaka Juvva <kanaka.d.juvva@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Link: http://lkml.kernel.org/r/1422038748-21397-4-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4e8dc596f101..1fc3bae5904a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7116,7 +7116,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 		 struct perf_event *group_leader,
 		 struct perf_event *parent_event,
 		 perf_overflow_handler_t overflow_handler,
-		 void *context)
+		 void *context, int cgroup_fd)
 {
 	struct pmu *pmu;
 	struct perf_event *event;
@@ -7212,6 +7212,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	if (!has_branch_stack(event))
 		event->attr.branch_sample_type = 0;
 
+	if (cgroup_fd != -1) {
+		err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
+		if (err)
+			goto err_ns;
+	}
+
 	pmu = perf_init_event(event);
 	if (!pmu)
 		goto err_ns;
@@ -7235,6 +7241,8 @@ err_pmu:
 		event->destroy(event);
 	module_put(pmu->module);
 err_ns:
+	if (is_cgroup_event(event))
+		perf_detach_cgroup(event);
 	if (event->ns)
 		put_pid_ns(event->ns);
 	kfree(event);
@@ -7453,6 +7461,7 @@ SYSCALL_DEFINE5(perf_event_open,
 	int move_group = 0;
 	int err;
 	int f_flags = O_RDWR;
+	int cgroup_fd = -1;
 
 	/* for future expandability... */
 	if (flags & ~PERF_FLAG_ALL)
@@ -7518,21 +7527,16 @@ SYSCALL_DEFINE5(perf_event_open,
 
 	get_online_cpus();
 
+	if (flags & PERF_FLAG_PID_CGROUP)
+		cgroup_fd = pid;
+
 	event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
-				 NULL, NULL);
+				 NULL, NULL, cgroup_fd);
 	if (IS_ERR(event)) {
 		err = PTR_ERR(event);
 		goto err_cpus;
 	}
 
-	if (flags & PERF_FLAG_PID_CGROUP) {
-		err = perf_cgroup_connect(pid, event, &attr, group_leader);
-		if (err) {
-			__free_event(event);
-			goto err_cpus;
-		}
-	}
-
 	if (is_sampling_event(event)) {
 		if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
 			err = -ENOTSUPP;
@@ -7769,7 +7773,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 	 */
 
 	event = perf_event_alloc(attr, cpu, task, NULL, NULL,
-				 overflow_handler, context);
+				 overflow_handler, context, -1);
 	if (IS_ERR(event)) {
 		err = PTR_ERR(event);
 		goto err;
@@ -8130,7 +8134,7 @@ inherit_event(struct perf_event *parent_event,
 					   parent_event->cpu,
 					   child,
 					   group_leader, parent_event,
-				           NULL, NULL);
+					   NULL, NULL, -1);
 	if (IS_ERR(child_event))
 		return child_event;
 
-- 
cgit v1.2.3


From bfe1fcd2688f557a6b6a88f59ea7619228728bd7 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Fri, 23 Jan 2015 18:45:46 +0000
Subject: perf/x86/intel: Support task events with Intel CQM

Add support for task events as well as system-wide events. This change
has a big impact on the way that we gather LLC occupancy values in
intel_cqm_event_read().

Currently, for system-wide (per-cpu) events we defer processing to
userspace which knows how to discard all but one cpu result per package.

Things aren't so simple for task events because we need to do the value
aggregation ourselves. To do this, we defer updating the LLC occupancy
value in event->count from intel_cqm_event_read() and do an SMP
cross-call to read values for all packages in intel_cqm_event_count().
We need to ensure that we only do this for one task event per cache
group, otherwise we'll report duplicate values.

If we're a system-wide event we want to fallback to the default
perf_event_count() implementation. Refactor this into a common function
so that we don't duplicate the code.

Also, introduce PERF_TYPE_INTEL_CQM, since we need a way to track an
event's task (if the event isn't per-cpu) inside of the Intel CQM PMU
driver.  This task information is only availble in the upper layers of
the perf infrastructure.

Other perf backends stash the target task in event->hw.*target so we
need to do something similar. The task is used to determine whether
events should share a cache group and an RMID.

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kanaka Juvva <kanaka.d.juvva@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Cc: linux-api@vger.kernel.org
Link: http://lkml.kernel.org/r/1422038748-21397-8-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_cqm.c | 195 +++++++++++++++++++++++++----
 include/linux/perf_event.h                 |   1 +
 include/uapi/linux/perf_event.h            |   1 +
 kernel/events/core.c                       |   2 +
 4 files changed, 178 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index b5d9d746dbc0..8003d87afd89 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -182,23 +182,124 @@ fail:
 
 /*
  * Determine if @a and @b measure the same set of tasks.
+ *
+ * If @a and @b measure the same set of tasks then we want to share a
+ * single RMID.
  */
 static bool __match_event(struct perf_event *a, struct perf_event *b)
 {
+	/* Per-cpu and task events don't mix */
 	if ((a->attach_state & PERF_ATTACH_TASK) !=
 	    (b->attach_state & PERF_ATTACH_TASK))
 		return false;
 
-	/* not task */
+#ifdef CONFIG_CGROUP_PERF
+	if (a->cgrp != b->cgrp)
+		return false;
+#endif
+
+	/* If not task event, we're machine wide */
+	if (!(b->attach_state & PERF_ATTACH_TASK))
+		return true;
+
+	/*
+	 * Events that target same task are placed into the same cache group.
+	 */
+	if (a->hw.cqm_target == b->hw.cqm_target)
+		return true;
+
+	/*
+	 * Are we an inherited event?
+	 */
+	if (b->parent == a)
+		return true;
+
+	return false;
+}
+
+#ifdef CONFIG_CGROUP_PERF
+static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
+{
+	if (event->attach_state & PERF_ATTACH_TASK)
+		return perf_cgroup_from_task(event->hw.cqm_target);
 
-	return true; /* if not task, we're machine wide */
+	return event->cgrp;
 }
+#endif
 
 /*
  * Determine if @a's tasks intersect with @b's tasks
+ *
+ * There are combinations of events that we explicitly prohibit,
+ *
+ *		   PROHIBITS
+ *     system-wide    -> 	cgroup and task
+ *     cgroup 	      ->	system-wide
+ *     		      ->	task in cgroup
+ *     task 	      -> 	system-wide
+ *     		      ->	task in cgroup
+ *
+ * Call this function before allocating an RMID.
  */
 static bool __conflict_event(struct perf_event *a, struct perf_event *b)
 {
+#ifdef CONFIG_CGROUP_PERF
+	/*
+	 * We can have any number of cgroups but only one system-wide
+	 * event at a time.
+	 */
+	if (a->cgrp && b->cgrp) {
+		struct perf_cgroup *ac = a->cgrp;
+		struct perf_cgroup *bc = b->cgrp;
+
+		/*
+		 * This condition should have been caught in
+		 * __match_event() and we should be sharing an RMID.
+		 */
+		WARN_ON_ONCE(ac == bc);
+
+		if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+		    cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+			return true;
+
+		return false;
+	}
+
+	if (a->cgrp || b->cgrp) {
+		struct perf_cgroup *ac, *bc;
+
+		/*
+		 * cgroup and system-wide events are mutually exclusive
+		 */
+		if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
+		    (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
+			return true;
+
+		/*
+		 * Ensure neither event is part of the other's cgroup
+		 */
+		ac = event_to_cgroup(a);
+		bc = event_to_cgroup(b);
+		if (ac == bc)
+			return true;
+
+		/*
+		 * Must have cgroup and non-intersecting task events.
+		 */
+		if (!ac || !bc)
+			return false;
+
+		/*
+		 * We have cgroup and task events, and the task belongs
+		 * to a cgroup. Check for for overlap.
+		 */
+		if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+		    cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+			return true;
+
+		return false;
+	}
+#endif
 	/*
 	 * If one of them is not a task, same story as above with cgroups.
 	 */
@@ -245,9 +346,16 @@ static int intel_cqm_setup_event(struct perf_event *event,
 
 static void intel_cqm_event_read(struct perf_event *event)
 {
-	unsigned long rmid = event->hw.cqm_rmid;
+	unsigned long rmid;
 	u64 val;
 
+	/*
+	 * Task events are handled by intel_cqm_event_count().
+	 */
+	if (event->cpu == -1)
+		return;
+
+	rmid = event->hw.cqm_rmid;
 	val = __rmid_read(rmid);
 
 	/*
@@ -259,6 +367,63 @@ static void intel_cqm_event_read(struct perf_event *event)
 	local64_set(&event->count, val);
 }
 
+struct rmid_read {
+	unsigned int rmid;
+	atomic64_t value;
+};
+
+static void __intel_cqm_event_count(void *info)
+{
+	struct rmid_read *rr = info;
+	u64 val;
+
+	val = __rmid_read(rr->rmid);
+
+	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+		return;
+
+	atomic64_add(val, &rr->value);
+}
+
+static inline bool cqm_group_leader(struct perf_event *event)
+{
+	return !list_empty(&event->hw.cqm_groups_entry);
+}
+
+static u64 intel_cqm_event_count(struct perf_event *event)
+{
+	struct rmid_read rr = {
+		.rmid = event->hw.cqm_rmid,
+		.value = ATOMIC64_INIT(0),
+	};
+
+	/*
+	 * We only need to worry about task events. System-wide events
+	 * are handled like usual, i.e. entirely with
+	 * intel_cqm_event_read().
+	 */
+	if (event->cpu != -1)
+		return __perf_event_count(event);
+
+	/*
+	 * Only the group leader gets to report values. This stops us
+	 * reporting duplicate values to userspace, and gives us a clear
+	 * rule for which task gets to report the values.
+	 *
+	 * Note that it is impossible to attribute these values to
+	 * specific packages - we forfeit that ability when we create
+	 * task events.
+	 */
+	if (!cqm_group_leader(event))
+		return 0;
+
+	on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
+
+	local64_set(&event->count, atomic64_read(&rr.value));
+
+	return __perf_event_count(event);
+}
+
 static void intel_cqm_event_start(struct perf_event *event, int mode)
 {
 	struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
@@ -344,7 +509,7 @@ static void intel_cqm_event_destroy(struct perf_event *event)
 	/*
 	 * And we're the group leader..
 	 */
-	if (!list_empty(&event->hw.cqm_groups_entry)) {
+	if (cqm_group_leader(event)) {
 		/*
 		 * If there was a group_other, make that leader, otherwise
 		 * destroy the group and return the RMID.
@@ -365,17 +530,6 @@ static void intel_cqm_event_destroy(struct perf_event *event)
 
 static struct pmu intel_cqm_pmu;
 
-/*
- * XXX there's a bit of a problem in that we cannot simply do the one
- * event per node as one would want, since that one event would one get
- * scheduled on the one cpu. But we want to 'schedule' the RMID on all
- * CPUs.
- *
- * This means we want events for each CPU, however, that generates a lot
- * of duplicate values out to userspace -- this is not to be helped
- * unless we want to change the core code in some way. Fore more info,
- * see intel_cqm_event_read().
- */
 static int intel_cqm_event_init(struct perf_event *event)
 {
 	struct perf_event *group = NULL;
@@ -387,9 +541,6 @@ static int intel_cqm_event_init(struct perf_event *event)
 	if (event->attr.config & ~QOS_EVENT_MASK)
 		return -EINVAL;
 
-	if (event->cpu == -1)
-		return -EINVAL;
-
 	/* unsupported modes and filters */
 	if (event->attr.exclude_user   ||
 	    event->attr.exclude_kernel ||
@@ -407,7 +558,8 @@ static int intel_cqm_event_init(struct perf_event *event)
 
 	mutex_lock(&cache_mutex);
 
-	err = intel_cqm_setup_event(event, &group); /* will also set rmid */
+	/* Will also set rmid */
+	err = intel_cqm_setup_event(event, &group);
 	if (err)
 		goto out;
 
@@ -470,6 +622,7 @@ static struct pmu intel_cqm_pmu = {
 	.start		= intel_cqm_event_start,
 	.stop		= intel_cqm_event_stop,
 	.read		= intel_cqm_event_read,
+	.count		= intel_cqm_event_count,
 };
 
 static inline void cqm_pick_event_reader(int cpu)
@@ -599,8 +752,8 @@ static int __init intel_cqm_init(void)
 
 	__perf_cpu_notifier(intel_cqm_cpu_notifier);
 
-	ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
-
+	ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm",
+				PERF_TYPE_INTEL_CQM);
 	if (ret)
 		pr_err("Intel CQM perf registration failed: %d\n", ret);
 	else
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index ca5504c48f4f..dac4c2831d82 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -129,6 +129,7 @@ struct hw_perf_event {
 			struct list_head	cqm_events_entry;
 			struct list_head	cqm_groups_entry;
 			struct list_head	cqm_group_entry;
+			struct task_struct	*cqm_target;
 		};
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 		struct { /* breakpoint */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 1e3cd07cf76e..3c8b45de57ec 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -32,6 +32,7 @@ enum perf_type_id {
 	PERF_TYPE_HW_CACHE			= 3,
 	PERF_TYPE_RAW				= 4,
 	PERF_TYPE_BREAKPOINT			= 5,
+	PERF_TYPE_INTEL_CQM			= 6,
 
 	PERF_TYPE_MAX,				/* non-ABI */
 };
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1fc3bae5904a..71109a045450 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7181,6 +7181,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 		else if (attr->type == PERF_TYPE_BREAKPOINT)
 			event->hw.bp_target = task;
 #endif
+		else if (attr->type == PERF_TYPE_INTEL_CQM)
+			event->hw.cqm_target = task;
 	}
 
 	if (!overflow_handler && parent_event) {
-- 
cgit v1.2.3


From 1d4a9c17d4d204a159139361e8d4db7f9f267879 Mon Sep 17 00:00:00 2001
From: Brian Norris <computersforpeace@gmail.com>
Date: Sun, 22 Feb 2015 21:16:49 -0800
Subject: PM / sleep: add configurable delay for pm_test

When CONFIG_PM_DEBUG=y, we provide a sysfs file (/sys/power/pm_test) for
selecting one of a few suspend test modes, where rather than entering a
full suspend state, the kernel will perform some subset of suspend
steps, wait 5 seconds, and then resume back to normal operation.

This mode is useful for (among other things) observing the state of the
system just before entering a sleep mode, for debugging or analysis
purposes. However, a constant 5 second wait is not sufficient for some
sorts of analysis; for example, on an SoC, one might want to use
external tools to probe the power states of various on-chip controllers
or clocks.

This patch turns this 5 second delay into a configurable module
parameter, so users can determine how long to wait in this
pseudo-suspend state before resuming the system.

Example (wait 30 seconds);

  # echo 30 > /sys/module/suspend/parameters/pm_test_delay
  # echo core > /sys/power/pm_test
  # time echo mem  > /sys/power/state
  ...
  [   17.583625] suspend debug: Waiting for 30 second(s).
  ...
  real	0m30.381s
  user	0m0.017s
  sys	0m0.080s

Signed-off-by: Brian Norris <computersforpeace@gmail.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Reviewed-by: Kevin Cernekee <cernekee@chromium.org>
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/kernel-parameters.txt        |  7 +++++++
 Documentation/power/basic-pm-debugging.txt | 10 ++++++----
 kernel/power/suspend.c                     | 13 +++++++++++--
 3 files changed, 24 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index bfcb1a62a7b4..8b1fa5e129ac 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3462,6 +3462,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			improve throughput, but will also increase the
 			amount of memory reserved for use by the client.
 
+	suspend.pm_test_delay=
+			[SUSPEND]
+			Sets the number of seconds to remain in a suspend test
+			mode before resuming the system (see
+			/sys/power/pm_test). Only available when CONFIG_PM_DEBUG
+			is set. Default value is 5.
+
 	swapaccount=[0|1]
 			[KNL] Enable accounting of swap in memory resource
 			controller if no parameter or 1 is given or disable
diff --git a/Documentation/power/basic-pm-debugging.txt b/Documentation/power/basic-pm-debugging.txt
index edeecd447d23..b96098ccfe69 100644
--- a/Documentation/power/basic-pm-debugging.txt
+++ b/Documentation/power/basic-pm-debugging.txt
@@ -75,12 +75,14 @@ you should do the following:
 # echo platform > /sys/power/disk
 # echo disk > /sys/power/state
 
-Then, the kernel will try to freeze processes, suspend devices, wait 5 seconds,
-resume devices and thaw processes.  If "platform" is written to
+Then, the kernel will try to freeze processes, suspend devices, wait a few
+seconds (5 by default, but configurable by the suspend.pm_test_delay module
+parameter), resume devices and thaw processes.  If "platform" is written to
 /sys/power/pm_test , then after suspending devices the kernel will additionally
 invoke the global control methods (eg. ACPI global control methods) used to
-prepare the platform firmware for hibernation.  Next, it will wait 5 seconds and
-invoke the platform (eg. ACPI) global methods used to cancel hibernation etc.
+prepare the platform firmware for hibernation.  Next, it will wait a
+configurable number of seconds and invoke the platform (eg. ACPI) global
+methods used to cancel hibernation etc.
 
 Writing "none" to /sys/power/pm_test causes the kernel to switch to the normal
 hibernation/suspend operations.  Also, when open for reading, /sys/power/pm_test
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index b7d6b3a721b1..8d7a1ef72758 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -28,6 +28,7 @@
 #include <linux/ftrace.h>
 #include <trace/events/power.h>
 #include <linux/compiler.h>
+#include <linux/moduleparam.h>
 
 #include "power.h"
 
@@ -233,12 +234,20 @@ static bool platform_suspend_again(suspend_state_t state)
 		suspend_ops->suspend_again() : false;
 }
 
+#ifdef CONFIG_PM_DEBUG
+static unsigned int pm_test_delay = 5;
+module_param(pm_test_delay, uint, 0644);
+MODULE_PARM_DESC(pm_test_delay,
+		 "Number of seconds to wait before resuming from suspend test");
+#endif
+
 static int suspend_test(int level)
 {
 #ifdef CONFIG_PM_DEBUG
 	if (pm_test_level == level) {
-		printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n");
-		mdelay(5000);
+		printk(KERN_INFO "suspend debug: Waiting for %d second(s).\n",
+				pm_test_delay);
+		mdelay(pm_test_delay * 1000);
 		return 1;
 	}
 #endif /* !CONFIG_PM_DEBUG */
-- 
cgit v1.2.3


From ee376dbdf27728a2f3d30e2ba10fa387cc4c645b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 10 Jan 2015 19:47:10 -0800
Subject: rcu: Consolidate rcu_synchronize and wakeme_after_rcu()

There are currently duplicate identical definitions of the
rcu_synchronize() structure and the wakeme_after_rcu() function.
Thie commit therefore consolidates them.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h |  9 +++++++++
 kernel/rcu/srcu.c        | 17 -----------------
 kernel/rcu/update.c      | 15 ++++++---------
 3 files changed, 15 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 78097491cd99..3e6afed51051 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -195,6 +195,15 @@ void call_rcu_sched(struct rcu_head *head,
 
 void synchronize_sched(void);
 
+/*
+ * Structure allowing asynchronous waiting on RCU.
+ */
+struct rcu_synchronize {
+	struct rcu_head head;
+	struct completion completion;
+};
+void wakeme_after_rcu(struct rcu_head *head);
+
 /**
  * call_rcu_tasks() - Queue an RCU for invocation task-based grace period
  * @head: structure to be used for queueing the RCU updates.
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 445bf8ffe3fb..81f53b504c18 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -402,23 +402,6 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
 }
 EXPORT_SYMBOL_GPL(call_srcu);
 
-struct rcu_synchronize {
-	struct rcu_head head;
-	struct completion completion;
-};
-
-/*
- * Awaken the corresponding synchronize_srcu() instance now that a
- * grace period has elapsed.
- */
-static void wakeme_after_rcu(struct rcu_head *head)
-{
-	struct rcu_synchronize *rcu;
-
-	rcu = container_of(head, struct rcu_synchronize, head);
-	complete(&rcu->completion);
-}
-
 static void srcu_advance_batches(struct srcu_struct *sp, int trycount);
 static void srcu_reschedule(struct srcu_struct *sp);
 
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index e0d31a345ee6..8864ed90f0d7 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -199,16 +199,13 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
 
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
-struct rcu_synchronize {
-	struct rcu_head head;
-	struct completion completion;
-};
-
-/*
- * Awaken the corresponding synchronize_rcu() instance now that a
- * grace period has elapsed.
+/**
+ * wakeme_after_rcu() - Callback function to awaken a task after grace period
+ * @head: Pointer to rcu_head member within rcu_synchronize structure
+ *
+ * Awaken the corresponding task now that a grace period has elapsed.
  */
-static void wakeme_after_rcu(struct rcu_head  *head)
+void wakeme_after_rcu(struct rcu_head *head)
 {
 	struct rcu_synchronize *rcu;
 
-- 
cgit v1.2.3


From 3f47da0f32f5e43e6ae901129d5b9c2600011a2c Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 13 Jan 2015 15:30:34 +0800
Subject: rcu_tree: Avoid touching rnp->completed when a new GP is started

In rcu_gp_init(), rnp->completed equals to rsp->completed in THEORY,
we don't need to touch it normally.  If something goes wrong,
it will complain and fixup rnp->completed and avoid oops.
This commit thus avoids the normal needless store to rnp->completed.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 48d640ca1a05..077d0b700f74 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1757,8 +1757,8 @@ static int rcu_gp_init(struct rcu_state *rsp)
 		rcu_preempt_check_blocked_tasks(rnp);
 		rnp->qsmask = rnp->qsmaskinit;
 		ACCESS_ONCE(rnp->gpnum) = rsp->gpnum;
-		WARN_ON_ONCE(rnp->completed != rsp->completed);
-		ACCESS_ONCE(rnp->completed) = rsp->completed;
+		if (WARN_ON_ONCE(rnp->completed != rsp->completed))
+			ACCESS_ONCE(rnp->completed) = rsp->completed;
 		if (rnp == rdp->mynode)
 			(void)__note_gp_changes(rsp, rnp, rdp);
 		rcu_preempt_boost_start_gp(rnp);
-- 
cgit v1.2.3


From d3f3f3f25b1d4ee152f3f19a812c3a282da4c120 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 18 Jan 2015 18:21:09 -0800
Subject: rcu: Abstract default callback-list initialization from
 init_callback_list()

In preparation for early-boot posting of callbacks, this commit abstracts
initialization of the default (non-no-CB) callbacks list from the
init_callback_list() function into a new init_default_callback_list()
function.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 48d640ca1a05..f8cdb92da10b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1328,19 +1328,29 @@ void rcu_cpu_stall_reset(void)
 }
 
 /*
- * Initialize the specified rcu_data structure's callback list to empty.
+ * Initialize the specified rcu_data structure's default callback list
+ * to empty.  The default callback list is the one that is not used by
+ * no-callbacks CPUs.
  */
-static void init_callback_list(struct rcu_data *rdp)
+static void init_default_callback_list(struct rcu_data *rdp)
 {
 	int i;
 
-	if (init_nocb_callback_list(rdp))
-		return;
 	rdp->nxtlist = NULL;
 	for (i = 0; i < RCU_NEXT_SIZE; i++)
 		rdp->nxttail[i] = &rdp->nxtlist;
 }
 
+/*
+ * Initialize the specified rcu_data structure's callback list to empty.
+ */
+static void init_callback_list(struct rcu_data *rdp)
+{
+	if (init_nocb_callback_list(rdp))
+		return;
+	init_default_callback_list(rdp);
+}
+
 /*
  * Determine the value that ->completed will have at the end of the
  * next subsequent grace period.  This is used to tag callbacks so that
-- 
cgit v1.2.3


From 2723249a31a68ccc0ec8ac59a905d7f9430bf8f6 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 20 Jan 2015 22:44:13 -0800
Subject: rcu: Wire ->rda pointers at compile time

This commit wires up the rcu_state structures' ->rda pointers to the
per-CPU rcu_data structures at compile time, thus ensuring that this
linkage is present at early boot, in turn allowing posting of callbacks
before rcu_init() is executed.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f8cdb92da10b..d2fa95e4a268 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -91,8 +91,10 @@ static const char *tp_##sname##_varname __used __tracepoint_string = sname##_var
 
 #define RCU_STATE_INITIALIZER(sname, sabbr, cr) \
 DEFINE_RCU_TPS(sname) \
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \
 struct rcu_state sname##_state = { \
 	.level = { &sname##_state.node[0] }, \
+	.rda = &sname##_data, \
 	.call = cr, \
 	.fqs_state = RCU_GP_IDLE, \
 	.gpnum = 0UL - 300UL, \
@@ -104,8 +106,7 @@ struct rcu_state sname##_state = { \
 	.onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
 	.name = RCU_STATE_NAME(sname), \
 	.abbr = sabbr, \
-}; \
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data)
+}
 
 RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
 RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
@@ -3843,7 +3844,6 @@ static void __init rcu_init_one(struct rcu_state *rsp,
 		}
 	}
 
-	rsp->rda = rda;
 	init_waitqueue_head(&rsp->gp_wq);
 	rnp = rsp->level[rcu_num_lvls - 1];
 	for_each_possible_cpu(i) {
-- 
cgit v1.2.3


From 143da9c2fc030a5774674f2ebc2f934fab3dcd9a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 19 Jan 2015 19:57:32 -0800
Subject: rcu: Prevent early-boot RCU callbacks from splatting

Currently, a call_rcu() that precedes rcu_init() will splat due to the
callback lists not having yet been initialized.  This commit causes the
first such callback to initialize the boot CPU's RCU callback list.

Note that this commit does not change rcu_init()-time initialization,
which means that the callback will be discarded at rcu_init() time.
Fixing this is the job of later commits.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d2fa95e4a268..fcfdbe53bb70 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2838,11 +2838,21 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 
 		if (cpu != -1)
 			rdp = per_cpu_ptr(rsp->rda, cpu);
-		offline = !__call_rcu_nocb(rdp, head, lazy, flags);
-		WARN_ON_ONCE(offline);
-		/* _call_rcu() is illegal on offline CPU; leak the callback. */
-		local_irq_restore(flags);
-		return;
+		if (likely(rdp->mynode)) {
+			/* Post-boot, so this should be for a no-CBs CPU. */
+			offline = !__call_rcu_nocb(rdp, head, lazy, flags);
+			WARN_ON_ONCE(offline);
+			/* Offline CPU, _call_rcu() illegal, leak callback.  */
+			local_irq_restore(flags);
+			return;
+		}
+		/*
+		 * Very early boot, before rcu_init().  Initialize if needed
+		 * and then drop through to queue the callback.
+		 */
+		BUG_ON(cpu != -1);
+		if (!likely(rdp->nxtlist))
+			init_default_callback_list(rdp);
 	}
 	ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1;
 	if (lazy)
-- 
cgit v1.2.3


From 59f792d1ef214592ae9b86238fa8fd00f5929b76 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 19 Jan 2015 21:43:40 -0800
Subject: rcu: Refine diagnostics for lacking kthread for no-CBs callbacks

Some diagnostics under CONFIG_PROVE_RCU in rcu_nocb_cpu_needs_barrier()
assume that there can be no early-boot callbacks.  This commit therefore
qualifies the diagnostic with rcu_scheduler_fully_active to permit
early boot callbacks to avoid this splat.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0a571e9a0f1d..75d5f096bcb0 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1945,7 +1945,8 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
 		rhp = ACCESS_ONCE(rdp->nocb_follower_head);
 
 	/* Having no rcuo kthread but CBs after scheduler starts is bad! */
-	if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp) {
+	if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp &&
+	    rcu_scheduler_fully_active) {
 		/* RCU callback enqueued before CPU first came online??? */
 		pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n",
 		       cpu, rhp->func);
-- 
cgit v1.2.3


From 39c8d313c3c546a414cc51b4f6571c2f8cc06407 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 20 Jan 2015 23:42:38 -0800
Subject: rcu: Avoid clobbering early boot callbacks

When a CPU comes online, it initializes its callback list.  This
is a bad thing if this is the first time that the CPU has come
online and if that CPU has early boot callbacks.  This commit therefore
avoid initializing the callback list if there are callbacks present,
in which case the initial call_rcu() did the initialization for us.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index fcfdbe53bb70..92fd3eab5823 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3583,7 +3583,8 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 	rdp->qlen_last_fqs_check = 0;
 	rdp->n_force_qs_snap = rsp->n_force_qs;
 	rdp->blimit = blimit;
-	init_callback_list(rdp);  /* Re-enable callbacks on this CPU. */
+	if (!rdp->nxtlist)
+		init_callback_list(rdp);  /* Re-enable callbacks on this CPU. */
 	rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
 	rcu_sysidle_init_percpu_data(rdp->dynticks);
 	atomic_set(&rdp->dynticks->dynticks,
-- 
cgit v1.2.3


From 1925d1967c93a1c421271aade7953f6857e9f579 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 18 Jan 2015 17:45:05 -0800
Subject: rcu: Fix a couple of typos in rcu_all_qs() comment header

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 077d0b700f74..4e37c7fd9e29 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -292,10 +292,10 @@ void rcu_note_context_switch(void)
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
 
 /*
- * Register a quiesecent state for all RCU flavors.  If there is an
+ * Register a quiescent state for all RCU flavors.  If there is an
  * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight
  * dyntick-idle quiescent state visible to other CPUs (but only for those
- * RCU flavors in desparate need of a quiescent state, which will normally
+ * RCU flavors in desperate need of a quiescent state, which will normally
  * be none of them).  Either way, do a lightweight quiescent state for
  * all RCU flavors.
  */
-- 
cgit v1.2.3


From 0d39482c3db13aae1db143d340816108dd53e443 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 18 Feb 2015 12:24:30 -0800
Subject: rcu: Provide rcu_expedite_gp() and rcu_unexpedite_gp()

Currently, expediting of normal synchronous grace-period primitives
(synchronize_rcu() and friends) is controlled by the rcu_expedited()
boot/sysfs parameter.  This works well, but does not handle nesting.
This commit therefore provides rcu_expedite_gp() to enable expediting
and rcu_unexpedite_gp() to cancel a prior rcu_expedite_gp(), both of
which support nesting.

Reported-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 20 ++++++++++++++++++++
 kernel/rcu/update.c      | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 78097491cd99..57a4d1f73a00 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -48,6 +48,26 @@
 
 extern int rcu_expedited; /* for sysctl */
 
+#ifdef CONFIG_TINY_RCU
+/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
+static inline bool rcu_gp_is_expedited(void)  /* Internal RCU use. */
+{
+	return false;
+}
+
+static inline void rcu_expedite_gp(void)
+{
+}
+
+static inline void rcu_unexpedite_gp(void)
+{
+}
+#else /* #ifdef CONFIG_TINY_RCU */
+bool rcu_gp_is_expedited(void);  /* Internal RCU use. */
+void rcu_expedite_gp(void);
+void rcu_unexpedite_gp(void);
+#endif /* #else #ifdef CONFIG_TINY_RCU */
+
 enum rcutorture_type {
 	RCU_FLAVOR,
 	RCU_BH_FLAVOR,
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index e0d31a345ee6..5f850823c187 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -62,6 +62,54 @@ MODULE_ALIAS("rcupdate");
 
 module_param(rcu_expedited, int, 0);
 
+#ifndef CONFIG_TINY_RCU
+
+static atomic_t rcu_expedited_nesting;
+
+/*
+ * Should normal grace-period primitives be expedited?  Intended for
+ * use within RCU.  Note that this function takes the rcu_expedited
+ * sysfs/boot variable into account as well as the rcu_expedite_gp()
+ * nesting.  So looping on rcu_unexpedite_gp() until rcu_gp_is_expedited()
+ * returns false is a -really- bad idea.
+ */
+bool rcu_gp_is_expedited(void)
+{
+	return rcu_expedited || atomic_read(&rcu_expedited_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_gp_is_expedited);
+
+/**
+ * rcu_expedite_gp - Expedite future RCU grace periods
+ *
+ * After a call to this function, future calls to synchronize_rcu() and
+ * friends act as the corresponding synchronize_rcu_expedited() function
+ * had instead been called.
+ */
+void rcu_expedite_gp(void)
+{
+	atomic_inc(&rcu_expedited_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_expedite_gp);
+
+/**
+ * rcu_unexpedite_gp - Cancel prior rcu_expedite_gp() invocation
+ *
+ * Undo a prior call to rcu_expedite_gp().  If all prior calls to
+ * rcu_expedite_gp() are undone by a subsequent call to rcu_unexpedite_gp(),
+ * and if the rcu_expedited sysfs/boot parameter is not set, then all
+ * subsequent calls to synchronize_rcu() and friends will return to
+ * their normal non-expedited behavior.
+ */
+void rcu_unexpedite_gp(void)
+{
+	atomic_dec(&rcu_expedited_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_unexpedite_gp);
+
+#endif /* #ifndef CONFIG_TINY_RCU */
+
+
 #ifdef CONFIG_PREEMPT_RCU
 
 /*
-- 
cgit v1.2.3


From 4bb3c5f4142a359de46cf14ebab64c4c903d6773 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 18 Feb 2015 16:31:29 -0800
Subject: rcu: Add rcu_expedite_gp() and rcu_unexpedite_gp() to rcutorture

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcutorture.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 30d42aa55d83..3833aa611ae7 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -853,6 +853,8 @@ rcu_torture_fqs(void *arg)
 static int
 rcu_torture_writer(void *arg)
 {
+	bool can_expedite = !rcu_gp_is_expedited();
+	int expediting = 0;
 	unsigned long gp_snap;
 	bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal;
 	bool gp_sync1 = gp_sync;
@@ -865,6 +867,12 @@ rcu_torture_writer(void *arg)
 	int nsynctypes = 0;
 
 	VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
+	pr_alert("%s" TORTURE_FLAG
+		 " Grace periods expedited from boot/sysfs for %s,\n",
+		 torture_type, cur_ops->name);
+	pr_alert("%s" TORTURE_FLAG
+		 " Testing of dynamic grace-period expediting diabled.\n",
+		 torture_type);
 
 	/* Initialize synctype[] array.  If none set, take default. */
 	if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync)
@@ -949,9 +957,26 @@ rcu_torture_writer(void *arg)
 			}
 		}
 		rcutorture_record_progress(++rcu_torture_current_version);
+		/* Cycle through nesting levels of rcu_expedite_gp() calls. */
+		if (can_expedite &&
+		    !(torture_random(&rand) & 0xff & (!!expediting - 1))) {
+			WARN_ON_ONCE(expediting == 0 && rcu_gp_is_expedited());
+			if (expediting >= 0)
+				rcu_expedite_gp();
+			else
+				rcu_unexpedite_gp();
+			if (++expediting > 3)
+				expediting = -expediting;
+		}
 		rcu_torture_writer_state = RTWS_STUTTER;
 		stutter_wait("rcu_torture_writer");
 	} while (!torture_must_stop());
+	/* Reset expediting back to unexpedited. */
+	if (expediting > 0)
+		expediting = -expediting;
+	while (can_expedite && expediting++ < 0)
+		rcu_unexpedite_gp();
+	WARN_ON_ONCE(can_expedite && rcu_gp_is_expedited());
 	rcu_torture_writer_state = RTWS_STOPPING;
 	torture_kthread_stopping("rcu_torture_writer");
 	return 0;
-- 
cgit v1.2.3


From 5afff48bdf7481570c9385a8a674a81ffb8f09ee Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 18 Feb 2015 16:39:09 -0800
Subject: rcu: Update from rcu_expedited variable to rcu_gp_is_expedited()

This commit updates open-coded tests of the rcu_expedited variable
to instead use rcu_gp_is_expedited().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/srcu.c        | 2 +-
 kernel/rcu/tree.c        | 9 +++++----
 kernel/rcu/tree_plugin.h | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 445bf8ffe3fb..c871f07eff69 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -507,7 +507,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
  */
 void synchronize_srcu(struct srcu_struct *sp)
 {
-	__synchronize_srcu(sp, rcu_expedited
+	__synchronize_srcu(sp, rcu_gp_is_expedited()
 			   ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
 			   : SYNCHRONIZE_SRCU_TRYCOUNT);
 }
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 48d640ca1a05..4325fbe79d84 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2954,7 +2954,7 @@ void synchronize_sched(void)
 			   "Illegal synchronize_sched() in RCU-sched read-side critical section");
 	if (rcu_blocking_is_gp())
 		return;
-	if (rcu_expedited)
+	if (rcu_gp_is_expedited())
 		synchronize_sched_expedited();
 	else
 		wait_rcu_gp(call_rcu_sched);
@@ -2981,7 +2981,7 @@ void synchronize_rcu_bh(void)
 			   "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
 	if (rcu_blocking_is_gp())
 		return;
-	if (rcu_expedited)
+	if (rcu_gp_is_expedited())
 		synchronize_rcu_bh_expedited();
 	else
 		wait_rcu_gp(call_rcu_bh);
@@ -3660,11 +3660,12 @@ static int rcu_pm_notify(struct notifier_block *self,
 	case PM_HIBERNATION_PREPARE:
 	case PM_SUSPEND_PREPARE:
 		if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
-			rcu_expedited = 1;
+			rcu_expedite_gp();
 		break;
 	case PM_POST_HIBERNATION:
 	case PM_POST_SUSPEND:
-		rcu_expedited = 0;
+		if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
+			rcu_unexpedite_gp();
 		break;
 	default:
 		break;
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0a571e9a0f1d..63726b734d34 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -585,7 +585,7 @@ void synchronize_rcu(void)
 			   "Illegal synchronize_rcu() in RCU read-side critical section");
 	if (!rcu_scheduler_active)
 		return;
-	if (rcu_expedited)
+	if (rcu_gp_is_expedited())
 		synchronize_rcu_expedited();
 	else
 		wait_rcu_gp(call_rcu);
-- 
cgit v1.2.3


From ee42571f4381f184e2672dd34ab411e5bf5bd5e0 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 19 Feb 2015 10:51:32 -0800
Subject: rcu: Add Kconfig option to expedite grace periods during boot

This commit adds a CONFIG_RCU_EXPEDITE_BOOT Kconfig parameter
that emulates a very early boot rcu_expedite_gp().  A late-boot
call to rcu_end_inkernel_boot() will provide the corresponding
rcu_unexpedite_gp().  The late-boot call to rcu_end_inkernel_boot()
should be made just before init is spawned.

According to Arjan:

> To show the boot time, I'm using the timestamp of the "Write protecting"
> line, that's pretty much the last thing we print prior to ring 3 execution.
>
> A kernel with default RCU behavior (inside KVM, only virtual devices)
> looks like this:
>
> [    0.038724] Write protecting the kernel read-only data: 10240k
>
> a kernel with expedited RCU (using the command line option, so that I
> don't have to recompile between measurements and thus am completely
> oranges-to-oranges)
>
> [    0.031768] Write protecting the kernel read-only data: 10240k
>
> which, in percentage, is an 18% improvement.

Reported-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Arjan van de Ven <arjan@linux.intel.com>
---
 include/linux/rcupdate.h |  1 +
 init/Kconfig             | 13 +++++++++++++
 kernel/rcu/update.c      | 11 ++++++++++-
 3 files changed, 24 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 57a4d1f73a00..b9f039b11d31 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -278,6 +278,7 @@ static inline int rcu_preempt_depth(void)
 
 /* Internal to kernel */
 void rcu_init(void);
+void rcu_end_inkernel_boot(void);
 void rcu_sched_qs(void);
 void rcu_bh_qs(void);
 void rcu_check_callbacks(int user);
diff --git a/init/Kconfig b/init/Kconfig
index f5dbc6d4261b..9a0592516f48 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -791,6 +791,19 @@ config RCU_NOCB_CPU_ALL
 
 endchoice
 
+config RCU_EXPEDITE_BOOT
+	bool
+	default n
+	help
+	  This option enables expedited grace periods at boot time,
+	  as if rcu_expedite_gp() had been invoked early in boot.
+	  The corresponding rcu_unexpedite_gp() is invoked from
+	  rcu_end_inkernel_boot(), which is intended to be invoked
+	  at the end of the kernel-only boot sequence, just before
+	  init is exec'ed.
+
+	  Accept the default if unsure.
+
 endmenu # "RCU Subsystem"
 
 config BUILD_BIN2C
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 5f850823c187..7b12466f90bc 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -64,7 +64,8 @@ module_param(rcu_expedited, int, 0);
 
 #ifndef CONFIG_TINY_RCU
 
-static atomic_t rcu_expedited_nesting;
+static atomic_t rcu_expedited_nesting =
+	ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
 
 /*
  * Should normal grace-period primitives be expedited?  Intended for
@@ -109,6 +110,14 @@ EXPORT_SYMBOL_GPL(rcu_unexpedite_gp);
 
 #endif /* #ifndef CONFIG_TINY_RCU */
 
+/*
+ * Inform RCU of the end of the in-kernel boot sequence.
+ */
+void rcu_end_inkernel_boot(void)
+{
+	if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT))
+		rcu_unexpedite_gp();
+}
 
 #ifdef CONFIG_PREEMPT_RCU
 
-- 
cgit v1.2.3


From c136f991049f51212e3d837a9f41708158591869 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 19 Feb 2015 12:15:19 -0800
Subject: rcutorture: Make consistent use of variables

The "if" statement at the beginning of rcu_torture_writer() should
use the same set of variables.  In theory, this does not matter because
the corresponding variables (gp_sync and gp_sync1) have the same value
at this point in the code, but in practice such puzzles should be
removed.  This commit therefore makes the use of variables consistent.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcutorture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 3833aa611ae7..8dbe27611ec3 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -875,7 +875,7 @@ rcu_torture_writer(void *arg)
 		 torture_type);
 
 	/* Initialize synctype[] array.  If none set, take default. */
-	if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync)
+	if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1)
 		gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true;
 	if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync)
 		synctype[nsynctypes++] = RTWS_COND_GET;
-- 
cgit v1.2.3


From 675da67f24e2d6d8df0cedf12e59085ed8bbf4e7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 23 Feb 2015 15:57:07 -0800
Subject: rcu: Fixes to NO_HZ_FULL sysidle accounting

On second and subsequent passes through quiescent-state forcing, the
isidle variable was initialized to false, which would prevent full sysidle
state from being reached if a grace period needed more than one round
of quiescent-state forcing (which most should not).  However, the check
for offline CPUs in the quiescent-state forcing main loop had the wrong
sense, which could prevent CPUs from ever entering full sysidle state.

This commit fixes both of these bugs.  Given that sysidle is not yet
wired up, this has no effect in old kernels, but might have proven
frustrating had anyone attempted to wire it up.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 48d640ca1a05..735bd7ee749a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1798,7 +1798,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
 		fqs_state = RCU_FORCE_QS;
 	} else {
 		/* Handle dyntick-idle and offline CPUs. */
-		isidle = false;
+		isidle = true;
 		force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
 	}
 	/* Clear flag to prevent immediate re-entry. */
@@ -2596,8 +2596,8 @@ static void force_qs_rnp(struct rcu_state *rsp,
 		bit = 1;
 		for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
 			if ((rnp->qsmask & bit) != 0) {
-				if ((rnp->qsmaskinit & bit) != 0)
-					*isidle = false;
+				if ((rnp->qsmaskinit & bit) == 0)
+					*isidle = false; /* Pending hotplug. */
 				if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
 					mask |= bit;
 			}
-- 
cgit v1.2.3


From 27153acbe1141ceecf098ca5d24c2ae2714c1a5f Mon Sep 17 00:00:00 2001
From: Alexander Gordeev <agordeev@redhat.com>
Date: Wed, 11 Feb 2015 15:42:37 +0100
Subject: rcu: Remove unnecessary condition check in rcu_qsctr_help()

When the ->curtail and ->donetail pointers differ, ->rcucblist
always points to the beginning of the current list and thus
cannot be NULL. Therefore, the check ->rcucblist != NULL is
redundant and this commit removes it.

Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Alexander Gordeev <agordeev@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tiny.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index cc9ceca7bde1..d4e7fe5f3baf 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -103,8 +103,7 @@ EXPORT_SYMBOL(__rcu_is_watching);
 static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
 {
 	RCU_TRACE(reset_cpu_stall_ticks(rcp));
-	if (rcp->rcucblist != NULL &&
-	    rcp->donetail != rcp->curtail) {
+	if (rcp->donetail != rcp->curtail) {
 		rcp->donetail = rcp->curtail;
 		return 1;
 	}
-- 
cgit v1.2.3


From 915e8a4fe45eab871a862f6467ec7e59864735b2 Mon Sep 17 00:00:00 2001
From: Alexander Gordeev <agordeev@redhat.com>
Date: Wed, 11 Feb 2015 15:42:38 +0100
Subject: rcu: Remove fastpath from __rcu_process_callbacks()

The standard code path accommodates a condition when no
RCU callbacks are ready to invoke. Since size of the code
is a priority for tiny RCU, remove the fast path.

Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Alexander Gordeev <agordeev@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tiny.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index d4e7fe5f3baf..069742d61c68 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -168,17 +168,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 	unsigned long flags;
 	RCU_TRACE(int cb_count = 0);
 
-	/* If no RCU callbacks ready to invoke, just return. */
-	if (&rcp->rcucblist == rcp->donetail) {
-		RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1));
-		RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
-					      !!ACCESS_ONCE(rcp->rcucblist),
-					      need_resched(),
-					      is_idle_task(current),
-					      false));
-		return;
-	}
-
 	/* Move the ready-to-invoke callbacks to a local list. */
 	local_irq_save(flags);
 	RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1));
-- 
cgit v1.2.3


From 5871968d531f39c23a8e6c69525bb705bca52e04 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 24 Feb 2015 11:05:36 -0800
Subject: rcu: Tighten up affinity and check for sysidle

If the RCU grace-period kthread invoking rcu_sysidle_check_cpu()
happens to be running on the tick_do_timer_cpu initially,
then rcu_bind_gp_kthread() won't bind it.  This kthread might
then migrate before invoking rcu_gp_fqs(), which will trigger the
WARN_ON_ONCE() in rcu_sysidle_check_cpu().  This commit therefore makes
rcu_bind_gp_kthread() do the binding even if the kthread is currently
on the same CPU.  Because this incurs added overhead, this commit also
causes each RCU grace-period kthread to invoke rcu_bind_gp_kthread()
once at boot rather than at the beginning of each grace period.
And as long as rcu_bind_gp_kthread() is being modified, this commit
eliminates its #ifdef.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c        |  2 +-
 kernel/rcu/tree_plugin.h | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 735bd7ee749a..a6972c20eaa5 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1707,7 +1707,6 @@ static int rcu_gp_init(struct rcu_state *rsp)
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
 	ACCESS_ONCE(rsp->gp_activity) = jiffies;
-	rcu_bind_gp_kthread();
 	raw_spin_lock_irq(&rnp->lock);
 	smp_mb__after_unlock_lock();
 	if (!ACCESS_ONCE(rsp->gp_flags)) {
@@ -1895,6 +1894,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
 	struct rcu_state *rsp = arg;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
+	rcu_bind_gp_kthread();
 	for (;;) {
 
 		/* Handle grace-period start. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0a571e9a0f1d..b46c92824db1 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2763,7 +2763,8 @@ static void rcu_sysidle_exit(int irq)
 
 /*
  * Check to see if the current CPU is idle.  Note that usermode execution
- * does not count as idle.  The caller must have disabled interrupts.
+ * does not count as idle.  The caller must have disabled interrupts,
+ * and must be running on tick_do_timer_cpu.
  */
 static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
 				  unsigned long *maxj)
@@ -2784,8 +2785,8 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
 	if (!*isidle || rdp->rsp != rcu_state_p ||
 	    cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
 		return;
-	if (rcu_gp_in_progress(rdp->rsp))
-		WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
+	/* Verify affinity of current kthread. */
+	WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
 
 	/* Pick up current idle and NMI-nesting counter and check. */
 	cur = atomic_read(&rdtp->dynticks_idle);
@@ -3068,11 +3069,10 @@ static void rcu_bind_gp_kthread(void)
 		return;
 #ifdef CONFIG_NO_HZ_FULL_SYSIDLE
 	cpu = tick_do_timer_cpu;
-	if (cpu >= 0 && cpu < nr_cpu_ids && raw_smp_processor_id() != cpu)
+	if (cpu >= 0 && cpu < nr_cpu_ids)
 		set_cpus_allowed_ptr(current, cpumask_of(cpu));
 #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
-	if (!is_housekeeping_cpu(raw_smp_processor_id()))
-		housekeeping_affine(current);
+	housekeeping_affine(current);
 #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
 }
 
-- 
cgit v1.2.3


From f91fe17e243d1f279d425071a35e3d41290758a0 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 1 Mar 2015 12:31:41 +0100
Subject: ebpf: remove kernel test stubs

Now that we have BPF_PROG_TYPE_SOCKET_FILTER up and running, we can
remove the test stubs which were added to get the verifier suite up.

We can just let the test cases probe under socket filter type instead.
In the fill/spill test case, we cannot (yet) access fields from the
context (skb), but we may adapt that test case in future.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/Makefile         |  3 --
 kernel/bpf/test_stub.c      | 78 ---------------------------------------------
 samples/bpf/test_verifier.c |  5 +--
 3 files changed, 3 insertions(+), 83 deletions(-)
 delete mode 100644 kernel/bpf/test_stub.c

(limited to 'kernel')

diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index a5ae60f0b0a2..e6983be12bd3 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,5 +1,2 @@
 obj-y := core.o
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o
-ifdef CONFIG_TEST_BPF
-obj-$(CONFIG_BPF_SYSCALL) += test_stub.o
-endif
diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c
deleted file mode 100644
index 0ceae1e6e8b5..000000000000
--- a/kernel/bpf/test_stub.c
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/err.h>
-#include <linux/bpf.h>
-
-/* test stubs for BPF_MAP_TYPE_UNSPEC and for BPF_PROG_TYPE_UNSPEC
- * to be used by user space verifier testsuite
- */
-struct bpf_context {
-	u64 arg1;
-	u64 arg2;
-};
-
-static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id)
-{
-	switch (func_id) {
-	case BPF_FUNC_map_lookup_elem:
-		return &bpf_map_lookup_elem_proto;
-	case BPF_FUNC_map_update_elem:
-		return &bpf_map_update_elem_proto;
-	case BPF_FUNC_map_delete_elem:
-		return &bpf_map_delete_elem_proto;
-	default:
-		return NULL;
-	}
-}
-
-static const struct bpf_context_access {
-	int size;
-	enum bpf_access_type type;
-} test_ctx_access[] = {
-	[offsetof(struct bpf_context, arg1)] = {
-		FIELD_SIZEOF(struct bpf_context, arg1),
-		BPF_READ
-	},
-	[offsetof(struct bpf_context, arg2)] = {
-		FIELD_SIZEOF(struct bpf_context, arg2),
-		BPF_READ
-	},
-};
-
-static bool test_is_valid_access(int off, int size, enum bpf_access_type type)
-{
-	const struct bpf_context_access *access;
-
-	if (off < 0 || off >= ARRAY_SIZE(test_ctx_access))
-		return false;
-
-	access = &test_ctx_access[off];
-	if (access->size == size && (access->type & type))
-		return true;
-
-	return false;
-}
-
-static struct bpf_verifier_ops test_ops = {
-	.get_func_proto = test_func_proto,
-	.is_valid_access = test_is_valid_access,
-};
-
-static struct bpf_prog_type_list tl_prog = {
-	.ops = &test_ops,
-	.type = BPF_PROG_TYPE_UNSPEC,
-};
-
-static int __init register_test_ops(void)
-{
-	bpf_register_prog_type(&tl_prog);
-	return 0;
-}
-late_initcall(register_test_ops);
diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c
index b96175e90363..7b56b59fad8e 100644
--- a/samples/bpf/test_verifier.c
+++ b/samples/bpf/test_verifier.c
@@ -288,7 +288,8 @@ static struct bpf_test tests[] = {
 			BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -8),
 
 			/* should be able to access R0 = *(R2 + 8) */
-			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 8),
+			/* BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 8), */
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
 			BPF_EXIT_INSN(),
 		},
 		.result = ACCEPT,
@@ -687,7 +688,7 @@ static int test(void)
 		}
 		printf("#%d %s ", i, tests[i].descr);
 
-		prog_fd = bpf_prog_load(BPF_PROG_TYPE_UNSPEC, prog,
+		prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, prog,
 					prog_len * sizeof(struct bpf_insn),
 					"GPL");
 
-- 
cgit v1.2.3


From a2c83fff582ae133d9f5bb187404ea9ce4da1f96 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 1 Mar 2015 12:31:42 +0100
Subject: ebpf: constify various function pointer structs

We can move bpf_map_ops and bpf_verifier_ops and other structs into ro
section, bpf_map_type_list and bpf_prog_type_list into read mostly.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h   | 14 +++++++-------
 kernel/bpf/arraymap.c |  6 +++---
 kernel/bpf/hashtab.c  |  6 +++---
 kernel/bpf/helpers.c  |  6 +++---
 net/core/filter.c     |  6 +++---
 5 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bbfceb756452..78446860f796 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -32,13 +32,13 @@ struct bpf_map {
 	u32 key_size;
 	u32 value_size;
 	u32 max_entries;
-	struct bpf_map_ops *ops;
+	const struct bpf_map_ops *ops;
 	struct work_struct work;
 };
 
 struct bpf_map_type_list {
 	struct list_head list_node;
-	struct bpf_map_ops *ops;
+	const struct bpf_map_ops *ops;
 	enum bpf_map_type type;
 };
 
@@ -109,7 +109,7 @@ struct bpf_verifier_ops {
 
 struct bpf_prog_type_list {
 	struct list_head list_node;
-	struct bpf_verifier_ops *ops;
+	const struct bpf_verifier_ops *ops;
 	enum bpf_prog_type type;
 };
 
@@ -121,7 +121,7 @@ struct bpf_prog_aux {
 	atomic_t refcnt;
 	bool is_gpl_compatible;
 	enum bpf_prog_type prog_type;
-	struct bpf_verifier_ops *ops;
+	const struct bpf_verifier_ops *ops;
 	struct bpf_map **used_maps;
 	u32 used_map_cnt;
 	struct bpf_prog *prog;
@@ -138,8 +138,8 @@ struct bpf_prog *bpf_prog_get(u32 ufd);
 int bpf_check(struct bpf_prog *fp, union bpf_attr *attr);
 
 /* verifier prototypes for helper functions called from eBPF programs */
-extern struct bpf_func_proto bpf_map_lookup_elem_proto;
-extern struct bpf_func_proto bpf_map_update_elem_proto;
-extern struct bpf_func_proto bpf_map_delete_elem_proto;
+extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
+extern const struct bpf_func_proto bpf_map_update_elem_proto;
+extern const struct bpf_func_proto bpf_map_delete_elem_proto;
 
 #endif /* _LINUX_BPF_H */
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 9eb4d8a7cd87..8a6616583f38 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -134,7 +134,7 @@ static void array_map_free(struct bpf_map *map)
 	kvfree(array);
 }
 
-static struct bpf_map_ops array_ops = {
+static const struct bpf_map_ops array_ops = {
 	.map_alloc = array_map_alloc,
 	.map_free = array_map_free,
 	.map_get_next_key = array_map_get_next_key,
@@ -143,14 +143,14 @@ static struct bpf_map_ops array_ops = {
 	.map_delete_elem = array_map_delete_elem,
 };
 
-static struct bpf_map_type_list tl = {
+static struct bpf_map_type_list array_type __read_mostly = {
 	.ops = &array_ops,
 	.type = BPF_MAP_TYPE_ARRAY,
 };
 
 static int __init register_array_map(void)
 {
-	bpf_register_map_type(&tl);
+	bpf_register_map_type(&array_type);
 	return 0;
 }
 late_initcall(register_array_map);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index b3ba43674310..83c209d9b17a 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -345,7 +345,7 @@ static void htab_map_free(struct bpf_map *map)
 	kfree(htab);
 }
 
-static struct bpf_map_ops htab_ops = {
+static const struct bpf_map_ops htab_ops = {
 	.map_alloc = htab_map_alloc,
 	.map_free = htab_map_free,
 	.map_get_next_key = htab_map_get_next_key,
@@ -354,14 +354,14 @@ static struct bpf_map_ops htab_ops = {
 	.map_delete_elem = htab_map_delete_elem,
 };
 
-static struct bpf_map_type_list tl = {
+static struct bpf_map_type_list htab_type __read_mostly = {
 	.ops = &htab_ops,
 	.type = BPF_MAP_TYPE_HASH,
 };
 
 static int __init register_htab_map(void)
 {
-	bpf_register_map_type(&tl);
+	bpf_register_map_type(&htab_type);
 	return 0;
 }
 late_initcall(register_htab_map);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 9e3414d85459..a3c7701a8b5e 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -41,7 +41,7 @@ static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 	return (unsigned long) value;
 }
 
-struct bpf_func_proto bpf_map_lookup_elem_proto = {
+const struct bpf_func_proto bpf_map_lookup_elem_proto = {
 	.func = bpf_map_lookup_elem,
 	.gpl_only = false,
 	.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
@@ -60,7 +60,7 @@ static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 	return map->ops->map_update_elem(map, key, value, r4);
 }
 
-struct bpf_func_proto bpf_map_update_elem_proto = {
+const struct bpf_func_proto bpf_map_update_elem_proto = {
 	.func = bpf_map_update_elem,
 	.gpl_only = false,
 	.ret_type = RET_INTEGER,
@@ -80,7 +80,7 @@ static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 	return map->ops->map_delete_elem(map, key);
 }
 
-struct bpf_func_proto bpf_map_delete_elem_proto = {
+const struct bpf_func_proto bpf_map_delete_elem_proto = {
 	.func = bpf_map_delete_elem,
 	.gpl_only = false,
 	.ret_type = RET_INTEGER,
diff --git a/net/core/filter.c b/net/core/filter.c
index f6bdc2b1ba01..6fe09e36dad9 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1159,19 +1159,19 @@ static bool sock_filter_is_valid_access(int off, int size, enum bpf_access_type
 	return false;
 }
 
-static struct bpf_verifier_ops sock_filter_ops = {
+static const struct bpf_verifier_ops sock_filter_ops = {
 	.get_func_proto = sock_filter_func_proto,
 	.is_valid_access = sock_filter_is_valid_access,
 };
 
-static struct bpf_prog_type_list tl = {
+static struct bpf_prog_type_list sock_filter_type __read_mostly = {
 	.ops = &sock_filter_ops,
 	.type = BPF_PROG_TYPE_SOCKET_FILTER,
 };
 
 static int __init register_sock_filter_ops(void)
 {
-	bpf_register_prog_type(&tl);
+	bpf_register_prog_type(&sock_filter_type);
 	return 0;
 }
 late_initcall(register_sock_filter_ops);
-- 
cgit v1.2.3


From 96be4325f443dbbfeb37d2a157675ac0736531a1 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 1 Mar 2015 12:31:46 +0100
Subject: ebpf: add sched_cls_type and map it to sk_filter's verifier ops

As discussed recently and at netconf/netdev01, we want to prevent making
bpf_verifier_ops registration available for modules, but have them at a
controlled place inside the kernel instead.

The reason for this is, that out-of-tree modules can go crazy and define
and register any verfifier ops they want, doing all sorts of crap, even
bypassing available GPLed eBPF helper functions. We don't want to offer
such a shiny playground, of course, but keep strict control to ourselves
inside the core kernel.

This also encourages us to design eBPF user helpers carefully and
generically, so they can be shared among various subsystems using eBPF.

For the eBPF traffic classifier (cls_bpf), it's a good start to share
the same helper facilities as we currently do in eBPF for socket filters.

That way, we have BPF_PROG_TYPE_SCHED_CLS look like it's own type, thus
one day if there's a good reason to diverge the set of helper functions
from the set available to socket filters, we keep ABI compatibility.

In future, we could place all bpf_prog_type_list at a central place,
perhaps.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |  1 +
 kernel/bpf/verifier.c    | 15 +++++++++++++--
 net/core/filter.c        |  7 +++++++
 3 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0248180bf2e2..3fa1af8a58d7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -118,6 +118,7 @@ enum bpf_map_type {
 enum bpf_prog_type {
 	BPF_PROG_TYPE_UNSPEC,
 	BPF_PROG_TYPE_SOCKET_FILTER,
+	BPF_PROG_TYPE_SCHED_CLS,
 };
 
 #define BPF_PSEUDO_MAP_FD	1
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a28e09c7825d..594d341f04db 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1172,6 +1172,17 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
 	return 0;
 }
 
+static bool may_access_skb(enum bpf_prog_type type)
+{
+	switch (type) {
+	case BPF_PROG_TYPE_SOCKET_FILTER:
+	case BPF_PROG_TYPE_SCHED_CLS:
+		return true;
+	default:
+		return false;
+	}
+}
+
 /* verify safety of LD_ABS|LD_IND instructions:
  * - they can only appear in the programs where ctx == skb
  * - since they are wrappers of function calls, they scratch R1-R5 registers,
@@ -1194,8 +1205,8 @@ static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn)
 	struct reg_state *reg;
 	int i, err;
 
-	if (env->prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) {
-		verbose("BPF_LD_ABS|IND instructions are only allowed in socket filters\n");
+	if (!may_access_skb(env->prog->aux->prog_type)) {
+		verbose("BPF_LD_ABS|IND instructions not allowed for this program type\n");
 		return -EINVAL;
 	}
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 741721233166..514d4082f326 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1166,9 +1166,16 @@ static struct bpf_prog_type_list sk_filter_type __read_mostly = {
 	.type = BPF_PROG_TYPE_SOCKET_FILTER,
 };
 
+static struct bpf_prog_type_list sched_cls_type __read_mostly = {
+	.ops = &sk_filter_ops,
+	.type = BPF_PROG_TYPE_SCHED_CLS,
+};
+
 static int __init register_sk_filter_ops(void)
 {
 	bpf_register_prog_type(&sk_filter_type);
+	bpf_register_prog_type(&sched_cls_type);
+
 	return 0;
 }
 late_initcall(register_sk_filter_ops);
-- 
cgit v1.2.3


From 24701ecea76b0b93bd9667486934ec310825f558 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 1 Mar 2015 12:31:47 +0100
Subject: ebpf: move read-only fields to bpf_prog and shrink bpf_prog_aux

is_gpl_compatible and prog_type should be moved directly into bpf_prog
as they stay immutable during bpf_prog's lifetime, are core attributes
and they can be locked as read-only later on via bpf_prog_select_runtime().

With a bit of rearranging, this also allows us to shrink bpf_prog_aux
to exactly 1 cacheline.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h    | 4 +---
 include/linux/filter.h | 4 +++-
 kernel/bpf/syscall.c   | 7 +++----
 kernel/bpf/verifier.c  | 4 ++--
 net/core/filter.c      | 4 ++--
 5 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9c458144cdb4..a1a7ff2df328 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -117,11 +117,9 @@ struct bpf_prog;
 
 struct bpf_prog_aux {
 	atomic_t refcnt;
-	bool is_gpl_compatible;
-	enum bpf_prog_type prog_type;
+	u32 used_map_cnt;
 	const struct bpf_verifier_ops *ops;
 	struct bpf_map **used_maps;
-	u32 used_map_cnt;
 	struct bpf_prog *prog;
 	struct work_struct work;
 };
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 5e3863d5f666..9ee8c67ea249 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -308,9 +308,11 @@ struct bpf_binary_header {
 struct bpf_prog {
 	u16			pages;		/* Number of allocated pages */
 	bool			jited;		/* Is our filter JIT'ed? */
+	bool			gpl_compatible;	/* Is our filter GPL compatible? */
 	u32			len;		/* Number of filter blocks */
-	struct sock_fprog_kern	*orig_prog;	/* Original BPF program */
+	enum bpf_prog_type	type;		/* Type of BPF program */
 	struct bpf_prog_aux	*aux;		/* Auxiliary fields */
+	struct sock_fprog_kern	*orig_prog;	/* Original BPF program */
 	unsigned int		(*bpf_func)(const struct sk_buff *skb,
 					    const struct bpf_insn *filter);
 	/* Instructions for interpreter */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 536edc2be307..0d69449acbd0 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -354,10 +354,11 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
 	list_for_each_entry(tl, &bpf_prog_types, list_node) {
 		if (tl->type == type) {
 			prog->aux->ops = tl->ops;
-			prog->aux->prog_type = type;
+			prog->type = type;
 			return 0;
 		}
 	}
+
 	return -EINVAL;
 }
 
@@ -508,7 +509,7 @@ static int bpf_prog_load(union bpf_attr *attr)
 	prog->jited = false;
 
 	atomic_set(&prog->aux->refcnt, 1);
-	prog->aux->is_gpl_compatible = is_gpl;
+	prog->gpl_compatible = is_gpl;
 
 	/* find program type: socket_filter vs tracing_filter */
 	err = find_prog_type(type, prog);
@@ -517,7 +518,6 @@ static int bpf_prog_load(union bpf_attr *attr)
 
 	/* run eBPF verifier */
 	err = bpf_check(prog, attr);
-
 	if (err < 0)
 		goto free_used_maps;
 
@@ -528,7 +528,6 @@ static int bpf_prog_load(union bpf_attr *attr)
 	bpf_prog_select_runtime(prog);
 
 	err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
-
 	if (err < 0)
 		/* failed to allocate fd */
 		goto free_used_maps;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 594d341f04db..bdf4192a889b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -852,7 +852,7 @@ static int check_call(struct verifier_env *env, int func_id)
 	}
 
 	/* eBPF programs must be GPL compatible to use GPL-ed functions */
-	if (!env->prog->aux->is_gpl_compatible && fn->gpl_only) {
+	if (!env->prog->gpl_compatible && fn->gpl_only) {
 		verbose("cannot call GPL only function from proprietary program\n");
 		return -EINVAL;
 	}
@@ -1205,7 +1205,7 @@ static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn)
 	struct reg_state *reg;
 	int i, err;
 
-	if (!may_access_skb(env->prog->aux->prog_type)) {
+	if (!may_access_skb(env->prog->type)) {
 		verbose("BPF_LD_ABS|IND instructions not allowed for this program type\n");
 		return -EINVAL;
 	}
diff --git a/net/core/filter.c b/net/core/filter.c
index 514d4082f326..ff000cb25e0a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -814,7 +814,7 @@ static void bpf_release_orig_filter(struct bpf_prog *fp)
 
 static void __bpf_prog_release(struct bpf_prog *prog)
 {
-	if (prog->aux->prog_type == BPF_PROG_TYPE_SOCKET_FILTER) {
+	if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) {
 		bpf_prog_put(prog);
 	} else {
 		bpf_release_orig_filter(prog);
@@ -1105,7 +1105,7 @@ int sk_attach_bpf(u32 ufd, struct sock *sk)
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
 
-	if (prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) {
+	if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) {
 		bpf_prog_put(prog);
 		return -EINVAL;
 	}
-- 
cgit v1.2.3


From e2e9b6541dd4b31848079da80fe2253daaafb549 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 1 Mar 2015 12:31:48 +0100
Subject: cls_bpf: add initial eBPF support for programmable classifiers

This work extends the "classic" BPF programmable tc classifier by
extending its scope also to native eBPF code!

This allows for user space to implement own custom, 'safe' C like
classifiers (or whatever other frontend language LLVM et al may
provide in future), that can then be compiled with the LLVM eBPF
backend to an eBPF elf file. The result of this can be loaded into
the kernel via iproute2's tc. In the kernel, they can be JITed on
major archs and thus run in native performance.

Simple, minimal toy example to demonstrate the workflow:

  #include <linux/ip.h>
  #include <linux/if_ether.h>
  #include <linux/bpf.h>

  #include "tc_bpf_api.h"

  __section("classify")
  int cls_main(struct sk_buff *skb)
  {
    return (0x800 << 16) | load_byte(skb, ETH_HLEN + __builtin_offsetof(struct iphdr, tos));
  }

  char __license[] __section("license") = "GPL";

The classifier can then be compiled into eBPF opcodes and loaded
via tc, for example:

  clang -O2 -emit-llvm -c cls.c -o - | llc -march=bpf -filetype=obj -o cls.o
  tc filter add dev em1 parent 1: bpf cls.o [...]

As it has been demonstrated, the scope can even reach up to a fully
fledged flow dissector (similarly as in samples/bpf/sockex2_kern.c).

For tc, maps are allowed to be used, but from kernel context only,
in other words, eBPF code can keep state across filter invocations.
In future, we perhaps may reattach from a different application to
those maps e.g., to read out collected statistics/state.

Similarly as in socket filters, we may extend functionality for eBPF
classifiers over time depending on the use cases. For that purpose,
cls_bpf programs are using BPF_PROG_TYPE_SCHED_CLS program type, so
we can allow additional functions/accessors (e.g. an ABI compatible
offset translation to skb fields/metadata). For an initial cls_bpf
support, we allow the same set of helper functions as eBPF socket
filters, but we could diverge at some point in time w/o problem.

I was wondering whether cls_bpf and act_bpf could share C programs,
I can imagine that at some point, we introduce i) further common
handlers for both (or even beyond their scope), and/or if truly needed
ii) some restricted function space for each of them. Both can be
abstracted easily through struct bpf_verifier_ops in future.

The context of cls_bpf versus act_bpf is slightly different though:
a cls_bpf program will return a specific classid whereas act_bpf a
drop/non-drop return code, latter may also in future mangle skbs.
That said, we can surely have a "classify" and "action" section in
a single object file, or considered mentioned constraint add a
possibility of a shared section.

The workflow for getting native eBPF running from tc [1] is as
follows: for f_bpf, I've added a slightly modified ELF parser code
from Alexei's kernel sample, which reads out the LLVM compiled
object, sets up maps (and dynamically fixes up map fds) if any, and
loads the eBPF instructions all centrally through the bpf syscall.

The resulting fd from the loaded program itself is being passed down
to cls_bpf, which looks up struct bpf_prog from the fd store, and
holds reference, so that it stays available also after tc program
lifetime. On tc filter destruction, it will then drop its reference.

Moreover, I've also added the optional possibility to annotate an
eBPF filter with a name (e.g. path to object file, or something
else if preferred) so that when tc dumps currently installed filters,
some more context can be given to an admin for a given instance (as
opposed to just the file descriptor number).

Last but not least, bpf_prog_get() and bpf_prog_put() needed to be
exported, so that eBPF can be used from cls_bpf built as a module.
Thanks to 60a3b2253c41 ("net: bpf: make eBPF interpreter images
read-only") I think this is of no concern since anything wanting to
alter eBPF opcode after verification stage would crash the kernel.

  [1] http://git.breakpoint.cc/cgit/dborkman/iproute2.git/log/?h=ebpf

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h |   2 +
 kernel/bpf/syscall.c         |   2 +
 net/sched/cls_bpf.c          | 206 ++++++++++++++++++++++++++++++++-----------
 3 files changed, 158 insertions(+), 52 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 25731dfb3fcc..bf08e76bf505 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -397,6 +397,8 @@ enum {
 	TCA_BPF_CLASSID,
 	TCA_BPF_OPS_LEN,
 	TCA_BPF_OPS,
+	TCA_BPF_FD,
+	TCA_BPF_NAME,
 	__TCA_BPF_MAX,
 };
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0d69449acbd0..669719ccc9ee 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -419,6 +419,7 @@ void bpf_prog_put(struct bpf_prog *prog)
 		bpf_prog_free(prog);
 	}
 }
+EXPORT_SYMBOL_GPL(bpf_prog_put);
 
 static int bpf_prog_release(struct inode *inode, struct file *filp)
 {
@@ -466,6 +467,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
 	fdput(f);
 	return prog;
 }
+EXPORT_SYMBOL_GPL(bpf_prog_get);
 
 /* last field in 'union bpf_attr' used by this command */
 #define	BPF_PROG_LOAD_LAST_FIELD log_buf
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 5f3ee9e4b5bf..6f7ed8f8e6ee 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -16,6 +16,8 @@
 #include <linux/types.h>
 #include <linux/skbuff.h>
 #include <linux/filter.h>
+#include <linux/bpf.h>
+
 #include <net/rtnetlink.h>
 #include <net/pkt_cls.h>
 #include <net/sock.h>
@@ -24,6 +26,8 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
 MODULE_DESCRIPTION("TC BPF based classifier");
 
+#define CLS_BPF_NAME_LEN	256
+
 struct cls_bpf_head {
 	struct list_head plist;
 	u32 hgen;
@@ -32,18 +36,24 @@ struct cls_bpf_head {
 
 struct cls_bpf_prog {
 	struct bpf_prog *filter;
-	struct sock_filter *bpf_ops;
-	struct tcf_exts exts;
-	struct tcf_result res;
 	struct list_head link;
+	struct tcf_result res;
+	struct tcf_exts exts;
 	u32 handle;
-	u16 bpf_num_ops;
+	union {
+		u32 bpf_fd;
+		u16 bpf_num_ops;
+	};
+	struct sock_filter *bpf_ops;
+	const char *bpf_name;
 	struct tcf_proto *tp;
 	struct rcu_head rcu;
 };
 
 static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
 	[TCA_BPF_CLASSID]	= { .type = NLA_U32 },
+	[TCA_BPF_FD]		= { .type = NLA_U32 },
+	[TCA_BPF_NAME]		= { .type = NLA_NUL_STRING, .len = CLS_BPF_NAME_LEN },
 	[TCA_BPF_OPS_LEN]	= { .type = NLA_U16 },
 	[TCA_BPF_OPS]		= { .type = NLA_BINARY,
 				    .len = sizeof(struct sock_filter) * BPF_MAXINSNS },
@@ -76,6 +86,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 	return -1;
 }
 
+static bool cls_bpf_is_ebpf(const struct cls_bpf_prog *prog)
+{
+	return !prog->bpf_ops;
+}
+
 static int cls_bpf_init(struct tcf_proto *tp)
 {
 	struct cls_bpf_head *head;
@@ -94,8 +109,12 @@ static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog)
 {
 	tcf_exts_destroy(&prog->exts);
 
-	bpf_prog_destroy(prog->filter);
+	if (cls_bpf_is_ebpf(prog))
+		bpf_prog_put(prog->filter);
+	else
+		bpf_prog_destroy(prog->filter);
 
+	kfree(prog->bpf_name);
 	kfree(prog->bpf_ops);
 	kfree(prog);
 }
@@ -114,6 +133,7 @@ static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg)
 	list_del_rcu(&prog->link);
 	tcf_unbind_filter(tp, &prog->res);
 	call_rcu(&prog->rcu, __cls_bpf_delete_prog);
+
 	return 0;
 }
 
@@ -151,69 +171,121 @@ static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle)
 	return ret;
 }
 
-static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
-				   struct cls_bpf_prog *prog,
-				   unsigned long base, struct nlattr **tb,
-				   struct nlattr *est, bool ovr)
+static int cls_bpf_prog_from_ops(struct nlattr **tb,
+				 struct cls_bpf_prog *prog, u32 classid)
 {
 	struct sock_filter *bpf_ops;
-	struct tcf_exts exts;
-	struct sock_fprog_kern tmp;
+	struct sock_fprog_kern fprog_tmp;
 	struct bpf_prog *fp;
 	u16 bpf_size, bpf_num_ops;
-	u32 classid;
 	int ret;
 
-	if (!tb[TCA_BPF_OPS_LEN] || !tb[TCA_BPF_OPS] || !tb[TCA_BPF_CLASSID])
-		return -EINVAL;
-
-	tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE);
-	ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr);
-	if (ret < 0)
-		return ret;
-
-	classid = nla_get_u32(tb[TCA_BPF_CLASSID]);
 	bpf_num_ops = nla_get_u16(tb[TCA_BPF_OPS_LEN]);
-	if (bpf_num_ops > BPF_MAXINSNS || bpf_num_ops == 0) {
-		ret = -EINVAL;
-		goto errout;
-	}
+	if (bpf_num_ops > BPF_MAXINSNS || bpf_num_ops == 0)
+		return -EINVAL;
 
 	bpf_size = bpf_num_ops * sizeof(*bpf_ops);
-	if (bpf_size != nla_len(tb[TCA_BPF_OPS])) {
-		ret = -EINVAL;
-		goto errout;
-	}
+	if (bpf_size != nla_len(tb[TCA_BPF_OPS]))
+		return -EINVAL;
 
 	bpf_ops = kzalloc(bpf_size, GFP_KERNEL);
-	if (bpf_ops == NULL) {
-		ret = -ENOMEM;
-		goto errout;
-	}
+	if (bpf_ops == NULL)
+		return -ENOMEM;
 
 	memcpy(bpf_ops, nla_data(tb[TCA_BPF_OPS]), bpf_size);
 
-	tmp.len = bpf_num_ops;
-	tmp.filter = bpf_ops;
+	fprog_tmp.len = bpf_num_ops;
+	fprog_tmp.filter = bpf_ops;
 
-	ret = bpf_prog_create(&fp, &tmp);
-	if (ret)
-		goto errout_free;
+	ret = bpf_prog_create(&fp, &fprog_tmp);
+	if (ret < 0) {
+		kfree(bpf_ops);
+		return ret;
+	}
 
-	prog->bpf_num_ops = bpf_num_ops;
 	prog->bpf_ops = bpf_ops;
+	prog->bpf_num_ops = bpf_num_ops;
+	prog->bpf_name = NULL;
+
 	prog->filter = fp;
 	prog->res.classid = classid;
 
+	return 0;
+}
+
+static int cls_bpf_prog_from_efd(struct nlattr **tb,
+				 struct cls_bpf_prog *prog, u32 classid)
+{
+	struct bpf_prog *fp;
+	char *name = NULL;
+	u32 bpf_fd;
+
+	bpf_fd = nla_get_u32(tb[TCA_BPF_FD]);
+
+	fp = bpf_prog_get(bpf_fd);
+	if (IS_ERR(fp))
+		return PTR_ERR(fp);
+
+	if (fp->type != BPF_PROG_TYPE_SCHED_CLS) {
+		bpf_prog_put(fp);
+		return -EINVAL;
+	}
+
+	if (tb[TCA_BPF_NAME]) {
+		name = kmemdup(nla_data(tb[TCA_BPF_NAME]),
+			       nla_len(tb[TCA_BPF_NAME]),
+			       GFP_KERNEL);
+		if (!name) {
+			bpf_prog_put(fp);
+			return -ENOMEM;
+		}
+	}
+
+	prog->bpf_ops = NULL;
+	prog->bpf_fd = bpf_fd;
+	prog->bpf_name = name;
+
+	prog->filter = fp;
+	prog->res.classid = classid;
+
+	return 0;
+}
+
+static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
+				   struct cls_bpf_prog *prog,
+				   unsigned long base, struct nlattr **tb,
+				   struct nlattr *est, bool ovr)
+{
+	struct tcf_exts exts;
+	bool is_bpf, is_ebpf;
+	u32 classid;
+	int ret;
+
+	is_bpf = tb[TCA_BPF_OPS_LEN] && tb[TCA_BPF_OPS];
+	is_ebpf = tb[TCA_BPF_FD];
+
+	if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf) ||
+	    !tb[TCA_BPF_CLASSID])
+		return -EINVAL;
+
+	tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE);
+	ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr);
+	if (ret < 0)
+		return ret;
+
+	classid = nla_get_u32(tb[TCA_BPF_CLASSID]);
+
+	ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog, classid) :
+		       cls_bpf_prog_from_efd(tb, prog, classid);
+	if (ret < 0) {
+		tcf_exts_destroy(&exts);
+		return ret;
+	}
+
 	tcf_bind_filter(tp, &prog->res, base);
 	tcf_exts_change(tp, &prog->exts, &exts);
 
 	return 0;
-errout_free:
-	kfree(bpf_ops);
-errout:
-	tcf_exts_destroy(&exts);
-	return ret;
 }
 
 static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp,
@@ -297,11 +369,43 @@ errout:
 	return ret;
 }
 
+static int cls_bpf_dump_bpf_info(const struct cls_bpf_prog *prog,
+				 struct sk_buff *skb)
+{
+	struct nlattr *nla;
+
+	if (nla_put_u16(skb, TCA_BPF_OPS_LEN, prog->bpf_num_ops))
+		return -EMSGSIZE;
+
+	nla = nla_reserve(skb, TCA_BPF_OPS, prog->bpf_num_ops *
+			  sizeof(struct sock_filter));
+	if (nla == NULL)
+		return -EMSGSIZE;
+
+	memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla));
+
+	return 0;
+}
+
+static int cls_bpf_dump_ebpf_info(const struct cls_bpf_prog *prog,
+				  struct sk_buff *skb)
+{
+	if (nla_put_u32(skb, TCA_BPF_FD, prog->bpf_fd))
+		return -EMSGSIZE;
+
+	if (prog->bpf_name &&
+	    nla_put_string(skb, TCA_BPF_NAME, prog->bpf_name))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
 static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 			struct sk_buff *skb, struct tcmsg *tm)
 {
 	struct cls_bpf_prog *prog = (struct cls_bpf_prog *) fh;
-	struct nlattr *nest, *nla;
+	struct nlattr *nest;
+	int ret;
 
 	if (prog == NULL)
 		return skb->len;
@@ -314,16 +418,14 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 
 	if (nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid))
 		goto nla_put_failure;
-	if (nla_put_u16(skb, TCA_BPF_OPS_LEN, prog->bpf_num_ops))
-		goto nla_put_failure;
 
-	nla = nla_reserve(skb, TCA_BPF_OPS, prog->bpf_num_ops *
-			  sizeof(struct sock_filter));
-	if (nla == NULL)
+	if (cls_bpf_is_ebpf(prog))
+		ret = cls_bpf_dump_ebpf_info(prog, skb);
+	else
+		ret = cls_bpf_dump_bpf_info(prog, skb);
+	if (ret)
 		goto nla_put_failure;
 
-	memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla));
-
 	if (tcf_exts_dump(skb, &prog->exts) < 0)
 		goto nla_put_failure;
 
-- 
cgit v1.2.3


From 32a158325acf12842764b1681f53903673f2f22e Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Fri, 27 Feb 2015 11:25:56 -0800
Subject: clockevents: export clockevents_unbind_device instead of
 clockevents_unbind

It looks like clockevents_unbind is being exported by mistake as:
- it is static;
- it is not listed in include/linux/clockchips.h;
- EXPORT_SYMBOL_GPL(clockevents_unbind) follows clockevents_unbind_device()
  implementation.

I think clockevents_unbind_device should be exported instead. This is going to
be used to teardown Hyper-V clockevent devices on module unload.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/time/clockevents.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 55449909f114..888ecc114ddc 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -371,7 +371,7 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu)
 	mutex_unlock(&clockevents_mutex);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(clockevents_unbind);
+EXPORT_SYMBOL_GPL(clockevents_unbind_device);
 
 /**
  * clockevents_register_device - register a clock event device
-- 
cgit v1.2.3


From 2ed11312eb19506c027e7cac039994ad42a9cb2c Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Mon, 2 Mar 2015 02:14:26 -0500
Subject: Revert "perf: Remove the extra validity check on nr_pages"

This reverts commit 74390aa55678 ("perf: Remove the extra validity check
on nr_pages")

nr_pages equals to number of pages - 1 in perf_mmap. So nr_pages = 0 is
valid.

So the nr_pages != 0 && !is_power_of_2(nr_pages) are all
needed for checking. Otherwise, for example, perf test 6 failed.

 # perf test 6
  6: x86 rdpmc test                                         :Error:
 mmap() syscall returned with (Invalid argument)
 FAILED!

Signed-off-by: Kan Liang <kan.liang@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Kaixu Xia <xiakaixu@huawei.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1425280466-7830-1-git-send-email-kan.liang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 kernel/events/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index af924bc38121..8bb20cc39a92 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4446,7 +4446,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	 * If we have rb pages ensure they're a power-of-two number, so we
 	 * can do bitmasks instead of modulo.
 	 */
-	if (!is_power_of_2(nr_pages))
+	if (nr_pages != 0 && !is_power_of_2(nr_pages))
 		return -EINVAL;
 
 	if (vma_size != PAGE_SIZE * (1 + nr_pages))
-- 
cgit v1.2.3


From 34404ca8fb252ccee662c4368c555ccf774acc3b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 19 Jan 2015 20:39:20 -0800
Subject: rcu: Move early-boot callbacks to no-CBs lists for no-CBs CPUs

When a CPU is first determined to be a no-CBs CPUs, this commit causes
any early boot callbacks to be moved to the no-CBs callback list,
allowing them to be invoked.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c        |  1 +
 kernel/rcu/tree_plugin.h | 24 ++++++++++++------------
 2 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 92fd3eab5823..0317bf7d997f 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2851,6 +2851,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 		 * and then drop through to queue the callback.
 		 */
 		BUG_ON(cpu != -1);
+		WARN_ON_ONCE(!rcu_is_watching());
 		if (!likely(rdp->nxtlist))
 			init_default_callback_list(rdp);
 	}
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 75d5f096bcb0..afddd5641bea 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2393,18 +2393,8 @@ void __init rcu_init_nohz(void)
 		pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
 
 	for_each_rcu_flavor(rsp) {
-		for_each_cpu(cpu, rcu_nocb_mask) {
-			struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
-
-			/*
-			 * If there are early callbacks, they will need
-			 * to be moved to the nocb lists.
-			 */
-			WARN_ON_ONCE(rdp->nxttail[RCU_NEXT_TAIL] !=
-				     &rdp->nxtlist &&
-				     rdp->nxttail[RCU_NEXT_TAIL] != NULL);
-			init_nocb_callback_list(rdp);
-		}
+		for_each_cpu(cpu, rcu_nocb_mask)
+			init_nocb_callback_list(per_cpu_ptr(rsp->rda, cpu));
 		rcu_organize_nocb_kthreads(rsp);
 	}
 }
@@ -2541,6 +2531,16 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
 	if (!rcu_is_nocb_cpu(rdp->cpu))
 		return false;
 
+	/* If there are early-boot callbacks, move them to nocb lists. */
+	if (rdp->nxtlist) {
+		rdp->nocb_head = rdp->nxtlist;
+		rdp->nocb_tail = rdp->nxttail[RCU_NEXT_TAIL];
+		atomic_long_set(&rdp->nocb_q_count, rdp->qlen);
+		atomic_long_set(&rdp->nocb_q_count_lazy, rdp->qlen_lazy);
+		rdp->nxtlist = NULL;
+		rdp->qlen = 0;
+		rdp->qlen_lazy = 0;
+	}
 	rdp->nxttail[RCU_NEXT_TAIL] = NULL;
 	return true;
 }
-- 
cgit v1.2.3


From 476276781095c79580abe27a65988549ac7f5f89 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 19 Jan 2015 21:10:21 -0800
Subject: rcu: Move early boot callback tests earlier

Because callbacks can now be posted quite early in boot, move the
early boot callback tests to precede RCU initialization.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 0317bf7d997f..c8e6569c5fbd 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3948,6 +3948,8 @@ void __init rcu_init(void)
 {
 	int cpu;
 
+	rcu_early_boot_tests();
+
 	rcu_bootup_announce();
 	rcu_init_geometry();
 	rcu_init_one(&rcu_bh_state, &rcu_bh_data);
@@ -3964,8 +3966,6 @@ void __init rcu_init(void)
 	pm_notifier(rcu_pm_notify, 0);
 	for_each_online_cpu(cpu)
 		rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
-
-	rcu_early_boot_tests();
 }
 
 #include "tree_plugin.h"
-- 
cgit v1.2.3


From 6629240575992a6f0d18c46f5160b34527b0e501 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 19 Jan 2015 19:16:38 -0800
Subject: rcu: Use IS_ENABLED() to CONFIG_RCU_FANOUT_EXACT #ifdef

This commit uses IS_ENABLED() to remove the #ifdef from the
rcu_init_levelspread() functions.  No effect on executable code.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 32 ++++++++++++++------------------
 1 file changed, 14 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 4e37c7fd9e29..35e1604f7e3e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3734,30 +3734,26 @@ void rcu_scheduler_starting(void)
  * Compute the per-level fanout, either using the exact fanout specified
  * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
  */
-#ifdef CONFIG_RCU_FANOUT_EXACT
 static void __init rcu_init_levelspread(struct rcu_state *rsp)
 {
 	int i;
 
-	rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
-	for (i = rcu_num_lvls - 2; i >= 0; i--)
-		rsp->levelspread[i] = CONFIG_RCU_FANOUT;
-}
-#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
-static void __init rcu_init_levelspread(struct rcu_state *rsp)
-{
-	int ccur;
-	int cprv;
-	int i;
-
-	cprv = nr_cpu_ids;
-	for (i = rcu_num_lvls - 1; i >= 0; i--) {
-		ccur = rsp->levelcnt[i];
-		rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
-		cprv = ccur;
+	if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT)) {
+		rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
+		for (i = rcu_num_lvls - 2; i >= 0; i--)
+			rsp->levelspread[i] = CONFIG_RCU_FANOUT;
+	} else {
+		int ccur;
+		int cprv;
+
+		cprv = nr_cpu_ids;
+		for (i = rcu_num_lvls - 1; i >= 0; i--) {
+			ccur = rsp->levelcnt[i];
+			rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
+			cprv = ccur;
+		}
 	}
 }
-#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */
 
 /*
  * Helper function for rcu_init() that initializes one rcu_state structure.
-- 
cgit v1.2.3


From d24209bb689e2c7f7418faec9b4a948e922d24da Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 21 Jan 2015 15:26:03 -0800
Subject: rcu: Improve diagnostics for blocked critical sections in irq

If an RCU read-side critical section occurs within an interrupt handler
or a softirq handler, it cannot have been preempted.  Therefore, there is
a check in rcu_read_unlock_special() checking for this error.  However,
when this check triggers, it lacks diagnostic information.  This commit
therefore moves rcu_read_unlock()'s lockdep annotation to follow the
call to __rcu_read_unlock() and changes rcu_read_unlock_special()'s
WARN_ON_ONCE() to an lockdep_rcu_suspicious() in order to locate where
the offending RCU read-side critical section began.  In addition, the
value of the ->rcu_read_unlock_special field is printed.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/lockdep.h  | 7 ++++++-
 include/linux/rcupdate.h | 2 +-
 kernel/rcu/tree_plugin.h | 8 +++++++-
 3 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 74ab23176e9b..066ba4157541 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -531,8 +531,13 @@ do {									\
 # define might_lock_read(lock) do { } while (0)
 #endif
 
-#ifdef CONFIG_PROVE_RCU
+#ifdef CONFIG_LOCKDEP
 void lockdep_rcu_suspicious(const char *file, const int line, const char *s);
+#else
+static inline void
+lockdep_rcu_suspicious(const char *file, const int line, const char *s)
+{
+}
 #endif
 
 #endif /* __LINUX_LOCKDEP_H */
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 3e6afed51051..70b896e16f19 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -942,9 +942,9 @@ static inline void rcu_read_unlock(void)
 {
 	rcu_lockdep_assert(rcu_is_watching(),
 			   "rcu_read_unlock() used illegally while idle");
-	rcu_lock_release(&rcu_lock_map);
 	__release(RCU);
 	__rcu_read_unlock();
+	rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */
 }
 
 /**
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0a571e9a0f1d..8a33920b8845 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -334,7 +334,13 @@ void rcu_read_unlock_special(struct task_struct *t)
 	}
 
 	/* Hardware IRQ handlers cannot block, complain if they get here. */
-	if (WARN_ON_ONCE(in_irq() || in_serving_softirq())) {
+	if (in_irq() || in_serving_softirq()) {
+		lockdep_rcu_suspicious(__FILE__, __LINE__,
+				       "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
+		pr_alert("->rcu_read_unlock_special: %#x (b: %d, nq: %d)\n",
+			 t->rcu_read_unlock_special.s,
+			 t->rcu_read_unlock_special.b.blocked,
+			 t->rcu_read_unlock_special.b.need_qs);
 		local_irq_restore(flags);
 		return;
 	}
-- 
cgit v1.2.3


From ab6f5bd6741af7b157275de299b7b2b96f2df40e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 21 Jan 2015 16:58:06 -0800
Subject: rcu: Use IS_ENABLED() to simplify rcu_bootup_announce_oddness()

This commit gets rid of some inline #ifdefs by replacing them with
IS_ENABLED.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 48 ++++++++++++++++++++----------------------------
 1 file changed, 20 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 8a33920b8845..81c4d91fa18a 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -58,38 +58,30 @@ static bool __read_mostly rcu_nocb_poll;    /* Offload kthread are to poll. */
  */
 static void __init rcu_bootup_announce_oddness(void)
 {
-#ifdef CONFIG_RCU_TRACE
-	pr_info("\tRCU debugfs-based tracing is enabled.\n");
-#endif
-#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)
-	pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
-	       CONFIG_RCU_FANOUT);
-#endif
-#ifdef CONFIG_RCU_FANOUT_EXACT
-	pr_info("\tHierarchical RCU autobalancing is disabled.\n");
-#endif
-#ifdef CONFIG_RCU_FAST_NO_HZ
-	pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
-#endif
-#ifdef CONFIG_PROVE_RCU
-	pr_info("\tRCU lockdep checking is enabled.\n");
-#endif
-#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
-	pr_info("\tRCU torture testing starts during boot.\n");
-#endif
-#if defined(CONFIG_RCU_CPU_STALL_INFO)
-	pr_info("\tAdditional per-CPU info printed with stalls.\n");
-#endif
-#if NUM_RCU_LVL_4 != 0
-	pr_info("\tFour-level hierarchy is enabled.\n");
-#endif
+	if (IS_ENABLED(CONFIG_RCU_TRACE))
+		pr_info("\tRCU debugfs-based tracing is enabled.\n");
+	if ((IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) ||
+	    (!IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32))
+		pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
+		       CONFIG_RCU_FANOUT);
+	if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT))
+		pr_info("\tHierarchical RCU autobalancing is disabled.\n");
+	if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ))
+		pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
+	if (IS_ENABLED(CONFIG_PROVE_RCU))
+		pr_info("\tRCU lockdep checking is enabled.\n");
+	if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE))
+		pr_info("\tRCU torture testing starts during boot.\n");
+	if (IS_ENABLED(CONFIG_RCU_CPU_STALL_INFO))
+		pr_info("\tAdditional per-CPU info printed with stalls.\n");
+	if (NUM_RCU_LVL_4 != 0)
+		pr_info("\tFour-level hierarchy is enabled.\n");
 	if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
 		pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
 	if (nr_cpu_ids != NR_CPUS)
 		pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
-#ifdef CONFIG_RCU_BOOST
-	pr_info("\tRCU kthread priority: %d.\n", kthread_prio);
-#endif
+	if (IS_ENABLED(CONFIG_RCU_BOOST))
+		pr_info("\tRCU kthread priority: %d.\n", kthread_prio);
 }
 
 #ifdef CONFIG_PREEMPT_RCU
-- 
cgit v1.2.3


From a3bd2c09adcc80946262fd15e63868de1f0f4963 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 21 Jan 2015 20:58:57 -0800
Subject: rcu: Add boot-up check for non-default CONFIG_RCU_FANOUT_LEAF values

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 81c4d91fa18a..c9225350d3ed 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -76,6 +76,9 @@ static void __init rcu_bootup_announce_oddness(void)
 		pr_info("\tAdditional per-CPU info printed with stalls.\n");
 	if (NUM_RCU_LVL_4 != 0)
 		pr_info("\tFour-level hierarchy is enabled.\n");
+	if (CONFIG_RCU_FANOUT_LEAF != 16)
+		pr_info("\tBuild-time adjustment of leaf fanout to %d.\n",
+			CONFIG_RCU_FANOUT_LEAF);
 	if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
 		pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
 	if (nr_cpu_ids != NR_CPUS)
-- 
cgit v1.2.3


From e7580f33889299e484a80f42c20611ead42f199e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 24 Feb 2015 14:23:39 -0800
Subject: rcu: Get rcu_sched_force_quiescent_state() where it belongs

The very similar functions rcu_force_quiescent_state(),
rcu_bh_force_quiescent_state(), and rcu_sched_force_quiescent_state()
are supposed to be together, but have drifted apart.  This commit
restores rcu_sched_force_quiescent_state() to its rightful place.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 35e1604f7e3e..fbe9dd9ced54 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -409,6 +409,15 @@ void rcu_bh_force_quiescent_state(void)
 }
 EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
 
+/*
+ * Force a quiescent state for RCU-sched.
+ */
+void rcu_sched_force_quiescent_state(void)
+{
+	force_quiescent_state(&rcu_sched_state);
+}
+EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
+
 /*
  * Show the state of the grace-period kthreads.
  */
@@ -482,15 +491,6 @@ void rcutorture_record_progress(unsigned long vernum)
 }
 EXPORT_SYMBOL_GPL(rcutorture_record_progress);
 
-/*
- * Force a quiescent state for RCU-sched.
- */
-void rcu_sched_force_quiescent_state(void)
-{
-	force_quiescent_state(&rcu_sched_state);
-}
-EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
-
 /*
  * Does the CPU have callbacks ready to be invoked?
  */
-- 
cgit v1.2.3


From 9910affa89fe0895153880b115ec243636e70af3 Mon Sep 17 00:00:00 2001
From: Yao Dongdong <yaodongdong@huawei.com>
Date: Wed, 25 Feb 2015 17:09:46 +0800
Subject: rcu: Remove redundant check of cpu_online()

Because invoke_cpu_core() checks whether the current CPU is online,
there is no need for __call_rcu_core() to redundantly check it.
There should not be any performance degradation because the called
function is visible to the compiler.  This commit therefore removes
the redundant check.

Signed-off-by: Yao Dongdong <yaodongdong@huawei.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index fbe9dd9ced54..23194a77a768 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2741,7 +2741,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
 	 * If called from an extended quiescent state, invoke the RCU
 	 * core in order to force a re-evaluation of RCU's idleness.
 	 */
-	if (!rcu_is_watching() && cpu_online(smp_processor_id()))
+	if (!rcu_is_watching())
 		invoke_rcu_core();
 
 	/* If interrupts were disabled or CPU offline, don't invoke RCU core. */
-- 
cgit v1.2.3


From 2e3ac940f2754d7dc616aba1643a668954fe892f Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 3 Mar 2015 17:02:21 -0600
Subject: livepatch: remove unnecessary call to klp_find_object_module()

klp_find_object_module() is called from both the klp register and enable
paths.  Only the call from the register path is necessary because the
module notifier will let us know if the patched module gets loaded or
unloaded.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Petr Mladek <pmladek@suse.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/livepatch/core.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 26df09d56f7c..d03d6134e824 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -511,8 +511,6 @@ static int __klp_enable_patch(struct klp_patch *patch)
 	pr_notice("enabling patch '%s'\n", patch->mod->name);
 
 	for (obj = patch->objs; obj->funcs; obj++) {
-		klp_find_object_module(obj);
-
 		if (!klp_is_object_loaded(obj))
 			continue;
 
-- 
cgit v1.2.3


From 3ba67dabaa58e3223325f0a813a6e830fb5f5cc5 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 5 Mar 2015 23:27:51 +0100
Subject: ebpf: bpf_map_*: fix linker error on avr32 and openrisc arch

Fengguang reported, that on openrisc and avr32 architectures, we
get the following linker errors on *_defconfig builds that have
no bpf syscall support:

  net/built-in.o:(.rodata+0x1cd0): undefined reference to `bpf_map_lookup_elem_proto'
  net/built-in.o:(.rodata+0x1cd4): undefined reference to `bpf_map_update_elem_proto'
  net/built-in.o:(.rodata+0x1cd8): undefined reference to `bpf_map_delete_elem_proto'

Fix it up by providing built-in weak definitions of the symbols,
so they can be overridden when the syscall is enabled. I think
the issue might be that gcc is not able to optimize all that away.
This patch fixes the linker errors for me, tested with Fengguang's
make.cross [1] script.

  [1] https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Fixes: d4052c4aea0c ("ebpf: remove CONFIG_BPF_SYSCALL ifdefs in socket filter code")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index a64e7a207d2b..50603aec766a 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -656,6 +656,11 @@ void bpf_prog_free(struct bpf_prog *fp)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_free);
 
+/* Weak definitions of helper functions in case we don't have bpf syscall. */
+const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
+const struct bpf_func_proto bpf_map_update_elem_proto __weak;
+const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
+
 /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
  * skb_copy_bits(), so provide a weak definition of it for NET-less config.
  */
-- 
cgit v1.2.3


From f427c990e2d0fd30336b0c252aa7e38e4cffdea2 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sun, 1 Mar 2015 11:05:47 -0500
Subject: console: Preserve index after console setup()

Before register_console() calls the setup() method of the matched
console, the registering console index is already equal to the index
from the console command line; ie. newcon->index == c->index.

This change is also required to support extensible console matching;
(the command line index may have no relation to the console index
assigned by the console-defined match() function).

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/printk/printk.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 01cfd69c54c6..d261a7e5f51a 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2479,7 +2479,6 @@ void register_console(struct console *newcon)
 		    newcon->setup(newcon, console_cmdline[i].options) != 0)
 			break;
 		newcon->flags |= CON_ENABLED;
-		newcon->index = c->index;
 		if (i == selected_console) {
 			newcon->flags |= CON_CONSDEV;
 			preferred_console = selected_console;
-- 
cgit v1.2.3


From c467ea763fd5d8795b7d1b5a78eb94b6ad8f66ad Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 4 Mar 2015 18:06:33 +0100
Subject: context_tracking: Rename context symbols to prepare for transition
 state

Current context tracking symbols are designed to express living state.
As such they are prefixed with "IN_": IN_USER, IN_KERNEL.

Now we are going to use these symbols to also express state transitions
such as context_tracking_enter(IN_USER) or context_tracking_exit(IN_USER).
But while the "IN_" prefix works well to express entering a context, it's
confusing to depict a context exit: context_tracking_exit(IN_USER)
could mean two things:
	1) We are exiting the current context to enter user context.
	2) We are exiting the user context
We want 2) but the reviewer may be confused and understand 1)

So lets disambiguate these symbols and rename them to CONTEXT_USER and
CONTEXT_KERNEL.

Acked-by: Rik van Riel <riel@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Will deacon <will.deacon@arm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 arch/x86/kernel/traps.c                | 2 +-
 include/linux/context_tracking.h       | 2 +-
 include/linux/context_tracking_state.h | 6 +++---
 kernel/context_tracking.c              | 8 ++++----
 kernel/sched/core.c                    | 2 +-
 5 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 9d2073e2ecc9..756f74eed35d 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -123,7 +123,7 @@ enum ctx_state ist_enter(struct pt_regs *regs)
 		 * but we need to notify RCU.
 		 */
 		rcu_nmi_enter();
-		prev_state = IN_KERNEL;  /* the value is irrelevant. */
+		prev_state = CONTEXT_KERNEL;  /* the value is irrelevant. */
 	}
 
 	/*
diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index 37b81bd51ec0..427b056dfd3d 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -43,7 +43,7 @@ static inline enum ctx_state exception_enter(void)
 static inline void exception_exit(enum ctx_state prev_ctx)
 {
 	if (context_tracking_is_enabled()) {
-		if (prev_ctx == IN_USER)
+		if (prev_ctx == CONTEXT_USER)
 			context_tracking_user_enter();
 	}
 }
diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index 97a81225d037..8076f875c324 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -13,8 +13,8 @@ struct context_tracking {
 	 */
 	bool active;
 	enum ctx_state {
-		IN_KERNEL = 0,
-		IN_USER,
+		CONTEXT_KERNEL = 0,
+		CONTEXT_USER,
 	} state;
 };
 
@@ -34,7 +34,7 @@ static inline bool context_tracking_cpu_is_enabled(void)
 
 static inline bool context_tracking_in_user(void)
 {
-	return __this_cpu_read(context_tracking.state) == IN_USER;
+	return __this_cpu_read(context_tracking.state) == CONTEXT_USER;
 }
 #else
 static inline bool context_tracking_in_user(void) { return false; }
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 937ecdfdf258..8ad53c9d38b6 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -75,7 +75,7 @@ void context_tracking_user_enter(void)
 	WARN_ON_ONCE(!current->mm);
 
 	local_irq_save(flags);
-	if ( __this_cpu_read(context_tracking.state) != IN_USER) {
+	if ( __this_cpu_read(context_tracking.state) != CONTEXT_USER) {
 		if (__this_cpu_read(context_tracking.active)) {
 			trace_user_enter(0);
 			/*
@@ -101,7 +101,7 @@ void context_tracking_user_enter(void)
 		 * OTOH we can spare the calls to vtime and RCU when context_tracking.active
 		 * is false because we know that CPU is not tickless.
 		 */
-		__this_cpu_write(context_tracking.state, IN_USER);
+		__this_cpu_write(context_tracking.state, CONTEXT_USER);
 	}
 	local_irq_restore(flags);
 }
@@ -129,7 +129,7 @@ void context_tracking_user_exit(void)
 		return;
 
 	local_irq_save(flags);
-	if (__this_cpu_read(context_tracking.state) == IN_USER) {
+	if (__this_cpu_read(context_tracking.state) == CONTEXT_USER) {
 		if (__this_cpu_read(context_tracking.active)) {
 			/*
 			 * We are going to run code that may use RCU. Inform
@@ -139,7 +139,7 @@ void context_tracking_user_exit(void)
 			vtime_user_exit(current);
 			trace_user_exit(0);
 		}
-		__this_cpu_write(context_tracking.state, IN_KERNEL);
+		__this_cpu_write(context_tracking.state, CONTEXT_KERNEL);
 	}
 	local_irq_restore(flags);
 }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f0f831e8a345..06b9a00871e0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2818,7 +2818,7 @@ asmlinkage __visible void __sched schedule_user(void)
 	 * we find a better solution.
 	 *
 	 * NB: There are buggy callers of this function.  Ideally we
-	 * should warn if prev_state != IN_USER, but that will trigger
+	 * should warn if prev_state != CONTEXT_USER, but that will trigger
 	 * too frequently to make sense yet.
 	 */
 	enum ctx_state prev_state = exception_enter();
-- 
cgit v1.2.3


From 3aab4f50bff89bdea5066a05d4f3c5fa25bc37c7 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Tue, 10 Feb 2015 15:27:50 -0500
Subject: context_tracking: Generalize context tracking APIs to support user
 and guest

Generalize the context tracking APIs to support various nature of
contexts. This is performed by splitting out the mechanism from
context_tracking_user_enter and context_tracking_user_exit into
context_tracking_enter and context_tracking_exit.

The nature of the context we track is now detailed in a ctx_state
parameter pushed to these APIs, allowing the same functions to not just
track kernel <> user space switching, but also kernel <> guest transitions.

But leave the old functions in order to avoid breaking ARM, which calls
these functions from assembler code, and cannot easily use C enum
parameters.

Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Will deacon <will.deacon@arm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 include/linux/context_tracking.h |  9 ++++++---
 kernel/context_tracking.c        | 43 ++++++++++++++++++++++++++--------------
 2 files changed, 34 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index 427b056dfd3d..7f1810a3b5a4 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -10,6 +10,8 @@
 #ifdef CONFIG_CONTEXT_TRACKING
 extern void context_tracking_cpu_set(int cpu);
 
+extern void context_tracking_enter(enum ctx_state state);
+extern void context_tracking_exit(enum ctx_state state);
 extern void context_tracking_user_enter(void);
 extern void context_tracking_user_exit(void);
 extern void __context_tracking_task_switch(struct task_struct *prev,
@@ -35,7 +37,8 @@ static inline enum ctx_state exception_enter(void)
 		return 0;
 
 	prev_ctx = this_cpu_read(context_tracking.state);
-	context_tracking_user_exit();
+	if (prev_ctx != CONTEXT_KERNEL)
+		context_tracking_exit(prev_ctx);
 
 	return prev_ctx;
 }
@@ -43,8 +46,8 @@ static inline enum ctx_state exception_enter(void)
 static inline void exception_exit(enum ctx_state prev_ctx)
 {
 	if (context_tracking_is_enabled()) {
-		if (prev_ctx == CONTEXT_USER)
-			context_tracking_user_enter();
+		if (prev_ctx != CONTEXT_KERNEL)
+			context_tracking_enter(prev_ctx);
 	}
 }
 
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 8ad53c9d38b6..17715d811b71 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -39,15 +39,15 @@ void context_tracking_cpu_set(int cpu)
 }
 
 /**
- * context_tracking_user_enter - Inform the context tracking that the CPU is going to
- *                               enter userspace mode.
+ * context_tracking_enter - Inform the context tracking that the CPU is going
+ *                          enter user or guest space mode.
  *
  * This function must be called right before we switch from the kernel
- * to userspace, when it's guaranteed the remaining kernel instructions
- * to execute won't use any RCU read side critical section because this
- * function sets RCU in extended quiescent state.
+ * to user or guest space, when it's guaranteed the remaining kernel
+ * instructions to execute won't use any RCU read side critical section
+ * because this function sets RCU in extended quiescent state.
  */
-void context_tracking_user_enter(void)
+void context_tracking_enter(enum ctx_state state)
 {
 	unsigned long flags;
 
@@ -75,7 +75,7 @@ void context_tracking_user_enter(void)
 	WARN_ON_ONCE(!current->mm);
 
 	local_irq_save(flags);
-	if ( __this_cpu_read(context_tracking.state) != CONTEXT_USER) {
+	if ( __this_cpu_read(context_tracking.state) != state) {
 		if (__this_cpu_read(context_tracking.active)) {
 			trace_user_enter(0);
 			/*
@@ -101,24 +101,31 @@ void context_tracking_user_enter(void)
 		 * OTOH we can spare the calls to vtime and RCU when context_tracking.active
 		 * is false because we know that CPU is not tickless.
 		 */
-		__this_cpu_write(context_tracking.state, CONTEXT_USER);
+		__this_cpu_write(context_tracking.state, state);
 	}
 	local_irq_restore(flags);
 }
+NOKPROBE_SYMBOL(context_tracking_enter);
+
+void context_tracking_user_enter(void)
+{
+	context_tracking_enter(CONTEXT_USER);
+}
 NOKPROBE_SYMBOL(context_tracking_user_enter);
 
 /**
- * context_tracking_user_exit - Inform the context tracking that the CPU is
- *                              exiting userspace mode and entering the kernel.
+ * context_tracking_exit - Inform the context tracking that the CPU is
+ *                         exiting user or guest mode and entering the kernel.
  *
- * This function must be called after we entered the kernel from userspace
- * before any use of RCU read side critical section. This potentially include
- * any high level kernel code like syscalls, exceptions, signal handling, etc...
+ * This function must be called after we entered the kernel from user or
+ * guest space before any use of RCU read side critical section. This
+ * potentially include any high level kernel code like syscalls, exceptions,
+ * signal handling, etc...
  *
  * This call supports re-entrancy. This way it can be called from any exception
  * handler without needing to know if we came from userspace or not.
  */
-void context_tracking_user_exit(void)
+void context_tracking_exit(enum ctx_state state)
 {
 	unsigned long flags;
 
@@ -129,7 +136,7 @@ void context_tracking_user_exit(void)
 		return;
 
 	local_irq_save(flags);
-	if (__this_cpu_read(context_tracking.state) == CONTEXT_USER) {
+	if (__this_cpu_read(context_tracking.state) == state) {
 		if (__this_cpu_read(context_tracking.active)) {
 			/*
 			 * We are going to run code that may use RCU. Inform
@@ -143,6 +150,12 @@ void context_tracking_user_exit(void)
 	}
 	local_irq_restore(flags);
 }
+NOKPROBE_SYMBOL(context_tracking_exit);
+
+void context_tracking_user_exit(void)
+{
+	context_tracking_exit(CONTEXT_USER);
+}
 NOKPROBE_SYMBOL(context_tracking_user_exit);
 
 /**
-- 
cgit v1.2.3


From 19fdd98b6253404c6bdd6927bde9f962729376f7 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Tue, 10 Feb 2015 15:27:52 -0500
Subject: context_tracking: Run vtime_user_enter/exit only when state ==
 CONTEXT_USER

Only run vtime_user_enter, vtime_user_exit, and the user enter & exit
trace points when we are entering or exiting user state, respectively.

The KVM code in guest_enter and guest_exit already take care of calling
vtime_guest_enter and vtime_guest_exit, respectively.

The RCU code only distinguishes between "idle" and "not idle or kernel".
There should be no need to add an additional (unused) state there.

Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Will deacon <will.deacon@arm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/context_tracking.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 17715d811b71..a2c0866384e8 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -77,7 +77,6 @@ void context_tracking_enter(enum ctx_state state)
 	local_irq_save(flags);
 	if ( __this_cpu_read(context_tracking.state) != state) {
 		if (__this_cpu_read(context_tracking.active)) {
-			trace_user_enter(0);
 			/*
 			 * At this stage, only low level arch entry code remains and
 			 * then we'll run in userspace. We can assume there won't be
@@ -85,7 +84,10 @@ void context_tracking_enter(enum ctx_state state)
 			 * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
 			 * on the tick.
 			 */
-			vtime_user_enter(current);
+			if (state == CONTEXT_USER) {
+				trace_user_enter(0);
+				vtime_user_enter(current);
+			}
 			rcu_user_enter();
 		}
 		/*
@@ -143,8 +145,10 @@ void context_tracking_exit(enum ctx_state state)
 			 * RCU core about that (ie: we may need the tick again).
 			 */
 			rcu_user_exit();
-			vtime_user_exit(current);
-			trace_user_exit(0);
+			if (state == CONTEXT_USER) {
+				vtime_user_exit(current);
+				trace_user_exit(0);
+			}
 		}
 		__this_cpu_write(context_tracking.state, CONTEXT_KERNEL);
 	}
-- 
cgit v1.2.3


From efc1e2c9bcbab73797d7bc214014cb916d6a8eb5 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Tue, 10 Feb 2015 15:27:53 -0500
Subject: context_tracking: Export context_tracking_user_enter/exit

Export context_tracking_user_enter/exit so it can be used by KVM.

Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Will deacon <will.deacon@arm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/context_tracking.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index a2c0866384e8..72d59a1a6eb6 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -108,6 +108,7 @@ void context_tracking_enter(enum ctx_state state)
 	local_irq_restore(flags);
 }
 NOKPROBE_SYMBOL(context_tracking_enter);
+EXPORT_SYMBOL_GPL(context_tracking_enter);
 
 void context_tracking_user_enter(void)
 {
@@ -155,6 +156,7 @@ void context_tracking_exit(enum ctx_state state)
 	local_irq_restore(flags);
 }
 NOKPROBE_SYMBOL(context_tracking_exit);
+EXPORT_SYMBOL_GPL(context_tracking_exit);
 
 void context_tracking_user_exit(void)
 {
-- 
cgit v1.2.3


From 8038dad7e888581266c76df15d70ca457a3c5910 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 25 Feb 2015 10:34:39 -0800
Subject: smpboot: Add common code for notification from dying CPU

RCU ignores offlined CPUs, so they cannot safely run RCU read-side code.
(They -can- use SRCU, but not RCU.)  This means that any use of RCU
during or after the call to arch_cpu_idle_dead().  Unfortunately,
commit 2ed53c0d6cc99 added a complete() call, which will contain RCU
read-side critical sections if there is a task waiting to be awakened.

Which, as it turns out, there almost never is.  In my qemu/KVM testing,
the to-be-awakened task is not yet asleep more than 99.5% of the time.
In current mainline, failure is even harder to reproduce, requiring a
virtualized environment that delays the outgoing CPU by at least three
jiffies between the time it exits its stop_machine() task at CPU_DYING
time and the time it calls arch_cpu_idle_dead() from the idle loop.
However, this problem really can occur, especially in virtualized
environments, and therefore really does need to be fixed

This suggests moving back to the polling loop, but using a much shorter
wait, with gentle exponential backoff instead of the old 100-millisecond
wait.  Most of the time, the loop will exit without waiting at all,
and almost all of the remaining uses will wait only five microseconds.
If the outgoing CPU is preempted, a loop will wait one jiffy, then
increase the wait by a factor of 11/10ths, rounding up.  As before, there
is a five-second timeout.

This commit therefore provides common-code infrastructure to do the
dying-to-surviving CPU handoff in a safe manner.  This code also
provides an indication at CPU-online of whether the CPU to be onlined
previously timed out on offline.  The new cpu_check_up_prepare() function
returns -EBUSY if this CPU previously took more than five seconds to
go offline, or -EAGAIN if it has not yet managed to go offline.  The
rationale for -EAGAIN is that it might still be preempted, so an additional
wait might well find it correctly offlined.  Architecture-specific code
can decide how to handle these conditions.  Systems in which CPUs take
themselves completely offline might respond to an -EBUSY return as if
it was a zero (success) return.  Systems in which the surviving CPU must
take some action might take it at this time, or might simply mark the
other CPU as unusable.

Note that architectures that take the easy way out and simply pass the
-EBUSY and -EAGAIN upwards will change the sysfs API.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: <linux-api@vger.kernel.org>
Cc: <linux-arch@vger.kernel.org>
[ paulmck: Fixed state machine for architectures that don't check earlier
  CPU-hotplug results as suggested by James Hogan. ]
---
 include/linux/cpu.h |  12 ++++
 kernel/smpboot.c    | 156 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 168 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 4260e8594bd7..4744ef915acd 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -95,6 +95,8 @@ enum {
 					* Called on the new cpu, just before
 					* enabling interrupts. Must not sleep,
 					* must not fail */
+#define CPU_BROKEN		0x000C /* CPU (unsigned)v did not die properly,
+					* perhaps due to preemption. */
 
 /* Used for CPU hotplug events occurring while tasks are frozen due to a suspend
  * operation in progress
@@ -271,4 +273,14 @@ void arch_cpu_idle_enter(void);
 void arch_cpu_idle_exit(void);
 void arch_cpu_idle_dead(void);
 
+DECLARE_PER_CPU(bool, cpu_dead_idle);
+
+int cpu_report_state(int cpu);
+int cpu_check_up_prepare(int cpu);
+void cpu_set_state_online(int cpu);
+#ifdef CONFIG_HOTPLUG_CPU
+bool cpu_wait_death(unsigned int cpu, int seconds);
+bool cpu_report_death(void);
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+
 #endif /* _LINUX_CPU_H_ */
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 40190f28db35..c697f73d82d6 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -4,6 +4,7 @@
 #include <linux/cpu.h>
 #include <linux/err.h>
 #include <linux/smp.h>
+#include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/slab.h>
@@ -314,3 +315,158 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
 	put_online_cpus();
 }
 EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
+
+static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
+
+/*
+ * Called to poll specified CPU's state, for example, when waiting for
+ * a CPU to come online.
+ */
+int cpu_report_state(int cpu)
+{
+	return atomic_read(&per_cpu(cpu_hotplug_state, cpu));
+}
+
+/*
+ * If CPU has died properly, set its state to CPU_UP_PREPARE and
+ * return success.  Otherwise, return -EBUSY if the CPU died after
+ * cpu_wait_death() timed out.  And yet otherwise again, return -EAGAIN
+ * if cpu_wait_death() timed out and the CPU still hasn't gotten around
+ * to dying.  In the latter two cases, the CPU might not be set up
+ * properly, but it is up to the arch-specific code to decide.
+ * Finally, -EIO indicates an unanticipated problem.
+ *
+ * Note that it is permissible to omit this call entirely, as is
+ * done in architectures that do no CPU-hotplug error checking.
+ */
+int cpu_check_up_prepare(int cpu)
+{
+	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) {
+		atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE);
+		return 0;
+	}
+
+	switch (atomic_read(&per_cpu(cpu_hotplug_state, cpu))) {
+
+	case CPU_POST_DEAD:
+
+		/* The CPU died properly, so just start it up again. */
+		atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE);
+		return 0;
+
+	case CPU_DEAD_FROZEN:
+
+		/*
+		 * Timeout during CPU death, so let caller know.
+		 * The outgoing CPU completed its processing, but after
+		 * cpu_wait_death() timed out and reported the error. The
+		 * caller is free to proceed, in which case the state
+		 * will be reset properly by cpu_set_state_online().
+		 * Proceeding despite this -EBUSY return makes sense
+		 * for systems where the outgoing CPUs take themselves
+		 * offline, with no post-death manipulation required from
+		 * a surviving CPU.
+		 */
+		return -EBUSY;
+
+	case CPU_BROKEN:
+
+		/*
+		 * The most likely reason we got here is that there was
+		 * a timeout during CPU death, and the outgoing CPU never
+		 * did complete its processing.  This could happen on
+		 * a virtualized system if the outgoing VCPU gets preempted
+		 * for more than five seconds, and the user attempts to
+		 * immediately online that same CPU.  Trying again later
+		 * might return -EBUSY above, hence -EAGAIN.
+		 */
+		return -EAGAIN;
+
+	default:
+
+		/* Should not happen.  Famous last words. */
+		return -EIO;
+	}
+}
+
+/*
+ * Mark the specified CPU online.
+ *
+ * Note that it is permissible to omit this call entirely, as is
+ * done in architectures that do no CPU-hotplug error checking.
+ */
+void cpu_set_state_online(int cpu)
+{
+	(void)atomic_xchg(&per_cpu(cpu_hotplug_state, cpu), CPU_ONLINE);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Wait for the specified CPU to exit the idle loop and die.
+ */
+bool cpu_wait_death(unsigned int cpu, int seconds)
+{
+	int jf_left = seconds * HZ;
+	int oldstate;
+	bool ret = true;
+	int sleep_jf = 1;
+
+	might_sleep();
+
+	/* The outgoing CPU will normally get done quite quickly. */
+	if (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) == CPU_DEAD)
+		goto update_state;
+	udelay(5);
+
+	/* But if the outgoing CPU dawdles, wait increasingly long times. */
+	while (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) != CPU_DEAD) {
+		schedule_timeout_uninterruptible(sleep_jf);
+		jf_left -= sleep_jf;
+		if (jf_left <= 0)
+			break;
+		sleep_jf = DIV_ROUND_UP(sleep_jf * 11, 10);
+	}
+update_state:
+	oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
+	if (oldstate == CPU_DEAD) {
+		/* Outgoing CPU died normally, update state. */
+		smp_mb(); /* atomic_read() before update. */
+		atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_POST_DEAD);
+	} else {
+		/* Outgoing CPU still hasn't died, set state accordingly. */
+		if (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
+				   oldstate, CPU_BROKEN) != oldstate)
+			goto update_state;
+		ret = false;
+	}
+	return ret;
+}
+
+/*
+ * Called by the outgoing CPU to report its successful death.  Return
+ * false if this report follows the surviving CPU's timing out.
+ *
+ * A separate "CPU_DEAD_FROZEN" is used when the surviving CPU
+ * timed out.  This approach allows architectures to omit calls to
+ * cpu_check_up_prepare() and cpu_set_state_online() without defeating
+ * the next cpu_wait_death()'s polling loop.
+ */
+bool cpu_report_death(void)
+{
+	int oldstate;
+	int newstate;
+	int cpu = smp_processor_id();
+
+	do {
+		oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
+		if (oldstate != CPU_BROKEN)
+			newstate = CPU_DEAD;
+		else
+			newstate = CPU_DEAD_FROZEN;
+	} while (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
+				oldstate, newstate) != oldstate);
+	return newstate == CPU_DEAD;
+}
+
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-- 
cgit v1.2.3


From b33078b6098148c3efdacc907249a247c9d5491e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 16 Jan 2015 14:01:21 -0800
Subject: rcu: Consolidate offline-CPU callback initialization

Currently, both rcu_cleanup_dead_cpu() and rcu_send_cbs_to_orphanage()
initialize the outgoing CPU's callback list.  However, only
rcu_cleanup_dead_cpu() invokes rcu_send_cbs_to_orphanage(), and
it does so unconditionally, which means that only one of these
initializations is required.  This commit therefore consolidates the
callback-list initialization with the rest of the callback handling in
rcu_send_cbs_to_orphanage().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 48d640ca1a05..8e020c59ecfd 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2256,8 +2256,12 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
 		rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];
 	}
 
-	/* Finally, initialize the rcu_data structure's list to empty.  */
+	/*
+	 * Finally, initialize the rcu_data structure's list to empty and
+	 * disallow further callbacks on this CPU.
+	 */
 	init_callback_list(rdp);
+	rdp->nxttail[RCU_NEXT_TAIL] = NULL;
 }
 
 /*
@@ -2398,9 +2402,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 	WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
 		  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
 		  cpu, rdp->qlen, rdp->nxtlist);
-	init_callback_list(rdp);
-	/* Disallow further callbacks on this CPU. */
-	rdp->nxttail[RCU_NEXT_TAIL] = NULL;
 	mutex_unlock(&rsp->onoff_mutex);
 }
 
-- 
cgit v1.2.3


From 78043c467a91573cc1d51827fe10d7d15ae79a60 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 18 Jan 2015 17:46:24 -0800
Subject: rcu: Put all orphan-callback-related code under same comment

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8e020c59ecfd..98da632d1d49 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2385,9 +2385,9 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 
 	/* Exclude any attempts to start a new grace period. */
 	mutex_lock(&rsp->onoff_mutex);
-	raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
 
 	/* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
+	raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
 	rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
 	rcu_adopt_orphan_cbs(rsp, flags);
 	raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
-- 
cgit v1.2.3


From c8aead6a9b27fdd94b7bcb74b587ae012d8145f2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 19 Jan 2015 16:56:46 -0800
Subject: rcu: Simplify sync_rcu_preempt_exp_init()

This commit eliminates a boolean and associated "if" statement by
rearranging the code.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0a571e9a0f1d..d37c9fbdba71 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -677,19 +677,16 @@ static void
 sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
 {
 	unsigned long flags;
-	int must_wait = 0;
 
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	smp_mb__after_unlock_lock();
 	if (!rcu_preempt_has_tasks(rnp)) {
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		rcu_report_exp_rnp(rsp, rnp, false); /* No tasks, report. */
 	} else {
 		rnp->exp_tasks = rnp->blkd_tasks.next;
 		rcu_initiate_boost(rnp, flags);  /* releases rnp->lock */
-		must_wait = 1;
 	}
-	if (!must_wait)
-		rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */
 }
 
 /**
-- 
cgit v1.2.3


From 18c629eaebf1814ca7f0c27327f75aa93aa4a5de Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 19 Jan 2015 18:59:56 -0800
Subject: rcu: Eliminate empty HOTPLUG_CPU ifdef

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index d37c9fbdba71..79376e2461c9 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -520,10 +520,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 	WARN_ON_ONCE(rnp->qsmask);
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
 /*
  * Check for a quiescent state from the current CPU.  When a task blocks,
  * the task is recorded in the corresponding CPU's rcu_node structure,
-- 
cgit v1.2.3


From 237a0f2193c6daf9b1edd7fd15d55e680f268952 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 22 Jan 2015 14:32:06 -0800
Subject: rcu: Detect stalls caused by failure to propagate up rcu_node tree

If all CPUs have passed through quiescent states, then stalls might be
due to starvation of the grace-period kthread or to failure to propagate
the quiescent states up the rcu_node combining tree.  The current stall
warning messages do not differentiate, so this commit adds a printout
of the root rcu_node structure's ->qsmask field.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 98da632d1d49..3b7e4133ca99 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1196,9 +1196,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 		} else {
 			j = jiffies;
 			gpa = ACCESS_ONCE(rsp->gp_activity);
-			pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld\n",
+			pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
 			       rsp->name, j - gpa, j, gpa,
-			       jiffies_till_next_fqs);
+			       jiffies_till_next_fqs,
+			       rcu_get_root(rsp)->qsmask);
 			/* In this case, the current CPU might be at fault. */
 			sched_show_task(current);
 		}
-- 
cgit v1.2.3


From 37745d281069682d901f00c0121949a7d224195f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 22 Jan 2015 18:24:08 -0800
Subject: rcu: Provide diagnostic option to slow down grace-period
 initialization

Grace-period initialization normally proceeds quite quickly, so
that it is very difficult to reproduce races against grace-period
initialization.  This commit therefore allows grace-period
initialization to be artificially slowed down, increasing
race-reproduction probability.  A pair of new Kconfig parameters are
provided, CONFIG_RCU_TORTURE_TEST_SLOW_INIT to enable the slowdowns, and
CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY to specify the number of jiffies
of slowdown to apply.  A boot-time parameter named rcutree.gp_init_delay
allows boot-time delay to be specified.  By default, no delay will be
applied even if CONFIG_RCU_TORTURE_TEST_SLOW_INIT is set.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/kernel-parameters.txt |  6 ++++++
 kernel/rcu/tree.c                   | 10 ++++++++++
 lib/Kconfig.debug                   | 24 ++++++++++++++++++++++++
 3 files changed, 40 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index bfcb1a62a7b4..94de410ec341 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2968,6 +2968,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			Set maximum number of finished RCU callbacks to
 			process in one batch.
 
+	rcutree.gp_init_delay=	[KNL]
+			Set the number of jiffies to delay each step of
+			RCU grace-period initialization.  This only has
+			effect when CONFIG_RCU_TORTURE_TEST_SLOW_INIT is
+			set.
+
 	rcutree.rcu_fanout_leaf= [KNL]
 			Increase the number of CPUs assigned to each
 			leaf rcu_node structure.  Useful for very large
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 3b7e4133ca99..b42001fd55fb 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -160,6 +160,12 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
 static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
 module_param(kthread_prio, int, 0644);
 
+/* Delay in jiffies for grace-period initialization delays. */
+static int gp_init_delay = IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT)
+				? CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY
+				: 0;
+module_param(gp_init_delay, int, 0644);
+
 /*
  * Track the rcutorture test sequence number and the update version
  * number within a given test.  The rcutorture_testseq is incremented
@@ -1769,6 +1775,10 @@ static int rcu_gp_init(struct rcu_state *rsp)
 		raw_spin_unlock_irq(&rnp->lock);
 		cond_resched_rcu_qs();
 		ACCESS_ONCE(rsp->gp_activity) = jiffies;
+		if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT) &&
+		    gp_init_delay > 0 &&
+		    !(rsp->gpnum % (rcu_num_nodes * 10)))
+			schedule_timeout_uninterruptible(gp_init_delay);
 	}
 
 	mutex_unlock(&rsp->onoff_mutex);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index c5cefb3c009c..feee8dab441e 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1257,6 +1257,30 @@ config RCU_TORTURE_TEST_RUNNABLE
 	  Say N here if you want the RCU torture tests to start only
 	  after being manually enabled via /proc.
 
+config RCU_TORTURE_TEST_SLOW_INIT
+	bool "Slow down RCU grace-period initialization to expose races"
+	depends on RCU_TORTURE_TEST
+	help
+	  This option makes grace-period initialization block for a
+	  few jiffies between initializing each pair of consecutive
+	  rcu_node structures.	This helps to expose races involving
+	  grace-period initialization, in other words, it makes your
+	  kernel less stable.  It can also greatly increase grace-period
+	  latency, especially on systems with large numbers of CPUs.
+	  This is useful when torture-testing RCU, but in almost no
+	  other circumstance.
+
+	  Say Y here if you want your system to crash and hang more often.
+	  Say N if you want a sane system.
+
+config RCU_TORTURE_TEST_SLOW_INIT_DELAY
+	int "How much to slow down RCU grace-period initialization"
+	range 0 5
+	default 0
+	help
+	  This option specifies the number of jiffies to wait between
+	  each rcu_node structure initialization.
+
 config RCU_CPU_STALL_TIMEOUT
 	int "RCU CPU stall timeout in seconds"
 	depends on RCU_STALL_COMMON
-- 
cgit v1.2.3


From 999c286347538388170f919146d7cfa58689472e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 31 Jan 2015 21:12:02 -0800
Subject: rcu: Remove event tracing from rcu_cpu_notify(), used by offline CPUs

Offline CPUs cannot safely invoke trace events, but such CPUs do execute
within rcu_cpu_notify().  Therefore, this commit removes the trace events
from rcu_cpu_notify().  These trace events are for utilization, against
which rcu_cpu_notify() execution time should be negligible.

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b42001fd55fb..a7151d26b940 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3629,7 +3629,6 @@ static int rcu_cpu_notify(struct notifier_block *self,
 	struct rcu_node *rnp = rdp->mynode;
 	struct rcu_state *rsp;
 
-	trace_rcu_utilization(TPS("Start CPU hotplug"));
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
@@ -3661,7 +3660,6 @@ static int rcu_cpu_notify(struct notifier_block *self,
 	default:
 		break;
 	}
-	trace_rcu_utilization(TPS("End CPU hotplug"));
 	return NOTIFY_OK;
 }
 
-- 
cgit v1.2.3


From 80f1d68ccba70b1060c9c7360ca83da430f66bed Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 12 Mar 2015 17:21:42 +0100
Subject: ebpf: verifier: check that call reg with ARG_ANYTHING is initialized

I noticed that a helper function with argument type ARG_ANYTHING does
not need to have an initialized value (register).

This can worst case lead to unintented stack memory leakage in future
helper functions if they are not carefully designed, or unintended
application behaviour in case the application developer was not careful
enough to match a correct helper function signature in the API.

The underlying issue is that ARG_ANYTHING should actually be split
into two different semantics:

  1) ARG_DONTCARE for function arguments that the helper function
     does not care about (in other words: the default for unused
     function arguments), and

  2) ARG_ANYTHING that is an argument actually being used by a
     helper function and *guaranteed* to be an initialized register.

The current risk is low: ARG_ANYTHING is only used for the 'flags'
argument (r4) in bpf_map_update_elem() that internally does strict
checking.

Fixes: 17a5267067f3 ("bpf: verifier (add verifier core)")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h   | 4 +++-
 kernel/bpf/verifier.c | 5 ++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a884f5a2c503..80f2e0fc3d02 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -44,7 +44,7 @@ struct bpf_map_type_list {
 
 /* function argument constraints */
 enum bpf_arg_type {
-	ARG_ANYTHING = 0,	/* any argument is ok */
+	ARG_DONTCARE = 0,	/* unused argument in helper function */
 
 	/* the following constraints used to prototype
 	 * bpf_map_lookup/update/delete_elem() functions
@@ -58,6 +58,8 @@ enum bpf_arg_type {
 	 */
 	ARG_PTR_TO_STACK,	/* any pointer to eBPF program stack */
 	ARG_CONST_STACK_SIZE,	/* number of bytes accessed from stack */
+
+	ARG_ANYTHING,		/* any (initialized) argument is ok */
 };
 
 /* type of values returned from helper functions */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bdf4192a889b..e6b522496250 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -755,7 +755,7 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 	enum bpf_reg_type expected_type;
 	int err = 0;
 
-	if (arg_type == ARG_ANYTHING)
+	if (arg_type == ARG_DONTCARE)
 		return 0;
 
 	if (reg->type == NOT_INIT) {
@@ -763,6 +763,9 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 		return -EACCES;
 	}
 
+	if (arg_type == ARG_ANYTHING)
+		return 0;
+
 	if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY ||
 	    arg_type == ARG_PTR_TO_MAP_VALUE) {
 		expected_type = PTR_TO_STACK;
-- 
cgit v1.2.3


From 8eb74b2b291e7bf6aa59fcb4e59f236382f00bf5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 13 Feb 2015 10:52:28 -0800
Subject: rcu: Rework preemptible expedited bitmask handling

Currently, the rcu_node tree ->expmask bitmasks are initially set to
reflect the online CPUs.  This is pointless, because only the CPUs
preempted within RCU read-side critical sections by the preceding
synchronize_sched_expedited() need to be tracked.  This commit therefore
instead sets up these bitmasks based on the state of the ->blkd_tasks
lists.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 98 ++++++++++++++++++++++++++++++++++++------------
 1 file changed, 75 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 79376e2461c9..a22721547442 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -626,9 +626,6 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
  * recursively up the tree.  (Calm down, calm down, we do the recursion
  * iteratively!)
  *
- * Most callers will set the "wake" flag, but the task initiating the
- * expedited grace period need not wake itself.
- *
  * Caller must hold sync_rcu_preempt_exp_mutex.
  */
 static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
@@ -663,26 +660,85 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 
 /*
  * Snapshot the tasks blocking the newly started preemptible-RCU expedited
- * grace period for the specified rcu_node structure.  If there are no such
- * tasks, report it up the rcu_node hierarchy.
+ * grace period for the specified rcu_node structure, phase 1.  If there
+ * are such tasks, set the ->expmask bits up the rcu_node tree and also
+ * set the ->expmask bits on the leaf rcu_node structures to tell phase 2
+ * that work is needed here.
  *
- * Caller must hold sync_rcu_preempt_exp_mutex and must exclude
- * CPU hotplug operations.
+ * Caller must hold sync_rcu_preempt_exp_mutex.
  */
 static void
-sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
+sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp)
 {
 	unsigned long flags;
+	unsigned long mask;
+	struct rcu_node *rnp_up;
 
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	smp_mb__after_unlock_lock();
+	WARN_ON_ONCE(rnp->expmask);
+	WARN_ON_ONCE(rnp->exp_tasks);
 	if (!rcu_preempt_has_tasks(rnp)) {
+		/* No blocked tasks, nothing to do. */
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
-		rcu_report_exp_rnp(rsp, rnp, false); /* No tasks, report. */
-	} else {
+		return;
+	}
+	/* Call for Phase 2 and propagate ->expmask bits up the tree. */
+	rnp->expmask = 1;
+	rnp_up = rnp;
+	while (rnp_up->parent) {
+		mask = rnp_up->grpmask;
+		rnp_up = rnp_up->parent;
+		if (rnp_up->expmask & mask)
+			break;
+		raw_spin_lock(&rnp_up->lock); /* irqs already off */
+		smp_mb__after_unlock_lock();
+		rnp_up->expmask |= mask;
+		raw_spin_unlock(&rnp_up->lock); /* irqs still off */
+	}
+	raw_spin_unlock_irqrestore(&rnp->lock, flags);
+}
+
+/*
+ * Snapshot the tasks blocking the newly started preemptible-RCU expedited
+ * grace period for the specified rcu_node structure, phase 2.  If the
+ * leaf rcu_node structure has its ->expmask field set, check for tasks.
+ * If there are some, clear ->expmask and set ->exp_tasks accordingly,
+ * then initiate RCU priority boosting.  Otherwise, clear ->expmask and
+ * invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits,
+ * enabling rcu_read_unlock_special() to do the bit-clearing.
+ *
+ * Caller must hold sync_rcu_preempt_exp_mutex.
+ */
+static void
+sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&rnp->lock, flags);
+	smp_mb__after_unlock_lock();
+	if (!rnp->expmask) {
+		/* Phase 1 didn't do anything, so Phase 2 doesn't either. */
+		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+
+	/* Phase 1 is over. */
+	rnp->expmask = 0;
+
+	/*
+	 * If there are still blocked tasks, set up ->exp_tasks so that
+	 * rcu_read_unlock_special() will wake us and then boost them.
+	 */
+	if (rcu_preempt_has_tasks(rnp)) {
 		rnp->exp_tasks = rnp->blkd_tasks.next;
 		rcu_initiate_boost(rnp, flags);  /* releases rnp->lock */
+		return;
 	}
+
+	/* No longer any blocked tasks, so undo bit setting. */
+	raw_spin_unlock_irqrestore(&rnp->lock, flags);
+	rcu_report_exp_rnp(rsp, rnp, false);
 }
 
 /**
@@ -699,7 +755,6 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
  */
 void synchronize_rcu_expedited(void)
 {
-	unsigned long flags;
 	struct rcu_node *rnp;
 	struct rcu_state *rsp = &rcu_preempt_state;
 	unsigned long snap;
@@ -750,19 +805,16 @@ void synchronize_rcu_expedited(void)
 	/* force all RCU readers onto ->blkd_tasks lists. */
 	synchronize_sched_expedited();
 
-	/* Initialize ->expmask for all non-leaf rcu_node structures. */
-	rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
-		raw_spin_lock_irqsave(&rnp->lock, flags);
-		smp_mb__after_unlock_lock();
-		rnp->expmask = rnp->qsmaskinit;
-		raw_spin_unlock_irqrestore(&rnp->lock, flags);
-	}
-
-	/* Snapshot current state of ->blkd_tasks lists. */
+	/*
+	 * Snapshot current state of ->blkd_tasks lists into ->expmask.
+	 * Phase 1 sets bits and phase 2 permits rcu_read_unlock_special()
+	 * to start clearing them.  Doing this in one phase leads to
+	 * strange races between setting and clearing bits, so just say "no"!
+	 */
+	rcu_for_each_leaf_node(rsp, rnp)
+		sync_rcu_preempt_exp_init1(rsp, rnp);
 	rcu_for_each_leaf_node(rsp, rnp)
-		sync_rcu_preempt_exp_init(rsp, rnp);
-	if (NUM_RCU_NODES > 1)
-		sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
+		sync_rcu_preempt_exp_init2(rsp, rnp);
 
 	put_online_cpus();
 
-- 
cgit v1.2.3


From cc99a310caf811aebbd0986f433d824e4a5e7ce5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 23 Feb 2015 08:59:29 -0800
Subject: rcu: Move rcu_report_unblock_qs_rnp() to common code

The rcu_report_unblock_qs_rnp() function is invoked when the
last task blocking the current grace period exits its outermost
RCU read-side critical section.  Previously, this was called only
from rcu_read_unlock_special(), and was therefore defined only when
CONFIG_RCU_PREEMPT=y.  However, this function will be invoked even when
CONFIG_RCU_PREEMPT=n once CPU-hotplug operations are processed only at
the beginnings of RCU grace periods.  The reason for this change is that
the last task on a given leaf rcu_node structure's ->blkd_tasks list
might well exit its RCU read-side critical section between the time that
recent CPU-hotplug operations were applied and when the new grace period
was initialized.  This situation could result in RCU waiting forever on
that leaf rcu_node structure, because if all that structure's CPUs were
already offline, there would be no quiescent-state events to drive that
structure's part of the grace period.

This commit therefore moves rcu_report_unblock_qs_rnp() to common code
that is built unconditionally so that the quiescent-state-forcing code
can clean up after this situation, avoiding the grace-period stall.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c        | 39 +++++++++++++++++++++++++++++++++++++++
 kernel/rcu/tree_plugin.h | 40 ++--------------------------------------
 2 files changed, 41 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a7151d26b940..5b5cb1ff73ed 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2126,6 +2126,45 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
 	rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */
 }
 
+/*
+ * Record a quiescent state for all tasks that were previously queued
+ * on the specified rcu_node structure and that were blocking the current
+ * RCU grace period.  The caller must hold the specified rnp->lock with
+ * irqs disabled, and this lock is released upon return, but irqs remain
+ * disabled.
+ */
+static void __maybe_unused rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
+				      struct rcu_node *rnp, unsigned long flags)
+	__releases(rnp->lock)
+{
+	unsigned long mask;
+	struct rcu_node *rnp_p;
+
+	WARN_ON_ONCE(rsp == &rcu_bh_state || rsp == &rcu_sched_state);
+	if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
+		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		return;  /* Still need more quiescent states! */
+	}
+
+	rnp_p = rnp->parent;
+	if (rnp_p == NULL) {
+		/*
+		 * Either there is only one rcu_node in the tree,
+		 * or tasks were kicked up to root rcu_node due to
+		 * CPUs going offline.
+		 */
+		rcu_report_qs_rsp(rsp, flags);
+		return;
+	}
+
+	/* Report up the rest of the hierarchy. */
+	mask = rnp->grpmask;
+	raw_spin_unlock(&rnp->lock);	/* irqs remain disabled. */
+	raw_spin_lock(&rnp_p->lock);	/* irqs already disabled. */
+	smp_mb__after_unlock_lock();
+	rcu_report_qs_rnp(mask, rsp, rnp_p, flags);
+}
+
 /*
  * Record a quiescent state for the specified CPU to that CPU's rcu_data
  * structure.  This must be either called from the specified CPU, or
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index a22721547442..ec6c2efb28cd 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -232,43 +232,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 	return rnp->gp_tasks != NULL;
 }
 
-/*
- * Record a quiescent state for all tasks that were previously queued
- * on the specified rcu_node structure and that were blocking the current
- * RCU grace period.  The caller must hold the specified rnp->lock with
- * irqs disabled, and this lock is released upon return, but irqs remain
- * disabled.
- */
-static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
-	__releases(rnp->lock)
-{
-	unsigned long mask;
-	struct rcu_node *rnp_p;
-
-	if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
-		raw_spin_unlock_irqrestore(&rnp->lock, flags);
-		return;  /* Still need more quiescent states! */
-	}
-
-	rnp_p = rnp->parent;
-	if (rnp_p == NULL) {
-		/*
-		 * Either there is only one rcu_node in the tree,
-		 * or tasks were kicked up to root rcu_node due to
-		 * CPUs going offline.
-		 */
-		rcu_report_qs_rsp(&rcu_preempt_state, flags);
-		return;
-	}
-
-	/* Report up the rest of the hierarchy. */
-	mask = rnp->grpmask;
-	raw_spin_unlock(&rnp->lock);	/* irqs remain disabled. */
-	raw_spin_lock(&rnp_p->lock);	/* irqs already disabled. */
-	smp_mb__after_unlock_lock();
-	rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
-}
-
 /*
  * Advance a ->blkd_tasks-list pointer to the next entry, instead
  * returning NULL if at the end of the list.
@@ -399,7 +362,8 @@ void rcu_read_unlock_special(struct task_struct *t)
 							 rnp->grplo,
 							 rnp->grphi,
 							 !!rnp->gp_tasks);
-			rcu_report_unblock_qs_rnp(rnp, flags);
+			rcu_report_unblock_qs_rnp(&rcu_preempt_state,
+						  rnp, flags);
 		} else {
 			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		}
-- 
cgit v1.2.3


From 0aa04b055e71bd3b8040dd71a126126c66b6f01e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 23 Jan 2015 21:52:37 -0800
Subject: rcu: Process offlining and onlining only at grace-period start

Races between CPU hotplug and grace periods can be difficult to resolve,
so the ->onoff_mutex is used to exclude the two events.  Unfortunately,
this means that it is impossible for an outgoing CPU to perform the
last bits of its offlining from its last pass through the idle loop,
because sleeplocks cannot be acquired in that context.

This commit avoids these problems by buffering online and offline events
in a new ->qsmaskinitnext field in the leaf rcu_node structures.  When a
grace period starts, the events accumulated in this mask are applied to
the ->qsmaskinit field, and, if needed, up the rcu_node tree.  The special
case of all CPUs corresponding to a given leaf rcu_node structure being
offline while there are still elements in that structure's ->blkd_tasks
list is handled using a new ->wait_blkd_tasks field.  In this case,
propagating the offline bits up the tree is deferred until the beginning
of the grace period after all of the tasks have exited their RCU read-side
critical sections and removed themselves from the list, at which point
the ->wait_blkd_tasks flag is cleared.  If one of that leaf rcu_node
structure's CPUs comes back online before the list empties, then the
->wait_blkd_tasks flag is simply cleared.

This of course means that RCU's notion of which CPUs are offline can be
out of date.  This is OK because RCU need only wait on CPUs that were
online at the time that the grace period started.  In addition, RCU's
force-quiescent-state actions will handle the case where a CPU goes
offline after the grace period starts.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c        | 154 +++++++++++++++++++++++++++++++++++++----------
 kernel/rcu/tree.h        |   9 +++
 kernel/rcu/tree_plugin.h |  22 ++-----
 kernel/rcu/tree_trace.c  |   4 +-
 4 files changed, 136 insertions(+), 53 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 5b5cb1ff73ed..f0f4d3510d24 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -152,6 +152,8 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
  */
 static int rcu_scheduler_fully_active __read_mostly;
 
+static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
+static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
 static void invoke_rcu_core(void);
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
@@ -178,6 +180,17 @@ module_param(gp_init_delay, int, 0644);
 unsigned long rcutorture_testseq;
 unsigned long rcutorture_vernum;
 
+/*
+ * Compute the mask of online CPUs for the specified rcu_node structure.
+ * This will not be stable unless the rcu_node structure's ->lock is
+ * held, but the bit corresponding to the current CPU will be stable
+ * in most contexts.
+ */
+unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
+{
+	return ACCESS_ONCE(rnp->qsmaskinitnext);
+}
+
 /*
  * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s
  * permit this function to be invoked without holding the root rcu_node
@@ -960,7 +973,7 @@ bool rcu_lockdep_current_cpu_online(void)
 	preempt_disable();
 	rdp = this_cpu_ptr(&rcu_sched_data);
 	rnp = rdp->mynode;
-	ret = (rdp->grpmask & rnp->qsmaskinit) ||
+	ret = (rdp->grpmask & rcu_rnp_online_cpus(rnp)) ||
 	      !rcu_scheduler_fully_active;
 	preempt_enable();
 	return ret;
@@ -1710,6 +1723,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
  */
 static int rcu_gp_init(struct rcu_state *rsp)
 {
+	unsigned long oldmask;
 	struct rcu_data *rdp;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
@@ -1744,6 +1758,55 @@ static int rcu_gp_init(struct rcu_state *rsp)
 	mutex_lock(&rsp->onoff_mutex);
 	smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */
 
+	/*
+	 * Apply per-leaf buffered online and offline operations to the
+	 * rcu_node tree.  Note that this new grace period need not wait
+	 * for subsequent online CPUs, and that quiescent-state forcing
+	 * will handle subsequent offline CPUs.
+	 */
+	rcu_for_each_leaf_node(rsp, rnp) {
+		raw_spin_lock_irq(&rnp->lock);
+		smp_mb__after_unlock_lock();
+		if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
+		    !rnp->wait_blkd_tasks) {
+			/* Nothing to do on this leaf rcu_node structure. */
+			raw_spin_unlock_irq(&rnp->lock);
+			continue;
+		}
+
+		/* Record old state, apply changes to ->qsmaskinit field. */
+		oldmask = rnp->qsmaskinit;
+		rnp->qsmaskinit = rnp->qsmaskinitnext;
+
+		/* If zero-ness of ->qsmaskinit changed, propagate up tree. */
+		if (!oldmask != !rnp->qsmaskinit) {
+			if (!oldmask) /* First online CPU for this rcu_node. */
+				rcu_init_new_rnp(rnp);
+			else if (rcu_preempt_has_tasks(rnp)) /* blocked tasks */
+				rnp->wait_blkd_tasks = true;
+			else /* Last offline CPU and can propagate. */
+				rcu_cleanup_dead_rnp(rnp);
+		}
+
+		/*
+		 * If all waited-on tasks from prior grace period are
+		 * done, and if all this rcu_node structure's CPUs are
+		 * still offline, propagate up the rcu_node tree and
+		 * clear ->wait_blkd_tasks.  Otherwise, if one of this
+		 * rcu_node structure's CPUs has since come back online,
+		 * simply clear ->wait_blkd_tasks (but rcu_cleanup_dead_rnp()
+		 * checks for this, so just call it unconditionally).
+		 */
+		if (rnp->wait_blkd_tasks &&
+		    (!rcu_preempt_has_tasks(rnp) ||
+		     rnp->qsmaskinit)) {
+			rnp->wait_blkd_tasks = false;
+			rcu_cleanup_dead_rnp(rnp);
+		}
+
+		raw_spin_unlock_irq(&rnp->lock);
+	}
+
 	/*
 	 * Set the quiescent-state-needed bits in all the rcu_node
 	 * structures for all currently online CPUs in breadth-first order,
@@ -2133,7 +2196,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
  * irqs disabled, and this lock is released upon return, but irqs remain
  * disabled.
  */
-static void __maybe_unused rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
+static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
 				      struct rcu_node *rnp, unsigned long flags)
 	__releases(rnp->lock)
 {
@@ -2409,6 +2472,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
 		raw_spin_lock(&rnp->lock); /* irqs already disabled. */
 		smp_mb__after_unlock_lock(); /* GP memory ordering. */
 		rnp->qsmaskinit &= ~mask;
+		rnp->qsmask &= ~mask;
 		if (rnp->qsmaskinit) {
 			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 			return;
@@ -2427,6 +2491,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
 static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 {
 	unsigned long flags;
+	unsigned long mask;
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
 
@@ -2443,12 +2508,12 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 	raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
 
 	/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
+	mask = rdp->grpmask;
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	smp_mb__after_unlock_lock();	/* Enforce GP memory-order guarantee. */
-	rnp->qsmaskinit &= ~rdp->grpmask;
-	if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp))
-		rcu_cleanup_dead_rnp(rnp);
-	rcu_report_qs_rnp(rdp->grpmask, rsp, rnp, flags); /* Rlses rnp->lock. */
+	rnp->qsmaskinitnext &= ~mask;
+	raw_spin_unlock_irqrestore(&rnp->lock, flags);
+
 	WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
 		  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
 		  cpu, rdp->qlen, rdp->nxtlist);
@@ -2654,12 +2719,21 @@ static void force_qs_rnp(struct rcu_state *rsp,
 			}
 		}
 		if (mask != 0) {
-
-			/* rcu_report_qs_rnp() releases rnp->lock. */
+			/* Idle/offline CPUs, report. */
 			rcu_report_qs_rnp(mask, rsp, rnp, flags);
-			continue;
+		} else if (rnp->parent &&
+			 list_empty(&rnp->blkd_tasks) &&
+			 !rnp->qsmask &&
+			 (rnp->parent->qsmask & rnp->grpmask)) {
+			/*
+			 * Race between grace-period initialization and task
+			 * existing RCU read-side critical section, report.
+			 */
+			rcu_report_unblock_qs_rnp(rsp, rnp, flags);
+		} else {
+			/* Nothing to do here, so just drop the lock. */
+			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		}
-		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	}
 }
 
@@ -3568,6 +3642,28 @@ void rcu_barrier_sched(void)
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_sched);
 
+/*
+ * Propagate ->qsinitmask bits up the rcu_node tree to account for the
+ * first CPU in a given leaf rcu_node structure coming online.  The caller
+ * must hold the corresponding leaf rcu_node ->lock with interrrupts
+ * disabled.
+ */
+static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
+{
+	long mask;
+	struct rcu_node *rnp = rnp_leaf;
+
+	for (;;) {
+		mask = rnp->grpmask;
+		rnp = rnp->parent;
+		if (rnp == NULL)
+			return;
+		raw_spin_lock(&rnp->lock); /* Interrupts already disabled. */
+		rnp->qsmaskinit |= mask;
+		raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
+	}
+}
+
 /*
  * Do boot-time initialization of a CPU's per-CPU RCU data.
  */
@@ -3620,31 +3716,23 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 		   (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
 	raw_spin_unlock(&rnp->lock);		/* irqs remain disabled. */
 
-	/* Add CPU to rcu_node bitmasks. */
+	/*
+	 * Add CPU to leaf rcu_node pending-online bitmask.  Any needed
+	 * propagation up the rcu_node tree will happen at the beginning
+	 * of the next grace period.
+	 */
 	rnp = rdp->mynode;
 	mask = rdp->grpmask;
-	do {
-		/* Exclude any attempts to start a new GP on small systems. */
-		raw_spin_lock(&rnp->lock);	/* irqs already disabled. */
-		rnp->qsmaskinit |= mask;
-		mask = rnp->grpmask;
-		if (rnp == rdp->mynode) {
-			/*
-			 * If there is a grace period in progress, we will
-			 * set up to wait for it next time we run the
-			 * RCU core code.
-			 */
-			rdp->gpnum = rnp->completed;
-			rdp->completed = rnp->completed;
-			rdp->passed_quiesce = 0;
-			rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
-			rdp->qs_pending = 0;
-			trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
-		}
-		raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
-		rnp = rnp->parent;
-	} while (rnp != NULL && !(rnp->qsmaskinit & mask));
-	local_irq_restore(flags);
+	raw_spin_lock(&rnp->lock);		/* irqs already disabled. */
+	smp_mb__after_unlock_lock();
+	rnp->qsmaskinitnext |= mask;
+	rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
+	rdp->completed = rnp->completed;
+	rdp->passed_quiesce = false;
+	rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
+	rdp->qs_pending = false;
+	trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
+	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 
 	mutex_unlock(&rsp->onoff_mutex);
 }
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 119de399eb2f..aa42562ff5b2 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -141,12 +141,20 @@ struct rcu_node {
 				/*  complete (only for PREEMPT_RCU). */
 	unsigned long qsmaskinit;
 				/* Per-GP initial value for qsmask & expmask. */
+				/*  Initialized from ->qsmaskinitnext at the */
+				/*  beginning of each grace period. */
+	unsigned long qsmaskinitnext;
+				/* Online CPUs for next grace period. */
 	unsigned long grpmask;	/* Mask to apply to parent qsmask. */
 				/*  Only one bit will be set in this mask. */
 	int	grplo;		/* lowest-numbered CPU or group here. */
 	int	grphi;		/* highest-numbered CPU or group here. */
 	u8	grpnum;		/* CPU/group number for next level up. */
 	u8	level;		/* root is at level 0. */
+	bool	wait_blkd_tasks;/* Necessary to wait for blocked tasks to */
+				/*  exit RCU read-side critical sections */
+				/*  before propagating offline up the */
+				/*  rcu_node tree? */
 	struct rcu_node *parent;
 	struct list_head blkd_tasks;
 				/* Tasks blocked in RCU read-side critical */
@@ -559,6 +567,7 @@ static void rcu_prepare_kthreads(int cpu);
 static void rcu_cleanup_after_idle(void);
 static void rcu_prepare_for_idle(void);
 static void rcu_idle_count_callbacks_posted(void);
+static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
 static void print_cpu_stall_info_begin(void);
 static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
 static void print_cpu_stall_info_end(void);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index ec6c2efb28cd..d45e961515c1 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -180,7 +180,7 @@ static void rcu_preempt_note_context_switch(void)
 		 * But first, note that the current CPU must still be
 		 * on line!
 		 */
-		WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
+		WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0);
 		WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
 		if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
 			list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
@@ -263,7 +263,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
  */
 void rcu_read_unlock_special(struct task_struct *t)
 {
-	bool empty;
 	bool empty_exp;
 	bool empty_norm;
 	bool empty_exp_now;
@@ -319,7 +318,6 @@ void rcu_read_unlock_special(struct task_struct *t)
 				break;
 			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 		}
-		empty = !rcu_preempt_has_tasks(rnp);
 		empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
 		empty_exp = !rcu_preempted_readers_exp(rnp);
 		smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
@@ -339,14 +337,6 @@ void rcu_read_unlock_special(struct task_struct *t)
 		drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
 #endif /* #ifdef CONFIG_RCU_BOOST */
 
-		/*
-		 * If this was the last task on the list, go see if we
-		 * need to propagate ->qsmaskinit bit clearing up the
-		 * rcu_node tree.
-		 */
-		if (!empty && !rcu_preempt_has_tasks(rnp))
-			rcu_cleanup_dead_rnp(rnp);
-
 		/*
 		 * If this was the last task on the current list, and if
 		 * we aren't waiting on any CPUs, report the quiescent state.
@@ -868,8 +858,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 	return 0;
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
-
 /*
  * Because there is no preemptible RCU, there can be no readers blocked.
  */
@@ -878,8 +866,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
 	return false;
 }
 
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
 /*
  * Because preemptible RCU does not exist, we never have to check for
  * tasks blocked within RCU read-side critical sections.
@@ -1179,7 +1165,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
  * Returns zero if all is well, a negated errno otherwise.
  */
 static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
-						 struct rcu_node *rnp)
+				       struct rcu_node *rnp)
 {
 	int rnp_index = rnp - &rsp->node[0];
 	unsigned long flags;
@@ -1189,7 +1175,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
 	if (&rcu_preempt_state != rsp)
 		return 0;
 
-	if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0)
+	if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0)
 		return 0;
 
 	rsp->boost = 1;
@@ -1282,7 +1268,7 @@ static void rcu_cpu_kthread(unsigned int cpu)
 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 {
 	struct task_struct *t = rnp->boost_kthread_task;
-	unsigned long mask = rnp->qsmaskinit;
+	unsigned long mask = rcu_rnp_online_cpus(rnp);
 	cpumask_var_t cm;
 	int cpu;
 
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index fbb6240509ea..f92361efd0f5 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -283,8 +283,8 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
 			seq_puts(m, "\n");
 			level = rnp->level;
 		}
-		seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d    ",
-			   rnp->qsmask, rnp->qsmaskinit,
+		seq_printf(m, "%lx/%lx->%lx %c%c>%c %d:%d ^%d    ",
+			   rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext,
 			   ".G"[rnp->gp_tasks != NULL],
 			   ".E"[rnp->exp_tasks != NULL],
 			   ".T"[!list_empty(&rnp->blkd_tasks)],
-- 
cgit v1.2.3


From c199068913c9c5cbb5498e289bb387703e087ea8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 23 Jan 2015 22:29:37 -0800
Subject: rcu: Eliminate ->onoff_mutex from rcu_node structure

Because that RCU grace-period initialization need no longer exclude
CPU-hotplug operations, this commit eliminates the ->onoff_mutex and
its uses.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 15 ---------------
 kernel/rcu/tree.h |  2 --
 2 files changed, 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f0f4d3510d24..79d53399247e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -101,7 +101,6 @@ struct rcu_state sname##_state = { \
 	.orphan_nxttail = &sname##_state.orphan_nxtlist, \
 	.orphan_donetail = &sname##_state.orphan_donelist, \
 	.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
-	.onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
 	.name = RCU_STATE_NAME(sname), \
 	.abbr = sabbr, \
 }; \
@@ -1754,10 +1753,6 @@ static int rcu_gp_init(struct rcu_state *rsp)
 	trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
 	raw_spin_unlock_irq(&rnp->lock);
 
-	/* Exclude any concurrent CPU-hotplug operations. */
-	mutex_lock(&rsp->onoff_mutex);
-	smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */
-
 	/*
 	 * Apply per-leaf buffered online and offline operations to the
 	 * rcu_node tree.  Note that this new grace period need not wait
@@ -1844,7 +1839,6 @@ static int rcu_gp_init(struct rcu_state *rsp)
 			schedule_timeout_uninterruptible(gp_init_delay);
 	}
 
-	mutex_unlock(&rsp->onoff_mutex);
 	return 1;
 }
 
@@ -2498,9 +2492,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 	/* Adjust any no-longer-needed kthreads. */
 	rcu_boost_kthread_setaffinity(rnp, -1);
 
-	/* Exclude any attempts to start a new grace period. */
-	mutex_lock(&rsp->onoff_mutex);
-
 	/* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
 	raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
 	rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
@@ -2517,7 +2508,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 	WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
 		  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
 		  cpu, rdp->qlen, rdp->nxtlist);
-	mutex_unlock(&rsp->onoff_mutex);
 }
 
 #else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -3700,9 +3690,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
-	/* Exclude new grace periods. */
-	mutex_lock(&rsp->onoff_mutex);
-
 	/* Set up local state, ensuring consistent view of global state. */
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	rdp->beenonline = 1;	 /* We have now been online. */
@@ -3733,8 +3720,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 	rdp->qs_pending = false;
 	trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
-
-	mutex_unlock(&rsp->onoff_mutex);
 }
 
 static void rcu_prepare_cpu(int cpu)
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index aa42562ff5b2..a69d3dab2ec4 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -456,8 +456,6 @@ struct rcu_state {
 	long qlen;				/* Total number of callbacks. */
 	/* End of fields guarded by orphan_lock. */
 
-	struct mutex onoff_mutex;		/* Coordinate hotplug & GPs. */
-
 	struct mutex barrier_mutex;		/* Guards barrier fields. */
 	atomic_t barrier_cpu_count;		/* # CPUs waiting on. */
 	struct completion barrier_completion;	/* Wake at barrier end. */
-- 
cgit v1.2.3


From 528a25b00e1f84eaba6c98e63f58ee0a8e472102 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 28 Jan 2015 14:09:43 -0800
Subject: cpu: Make CPU-offline idle-loop transition point more precise

This commit uses a per-CPU variable to make the CPU-offline code path
through the idle loop more precise, so that the outgoing CPU is
guaranteed to make it into the idle loop before it is powered off.
This commit is in preparation for putting the RCU offline-handling
code on this code path, which will eliminate the magic one-jiffy
wait that RCU uses as the maximum time for an outgoing CPU to get
all the way through the scheduler.

The magic one-jiffy wait for incoming CPUs remains a separate issue.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/cpu.c        | 4 +++-
 kernel/sched/idle.c | 7 ++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 1972b161c61e..d46b4dae0ca0 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -408,8 +408,10 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	 *
 	 * Wait for the stop thread to go away.
 	 */
-	while (!idle_cpu(cpu))
+	while (!per_cpu(cpu_dead_idle, cpu))
 		cpu_relax();
+	smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */
+	per_cpu(cpu_dead_idle, cpu) = false;
 
 	/* This actually kills the CPU. */
 	__cpu_die(cpu);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 94b2d7b88a27..e99e361ade20 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -198,6 +198,8 @@ exit_idle:
 	start_critical_timings();
 }
 
+DEFINE_PER_CPU(bool, cpu_dead_idle);
+
 /*
  * Generic idle loop implementation
  *
@@ -222,8 +224,11 @@ static void cpu_idle_loop(void)
 			check_pgt_cache();
 			rmb();
 
-			if (cpu_is_offline(smp_processor_id()))
+			if (cpu_is_offline(smp_processor_id())) {
+				smp_mb(); /* all activity before dead. */
+				this_cpu_write(cpu_dead_idle, true);
 				arch_cpu_idle_dead();
+			}
 
 			local_irq_disable();
 			arch_cpu_idle_enter();
-- 
cgit v1.2.3


From 88428cc5c27c63a4313e213813bc39b9899224d5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 28 Jan 2015 14:42:09 -0800
Subject: rcu: Handle outgoing CPUs on exit from idle loop

This commit informs RCU of an outgoing CPU just before that CPU invokes
arch_cpu_idle_dead() during its last pass through the idle loop (via a
new CPU_DYING_IDLE notifier value).  This change means that RCU need not
deal with outgoing CPUs passing through the scheduler after informing
RCU that they are no longer online.  Note that removing the CPU from
the rcu_node ->qsmaskinit bit masks is done at CPU_DYING_IDLE time,
and orphaning callbacks is still done at CPU_DEAD time, the reason being
that at CPU_DEAD time we have another CPU that can adopt them.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/cpu.h      |  2 ++
 include/linux/rcupdate.h |  2 ++
 kernel/rcu/tree.c        | 41 +++++++++++++++++++++++++++++++----------
 kernel/sched/idle.c      |  2 ++
 4 files changed, 37 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 4744ef915acd..d028721748d4 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -95,6 +95,8 @@ enum {
 					* Called on the new cpu, just before
 					* enabling interrupts. Must not sleep,
 					* must not fail */
+#define CPU_DYING_IDLE		0x000B /* CPU (unsigned)v dying, reached
+					* idle loop. */
 #define CPU_BROKEN		0x000C /* CPU (unsigned)v did not die properly,
 					* perhaps due to preemption. */
 
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 78097491cd99..762022f07afd 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -266,6 +266,8 @@ void rcu_idle_enter(void);
 void rcu_idle_exit(void);
 void rcu_irq_enter(void);
 void rcu_irq_exit(void);
+int rcu_cpu_notify(struct notifier_block *self,
+		   unsigned long action, void *hcpu);
 
 #ifdef CONFIG_RCU_STALL_COMMON
 void rcu_sysrq_start(void);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 79d53399247e..d5247ed44004 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2475,6 +2475,26 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
 	}
 }
 
+/*
+ * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
+ * function.  We now remove it from the rcu_node tree's ->qsmaskinit
+ * bit masks.
+ */
+static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
+{
+	unsigned long flags;
+	unsigned long mask;
+	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
+
+	/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
+	mask = rdp->grpmask;
+	raw_spin_lock_irqsave(&rnp->lock, flags);
+	smp_mb__after_unlock_lock();	/* Enforce GP memory-order guarantee. */
+	rnp->qsmaskinitnext &= ~mask;
+	raw_spin_unlock_irqrestore(&rnp->lock, flags);
+}
+
 /*
  * The CPU has been completely removed, and some other CPU is reporting
  * this fact from process context.  Do the remainder of the cleanup,
@@ -2485,7 +2505,6 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
 static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 {
 	unsigned long flags;
-	unsigned long mask;
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
 
@@ -2498,13 +2517,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 	rcu_adopt_orphan_cbs(rsp, flags);
 	raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
 
-	/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
-	mask = rdp->grpmask;
-	raw_spin_lock_irqsave(&rnp->lock, flags);
-	smp_mb__after_unlock_lock();	/* Enforce GP memory-order guarantee. */
-	rnp->qsmaskinitnext &= ~mask;
-	raw_spin_unlock_irqrestore(&rnp->lock, flags);
-
 	WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
 		  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
 		  cpu, rdp->qlen, rdp->nxtlist);
@@ -2520,6 +2532,10 @@ static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
 {
 }
 
+static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
+{
+}
+
 static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 {
 }
@@ -3733,8 +3749,8 @@ static void rcu_prepare_cpu(int cpu)
 /*
  * Handle CPU online/offline notification events.
  */
-static int rcu_cpu_notify(struct notifier_block *self,
-				    unsigned long action, void *hcpu)
+int rcu_cpu_notify(struct notifier_block *self,
+		   unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
 	struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
@@ -3760,6 +3776,11 @@ static int rcu_cpu_notify(struct notifier_block *self,
 		for_each_rcu_flavor(rsp)
 			rcu_cleanup_dying_cpu(rsp);
 		break;
+	case CPU_DYING_IDLE:
+		for_each_rcu_flavor(rsp) {
+			rcu_cleanup_dying_idle_cpu(cpu, rsp);
+		}
+		break;
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 	case CPU_UP_CANCELED:
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index e99e361ade20..b0090accfb5b 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -225,6 +225,8 @@ static void cpu_idle_loop(void)
 			rmb();
 
 			if (cpu_is_offline(smp_processor_id())) {
+				rcu_cpu_notify(NULL, CPU_DYING_IDLE,
+					       (void *)(long)smp_processor_id());
 				smp_mb(); /* all activity before dead. */
 				this_cpu_write(cpu_dead_idle, true);
 				arch_cpu_idle_dead();
-- 
cgit v1.2.3


From 5c60d25fa1b22fdcf141f8006d31c32b08db7311 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 9 Feb 2015 05:37:47 -0800
Subject: rcu: Add diagnostics to grace-period cleanup

At grace-period initialization time, RCU checks that all quiescent
states were really reported for the previous grace period.  Now that
grace-period cleanup has been split out of grace-period initialization,
this commit also performs those checks at grace-period cleanup time.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d5247ed44004..17b5abf999ca 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1920,6 +1920,8 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 	rcu_for_each_node_breadth_first(rsp, rnp) {
 		raw_spin_lock_irq(&rnp->lock);
 		smp_mb__after_unlock_lock();
+		WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
+		WARN_ON_ONCE(rnp->qsmask);
 		ACCESS_ONCE(rnp->completed) = rsp->gpnum;
 		rdp = this_cpu_ptr(rsp->rda);
 		if (rnp == rdp->mynode)
-- 
cgit v1.2.3


From 66ee59af630fd8d5f4f56fb28162857e629aa0ab Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 11 Feb 2015 19:56:46 +0100
Subject: fs: remove ki_nbytes

There is no need to pass the total request length in the kiocb, as
we already get passed in through the iov_iter argument.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/aio.c               | 34 ++++++++++++++++++----------------
 fs/ceph/file.c         |  2 +-
 fs/nfs/direct.c        |  2 +-
 fs/ocfs2/file.c        |  8 +++-----
 fs/read_write.c        |  8 --------
 fs/udf/file.c          |  2 +-
 include/linux/aio.h    |  1 -
 kernel/printk/printk.c |  2 +-
 mm/page_io.c           |  1 -
 net/socket.c           |  6 +++---
 10 files changed, 28 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/fs/aio.c b/fs/aio.c
index 118a2e0088d8..667054c7c067 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1344,12 +1344,13 @@ typedef ssize_t (rw_iter_op)(struct kiocb *, struct iov_iter *);
 static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb,
 				     int rw, char __user *buf,
 				     unsigned long *nr_segs,
+				     size_t *len,
 				     struct iovec **iovec,
 				     bool compat)
 {
 	ssize_t ret;
 
-	*nr_segs = kiocb->ki_nbytes;
+	*nr_segs = *len;
 
 #ifdef CONFIG_COMPAT
 	if (compat)
@@ -1364,21 +1365,22 @@ static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb,
 	if (ret < 0)
 		return ret;
 
-	/* ki_nbytes now reflect bytes instead of segs */
-	kiocb->ki_nbytes = ret;
+	/* len now reflect bytes instead of segs */
+	*len = ret;
 	return 0;
 }
 
 static ssize_t aio_setup_single_vector(struct kiocb *kiocb,
 				       int rw, char __user *buf,
 				       unsigned long *nr_segs,
+				       size_t len,
 				       struct iovec *iovec)
 {
-	if (unlikely(!access_ok(!rw, buf, kiocb->ki_nbytes)))
+	if (unlikely(!access_ok(!rw, buf, len)))
 		return -EFAULT;
 
 	iovec->iov_base = buf;
-	iovec->iov_len = kiocb->ki_nbytes;
+	iovec->iov_len = len;
 	*nr_segs = 1;
 	return 0;
 }
@@ -1388,7 +1390,7 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb,
  *	Performs the initial checks and io submission.
  */
 static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
-			    char __user *buf, bool compat)
+			    char __user *buf, size_t len, bool compat)
 {
 	struct file *file = req->ki_filp;
 	ssize_t ret;
@@ -1423,21 +1425,21 @@ rw_common:
 		if (!rw_op && !iter_op)
 			return -EINVAL;
 
-		ret = (opcode == IOCB_CMD_PREADV ||
-		       opcode == IOCB_CMD_PWRITEV)
-			? aio_setup_vectored_rw(req, rw, buf, &nr_segs,
-						&iovec, compat)
-			: aio_setup_single_vector(req, rw, buf, &nr_segs,
-						  iovec);
+		if (opcode == IOCB_CMD_PREADV || opcode == IOCB_CMD_PWRITEV)
+			ret = aio_setup_vectored_rw(req, rw, buf, &nr_segs,
+						&len, &iovec, compat);
+		else
+			ret = aio_setup_single_vector(req, rw, buf, &nr_segs,
+						  len, iovec);
 		if (!ret)
-			ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
+			ret = rw_verify_area(rw, file, &req->ki_pos, len);
 		if (ret < 0) {
 			if (iovec != inline_vecs)
 				kfree(iovec);
 			return ret;
 		}
 
-		req->ki_nbytes = ret;
+		len = ret;
 
 		/* XXX: move/kill - rw_verify_area()? */
 		/* This matches the pread()/pwrite() logic */
@@ -1450,7 +1452,7 @@ rw_common:
 			file_start_write(file);
 
 		if (iter_op) {
-			iov_iter_init(&iter, rw, iovec, nr_segs, req->ki_nbytes);
+			iov_iter_init(&iter, rw, iovec, nr_segs, len);
 			ret = iter_op(req, &iter);
 		} else {
 			ret = rw_op(req, iovec, nr_segs, req->ki_pos);
@@ -1553,10 +1555,10 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 	req->ki_obj.user = user_iocb;
 	req->ki_user_data = iocb->aio_data;
 	req->ki_pos = iocb->aio_offset;
-	req->ki_nbytes = iocb->aio_nbytes;
 
 	ret = aio_run_iocb(req, iocb->aio_lio_opcode,
 			   (char __user *)(unsigned long)iocb->aio_buf,
+			   iocb->aio_nbytes,
 			   compat);
 	if (ret)
 		goto out_put_req;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 905986dd4c3c..081c4e3f9e49 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -807,7 +807,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
 	struct file *filp = iocb->ki_filp;
 	struct ceph_file_info *fi = filp->private_data;
-	size_t len = iocb->ki_nbytes;
+	size_t len = iov_iter_count(to);
 	struct inode *inode = file_inode(filp);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct page *pinned_page = NULL;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 7077521acdf4..27cebf164070 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -265,7 +265,7 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t
 
 	return -EINVAL;
 #else
-	VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE);
+	VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
 
 	if (rw == READ)
 		return nfs_file_direct_read(iocb, iter, pos);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 46e0d4e857c7..266845de2100 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2280,7 +2280,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
 		file->f_path.dentry->d_name.name,
 		(unsigned int)from->nr_segs);	/* GRRRRR */
 
-	if (iocb->ki_nbytes == 0)
+	if (count == 0)
 		return 0;
 
 	appending = file->f_flags & O_APPEND ? 1 : 0;
@@ -2330,8 +2330,7 @@ relock:
 	}
 
 	can_do_direct = direct_io;
-	ret = ocfs2_prepare_inode_for_write(file, ppos,
-					    iocb->ki_nbytes, appending,
+	ret = ocfs2_prepare_inode_for_write(file, ppos, count, appending,
 					    &can_do_direct, &has_refcount);
 	if (ret < 0) {
 		mlog_errno(ret);
@@ -2339,8 +2338,7 @@ relock:
 	}
 
 	if (direct_io && !is_sync_kiocb(iocb))
-		unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_nbytes,
-						      *ppos);
+		unaligned_dio = ocfs2_is_io_unaligned(inode, count, *ppos);
 
 	/*
 	 * We can't complete the direct I/O as requested, fall back to
diff --git a/fs/read_write.c b/fs/read_write.c
index 8e1b68786d66..f8b8fc1316ab 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -343,7 +343,6 @@ ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos)
 
 	init_sync_kiocb(&kiocb, file);
 	kiocb.ki_pos = *ppos;
-	kiocb.ki_nbytes = iov_iter_count(iter);
 
 	iter->type |= READ;
 	ret = file->f_op->read_iter(&kiocb, iter);
@@ -366,7 +365,6 @@ ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos)
 
 	init_sync_kiocb(&kiocb, file);
 	kiocb.ki_pos = *ppos;
-	kiocb.ki_nbytes = iov_iter_count(iter);
 
 	iter->type |= WRITE;
 	ret = file->f_op->write_iter(&kiocb, iter);
@@ -426,7 +424,6 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp
 
 	init_sync_kiocb(&kiocb, filp);
 	kiocb.ki_pos = *ppos;
-	kiocb.ki_nbytes = len;
 
 	ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
 	if (-EIOCBQUEUED == ret)
@@ -446,7 +443,6 @@ ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *p
 
 	init_sync_kiocb(&kiocb, filp);
 	kiocb.ki_pos = *ppos;
-	kiocb.ki_nbytes = len;
 	iov_iter_init(&iter, READ, &iov, 1, len);
 
 	ret = filp->f_op->read_iter(&kiocb, &iter);
@@ -510,7 +506,6 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
 
 	init_sync_kiocb(&kiocb, filp);
 	kiocb.ki_pos = *ppos;
-	kiocb.ki_nbytes = len;
 
 	ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
 	if (-EIOCBQUEUED == ret)
@@ -530,7 +525,6 @@ ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, lo
 
 	init_sync_kiocb(&kiocb, filp);
 	kiocb.ki_pos = *ppos;
-	kiocb.ki_nbytes = len;
 	iov_iter_init(&iter, WRITE, &iov, 1, len);
 
 	ret = filp->f_op->write_iter(&kiocb, &iter);
@@ -719,7 +713,6 @@ static ssize_t do_iter_readv_writev(struct file *filp, int rw, const struct iove
 
 	init_sync_kiocb(&kiocb, filp);
 	kiocb.ki_pos = *ppos;
-	kiocb.ki_nbytes = len;
 
 	iov_iter_init(&iter, rw, iov, nr_segs, len);
 	ret = fn(&kiocb, &iter);
@@ -737,7 +730,6 @@ static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
 
 	init_sync_kiocb(&kiocb, filp);
 	kiocb.ki_pos = *ppos;
-	kiocb.ki_nbytes = len;
 
 	ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
 	if (ret == -EIOCBQUEUED)
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 08f3555fbeac..9c0b6da9dbb3 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -122,7 +122,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
 	int err, pos;
-	size_t count = iocb->ki_nbytes;
+	size_t count = iov_iter_count(from);
 	struct udf_inode_info *iinfo = UDF_I(inode);
 
 	mutex_lock(&inode->i_mutex);
diff --git a/include/linux/aio.h b/include/linux/aio.h
index d9c92daa3944..132d1ecba435 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -42,7 +42,6 @@ struct kiocb {
 
 	__u64			ki_user_data;	/* user's data for completion */
 	loff_t			ki_pos;
-	size_t			ki_nbytes;	/* copy of iocb->aio_nbytes */
 
 	struct list_head	ki_list;	/* the aio core uses this
 						 * for cancellation */
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index c06df7de0963..60b2aa2a2da9 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -521,7 +521,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
 	int i;
 	int level = default_message_loglevel;
 	int facility = 1;	/* LOG_USER */
-	size_t len = iocb->ki_nbytes;
+	size_t len = iov_iter_count(from);
 	ssize_t ret = len;
 
 	if (len > LOG_LINE_MAX)
diff --git a/mm/page_io.c b/mm/page_io.c
index e6045804c8d8..7ef21577856c 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -274,7 +274,6 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
 		iov_iter_bvec(&from, ITER_BVEC | WRITE, &bv, 1, PAGE_SIZE);
 		init_sync_kiocb(&kiocb, swap_file);
 		kiocb.ki_pos = page_file_offset(page);
-		kiocb.ki_nbytes = PAGE_SIZE;
 
 		set_page_writeback(page);
 		unlock_page(page);
diff --git a/net/socket.c b/net/socket.c
index bbedbfcb42c2..f92145554f34 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -858,11 +858,11 @@ static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	if (iocb->ki_pos != 0)
 		return -ESPIPE;
 
-	if (iocb->ki_nbytes == 0)	/* Match SYS5 behaviour */
+	if (!iov_iter_count(to))	/* Match SYS5 behaviour */
 		return 0;
 
 	res = __sock_recvmsg(iocb, sock, &msg,
-			     iocb->ki_nbytes, msg.msg_flags);
+			     iov_iter_count(to), msg.msg_flags);
 	*to = msg.msg_iter;
 	return res;
 }
@@ -883,7 +883,7 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (sock->type == SOCK_SEQPACKET)
 		msg.msg_flags |= MSG_EOR;
 
-	res = __sock_sendmsg(iocb, sock, &msg, iocb->ki_nbytes);
+	res = __sock_sendmsg(iocb, sock, &msg, iov_iter_count(from));
 	*from = msg.msg_iter;
 	return res;
 }
-- 
cgit v1.2.3


From 724e7bfcc566375158219c1454b4b6fc416b2c4a Mon Sep 17 00:00:00 2001
From: Pranith Kumar <bobby.prani@gmail.com>
Date: Wed, 11 Mar 2015 14:08:19 -0400
Subject: audit: Remove condition which always evaluates to false
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After commit 3e1d0bb6224f019893d1c498cc3327559d183674 ("audit: Convert int limit
uses to u32"), by converting an int to u32, few conditions will always evaluate
to false.

These warnings were emitted during compilation:

kernel/audit.c: In function ‘audit_set_enabled’:
kernel/audit.c:347:2: warning: comparison of unsigned expression < 0 is always
false [-Wtype-limits]
  if (state < AUDIT_OFF || state > AUDIT_LOCKED)
	  ^
	  kernel/audit.c: In function ‘audit_receive_msg’:
	  kernel/audit.c:880:9: warning: comparison of unsigned expression < 0 is
	  always false [-Wtype-limits]
	      if (s.backlog_wait_time < 0 ||

The following patch removes those unnecessary conditions.

Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>
Signed-off-by: Paul Moore <pmoore@redhat.com>
---
 kernel/audit.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 52ee8eee0e07..d5a1220c8620 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -346,7 +346,7 @@ static int audit_set_backlog_wait_time(u32 timeout)
 static int audit_set_enabled(u32 state)
 {
 	int rc;
-	if (state < AUDIT_OFF || state > AUDIT_LOCKED)
+	if (state > AUDIT_LOCKED)
 		return -EINVAL;
 
 	rc =  audit_do_config_change("audit_enabled", &audit_enabled, state);
@@ -888,8 +888,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		if (s.mask & AUDIT_STATUS_BACKLOG_WAIT_TIME) {
 			if (sizeof(s) > (size_t)nlh->nlmsg_len)
 				return -EINVAL;
-			if (s.backlog_wait_time < 0 ||
-			    s.backlog_wait_time > 10*AUDIT_BACKLOG_WAIT_TIME)
+			if (s.backlog_wait_time > 10*AUDIT_BACKLOG_WAIT_TIME)
 				return -EINVAL;
 			err = audit_set_backlog_wait_time(s.backlog_wait_time);
 			if (err < 0)
-- 
cgit v1.2.3


From 03e69b508b6f7c51743055c9f61d1dfeadf4b635 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 14 Mar 2015 02:27:16 +0100
Subject: ebpf: add prandom helper for packet sampling

This work is similar to commit 4cd3675ebf74 ("filter: added BPF
random opcode") and adds a possibility for packet sampling in eBPF.

Currently, this is only possible in classic BPF and useful to
combine sampling with f.e. packet sockets, possible also with tc.

Example function proto-type looks like:

  u32 (*prandom_u32)(void) = (void *)BPF_FUNC_get_prandom_u32;

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  2 ++
 include/uapi/linux/bpf.h |  1 +
 kernel/bpf/core.c        |  2 ++
 kernel/bpf/helpers.c     | 12 ++++++++++++
 net/core/filter.c        |  2 ++
 5 files changed, 19 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 80f2e0fc3d02..50bf95e29a96 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -154,4 +154,6 @@ extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
 extern const struct bpf_func_proto bpf_map_update_elem_proto;
 extern const struct bpf_func_proto bpf_map_delete_elem_proto;
 
+extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3fa1af8a58d7..1c2ca2b477c8 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -165,6 +165,7 @@ enum bpf_func_id {
 	BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */
 	BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */
 	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
+	BPF_FUNC_get_prandom_u32, /* u32 prandom_u32(void) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 50603aec766a..c1dbbb5d289b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -661,6 +661,8 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
 const struct bpf_func_proto bpf_map_update_elem_proto __weak;
 const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
 
+const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
+
 /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
  * skb_copy_bits(), so provide a weak definition of it for NET-less config.
  */
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index a3c7701a8b5e..95eb59a045ea 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -11,6 +11,7 @@
  */
 #include <linux/bpf.h>
 #include <linux/rcupdate.h>
+#include <linux/random.h>
 
 /* If kernel subsystem is allowing eBPF programs to call this function,
  * inside its own verifier_ops->get_func_proto() callback it should return
@@ -87,3 +88,14 @@ const struct bpf_func_proto bpf_map_delete_elem_proto = {
 	.arg1_type = ARG_CONST_MAP_PTR,
 	.arg2_type = ARG_PTR_TO_MAP_KEY,
 };
+
+static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	return prandom_u32();
+}
+
+const struct bpf_func_proto bpf_get_prandom_u32_proto = {
+	.func		= bpf_get_prandom_u32,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+};
diff --git a/net/core/filter.c b/net/core/filter.c
index 7a4eb7030dba..4344db39af2e 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1139,6 +1139,8 @@ sk_filter_func_proto(enum bpf_func_id func_id)
 		return &bpf_map_update_elem_proto;
 	case BPF_FUNC_map_delete_elem:
 		return &bpf_map_delete_elem_proto;
+	case BPF_FUNC_get_prandom_u32:
+		return &bpf_get_prandom_u32_proto;
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3


From c04167ce2ca0ecaeaafef006cb0d65cf01b68e42 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 14 Mar 2015 02:27:17 +0100
Subject: ebpf: add helper for obtaining current processor id

This patch adds the possibility to obtain raw_smp_processor_id() in
eBPF. Currently, this is only possible in classic BPF where commit
da2033c28226 ("filter: add SKF_AD_RXHASH and SKF_AD_CPU") has added
facilities for this.

Perhaps most importantly, this would also allow us to track per CPU
statistics with eBPF maps, or to implement a poor-man's per CPU data
structure through eBPF maps.

Example function proto-type looks like:

  u32 (*smp_processor_id)(void) = (void *)BPF_FUNC_get_smp_processor_id;

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  1 +
 include/uapi/linux/bpf.h |  1 +
 kernel/bpf/core.c        |  1 +
 kernel/bpf/helpers.c     | 12 ++++++++++++
 net/core/filter.c        |  2 ++
 5 files changed, 17 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 50bf95e29a96..30bfd331882a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -155,5 +155,6 @@ extern const struct bpf_func_proto bpf_map_update_elem_proto;
 extern const struct bpf_func_proto bpf_map_delete_elem_proto;
 
 extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
+extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
 
 #endif /* _LINUX_BPF_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1c2ca2b477c8..de1f63668daf 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -166,6 +166,7 @@ enum bpf_func_id {
 	BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */
 	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
 	BPF_FUNC_get_prandom_u32, /* u32 prandom_u32(void) */
+	BPF_FUNC_get_smp_processor_id, /* u32 raw_smp_processor_id(void) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c1dbbb5d289b..4139a0f8b558 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -662,6 +662,7 @@ const struct bpf_func_proto bpf_map_update_elem_proto __weak;
 const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
 
 const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
+const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
 
 /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
  * skb_copy_bits(), so provide a weak definition of it for NET-less config.
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 95eb59a045ea..bd7f5988ed9c 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -12,6 +12,7 @@
 #include <linux/bpf.h>
 #include <linux/rcupdate.h>
 #include <linux/random.h>
+#include <linux/smp.h>
 
 /* If kernel subsystem is allowing eBPF programs to call this function,
  * inside its own verifier_ops->get_func_proto() callback it should return
@@ -99,3 +100,14 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 };
+
+static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	return raw_smp_processor_id();
+}
+
+const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
+	.func		= bpf_get_smp_processor_id,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+};
diff --git a/net/core/filter.c b/net/core/filter.c
index 4344db39af2e..33310eee6134 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1141,6 +1141,8 @@ sk_filter_func_proto(enum bpf_func_id func_id)
 		return &bpf_map_delete_elem_proto;
 	case BPF_FUNC_get_prandom_u32:
 		return &bpf_get_prandom_u32_proto;
+	case BPF_FUNC_get_smp_processor_id:
+		return &bpf_get_smp_processor_id_proto;
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3


From 9bac3d6d548e5cc925570b263f35b70a00a00ffd Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Fri, 13 Mar 2015 11:57:42 -0700
Subject: bpf: allow extended BPF programs access skb fields

introduce user accessible mirror of in-kernel 'struct sk_buff':
struct __sk_buff {
    __u32 len;
    __u32 pkt_type;
    __u32 mark;
    __u32 queue_mapping;
};

bpf programs can do:

int bpf_prog(struct __sk_buff *skb)
{
    __u32 var = skb->pkt_type;

which will be compiled to bpf assembler as:

dst_reg = *(u32 *)(src_reg + 4) // 4 == offsetof(struct __sk_buff, pkt_type)

bpf verifier will check validity of access and will convert it to:

dst_reg = *(u8 *)(src_reg + offsetof(struct sk_buff, __pkt_type_offset))
dst_reg &= 7

since skb->pkt_type is a bitfield.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |   5 +-
 include/uapi/linux/bpf.h |  10 ++++
 kernel/bpf/syscall.c     |   2 +-
 kernel/bpf/verifier.c    | 152 ++++++++++++++++++++++++++++++++++++++++++-----
 net/core/filter.c        | 100 +++++++++++++++++++++++++------
 5 files changed, 234 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 30bfd331882a..280a315de8d6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -103,6 +103,9 @@ struct bpf_verifier_ops {
 	 * with 'type' (read or write) is allowed
 	 */
 	bool (*is_valid_access)(int off, int size, enum bpf_access_type type);
+
+	u32 (*convert_ctx_access)(int dst_reg, int src_reg, int ctx_off,
+				  struct bpf_insn *insn);
 };
 
 struct bpf_prog_type_list {
@@ -133,7 +136,7 @@ struct bpf_map *bpf_map_get(struct fd f);
 void bpf_map_put(struct bpf_map *map);
 
 /* verify correctness of eBPF program */
-int bpf_check(struct bpf_prog *fp, union bpf_attr *attr);
+int bpf_check(struct bpf_prog **fp, union bpf_attr *attr);
 #else
 static inline void bpf_register_prog_type(struct bpf_prog_type_list *tl)
 {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index de1f63668daf..929545a27546 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -170,4 +170,14 @@ enum bpf_func_id {
 	__BPF_FUNC_MAX_ID,
 };
 
+/* user accessible mirror of in-kernel sk_buff.
+ * new fields can only be added to the end of this structure
+ */
+struct __sk_buff {
+	__u32 len;
+	__u32 pkt_type;
+	__u32 mark;
+	__u32 queue_mapping;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 669719ccc9ee..ea75c654af1b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -519,7 +519,7 @@ static int bpf_prog_load(union bpf_attr *attr)
 		goto free_prog;
 
 	/* run eBPF verifier */
-	err = bpf_check(prog, attr);
+	err = bpf_check(&prog, attr);
 	if (err < 0)
 		goto free_used_maps;
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e6b522496250..c22ebd36fa4b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1620,11 +1620,10 @@ static int do_check(struct verifier_env *env)
 				return err;
 
 		} else if (class == BPF_LDX) {
-			if (BPF_MODE(insn->code) != BPF_MEM ||
-			    insn->imm != 0) {
-				verbose("BPF_LDX uses reserved fields\n");
-				return -EINVAL;
-			}
+			enum bpf_reg_type src_reg_type;
+
+			/* check for reserved fields is already done */
+
 			/* check src operand */
 			err = check_reg_arg(regs, insn->src_reg, SRC_OP);
 			if (err)
@@ -1643,6 +1642,29 @@ static int do_check(struct verifier_env *env)
 			if (err)
 				return err;
 
+			src_reg_type = regs[insn->src_reg].type;
+
+			if (insn->imm == 0 && BPF_SIZE(insn->code) == BPF_W) {
+				/* saw a valid insn
+				 * dst_reg = *(u32 *)(src_reg + off)
+				 * use reserved 'imm' field to mark this insn
+				 */
+				insn->imm = src_reg_type;
+
+			} else if (src_reg_type != insn->imm &&
+				   (src_reg_type == PTR_TO_CTX ||
+				    insn->imm == PTR_TO_CTX)) {
+				/* ABuser program is trying to use the same insn
+				 * dst_reg = *(u32*) (src_reg + off)
+				 * with different pointer types:
+				 * src_reg == ctx in one branch and
+				 * src_reg == stack|map in some other branch.
+				 * Reject it.
+				 */
+				verbose("same insn cannot be used with different pointers\n");
+				return -EINVAL;
+			}
+
 		} else if (class == BPF_STX) {
 			if (BPF_MODE(insn->code) == BPF_XADD) {
 				err = check_xadd(env, insn);
@@ -1790,6 +1812,13 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
 	int i, j;
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
+		if (BPF_CLASS(insn->code) == BPF_LDX &&
+		    (BPF_MODE(insn->code) != BPF_MEM ||
+		     insn->imm != 0)) {
+			verbose("BPF_LDX uses reserved fields\n");
+			return -EINVAL;
+		}
+
 		if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
 			struct bpf_map *map;
 			struct fd f;
@@ -1881,6 +1910,92 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env)
 			insn->src_reg = 0;
 }
 
+static void adjust_branches(struct bpf_prog *prog, int pos, int delta)
+{
+	struct bpf_insn *insn = prog->insnsi;
+	int insn_cnt = prog->len;
+	int i;
+
+	for (i = 0; i < insn_cnt; i++, insn++) {
+		if (BPF_CLASS(insn->code) != BPF_JMP ||
+		    BPF_OP(insn->code) == BPF_CALL ||
+		    BPF_OP(insn->code) == BPF_EXIT)
+			continue;
+
+		/* adjust offset of jmps if necessary */
+		if (i < pos && i + insn->off + 1 > pos)
+			insn->off += delta;
+		else if (i > pos && i + insn->off + 1 < pos)
+			insn->off -= delta;
+	}
+}
+
+/* convert load instructions that access fields of 'struct __sk_buff'
+ * into sequence of instructions that access fields of 'struct sk_buff'
+ */
+static int convert_ctx_accesses(struct verifier_env *env)
+{
+	struct bpf_insn *insn = env->prog->insnsi;
+	int insn_cnt = env->prog->len;
+	struct bpf_insn insn_buf[16];
+	struct bpf_prog *new_prog;
+	u32 cnt;
+	int i;
+
+	if (!env->prog->aux->ops->convert_ctx_access)
+		return 0;
+
+	for (i = 0; i < insn_cnt; i++, insn++) {
+		if (insn->code != (BPF_LDX | BPF_MEM | BPF_W))
+			continue;
+
+		if (insn->imm != PTR_TO_CTX) {
+			/* clear internal mark */
+			insn->imm = 0;
+			continue;
+		}
+
+		cnt = env->prog->aux->ops->
+			convert_ctx_access(insn->dst_reg, insn->src_reg,
+					   insn->off, insn_buf);
+		if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+			verbose("bpf verifier is misconfigured\n");
+			return -EINVAL;
+		}
+
+		if (cnt == 1) {
+			memcpy(insn, insn_buf, sizeof(*insn));
+			continue;
+		}
+
+		/* several new insns need to be inserted. Make room for them */
+		insn_cnt += cnt - 1;
+		new_prog = bpf_prog_realloc(env->prog,
+					    bpf_prog_size(insn_cnt),
+					    GFP_USER);
+		if (!new_prog)
+			return -ENOMEM;
+
+		new_prog->len = insn_cnt;
+
+		memmove(new_prog->insnsi + i + cnt, new_prog->insns + i + 1,
+			sizeof(*insn) * (insn_cnt - i - cnt));
+
+		/* copy substitute insns in place of load instruction */
+		memcpy(new_prog->insnsi + i, insn_buf, sizeof(*insn) * cnt);
+
+		/* adjust branches in the whole program */
+		adjust_branches(new_prog, i, cnt - 1);
+
+		/* keep walking new program and skip insns we just inserted */
+		env->prog = new_prog;
+		insn = new_prog->insnsi + i + cnt - 1;
+		i += cnt - 1;
+	}
+
+	return 0;
+}
+
 static void free_states(struct verifier_env *env)
 {
 	struct verifier_state_list *sl, *sln;
@@ -1903,13 +2018,13 @@ static void free_states(struct verifier_env *env)
 	kfree(env->explored_states);
 }
 
-int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
+int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 {
 	char __user *log_ubuf = NULL;
 	struct verifier_env *env;
 	int ret = -EINVAL;
 
-	if (prog->len <= 0 || prog->len > BPF_MAXINSNS)
+	if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS)
 		return -E2BIG;
 
 	/* 'struct verifier_env' can be global, but since it's not small,
@@ -1919,7 +2034,7 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
 	if (!env)
 		return -ENOMEM;
 
-	env->prog = prog;
+	env->prog = *prog;
 
 	/* grab the mutex to protect few globals used by verifier */
 	mutex_lock(&bpf_verifier_lock);
@@ -1951,7 +2066,7 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
 	if (ret < 0)
 		goto skip_full_check;
 
-	env->explored_states = kcalloc(prog->len,
+	env->explored_states = kcalloc(env->prog->len,
 				       sizeof(struct verifier_state_list *),
 				       GFP_USER);
 	ret = -ENOMEM;
@@ -1968,6 +2083,10 @@ skip_full_check:
 	while (pop_stack(env, NULL) >= 0);
 	free_states(env);
 
+	if (ret == 0)
+		/* program is valid, convert *(u32*)(ctx + off) accesses */
+		ret = convert_ctx_accesses(env);
+
 	if (log_level && log_len >= log_size - 1) {
 		BUG_ON(log_len >= log_size);
 		/* verifier log exceeded user supplied buffer */
@@ -1983,18 +2102,18 @@ skip_full_check:
 
 	if (ret == 0 && env->used_map_cnt) {
 		/* if program passed verifier, update used_maps in bpf_prog_info */
-		prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
-						     sizeof(env->used_maps[0]),
-						     GFP_KERNEL);
+		env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
+							  sizeof(env->used_maps[0]),
+							  GFP_KERNEL);
 
-		if (!prog->aux->used_maps) {
+		if (!env->prog->aux->used_maps) {
 			ret = -ENOMEM;
 			goto free_log_buf;
 		}
 
-		memcpy(prog->aux->used_maps, env->used_maps,
+		memcpy(env->prog->aux->used_maps, env->used_maps,
 		       sizeof(env->used_maps[0]) * env->used_map_cnt);
-		prog->aux->used_map_cnt = env->used_map_cnt;
+		env->prog->aux->used_map_cnt = env->used_map_cnt;
 
 		/* program is valid. Convert pseudo bpf_ld_imm64 into generic
 		 * bpf_ld_imm64 instructions
@@ -2006,11 +2125,12 @@ free_log_buf:
 	if (log_level)
 		vfree(log_buf);
 free_env:
-	if (!prog->aux->used_maps)
+	if (!env->prog->aux->used_maps)
 		/* if we didn't copy map pointers into bpf_prog_info, release
 		 * them now. Otherwise free_bpf_prog_info() will release them.
 		 */
 		release_maps(env);
+	*prog = env->prog;
 	kfree(env);
 	mutex_unlock(&bpf_verifier_lock);
 	return ret;
diff --git a/net/core/filter.c b/net/core/filter.c
index 33310eee6134..4e9dd0ad0d5b 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -150,10 +150,43 @@ static u64 __get_random_u32(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 	return prandom_u32();
 }
 
+static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
+			      struct bpf_insn *insn_buf)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	switch (skb_field) {
+	case SKF_AD_MARK:
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
+
+		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+				      offsetof(struct sk_buff, mark));
+		break;
+
+	case SKF_AD_PKTTYPE:
+		*insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET());
+		*insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX);
+#ifdef __BIG_ENDIAN_BITFIELD
+		*insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5);
+#endif
+		break;
+
+	case SKF_AD_QUEUE:
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2);
+
+		*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+				      offsetof(struct sk_buff, queue_mapping));
+		break;
+	}
+
+	return insn - insn_buf;
+}
+
 static bool convert_bpf_extensions(struct sock_filter *fp,
 				   struct bpf_insn **insnp)
 {
 	struct bpf_insn *insn = *insnp;
+	u32 cnt;
 
 	switch (fp->k) {
 	case SKF_AD_OFF + SKF_AD_PROTOCOL:
@@ -167,13 +200,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 		break;
 
 	case SKF_AD_OFF + SKF_AD_PKTTYPE:
-		*insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_A, BPF_REG_CTX,
-				      PKT_TYPE_OFFSET());
-		*insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, PKT_TYPE_MAX);
-#ifdef __BIG_ENDIAN_BITFIELD
-		insn++;
-                *insn = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 5);
-#endif
+		cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn);
+		insn += cnt - 1;
 		break;
 
 	case SKF_AD_OFF + SKF_AD_IFINDEX:
@@ -197,10 +225,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 		break;
 
 	case SKF_AD_OFF + SKF_AD_MARK:
-		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
-
-		*insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX,
-				    offsetof(struct sk_buff, mark));
+		cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn);
+		insn += cnt - 1;
 		break;
 
 	case SKF_AD_OFF + SKF_AD_RXHASH:
@@ -211,10 +237,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 		break;
 
 	case SKF_AD_OFF + SKF_AD_QUEUE:
-		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2);
-
-		*insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
-				    offsetof(struct sk_buff, queue_mapping));
+		cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn);
+		insn += cnt - 1;
 		break;
 
 	case SKF_AD_OFF + SKF_AD_VLAN_TAG:
@@ -1151,13 +1175,55 @@ sk_filter_func_proto(enum bpf_func_id func_id)
 static bool sk_filter_is_valid_access(int off, int size,
 				      enum bpf_access_type type)
 {
-	/* skb fields cannot be accessed yet */
-	return false;
+	/* only read is allowed */
+	if (type != BPF_READ)
+		return false;
+
+	/* check bounds */
+	if (off < 0 || off >= sizeof(struct __sk_buff))
+		return false;
+
+	/* disallow misaligned access */
+	if (off % size != 0)
+		return false;
+
+	/* all __sk_buff fields are __u32 */
+	if (size != 4)
+		return false;
+
+	return true;
+}
+
+static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off,
+					struct bpf_insn *insn_buf)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	switch (ctx_off) {
+	case offsetof(struct __sk_buff, len):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
+
+		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+				      offsetof(struct sk_buff, len));
+		break;
+
+	case offsetof(struct __sk_buff, mark):
+		return convert_skb_access(SKF_AD_MARK, dst_reg, src_reg, insn);
+
+	case offsetof(struct __sk_buff, pkt_type):
+		return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn);
+
+	case offsetof(struct __sk_buff, queue_mapping):
+		return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn);
+	}
+
+	return insn - insn_buf;
 }
 
 static const struct bpf_verifier_ops sk_filter_ops = {
 	.get_func_proto = sk_filter_func_proto,
 	.is_valid_access = sk_filter_is_valid_access,
+	.convert_ctx_access = sk_filter_convert_ctx_access,
 };
 
 static struct bpf_prog_type_list sk_filter_type __read_mostly = {
-- 
cgit v1.2.3


From 431d452af13720463dda498999b2e9a08729c03a Mon Sep 17 00:00:00 2001
From: Zhonghui Fu <zhonghui.fu@linux.intel.com>
Date: Wed, 18 Mar 2015 15:54:27 +0100
Subject: PM / sleep: add pm-trace support for suspending phase

Occasionally, the system can't come back up after suspend/resume
due to problems of device suspending phase. This patch make
PM_TRACE infrastructure cover device suspending phase of
suspend/resume process, and the information in RTC can tell
developers which device suspending function make system hang.

Signed-off-by: Zhonghui Fu <zhonghui.fu@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/x86/include/asm/pm-trace.h     | 23 +++++++++++++++++++++++
 arch/x86/include/asm/resume-trace.h | 21 ---------------------
 drivers/base/power/main.c           | 20 ++++++++++++++++----
 drivers/base/power/trace.c          |  6 +++---
 include/linux/pm-trace.h            | 35 +++++++++++++++++++++++++++++++++++
 include/linux/resume-trace.h        | 34 ----------------------------------
 kernel/power/main.c                 |  2 +-
 7 files changed, 78 insertions(+), 63 deletions(-)
 create mode 100644 arch/x86/include/asm/pm-trace.h
 delete mode 100644 arch/x86/include/asm/resume-trace.h
 create mode 100644 include/linux/pm-trace.h
 delete mode 100644 include/linux/resume-trace.h

(limited to 'kernel')

diff --git a/arch/x86/include/asm/pm-trace.h b/arch/x86/include/asm/pm-trace.h
new file mode 100644
index 000000000000..7b7ac42c3661
--- /dev/null
+++ b/arch/x86/include/asm/pm-trace.h
@@ -0,0 +1,23 @@
+#ifndef _ASM_X86_PM_TRACE_H
+#define _ASM_X86_PM_TRACE_H
+
+#include <asm/asm.h>
+
+#define TRACE_RESUME(user)					\
+do {								\
+	if (pm_trace_enabled) {					\
+		const void *tracedata;				\
+		asm volatile(_ASM_MOV " $1f,%0\n"		\
+			     ".section .tracedata,\"a\"\n"	\
+			     "1:\t.word %c1\n\t"		\
+			     _ASM_PTR " %c2\n"			\
+			     ".previous"			\
+			     :"=r" (tracedata)			\
+			     : "i" (__LINE__), "i" (__FILE__));	\
+		generate_pm_trace(tracedata, user);		\
+	}							\
+} while (0)
+
+#define TRACE_SUSPEND(user)	TRACE_RESUME(user)
+
+#endif /* _ASM_X86_PM_TRACE_H */
diff --git a/arch/x86/include/asm/resume-trace.h b/arch/x86/include/asm/resume-trace.h
deleted file mode 100644
index 3ff1c2cb1da5..000000000000
--- a/arch/x86/include/asm/resume-trace.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef _ASM_X86_RESUME_TRACE_H
-#define _ASM_X86_RESUME_TRACE_H
-
-#include <asm/asm.h>
-
-#define TRACE_RESUME(user)					\
-do {								\
-	if (pm_trace_enabled) {					\
-		const void *tracedata;				\
-		asm volatile(_ASM_MOV " $1f,%0\n"		\
-			     ".section .tracedata,\"a\"\n"	\
-			     "1:\t.word %c1\n\t"		\
-			     _ASM_PTR " %c2\n"			\
-			     ".previous"			\
-			     :"=r" (tracedata)			\
-			     : "i" (__LINE__), "i" (__FILE__));	\
-		generate_resume_trace(tracedata, user);		\
-	}							\
-} while (0)
-
-#endif /* _ASM_X86_RESUME_TRACE_H */
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 9717d5f20139..3d874eca7104 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -23,7 +23,7 @@
 #include <linux/mutex.h>
 #include <linux/pm.h>
 #include <linux/pm_runtime.h>
-#include <linux/resume-trace.h>
+#include <linux/pm-trace.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
 #include <linux/async.h>
@@ -1017,6 +1017,9 @@ static int __device_suspend_noirq(struct device *dev, pm_message_t state, bool a
 	char *info = NULL;
 	int error = 0;
 
+	TRACE_DEVICE(dev);
+	TRACE_SUSPEND(0);
+
 	if (async_error)
 		goto Complete;
 
@@ -1057,6 +1060,7 @@ static int __device_suspend_noirq(struct device *dev, pm_message_t state, bool a
 
 Complete:
 	complete_all(&dev->power.completion);
+	TRACE_SUSPEND(error);
 	return error;
 }
 
@@ -1078,7 +1082,7 @@ static int device_suspend_noirq(struct device *dev)
 {
 	reinit_completion(&dev->power.completion);
 
-	if (pm_async_enabled && dev->power.async_suspend) {
+	if (is_async(dev)) {
 		get_device(dev);
 		async_schedule(async_suspend_noirq, dev);
 		return 0;
@@ -1157,6 +1161,9 @@ static int __device_suspend_late(struct device *dev, pm_message_t state, bool as
 	char *info = NULL;
 	int error = 0;
 
+	TRACE_DEVICE(dev);
+	TRACE_SUSPEND(0);
+
 	__pm_runtime_disable(dev, false);
 
 	if (async_error)
@@ -1198,6 +1205,7 @@ static int __device_suspend_late(struct device *dev, pm_message_t state, bool as
 		async_error = error;
 
 Complete:
+	TRACE_SUSPEND(error);
 	complete_all(&dev->power.completion);
 	return error;
 }
@@ -1219,7 +1227,7 @@ static int device_suspend_late(struct device *dev)
 {
 	reinit_completion(&dev->power.completion);
 
-	if (pm_async_enabled && dev->power.async_suspend) {
+	if (is_async(dev)) {
 		get_device(dev);
 		async_schedule(async_suspend_late, dev);
 		return 0;
@@ -1338,6 +1346,9 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
 	int error = 0;
 	DECLARE_DPM_WATCHDOG_ON_STACK(wd);
 
+	TRACE_DEVICE(dev);
+	TRACE_SUSPEND(0);
+
 	dpm_wait_for_children(dev, async);
 
 	if (async_error)
@@ -1444,6 +1455,7 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
 	if (error)
 		async_error = error;
 
+	TRACE_SUSPEND(error);
 	return error;
 }
 
@@ -1465,7 +1477,7 @@ static int device_suspend(struct device *dev)
 {
 	reinit_completion(&dev->power.completion);
 
-	if (pm_async_enabled && dev->power.async_suspend) {
+	if (is_async(dev)) {
 		get_device(dev);
 		async_schedule(async_suspend, dev);
 		return 0;
diff --git a/drivers/base/power/trace.c b/drivers/base/power/trace.c
index d94a1f5121cf..a311cfa4c5bd 100644
--- a/drivers/base/power/trace.c
+++ b/drivers/base/power/trace.c
@@ -7,7 +7,7 @@
  * devices may be working.
  */
 
-#include <linux/resume-trace.h>
+#include <linux/pm-trace.h>
 #include <linux/export.h>
 #include <linux/rtc.h>
 
@@ -154,7 +154,7 @@ EXPORT_SYMBOL(set_trace_device);
  * it's not any guarantee, but it's a high _likelihood_ that
  * the match is valid).
  */
-void generate_resume_trace(const void *tracedata, unsigned int user)
+void generate_pm_trace(const void *tracedata, unsigned int user)
 {
 	unsigned short lineno = *(unsigned short *)tracedata;
 	const char *file = *(const char **)(tracedata + 2);
@@ -164,7 +164,7 @@ void generate_resume_trace(const void *tracedata, unsigned int user)
 	file_hash_value = hash_string(lineno, file, FILEHASH);
 	set_magic_time(user_hash_value, file_hash_value, dev_hash_value);
 }
-EXPORT_SYMBOL(generate_resume_trace);
+EXPORT_SYMBOL(generate_pm_trace);
 
 extern char __tracedata_start, __tracedata_end;
 static int show_file_hash(unsigned int value)
diff --git a/include/linux/pm-trace.h b/include/linux/pm-trace.h
new file mode 100644
index 000000000000..ecbde7a5548e
--- /dev/null
+++ b/include/linux/pm-trace.h
@@ -0,0 +1,35 @@
+#ifndef PM_TRACE_H
+#define PM_TRACE_H
+
+#ifdef CONFIG_PM_TRACE
+#include <asm/pm-trace.h>
+#include <linux/types.h>
+
+extern int pm_trace_enabled;
+
+static inline int pm_trace_is_enabled(void)
+{
+       return pm_trace_enabled;
+}
+
+struct device;
+extern void set_trace_device(struct device *);
+extern void generate_pm_trace(const void *tracedata, unsigned int user);
+extern int show_trace_dev_match(char *buf, size_t size);
+
+#define TRACE_DEVICE(dev) do { \
+	if (pm_trace_enabled) \
+		set_trace_device(dev); \
+	} while(0)
+
+#else
+
+static inline int pm_trace_is_enabled(void) { return 0; }
+
+#define TRACE_DEVICE(dev) do { } while (0)
+#define TRACE_RESUME(dev) do { } while (0)
+#define TRACE_SUSPEND(dev) do { } while (0)
+
+#endif
+
+#endif
diff --git a/include/linux/resume-trace.h b/include/linux/resume-trace.h
deleted file mode 100644
index f31db2368782..000000000000
--- a/include/linux/resume-trace.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef RESUME_TRACE_H
-#define RESUME_TRACE_H
-
-#ifdef CONFIG_PM_TRACE
-#include <asm/resume-trace.h>
-#include <linux/types.h>
-
-extern int pm_trace_enabled;
-
-static inline int pm_trace_is_enabled(void)
-{
-       return pm_trace_enabled;
-}
-
-struct device;
-extern void set_trace_device(struct device *);
-extern void generate_resume_trace(const void *tracedata, unsigned int user);
-extern int show_trace_dev_match(char *buf, size_t size);
-
-#define TRACE_DEVICE(dev) do { \
-	if (pm_trace_enabled) \
-		set_trace_device(dev); \
-	} while(0)
-
-#else
-
-static inline int pm_trace_is_enabled(void) { return 0; }
-
-#define TRACE_DEVICE(dev) do { } while (0)
-#define TRACE_RESUME(dev) do { } while (0)
-
-#endif
-
-#endif
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 9a59d042ea84..86e8157a450f 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -11,7 +11,7 @@
 #include <linux/export.h>
 #include <linux/kobject.h>
 #include <linux/string.h>
-#include <linux/resume-trace.h>
+#include <linux/pm-trace.h>
 #include <linux/workqueue.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
-- 
cgit v1.2.3


From a77da14ce9afb338040b405f6ab8afddc310411d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 8 Mar 2015 14:52:27 -0700
Subject: rcu: Yet another fix for preemption and CPU hotplug

As noted earlier, the following sequence of events can occur when
running PREEMPT_RCU and HOTPLUG_CPU on a system with a multi-level
rcu_node combining tree:

1.	A group of tasks block on CPUs corresponding to a given leaf
	rcu_node structure while within RCU read-side critical sections.
2.	All CPUs corrsponding to that rcu_node structure go offline.
3.	The next grace period starts, but because there are still tasks
	blocked, the upper-level bits corresponding to this leaf rcu_node
	structure remain set.
4.	All the tasks exit their RCU read-side critical sections and
	remove themselves from the leaf rcu_node structure's list,
	leaving it empty.
5.	But because there now is code to check for this condition at
	force-quiescent-state time, the upper bits are cleared and the
	grace period completes.

However, there is another complication that can occur following step 4 above:

4a.	The grace period starts, and the leaf rcu_node structure's
	gp_tasks pointer is set to NULL because there are no tasks
	blocked on this structure.
4b.	One of the CPUs corresponding to the leaf rcu_node structure
	comes back online.
4b.	An endless stream of tasks are preempted within RCU read-side
	critical sections on this CPU, such that the ->blkd_tasks
	list is always non-empty.

The grace period will never end.

This commit therefore makes the force-quiescent-state processing check only
for absence of tasks blocking the current grace period rather than absence
of tasks altogether.  This will cause a quiescent state to be reported if
the current leaf rcu_node structure is not blocking the current grace period
and its parent thinks that it is, regardless of how RCU managed to get
itself into this state.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: <stable@vger.kernel.org> # 4.0.x
Tested-by: Sasha Levin <sasha.levin@oracle.com>
---
 kernel/rcu/tree.c | 43 +++++++++++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 17b5abf999ca..b3684b284677 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2199,8 +2199,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
 	unsigned long mask;
 	struct rcu_node *rnp_p;
 
-	WARN_ON_ONCE(rsp == &rcu_bh_state || rsp == &rcu_sched_state);
-	if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
+	if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p ||
+	    rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;  /* Still need more quiescent states! */
 	}
@@ -2208,9 +2208,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
 	rnp_p = rnp->parent;
 	if (rnp_p == NULL) {
 		/*
-		 * Either there is only one rcu_node in the tree,
-		 * or tasks were kicked up to root rcu_node due to
-		 * CPUs going offline.
+		 * Only one rcu_node structure in the tree, so don't
+		 * try to report up to its nonexistent parent!
 		 */
 		rcu_report_qs_rsp(rsp, flags);
 		return;
@@ -2713,8 +2712,29 @@ static void force_qs_rnp(struct rcu_state *rsp,
 			return;
 		}
 		if (rnp->qsmask == 0) {
-			rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
-			continue;
+			if (rcu_state_p == &rcu_sched_state ||
+			    rsp != rcu_state_p ||
+			    rcu_preempt_blocked_readers_cgp(rnp)) {
+				/*
+				 * No point in scanning bits because they
+				 * are all zero.  But we might need to
+				 * priority-boost blocked readers.
+				 */
+				rcu_initiate_boost(rnp, flags);
+				/* rcu_initiate_boost() releases rnp->lock */
+				continue;
+			}
+			if (rnp->parent &&
+			    (rnp->parent->qsmask & rnp->grpmask)) {
+				/*
+				 * Race between grace-period
+				 * initialization and task exiting RCU
+				 * read-side critical section: Report.
+				 */
+				rcu_report_unblock_qs_rnp(rsp, rnp, flags);
+				/* rcu_report_unblock_qs_rnp() rlses ->lock */
+				continue;
+			}
 		}
 		cpu = rnp->grplo;
 		bit = 1;
@@ -2729,15 +2749,6 @@ static void force_qs_rnp(struct rcu_state *rsp,
 		if (mask != 0) {
 			/* Idle/offline CPUs, report. */
 			rcu_report_qs_rnp(mask, rsp, rnp, flags);
-		} else if (rnp->parent &&
-			 list_empty(&rnp->blkd_tasks) &&
-			 !rnp->qsmask &&
-			 (rnp->parent->qsmask & rnp->grpmask)) {
-			/*
-			 * Race between grace-period initialization and task
-			 * existing RCU read-side critical section, report.
-			 */
-			rcu_report_unblock_qs_rnp(rsp, rnp, flags);
 		} else {
 			/* Nothing to do here, so just drop the lock. */
 			raw_spin_unlock_irqrestore(&rnp->lock, flags);
-- 
cgit v1.2.3


From 654e953340491e498871321d7e2c9b0a12821933 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 15 Mar 2015 09:19:35 -0700
Subject: rcu: Associate quiescent-state reports with grace period

As noted in earlier commit logs, CPU hotplug operations running
concurrently with grace-period initialization can result in a given
leaf rcu_node structure having all CPUs offline and no blocked readers,
but with this rcu_node structure nevertheless blocking the current
grace period.  Therefore, the quiescent-state forcing code now checks
for this situation and repairs it.

Unfortunately, this checking can result in false positives, for example,
when the last task has just removed itself from this leaf rcu_node
structure, but has not yet started clearing the ->qsmask bits further
up the structure.  This means that the grace-period kthread (which
forces quiescent states) and some other task might be attempting to
concurrently clear these ->qsmask bits.  This is usually not a problem:
One of these tasks will be the first to acquire the upper-level rcu_node
structure's lock and with therefore clear the bit, and the other task,
seeing the bit already cleared, will stop trying to clear bits.

Sadly, this means that the following unusual sequence of events -can-
result in a problem:

1.	The grace-period kthread wins, and clears the ->qsmask bits.

2.	This is the last thing blocking the current grace period, so
	that the grace-period kthread clears ->qsmask bits all the way
	to the root and finds that the root ->qsmask field is now zero.

3.	Another grace period is required, so that the grace period kthread
	initializes it, including setting all the needed qsmask bits.

4.	The leaf rcu_node structure (the one that started this whole
	mess) is blocking this new grace period, either because it
	has at least one online CPU or because there is at least one
	task that had blocked within an RCU read-side critical section
	while running on one of this leaf rcu_node structure's CPUs.
	(And yes, that CPU might well have gone offline before the
	grace period in step (3) above started, which can mean that
	there is a task on the leaf rcu_node structure's ->blkd_tasks
	list, but ->qsmask equal to zero.)

5.	The other kthread didn't get around to trying to clear the upper
	level ->qsmask bits until all the above had happened.  This means
	that it now sees bits set in the upper-level ->qsmask field, so it
	proceeds to clear them.  Too bad that it is doing so on behalf of
	a quiescent state that does not apply to the current grace period!

This sequence of events can result in the new grace period being too
short.  It can also result in the new grace period ending before the
leaf rcu_node structure's ->qsmask bits have been cleared, which will
result in splats during initialization of the next grace period.  In
addition, it can result in tasks blocking the new grace period still
being queued at the start of the next grace period, which will result
in other splats.  Sasha's testing turned up another of these splats,
as did rcutorture testing.  (And yes, rcutorture is being adjusted to
make these splats show up more quickly.  Which probably is having the
undesirable side effect of making other problems show up less quickly.
Can't have everything!)

Reported-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: <stable@vger.kernel.org> # 4.0.x
Tested-by: Sasha Levin <sasha.levin@oracle.com>
---
 kernel/rcu/tree.c | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b3684b284677..8fcc64ed858c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2132,25 +2132,32 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
  * Similar to rcu_report_qs_rdp(), for which it is a helper function.
  * Allows quiescent states for a group of CPUs to be reported at one go
  * to the specified rcu_node structure, though all the CPUs in the group
- * must be represented by the same rcu_node structure (which need not be
- * a leaf rcu_node structure, though it often will be).  That structure's
- * lock must be held upon entry, and it is released before return.
+ * must be represented by the same rcu_node structure (which need not be a
+ * leaf rcu_node structure, though it often will be).  The gps parameter
+ * is the grace-period snapshot, which means that the quiescent states
+ * are valid only if rnp->gpnum is equal to gps.  That structure's lock
+ * must be held upon entry, and it is released before return.
  */
 static void
 rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
-		  struct rcu_node *rnp, unsigned long flags)
+		  struct rcu_node *rnp, unsigned long gps, unsigned long flags)
 	__releases(rnp->lock)
 {
+	unsigned long oldmask = 0;
 	struct rcu_node *rnp_c;
 
 	/* Walk up the rcu_node hierarchy. */
 	for (;;) {
-		if (!(rnp->qsmask & mask)) {
+		if (!(rnp->qsmask & mask) || rnp->gpnum != gps) {
 
-			/* Our bit has already been cleared, so done. */
+			/*
+			 * Our bit has already been cleared, or the
+			 * relevant grace period is already over, so done.
+			 */
 			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 			return;
 		}
+		WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
 		rnp->qsmask &= ~mask;
 		trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
 						 mask, rnp->qsmask, rnp->level,
@@ -2174,7 +2181,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
 		rnp = rnp->parent;
 		raw_spin_lock_irqsave(&rnp->lock, flags);
 		smp_mb__after_unlock_lock();
-		WARN_ON_ONCE(rnp_c->qsmask);
+		oldmask = rnp_c->qsmask;
 	}
 
 	/*
@@ -2196,6 +2203,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
 				      struct rcu_node *rnp, unsigned long flags)
 	__releases(rnp->lock)
 {
+	unsigned long gps;
 	unsigned long mask;
 	struct rcu_node *rnp_p;
 
@@ -2215,12 +2223,13 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
 		return;
 	}
 
-	/* Report up the rest of the hierarchy. */
+	/* Report up the rest of the hierarchy, tracking current ->gpnum. */
+	gps = rnp->gpnum;
 	mask = rnp->grpmask;
 	raw_spin_unlock(&rnp->lock);	/* irqs remain disabled. */
 	raw_spin_lock(&rnp_p->lock);	/* irqs already disabled. */
 	smp_mb__after_unlock_lock();
-	rcu_report_qs_rnp(mask, rsp, rnp_p, flags);
+	rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags);
 }
 
 /*
@@ -2271,7 +2280,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
 		 */
 		needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
 
-		rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
+		rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags);
+		/* ^^^ Released rnp->lock */
 		if (needwake)
 			rcu_gp_kthread_wake(rsp);
 	}
@@ -2747,8 +2757,8 @@ static void force_qs_rnp(struct rcu_state *rsp,
 			}
 		}
 		if (mask != 0) {
-			/* Idle/offline CPUs, report. */
-			rcu_report_qs_rnp(mask, rsp, rnp, flags);
+			/* Idle/offline CPUs, report (releases rnp->lock. */
+			rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags);
 		} else {
 			/* Nothing to do here, so just drop the lock. */
 			raw_spin_unlock_irqrestore(&rnp->lock, flags);
-- 
cgit v1.2.3


From 94caee8c312d96522bcdae88791aaa9ebcd5f22c Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 20 Mar 2015 15:11:11 +0100
Subject: ebpf: add sched_act_type and map it to sk_filter's verifier ops

In order to prepare eBPF support for tc action, we need to add
sched_act_type, so that the eBPF verifier is aware of what helper
function act_bpf may use, that it can load skb data and read out
currently available skb fields.

This is bascially analogous to 96be4325f443 ("ebpf: add sched_cls_type
and map it to sk_filter's verifier ops").

BPF_PROG_TYPE_SCHED_CLS and BPF_PROG_TYPE_SCHED_ACT need to be
separate since both will have a different set of functionality in
future (classifier vs action), thus we won't run into ABI troubles
when the point in time comes to diverge functionality from the
classifier.

The future plan for act_bpf would be that it will be able to write
into skb->data and alter selected fields mirrored in struct __sk_buff.

For an initial support, it's sufficient to map it to sk_filter_ops.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jiri Pirko <jiri@resnulli.us>
Reviewed-by: Jiri Pirko <jiri@resnulli.us>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 1 +
 kernel/bpf/verifier.c    | 1 +
 net/core/filter.c        | 6 ++++++
 3 files changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1623047af463..3dd314a45d0d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -119,6 +119,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_UNSPEC,
 	BPF_PROG_TYPE_SOCKET_FILTER,
 	BPF_PROG_TYPE_SCHED_CLS,
+	BPF_PROG_TYPE_SCHED_ACT,
 };
 
 #define BPF_PSEUDO_MAP_FD	1
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c22ebd36fa4b..0e714f799ec0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1180,6 +1180,7 @@ static bool may_access_skb(enum bpf_prog_type type)
 	switch (type) {
 	case BPF_PROG_TYPE_SOCKET_FILTER:
 	case BPF_PROG_TYPE_SCHED_CLS:
+	case BPF_PROG_TYPE_SCHED_ACT:
 		return true;
 	default:
 		return false;
diff --git a/net/core/filter.c b/net/core/filter.c
index bdaac5895def..084eacc4d1d4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1263,10 +1263,16 @@ static struct bpf_prog_type_list sched_cls_type __read_mostly = {
 	.type = BPF_PROG_TYPE_SCHED_CLS,
 };
 
+static struct bpf_prog_type_list sched_act_type __read_mostly = {
+	.ops = &sk_filter_ops,
+	.type = BPF_PROG_TYPE_SCHED_ACT,
+};
+
 static int __init register_sk_filter_ops(void)
 {
 	bpf_register_prog_type(&sk_filter_type);
 	bpf_register_prog_type(&sched_cls_type);
+	bpf_register_prog_type(&sched_act_type);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 50f16a8bf9d7a92c437ed1867d0f7e1dc6a9aca9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 5 Mar 2015 22:10:19 +0100
Subject: perf: Remove type specific target pointers

The only reason CQM had to use a hard-coded pmu type was so it could use
cqm_target in hw_perf_event.

Do away with the {tp,bp,cqm}_target pointers and provide a non type
specific one.

This allows us to do away with that silly pmu type as well.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Vince Weaver <vince@deater.net>
Cc: acme@kernel.org
Cc: acme@redhat.com
Cc: hpa@zytor.com
Cc: jolsa@redhat.com
Cc: kanaka.d.juvva@intel.com
Cc: matt.fleming@intel.com
Cc: tglx@linutronix.de
Cc: torvalds@linux-foundation.org
Cc: vikas.shivappa@linux.intel.com
Link: http://lkml.kernel.org/r/20150305211019.GU21418@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm/kernel/hw_breakpoint.c            |  2 +-
 arch/arm64/kernel/hw_breakpoint.c          |  2 +-
 arch/x86/kernel/cpu/perf_event_intel_cqm.c |  7 +++----
 include/linux/perf_event.h                 |  4 +---
 include/uapi/linux/perf_event.h            |  1 -
 kernel/events/core.c                       | 14 ++++----------
 kernel/events/hw_breakpoint.c              |  8 ++++----
 kernel/trace/trace_uprobe.c                | 10 +++++-----
 8 files changed, 19 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c
index 7fc70ae21185..dc7d0a95bd36 100644
--- a/arch/arm/kernel/hw_breakpoint.c
+++ b/arch/arm/kernel/hw_breakpoint.c
@@ -648,7 +648,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
 		 * Per-cpu breakpoints are not supported by our stepping
 		 * mechanism.
 		 */
-		if (!bp->hw.bp_target)
+		if (!bp->hw.target)
 			return -EINVAL;
 
 		/*
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index df1cf15377b4..d062f35911c2 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -527,7 +527,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
 	 * Disallow per-task kernel breakpoints since these would
 	 * complicate the stepping code.
 	 */
-	if (info->ctrl.privilege == AARCH64_BREAKPOINT_EL1 && bp->hw.bp_target)
+	if (info->ctrl.privilege == AARCH64_BREAKPOINT_EL1 && bp->hw.target)
 		return -EINVAL;
 
 	return 0;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index 9a8ef8376fcd..e4d1b8b738fa 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -263,7 +263,7 @@ static bool __match_event(struct perf_event *a, struct perf_event *b)
 	/*
 	 * Events that target same task are placed into the same cache group.
 	 */
-	if (a->hw.cqm_target == b->hw.cqm_target)
+	if (a->hw.target == b->hw.target)
 		return true;
 
 	/*
@@ -279,7 +279,7 @@ static bool __match_event(struct perf_event *a, struct perf_event *b)
 static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
 {
 	if (event->attach_state & PERF_ATTACH_TASK)
-		return perf_cgroup_from_task(event->hw.cqm_target);
+		return perf_cgroup_from_task(event->hw.target);
 
 	return event->cgrp;
 }
@@ -1365,8 +1365,7 @@ static int __init intel_cqm_init(void)
 
 	__perf_cpu_notifier(intel_cqm_cpu_notifier);
 
-	ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm",
-				PERF_TYPE_INTEL_CQM);
+	ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
 	if (ret)
 		pr_err("Intel CQM perf registration failed: %d\n", ret);
 	else
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index dac4c2831d82..5aa49d7bfd07 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -119,7 +119,6 @@ struct hw_perf_event {
 			struct hrtimer	hrtimer;
 		};
 		struct { /* tracepoint */
-			struct task_struct	*tp_target;
 			/* for tp_event->class */
 			struct list_head	tp_list;
 		};
@@ -129,7 +128,6 @@ struct hw_perf_event {
 			struct list_head	cqm_events_entry;
 			struct list_head	cqm_groups_entry;
 			struct list_head	cqm_group_entry;
-			struct task_struct	*cqm_target;
 		};
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 		struct { /* breakpoint */
@@ -138,12 +136,12 @@ struct hw_perf_event {
 			 * problem hw_breakpoint has with context
 			 * creation and event initalization.
 			 */
-			struct task_struct		*bp_target;
 			struct arch_hw_breakpoint	info;
 			struct list_head		bp_list;
 		};
 #endif
 	};
+	struct task_struct		*target;
 	int				state;
 	local64_t			prev_count;
 	u64				sample_period;
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 3c8b45de57ec..1e3cd07cf76e 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -32,7 +32,6 @@ enum perf_type_id {
 	PERF_TYPE_HW_CACHE			= 3,
 	PERF_TYPE_RAW				= 4,
 	PERF_TYPE_BREAKPOINT			= 5,
-	PERF_TYPE_INTEL_CQM			= 6,
 
 	PERF_TYPE_MAX,				/* non-ABI */
 };
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 71109a045450..525062b6fba1 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7171,18 +7171,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 
 	if (task) {
 		event->attach_state = PERF_ATTACH_TASK;
-
-		if (attr->type == PERF_TYPE_TRACEPOINT)
-			event->hw.tp_target = task;
-#ifdef CONFIG_HAVE_HW_BREAKPOINT
 		/*
-		 * hw_breakpoint is a bit difficult here..
+		 * XXX pmu::event_init needs to know what task to account to
+		 * and we cannot use the ctx information because we need the
+		 * pmu before we get a ctx.
 		 */
-		else if (attr->type == PERF_TYPE_BREAKPOINT)
-			event->hw.bp_target = task;
-#endif
-		else if (attr->type == PERF_TYPE_INTEL_CQM)
-			event->hw.cqm_target = task;
+		event->hw.target = task;
 	}
 
 	if (!overflow_handler && parent_event) {
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 9803a6600d49..92ce5f4ccc26 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -116,12 +116,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
  */
 static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
 {
-	struct task_struct *tsk = bp->hw.bp_target;
+	struct task_struct *tsk = bp->hw.target;
 	struct perf_event *iter;
 	int count = 0;
 
 	list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
-		if (iter->hw.bp_target == tsk &&
+		if (iter->hw.target == tsk &&
 		    find_slot_idx(iter) == type &&
 		    (iter->cpu < 0 || cpu == iter->cpu))
 			count += hw_breakpoint_weight(iter);
@@ -153,7 +153,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
 		int nr;
 
 		nr = info->cpu_pinned;
-		if (!bp->hw.bp_target)
+		if (!bp->hw.target)
 			nr += max_task_bp_pinned(cpu, type);
 		else
 			nr += task_bp_pinned(cpu, bp, type);
@@ -210,7 +210,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
 		weight = -weight;
 
 	/* Pinned counter cpu profiling */
-	if (!bp->hw.bp_target) {
+	if (!bp->hw.target) {
 		get_bp_info(bp->cpu, type)->cpu_pinned += weight;
 		return;
 	}
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index b11441321e7a..93fdc7791eaa 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1005,7 +1005,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
 		return true;
 
 	list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
-		if (event->hw.tp_target->mm == mm)
+		if (event->hw.target->mm == mm)
 			return true;
 	}
 
@@ -1015,7 +1015,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
 static inline bool
 uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
 {
-	return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm);
+	return __uprobe_perf_filter(&tu->filter, event->hw.target->mm);
 }
 
 static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
@@ -1023,10 +1023,10 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
 	bool done;
 
 	write_lock(&tu->filter.rwlock);
-	if (event->hw.tp_target) {
+	if (event->hw.target) {
 		list_del(&event->hw.tp_list);
 		done = tu->filter.nr_systemwide ||
-			(event->hw.tp_target->flags & PF_EXITING) ||
+			(event->hw.target->flags & PF_EXITING) ||
 			uprobe_filter_event(tu, event);
 	} else {
 		tu->filter.nr_systemwide--;
@@ -1046,7 +1046,7 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
 	int err;
 
 	write_lock(&tu->filter.rwlock);
-	if (event->hw.tp_target) {
+	if (event->hw.target) {
 		/*
 		 * event->parent != NULL means copy_process(), we can avoid
 		 * uprobe_apply(). current->mm must be probed and we can rely
-- 
cgit v1.2.3


From 7b63c3ab9b10b8a9cdc6b3a9e0e3d6a8a5d28970 Mon Sep 17 00:00:00 2001
From: Yannick Guerrini <yguerrini@tomshardware.fr>
Date: Tue, 24 Mar 2015 12:31:40 +1030
Subject: kernel/module.c: fix typos in message about unused symbols

Fix typos in pr_warn message about unused symbols

Signed-off-by: Yannick Guerrini <yguerrini@tomshardware.fr>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index b3d634ed06c9..3ab942f78760 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -387,9 +387,9 @@ static bool check_symbol(const struct symsearch *syms,
 		pr_warn("Symbol %s is marked as UNUSED, however this module is "
 			"using it.\n", fsa->name);
 		pr_warn("This symbol will go away in the future.\n");
-		pr_warn("Please evalute if this is the right api to use and if "
-			"it really is, submit a report the linux kernel "
-			"mailinglist together with submitting your code for "
+		pr_warn("Please evaluate if this is the right api to use and "
+			"if it really is, submit a report to the linux kernel "
+			"mailing list together with submitting your code for "
 			"inclusion.\n");
 	}
 #endif
-- 
cgit v1.2.3


From cc9e605dc6cb2e32fedae4ac2f61ad3b5f8d623d Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 24 Mar 2015 12:31:40 +1030
Subject: module: do not print allocation-fail warning on bogus user buffer
 size

init_module(2) passes user-specified buffer length directly to
vmalloc(). It makes warn_alloc_failed() to print out a lot of info into
dmesg if user specified insane size, like -1.

Let's silence the warning. It doesn't add much value to -ENOMEM return
code. Without the patch the syscall is prohibitive noisy for testing
with trinity.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Dave Jones <davej@codemonkey.org.uk>
Cc: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 3ab942f78760..65bd206e04a9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2494,7 +2494,8 @@ static int copy_module_from_user(const void __user *umod, unsigned long len,
 		return err;
 
 	/* Suck in entire file: we'll want most of it. */
-	info->hdr = vmalloc(info->len);
+	info->hdr = __vmalloc(info->len,
+			GFP_KERNEL | __GFP_HIGHMEM | __GFP_NOWARN, PAGE_KERNEL);
 	if (!info->hdr)
 		return -ENOMEM;
 
-- 
cgit v1.2.3


From 80a9b64e2c156b6523e7a01f2ba6e5d86e722814 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 17 Mar 2015 10:40:38 -0400
Subject: ring-buffer: Replace this_cpu_*() with __this_cpu_*()

It has come to my attention that this_cpu_read/write are horrible on
architectures other than x86. Worse yet, they actually disable
preemption or interrupts! This caused some unexpected tracing results
on ARM.

   101.356868: preempt_count_add <-ring_buffer_lock_reserve
   101.356870: preempt_count_sub <-ring_buffer_lock_reserve

The ring_buffer_lock_reserve has recursion protection that requires
accessing a per cpu variable. But since preempt_disable() is traced, it
too got traced while accessing the variable that is suppose to prevent
recursion like this.

The generic version of this_cpu_read() and write() are:

 #define this_cpu_generic_read(pcp)					\
 ({	typeof(pcp) ret__;						\
	preempt_disable();						\
	ret__ = *this_cpu_ptr(&(pcp));					\
	preempt_enable();						\
	ret__;								\
 })

 #define this_cpu_generic_to_op(pcp, val, op)				\
 do {									\
	unsigned long flags;						\
	raw_local_irq_save(flags);					\
	*__this_cpu_ptr(&(pcp)) op val;					\
	raw_local_irq_restore(flags);					\
 } while (0)

Which is unacceptable for locations that know they are within preempt
disabled or interrupt disabled locations.

Paul McKenney stated that __this_cpu_() versions produce much better code on
other architectures than this_cpu_() does, if we know that the call is done in
a preempt disabled location.

I also changed the recursive_unlock() to use two local variables instead
of accessing the per_cpu variable twice.

Link: http://lkml.kernel.org/r/20150317114411.GE3589@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/20150317104038.312e73d1@gandalf.local.home

Cc: stable@vger.kernel.org
Acked-by: Christoph Lameter <cl@linux.com>
Reported-by: Uwe Kleine-Koenig <u.kleine-koenig@pengutronix.de>
Tested-by: Uwe Kleine-Koenig <u.kleine-koenig@pengutronix.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 5040d44fe5a3..922048a0f7ea 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2679,7 +2679,7 @@ static DEFINE_PER_CPU(unsigned int, current_context);
 
 static __always_inline int trace_recursive_lock(void)
 {
-	unsigned int val = this_cpu_read(current_context);
+	unsigned int val = __this_cpu_read(current_context);
 	int bit;
 
 	if (in_interrupt()) {
@@ -2696,18 +2696,17 @@ static __always_inline int trace_recursive_lock(void)
 		return 1;
 
 	val |= (1 << bit);
-	this_cpu_write(current_context, val);
+	__this_cpu_write(current_context, val);
 
 	return 0;
 }
 
 static __always_inline void trace_recursive_unlock(void)
 {
-	unsigned int val = this_cpu_read(current_context);
+	unsigned int val = __this_cpu_read(current_context);
 
-	val--;
-	val &= this_cpu_read(current_context);
-	this_cpu_write(current_context, val);
+	val &= val & (val - 1);
+	__this_cpu_write(current_context, val);
 }
 
 #else
-- 
cgit v1.2.3


From bbedb179944c29e5e449603163eec9951116fe39 Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Wed, 11 Mar 2015 22:13:57 -0500
Subject: tracing: %pF is only for function pointers

Use %pS for actual addresses, otherwise you'll get bad output
on arches like ppc64 where %pF expects a function descriptor.

Link: http://lkml.kernel.org/r/1426130037-17956-22-git-send-email-scottwood@freescale.com

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/events/btrfs.h       |  4 ++--
 include/trace/events/ext3.h        |  2 +-
 include/trace/events/ext4.h        |  6 +++---
 include/trace/events/module.h      |  4 ++--
 include/trace/events/random.h      | 10 +++++-----
 kernel/trace/trace_entries.h       |  6 +++---
 tools/lib/traceevent/event-parse.c |  2 +-
 7 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 1faecea101f3..572e6503394a 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -962,7 +962,7 @@ TRACE_EVENT(alloc_extent_state,
 		__entry->ip	= IP
 	),
 
-	TP_printk("state=%p; mask = %s; caller = %pF", __entry->state,
+	TP_printk("state=%p; mask = %s; caller = %pS", __entry->state,
 		  show_gfp_flags(__entry->mask), (void *)__entry->ip)
 );
 
@@ -982,7 +982,7 @@ TRACE_EVENT(free_extent_state,
 		__entry->ip = IP
 	),
 
-	TP_printk(" state=%p; caller = %pF", __entry->state,
+	TP_printk(" state=%p; caller = %pS", __entry->state,
 		  (void *)__entry->ip)
 );
 
diff --git a/include/trace/events/ext3.h b/include/trace/events/ext3.h
index 6797b9de90ed..7f20707849bb 100644
--- a/include/trace/events/ext3.h
+++ b/include/trace/events/ext3.h
@@ -144,7 +144,7 @@ TRACE_EVENT(ext3_mark_inode_dirty,
 		__entry->ip	= IP;
 	),
 
-	TP_printk("dev %d,%d ino %lu caller %pF",
+	TP_printk("dev %d,%d ino %lu caller %pS",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino, (void *)__entry->ip)
 );
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 6e5abd6d38a2..47fca36ee426 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -240,7 +240,7 @@ TRACE_EVENT(ext4_mark_inode_dirty,
 		__entry->ip	= IP;
 	),
 
-	TP_printk("dev %d,%d ino %lu caller %pF",
+	TP_printk("dev %d,%d ino %lu caller %pS",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino, (void *)__entry->ip)
 );
@@ -1762,7 +1762,7 @@ TRACE_EVENT(ext4_journal_start,
 		__entry->rsv_blocks	 = rsv_blocks;
 	),
 
-	TP_printk("dev %d,%d blocks, %d rsv_blocks, %d caller %pF",
+	TP_printk("dev %d,%d blocks, %d rsv_blocks, %d caller %pS",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->blocks, __entry->rsv_blocks, (void *)__entry->ip)
 );
@@ -1784,7 +1784,7 @@ TRACE_EVENT(ext4_journal_start_reserved,
 		__entry->blocks		 = blocks;
 	),
 
-	TP_printk("dev %d,%d blocks, %d caller %pF",
+	TP_printk("dev %d,%d blocks, %d caller %pS",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->blocks, (void *)__entry->ip)
 );
diff --git a/include/trace/events/module.h b/include/trace/events/module.h
index 81c4c183d348..28c45997e451 100644
--- a/include/trace/events/module.h
+++ b/include/trace/events/module.h
@@ -84,7 +84,7 @@ DECLARE_EVENT_CLASS(module_refcnt,
 		__assign_str(name, mod->name);
 	),
 
-	TP_printk("%s call_site=%pf refcnt=%d",
+	TP_printk("%s call_site=%ps refcnt=%d",
 		  __get_str(name), (void *)__entry->ip, __entry->refcnt)
 );
 
@@ -121,7 +121,7 @@ TRACE_EVENT(module_request,
 		__assign_str(name, name);
 	),
 
-	TP_printk("%s wait=%d call_site=%pf",
+	TP_printk("%s wait=%d call_site=%ps",
 		  __get_str(name), (int)__entry->wait, (void *)__entry->ip)
 );
 
diff --git a/include/trace/events/random.h b/include/trace/events/random.h
index 805af6db41cc..4684de344c5d 100644
--- a/include/trace/events/random.h
+++ b/include/trace/events/random.h
@@ -22,7 +22,7 @@ TRACE_EVENT(add_device_randomness,
 		__entry->IP		= IP;
 	),
 
-	TP_printk("bytes %d caller %pF",
+	TP_printk("bytes %d caller %pS",
 		__entry->bytes, (void *)__entry->IP)
 );
 
@@ -43,7 +43,7 @@ DECLARE_EVENT_CLASS(random__mix_pool_bytes,
 		__entry->IP		= IP;
 	),
 
-	TP_printk("%s pool: bytes %d caller %pF",
+	TP_printk("%s pool: bytes %d caller %pS",
 		  __entry->pool_name, __entry->bytes, (void *)__entry->IP)
 );
 
@@ -82,7 +82,7 @@ TRACE_EVENT(credit_entropy_bits,
 	),
 
 	TP_printk("%s pool: bits %d entropy_count %d entropy_total %d "
-		  "caller %pF", __entry->pool_name, __entry->bits,
+		  "caller %pS", __entry->pool_name, __entry->bits,
 		  __entry->entropy_count, __entry->entropy_total,
 		  (void *)__entry->IP)
 );
@@ -207,7 +207,7 @@ DECLARE_EVENT_CLASS(random__get_random_bytes,
 		__entry->IP		= IP;
 	),
 
-	TP_printk("nbytes %d caller %pF", __entry->nbytes, (void *)__entry->IP)
+	TP_printk("nbytes %d caller %pS", __entry->nbytes, (void *)__entry->IP)
 );
 
 DEFINE_EVENT(random__get_random_bytes, get_random_bytes,
@@ -242,7 +242,7 @@ DECLARE_EVENT_CLASS(random__extract_entropy,
 		__entry->IP		= IP;
 	),
 
-	TP_printk("%s pool: nbytes %d entropy_count %d caller %pF",
+	TP_printk("%s pool: nbytes %d entropy_count %d caller %pS",
 		  __entry->pool_name, __entry->nbytes, __entry->entropy_count,
 		  (void *)__entry->IP)
 );
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e2d027ac66a2..ee7b94a4810a 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -223,7 +223,7 @@ FTRACE_ENTRY(bprint, bprint_entry,
 		__dynamic_array(	u32,	buf	)
 	),
 
-	F_printk("%pf: %s",
+	F_printk("%ps: %s",
 		 (void *)__entry->ip, __entry->fmt),
 
 	FILTER_OTHER
@@ -238,7 +238,7 @@ FTRACE_ENTRY(print, print_entry,
 		__dynamic_array(	char,	buf	)
 	),
 
-	F_printk("%pf: %s",
+	F_printk("%ps: %s",
 		 (void *)__entry->ip, __entry->buf),
 
 	FILTER_OTHER
@@ -253,7 +253,7 @@ FTRACE_ENTRY(bputs, bputs_entry,
 		__field(	const char *,	str	)
 	),
 
-	F_printk("%pf: %s",
+	F_printk("%ps: %s",
 		 (void *)__entry->ip, __entry->str),
 
 	FILTER_OTHER
diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c
index afe20ed9fac8..2c0bd8f2aad0 100644
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c
@@ -3976,7 +3976,7 @@ static struct print_arg *make_bprint_args(char *fmt, void *data, int size, struc
 	if (asprintf(&arg->atom.atom, "%lld", ip) < 0)
 		goto out_free;
 
-	/* skip the first "%pf: " */
+	/* skip the first "%ps: " */
 	for (ptr = fmt + 5, bptr = data + field->offset;
 	     bptr < data + size && *ptr; ptr++) {
 		int ls = 0;
-- 
cgit v1.2.3


From 754cb0071a5c9576ccfa6523969ef6a2f6a71676 Mon Sep 17 00:00:00 2001
From: He Kuang <hekuang@huawei.com>
Date: Tue, 3 Mar 2015 15:21:33 +0800
Subject: tracing: remove ftrace:function TRACE_EVENT_FL_USE_CALL_FILTER flag

TRACE_EVENT_FL_USE_CALL_FILTER flag in ftrace:functon event can be
removed. This flag was first introduced in commit
f306cc82a93d ("tracing: Update event filters for multibuffer").

Now, the only place uses this flag is ftrace:function, but the filter of
ftrace:function has a different code path with events/syscalls and
events/tracepoints. It uses ftrace_filter_write() and perf's
ftrace_profile_set_filter() to set the filter, the functionality of file
'tracing/events/ftrace/function/filter' is bypassed in function
init_pred(), in which case, neither call->filter nor file->filter is
used.

So we can safely remove TRACE_EVENT_FL_USE_CALL_FILTER flag from
ftrace:function events.

Link: http://lkml.kernel.org/r/1425367294-27852-1-git-send-email-hekuang@huawei.com

Signed-off-by: He Kuang <hekuang@huawei.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_export.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 12e2b99be862..174a6a71146c 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -177,7 +177,7 @@ struct ftrace_event_call __used event_##call = {			\
 	},								\
 	.event.type		= etype,				\
 	.print_fmt		= print,				\
-	.flags			= TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \
+	.flags			= TRACE_EVENT_FL_IGNORE_ENABLE,		\
 };									\
 struct ftrace_event_call __used						\
 __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
-- 
cgit v1.2.3


From d9a16d3ab8770357015c85a07387f1d2676a4773 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Thu, 12 Mar 2015 16:58:34 +1100
Subject: trace: Don't use __weak in header files

The commit that added a check for this to checkpatch says:

"Using weak declarations can have unintended link defects.  The __weak on
the declaration causes non-weak definitions to become weak."

In this case, when a PowerPC kernel is built with CONFIG_KPROBE_EVENT
but not CONFIG_UPROBE_EVENT, it generates the following warning:

WARNING: 1 bad relocations
c0000000014f2190 R_PPC64_ADDR64    uprobes_fetch_type_table

This is fixed by passing the fetch_table arrays to
traceprobe_parse_probe_arg() which also means that they can never be NULL.

Link: http://lkml.kernel.org/r/20150312165834.4482cb48@canb.auug.org.au

Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_kprobe.c |  5 +++--
 kernel/trace/trace_probe.c  | 19 +++++++------------
 kernel/trace/trace_probe.h  | 10 ++--------
 kernel/trace/trace_uprobe.c |  5 +++--
 4 files changed, 15 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index d73f565b4e06..f34c3ad1b5f4 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -250,7 +250,7 @@ DEFINE_FETCH_symbol(string_size)
 #define fetch_file_offset_string_size	NULL
 
 /* Fetch type information table */
-const struct fetch_type kprobes_fetch_type_table[] = {
+static const struct fetch_type kprobes_fetch_type_table[] = {
 	/* Special types */
 	[FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
 					sizeof(u32), 1, "__data_loc char[]"),
@@ -760,7 +760,8 @@ static int create_trace_kprobe(int argc, char **argv)
 
 		/* Parse fetch argument */
 		ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg,
-						is_return, true);
+						is_return, true,
+						kprobes_fetch_type_table);
 		if (ret) {
 			pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
 			goto error;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index b983b2fd2ca1..1769a81da8a7 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -356,17 +356,14 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
 
 /* Recursive argument parser */
 static int parse_probe_arg(char *arg, const struct fetch_type *t,
-		     struct fetch_param *f, bool is_return, bool is_kprobe)
+		     struct fetch_param *f, bool is_return, bool is_kprobe,
+		     const struct fetch_type *ftbl)
 {
-	const struct fetch_type *ftbl;
 	unsigned long param;
 	long offset;
 	char *tmp;
 	int ret = 0;
 
-	ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table;
-	BUG_ON(ftbl == NULL);
-
 	switch (arg[0]) {
 	case '$':
 		ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe);
@@ -447,7 +444,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
 			dprm->fetch_size = get_fetch_size_function(t,
 							dprm->fetch, ftbl);
 			ret = parse_probe_arg(arg, t2, &dprm->orig, is_return,
-							is_kprobe);
+							is_kprobe, ftbl);
 			if (ret)
 				kfree(dprm);
 			else {
@@ -505,15 +502,12 @@ static int __parse_bitfield_probe_arg(const char *bf,
 
 /* String length checking wrapper */
 int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
-		struct probe_arg *parg, bool is_return, bool is_kprobe)
+		struct probe_arg *parg, bool is_return, bool is_kprobe,
+		const struct fetch_type *ftbl)
 {
-	const struct fetch_type *ftbl;
 	const char *t;
 	int ret;
 
-	ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table;
-	BUG_ON(ftbl == NULL);
-
 	if (strlen(arg) > MAX_ARGSTR_LEN) {
 		pr_info("Argument is too long.: %s\n",  arg);
 		return -ENOSPC;
@@ -535,7 +529,8 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
 	}
 	parg->offset = *size;
 	*size += parg->type->size;
-	ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, is_kprobe);
+	ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return,
+			      is_kprobe, ftbl);
 
 	if (ret >= 0 && t != NULL)
 		ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 4f815fbce16d..e30f6cce4af6 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -229,13 +229,6 @@ ASSIGN_FETCH_FUNC(file_offset, ftype),			\
 #define FETCH_TYPE_STRING	0
 #define FETCH_TYPE_STRSIZE	1
 
-/*
- * Fetch type information table.
- * It's declared as a weak symbol due to conditional compilation.
- */
-extern __weak const struct fetch_type kprobes_fetch_type_table[];
-extern __weak const struct fetch_type uprobes_fetch_type_table[];
-
 #ifdef CONFIG_KPROBE_EVENT
 struct symbol_cache;
 unsigned long update_symbol_cache(struct symbol_cache *sc);
@@ -333,7 +326,8 @@ find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file)
 }
 
 extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
-		   struct probe_arg *parg, bool is_return, bool is_kprobe);
+		   struct probe_arg *parg, bool is_return, bool is_kprobe,
+		   const struct fetch_type *ftbl);
 
 extern int traceprobe_conflict_field_name(const char *name,
 			       struct probe_arg *args, int narg);
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 7dc1c8abecd6..74865465e0b7 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -196,7 +196,7 @@ DEFINE_FETCH_file_offset(string)
 DEFINE_FETCH_file_offset(string_size)
 
 /* Fetch type information table */
-const struct fetch_type uprobes_fetch_type_table[] = {
+static const struct fetch_type uprobes_fetch_type_table[] = {
 	/* Special types */
 	[FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
 					sizeof(u32), 1, "__data_loc char[]"),
@@ -535,7 +535,8 @@ static int create_trace_uprobe(int argc, char **argv)
 
 		/* Parse fetch argument */
 		ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg,
-						 is_return, false);
+						 is_return, false,
+						 uprobes_fetch_type_table);
 		if (ret) {
 			pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
 			goto error;
-- 
cgit v1.2.3


From e2e40f2c1ed433c5e224525c8c862fd32e5d3df2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 22 Feb 2015 08:58:50 -0800
Subject: fs: move struct kiocb to fs.h

struct kiocb now is a generic I/O container, so move it to fs.h.
Also do a #include diet for aio.h while we're at it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/s390/hypfs/inode.c                      |  2 +-
 drivers/char/mem.c                           |  2 +-
 drivers/char/tile-srom.c                     |  1 -
 drivers/infiniband/hw/ipath/ipath_file_ops.c |  1 -
 drivers/infiniband/hw/qib/qib_file_ops.c     |  1 -
 drivers/misc/mei/amthif.c                    |  1 -
 drivers/misc/mei/main.c                      |  1 -
 drivers/misc/mei/pci-me.c                    |  1 -
 drivers/scsi/sg.c                            |  2 +-
 drivers/staging/unisys/include/timskmod.h    |  1 -
 drivers/usb/gadget/function/f_fs.c           |  1 +
 drivers/usb/gadget/legacy/inode.c            |  1 +
 fs/9p/vfs_addr.c                             |  2 +-
 fs/affs/file.c                               |  2 +-
 fs/afs/write.c                               |  1 -
 fs/bfs/inode.c                               |  1 +
 fs/block_dev.c                               |  1 -
 fs/btrfs/file.c                              |  2 +-
 fs/btrfs/inode.c                             |  2 +-
 fs/ceph/file.c                               |  1 -
 fs/direct-io.c                               |  1 -
 fs/ecryptfs/file.c                           |  1 -
 fs/ext2/inode.c                              |  2 +-
 fs/ext3/inode.c                              |  2 +-
 fs/ext4/file.c                               |  2 +-
 fs/ext4/indirect.c                           |  2 +-
 fs/ext4/inode.c                              |  1 -
 fs/ext4/page-io.c                            |  1 -
 fs/f2fs/data.c                               |  2 +-
 fs/fat/inode.c                               |  1 -
 fs/fuse/cuse.c                               |  2 +-
 fs/fuse/dev.c                                |  1 -
 fs/fuse/file.c                               |  2 +-
 fs/gfs2/aops.c                               |  2 +-
 fs/gfs2/file.c                               |  1 -
 fs/hfs/inode.c                               |  2 +-
 fs/hfsplus/inode.c                           |  2 +-
 fs/jfs/inode.c                               |  2 +-
 fs/nfs/file.c                                |  1 -
 fs/nilfs2/inode.c                            |  2 +-
 fs/ntfs/file.c                               |  1 -
 fs/ntfs/inode.c                              |  1 -
 fs/ocfs2/aops.c                              |  1 +
 fs/ocfs2/aops.h                              |  2 +-
 fs/pipe.c                                    |  1 -
 fs/read_write.c                              |  1 -
 fs/reiserfs/inode.c                          |  2 +-
 fs/splice.c                                  |  1 -
 fs/ubifs/file.c                              |  1 -
 fs/udf/file.c                                |  2 +-
 fs/udf/inode.c                               |  2 +-
 fs/xfs/xfs_aops.c                            |  1 -
 fs/xfs/xfs_file.c                            |  1 -
 include/linux/aio.h                          | 31 +---------------------------
 include/linux/fs.h                           | 22 ++++++++++++++++++++
 include/net/sock.h                           |  1 -
 kernel/printk/printk.c                       |  2 +-
 kernel/sysctl.c                              |  1 +
 mm/filemap.c                                 |  1 -
 mm/page_io.c                                 |  2 +-
 mm/shmem.c                                   |  2 +-
 net/ipv4/raw.c                               |  1 -
 sound/core/pcm_native.c                      |  2 +-
 63 files changed, 55 insertions(+), 86 deletions(-)

(limited to 'kernel')

diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index 4c8008dd938e..ad66b07f742e 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -21,7 +21,7 @@
 #include <linux/module.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 #include <asm/ebcdic.h>
 #include "hypfs.h"
 
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 297110c12635..9c4fd7a8e2e5 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -26,7 +26,7 @@
 #include <linux/pfn.h>
 #include <linux/export.h>
 #include <linux/io.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 
 #include <linux/uaccess.h>
 
diff --git a/drivers/char/tile-srom.c b/drivers/char/tile-srom.c
index 02e76ac6d282..69f6b4acc377 100644
--- a/drivers/char/tile-srom.c
+++ b/drivers/char/tile-srom.c
@@ -27,7 +27,6 @@
 #include <linux/types.h>	/* size_t */
 #include <linux/proc_fs.h>
 #include <linux/fcntl.h>	/* O_ACCMODE */
-#include <linux/aio.h>
 #include <linux/pagemap.h>
 #include <linux/hugetlb.h>
 #include <linux/uaccess.h>
diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c
index 6d7f453b4d05..aed8afee56da 100644
--- a/drivers/infiniband/hw/ipath/ipath_file_ops.c
+++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c
@@ -40,7 +40,6 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/io.h>
-#include <linux/aio.h>
 #include <linux/jiffies.h>
 #include <linux/cpu.h>
 #include <asm/pgtable.h>
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
index b15e34eeef68..826c17ee4c7d 100644
--- a/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -39,7 +39,6 @@
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
 #include <linux/io.h>
-#include <linux/aio.h>
 #include <linux/jiffies.h>
 #include <asm/pgtable.h>
 #include <linux/delay.h>
diff --git a/drivers/misc/mei/amthif.c b/drivers/misc/mei/amthif.c
index c4cb9a984a5f..40ea639fa413 100644
--- a/drivers/misc/mei/amthif.c
+++ b/drivers/misc/mei/amthif.c
@@ -19,7 +19,6 @@
 #include <linux/errno.h>
 #include <linux/types.h>
 #include <linux/fcntl.h>
-#include <linux/aio.h>
 #include <linux/ioctl.h>
 #include <linux/cdev.h>
 #include <linux/list.h>
diff --git a/drivers/misc/mei/main.c b/drivers/misc/mei/main.c
index 3c019c0e60eb..47680c84801c 100644
--- a/drivers/misc/mei/main.c
+++ b/drivers/misc/mei/main.c
@@ -22,7 +22,6 @@
 #include <linux/errno.h>
 #include <linux/types.h>
 #include <linux/fcntl.h>
-#include <linux/aio.h>
 #include <linux/poll.h>
 #include <linux/init.h>
 #include <linux/ioctl.h>
diff --git a/drivers/misc/mei/pci-me.c b/drivers/misc/mei/pci-me.c
index bd3039ab8f98..af44ee26075d 100644
--- a/drivers/misc/mei/pci-me.c
+++ b/drivers/misc/mei/pci-me.c
@@ -21,7 +21,6 @@
 #include <linux/errno.h>
 #include <linux/types.h>
 #include <linux/fcntl.h>
-#include <linux/aio.h>
 #include <linux/pci.h>
 #include <linux/poll.h>
 #include <linux/ioctl.h>
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 0cbc1fb45f10..c78a6f7bbc20 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -33,7 +33,6 @@ static int sg_version_num = 30536;	/* 2 digits for each component */
 #include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/mm.h>
-#include <linux/aio.h>
 #include <linux/errno.h>
 #include <linux/mtio.h>
 #include <linux/ioctl.h>
@@ -51,6 +50,7 @@ static int sg_version_num = 30536;	/* 2 digits for each component */
 #include <linux/mutex.h>
 #include <linux/atomic.h>
 #include <linux/ratelimit.h>
+#include <linux/uio.h>
 
 #include "scsi.h"
 #include <scsi/scsi_dbg.h>
diff --git a/drivers/staging/unisys/include/timskmod.h b/drivers/staging/unisys/include/timskmod.h
index 4019a0d63645..52648d4d9922 100644
--- a/drivers/staging/unisys/include/timskmod.h
+++ b/drivers/staging/unisys/include/timskmod.h
@@ -46,7 +46,6 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/fcntl.h>
-#include <linux/aio.h>
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
 #include <linux/seq_file.h>
diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index b64538b498dc..a12315a78248 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -23,6 +23,7 @@
 #include <linux/export.h>
 #include <linux/hid.h>
 #include <linux/module.h>
+#include <linux/uio.h>
 #include <asm/unaligned.h>
 
 #include <linux/usb/composite.h>
diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
index a4a80694f607..662ef2c1c62b 100644
--- a/drivers/usb/gadget/legacy/inode.c
+++ b/drivers/usb/gadget/legacy/inode.c
@@ -26,6 +26,7 @@
 #include <linux/poll.h>
 #include <linux/mmu_context.h>
 #include <linux/aio.h>
+#include <linux/uio.h>
 
 #include <linux/device.h>
 #include <linux/moduleparam.h>
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index eb14e055ea83..ff1a5bac4200 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -33,7 +33,7 @@
 #include <linux/pagemap.h>
 #include <linux/idr.h>
 #include <linux/sched.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 
diff --git a/fs/affs/file.c b/fs/affs/file.c
index d2468bf95669..33eaa67bb026 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -12,7 +12,7 @@
  *  affs regular file handling primitives
  */
 
-#include <linux/aio.h>
+#include <linux/uio.h>
 #include "affs.h"
 
 static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext);
diff --git a/fs/afs/write.c b/fs/afs/write.c
index c13cb08964ed..0714abcd7f32 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -14,7 +14,6 @@
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
-#include <linux/aio.h>
 #include "internal.h"
 
 static int afs_write_back_from_locked_page(struct afs_writeback *wb,
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 90bc079d9982..fdcb4d69f430 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -15,6 +15,7 @@
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
 #include <linux/writeback.h>
+#include <linux/uio.h>
 #include <asm/uaccess.h>
 #include "bfs.h"
 
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 975266be67d3..2e522aed6584 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -27,7 +27,6 @@
 #include <linux/namei.h>
 #include <linux/log2.h>
 #include <linux/cleancache.h>
-#include <linux/aio.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b78bbbac900d..69c9508d2c7e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,7 +24,6 @@
 #include <linux/string.h>
 #include <linux/backing-dev.h>
 #include <linux/mpage.h>
-#include <linux/aio.h>
 #include <linux/falloc.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
@@ -32,6 +31,7 @@
 #include <linux/compat.h>
 #include <linux/slab.h>
 #include <linux/btrfs.h>
+#include <linux/uio.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 54bcf639d1cf..b214ab178f3a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -32,7 +32,6 @@
 #include <linux/writeback.h>
 #include <linux/statfs.h>
 #include <linux/compat.h>
-#include <linux/aio.h>
 #include <linux/bit_spinlock.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
@@ -43,6 +42,7 @@
 #include <linux/btrfs.h>
 #include <linux/blkdev.h>
 #include <linux/posix_acl_xattr.h>
+#include <linux/uio.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 081c4e3f9e49..98e257c1b5b1 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -7,7 +7,6 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/writeback.h>
-#include <linux/aio.h>
 #include <linux/falloc.h>
 
 #include "super.h"
diff --git a/fs/direct-io.c b/fs/direct-io.c
index c38b460776e6..6fb00e3f1059 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -37,7 +37,6 @@
 #include <linux/uio.h>
 #include <linux/atomic.h>
 #include <linux/prefetch.h>
-#include <linux/aio.h>
 
 /*
  * How many user pages to map in one call to get_user_pages().  This determines
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index a36da8841e0c..273d36e3f0c0 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -31,7 +31,6 @@
 #include <linux/security.h>
 #include <linux/compat.h>
 #include <linux/fs_stack.h>
-#include <linux/aio.h>
 #include "ecryptfs_kernel.h"
 
 /**
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 6434bc000125..df9d6afbc5d5 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -31,7 +31,7 @@
 #include <linux/mpage.h>
 #include <linux/fiemap.h>
 #include <linux/namei.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 #include "ext2.h"
 #include "acl.h"
 #include "xattr.h"
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 2c6ccc49ba27..db07ffbe7c85 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -27,7 +27,7 @@
 #include <linux/writeback.h>
 #include <linux/mpage.h>
 #include <linux/namei.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 #include "ext3.h"
 #include "xattr.h"
 #include "acl.h"
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 33a09da16c9c..598abbbe6786 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -23,9 +23,9 @@
 #include <linux/jbd2.h>
 #include <linux/mount.h>
 #include <linux/path.h>
-#include <linux/aio.h>
 #include <linux/quotaops.h>
 #include <linux/pagevec.h>
+#include <linux/uio.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 6b9878a24182..8611640856d3 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -20,9 +20,9 @@
  *	(sct@redhat.com), 1993, 1998
  */
 
-#include <linux/aio.h>
 #include "ext4_jbd2.h"
 #include "truncate.h"
+#include <linux/uio.h>
 
 #include <trace/events/ext4.h>
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 85404f15e53a..6325d2c1a65c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,7 +37,6 @@
 #include <linux/printk.h>
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
-#include <linux/aio.h>
 #include <linux/bitops.h>
 
 #include "ext4_jbd2.h"
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index b24a2541a9ba..464984261e69 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -18,7 +18,6 @@
 #include <linux/pagevec.h>
 #include <linux/mpage.h>
 #include <linux/namei.h>
-#include <linux/aio.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
 #include <linux/workqueue.h>
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 985ed023a750..497f8515d205 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -12,12 +12,12 @@
 #include <linux/f2fs_fs.h>
 #include <linux/buffer_head.h>
 #include <linux/mpage.h>
-#include <linux/aio.h>
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/blkdev.h>
 #include <linux/bio.h>
 #include <linux/prefetch.h>
+#include <linux/uio.h>
 
 #include "f2fs.h"
 #include "node.h"
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 497c7c5263c7..8521207de229 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -19,7 +19,6 @@
 #include <linux/mpage.h>
 #include <linux/buffer_head.h>
 #include <linux/mount.h>
-#include <linux/aio.h>
 #include <linux/vfs.h>
 #include <linux/parser.h>
 #include <linux/uio.h>
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 28d0c7abba1c..b3fa05032234 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -38,7 +38,6 @@
 #include <linux/device.h>
 #include <linux/file.h>
 #include <linux/fs.h>
-#include <linux/aio.h>
 #include <linux/kdev_t.h>
 #include <linux/kthread.h>
 #include <linux/list.h>
@@ -48,6 +47,7 @@
 #include <linux/slab.h>
 #include <linux/stat.h>
 #include <linux/module.h>
+#include <linux/uio.h>
 
 #include "fuse_i.h"
 
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ed19a7d622fa..8c92c727ddd6 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -19,7 +19,6 @@
 #include <linux/pipe_fs_i.h>
 #include <linux/swap.h>
 #include <linux/splice.h>
-#include <linux/aio.h>
 
 MODULE_ALIAS_MISCDEV(FUSE_MINOR);
 MODULE_ALIAS("devname:fuse");
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a5c5e38b3ff8..ff102cbf16ea 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -15,8 +15,8 @@
 #include <linux/module.h>
 #include <linux/compat.h>
 #include <linux/swap.h>
-#include <linux/aio.h>
 #include <linux/falloc.h>
+#include <linux/uio.h>
 
 static const struct file_operations fuse_direct_io_file_operations;
 
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 4ad4f94edebe..fe6634d25d1d 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -20,7 +20,7 @@
 #include <linux/swap.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/backing-dev.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 #include <trace/events/writeback.h>
 
 #include "gfs2.h"
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 3e32bb8e2d7e..f6fc412b1100 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -25,7 +25,6 @@
 #include <asm/uaccess.h>
 #include <linux/dlm.h>
 #include <linux/dlm_plock.h>
-#include <linux/aio.h>
 #include <linux/delay.h>
 
 #include "gfs2.h"
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index d0929bc81782..98d4ea45bb70 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -14,7 +14,7 @@
 #include <linux/pagemap.h>
 #include <linux/mpage.h>
 #include <linux/sched.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 
 #include "hfs_fs.h"
 #include "btree.h"
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 0cf786f2d046..f541196d4ee9 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -14,7 +14,7 @@
 #include <linux/pagemap.h>
 #include <linux/mpage.h>
 #include <linux/sched.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 
 #include "hfsplus_fs.h"
 #include "hfsplus_raw.h"
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index bd3df1ca3c9b..3197aed10614 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -22,8 +22,8 @@
 #include <linux/buffer_head.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
+#include <linux/uio.h>
 #include <linux/writeback.h>
-#include <linux/aio.h>
 #include "jfs_incore.h"
 #include "jfs_inode.h"
 #include "jfs_filsys.h"
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 94712fc781fa..5d8b89cb13c0 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -26,7 +26,6 @@
 #include <linux/nfs_mount.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
-#include <linux/aio.h>
 #include <linux/gfp.h>
 #include <linux/swap.h>
 
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 8b5969538f39..ab4987bc637f 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -26,7 +26,7 @@
 #include <linux/mpage.h>
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 #include "nilfs.h"
 #include "btnode.h"
 #include "segment.h"
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 1da9b2d184dc..f16f2d8401fe 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -28,7 +28,6 @@
 #include <linux/swap.h>
 #include <linux/uio.h>
 #include <linux/writeback.h>
-#include <linux/aio.h>
 
 #include <asm/page.h>
 #include <asm/uaccess.h>
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 898b9949d363..1d0c21df0d80 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -28,7 +28,6 @@
 #include <linux/quotaops.h>
 #include <linux/slab.h>
 #include <linux/log2.h>
-#include <linux/aio.h>
 
 #include "aops.h"
 #include "attrib.h"
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 44db1808cdb5..e1bf18c5d25e 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -29,6 +29,7 @@
 #include <linux/mpage.h>
 #include <linux/quotaops.h>
 #include <linux/blkdev.h>
+#include <linux/uio.h>
 
 #include <cluster/masklog.h>
 
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 6cae155d54df..dd59599b022d 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -22,7 +22,7 @@
 #ifndef OCFS2_AOPS_H
 #define OCFS2_AOPS_H
 
-#include <linux/aio.h>
+#include <linux/fs.h>
 
 handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
 							 struct page *page,
diff --git a/fs/pipe.c b/fs/pipe.c
index 21981e58e2a6..2d084f2d0b83 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -21,7 +21,6 @@
 #include <linux/audit.h>
 #include <linux/syscalls.h>
 #include <linux/fcntl.h>
-#include <linux/aio.h>
 
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
diff --git a/fs/read_write.c b/fs/read_write.c
index 76e324e8ce8d..99a6ef946d01 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -9,7 +9,6 @@
 #include <linux/fcntl.h>
 #include <linux/file.h>
 #include <linux/uio.h>
-#include <linux/aio.h>
 #include <linux/fsnotify.h>
 #include <linux/security.h>
 #include <linux/export.h>
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index e72401e1f995..9312b7842e03 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -18,7 +18,7 @@
 #include <linux/writeback.h>
 #include <linux/quotaops.h>
 #include <linux/swap.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 
 int reiserfs_commit_write(struct file *f, struct page *page,
 			  unsigned from, unsigned to);
diff --git a/fs/splice.c b/fs/splice.c
index 7968da96bebb..4bbfa95b5bfe 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -32,7 +32,6 @@
 #include <linux/gfp.h>
 #include <linux/socket.h>
 #include <linux/compat.h>
-#include <linux/aio.h>
 #include "internal.h"
 
 /*
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index e627c0acf626..c3d15fe83403 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -50,7 +50,6 @@
  */
 
 #include "ubifs.h"
-#include <linux/aio.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/slab.h>
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 9c0b6da9dbb3..7f885cc8b0b7 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -34,7 +34,7 @@
 #include <linux/errno.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 
 #include "udf_i.h"
 #include "udf_sb.h"
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index a445d599098d..9c1fbd23913d 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -38,7 +38,7 @@
 #include <linux/slab.h>
 #include <linux/crc-itu-t.h>
 #include <linux/mpage.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 
 #include "udf_i.h"
 #include "udf_sb.h"
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 3a9b7a1b8704..4f8cdc59bc38 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -31,7 +31,6 @@
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
 #include "xfs_bmap_btree.h"
-#include <linux/aio.h>
 #include <linux/gfp.h>
 #include <linux/mpage.h>
 #include <linux/pagevec.h>
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 1cdba95c78cb..f527618cb42b 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -37,7 +37,6 @@
 #include "xfs_log.h"
 #include "xfs_icache.h"
 
-#include <linux/aio.h>
 #include <linux/dcache.h>
 #include <linux/falloc.h>
 #include <linux/pagevec.h>
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 5c40b61285ac..9eb42dbc5582 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -1,52 +1,23 @@
 #ifndef __LINUX__AIO_H
 #define __LINUX__AIO_H
 
-#include <linux/list.h>
-#include <linux/workqueue.h>
 #include <linux/aio_abi.h>
-#include <linux/uio.h>
-#include <linux/rcupdate.h>
-
-#include <linux/atomic.h>
 
 struct kioctx;
 struct kiocb;
+struct mm_struct;
 
 #define KIOCB_KEY		0
 
 typedef int (kiocb_cancel_fn)(struct kiocb *);
 
-#define IOCB_EVENTFD		(1 << 0)
-
-struct kiocb {
-	struct file		*ki_filp;
-	loff_t			ki_pos;
-	void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
-	void			*private;
-	int			ki_flags;
-};
-
-static inline bool is_sync_kiocb(struct kiocb *kiocb)
-{
-	return kiocb->ki_complete == NULL;
-}
-
-static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
-{
-	*kiocb = (struct kiocb) {
-			.ki_filp = filp,
-		};
-}
-
 /* prototypes */
 #ifdef CONFIG_AIO
-struct mm_struct;
 extern void exit_aio(struct mm_struct *mm);
 extern long do_io_submit(aio_context_t ctx_id, long nr,
 			 struct iocb __user *__user *iocbpp, bool compat);
 void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel);
 #else
-struct mm_struct;
 static inline void exit_aio(struct mm_struct *mm) { }
 static inline long do_io_submit(aio_context_t ctx_id, long nr,
 				struct iocb __user * __user *iocbpp,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 447932aed1e1..48c1472bde4a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -314,6 +314,28 @@ struct page;
 struct address_space;
 struct writeback_control;
 
+#define IOCB_EVENTFD		(1 << 0)
+
+struct kiocb {
+	struct file		*ki_filp;
+	loff_t			ki_pos;
+	void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
+	void			*private;
+	int			ki_flags;
+};
+
+static inline bool is_sync_kiocb(struct kiocb *kiocb)
+{
+	return kiocb->ki_complete == NULL;
+}
+
+static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
+{
+	*kiocb = (struct kiocb) {
+		.ki_filp = filp,
+	};
+}
+
 /*
  * "descriptor" for what we're up to with a read.
  * This allows us to use the same read code yet
diff --git a/include/net/sock.h b/include/net/sock.h
index ab186b1d31ff..71c1300025e2 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -57,7 +57,6 @@
 #include <linux/page_counter.h>
 #include <linux/memcontrol.h>
 #include <linux/static_key.h>
-#include <linux/aio.h>
 #include <linux/sched.h>
 
 #include <linux/filter.h>
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 60b2aa2a2da9..40d50cc4c686 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -32,7 +32,6 @@
 #include <linux/security.h>
 #include <linux/bootmem.h>
 #include <linux/memblock.h>
-#include <linux/aio.h>
 #include <linux/syscalls.h>
 #include <linux/kexec.h>
 #include <linux/kdb.h>
@@ -46,6 +45,7 @@
 #include <linux/irq_work.h>
 #include <linux/utsname.h>
 #include <linux/ctype.h>
+#include <linux/uio.h>
 
 #include <asm/uaccess.h>
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 88ea2d6e0031..83d907afb4a6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -19,6 +19,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/aio.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/slab.h>
diff --git a/mm/filemap.c b/mm/filemap.c
index ad7242043bdb..876f4e6f3ed6 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -13,7 +13,6 @@
 #include <linux/compiler.h>
 #include <linux/fs.h>
 #include <linux/uaccess.h>
-#include <linux/aio.h>
 #include <linux/capability.h>
 #include <linux/kernel_stat.h>
 #include <linux/gfp.h>
diff --git a/mm/page_io.c b/mm/page_io.c
index 7ef21577856c..a96c8562d835 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -20,8 +20,8 @@
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/frontswap.h>
-#include <linux/aio.h>
 #include <linux/blkdev.h>
+#include <linux/uio.h>
 #include <asm/pgtable.h>
 
 static struct bio *get_swap_bio(gfp_t gfp_flags,
diff --git a/mm/shmem.c b/mm/shmem.c
index a63031fa3e0c..944b94079bb0 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -31,7 +31,7 @@
 #include <linux/mm.h>
 #include <linux/export.h>
 #include <linux/swap.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 
 static struct vfsmount *shm_mnt;
 
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index f027a708b7e0..4a356b7c081b 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -46,7 +46,6 @@
 #include <linux/stddef.h>
 #include <linux/slab.h>
 #include <linux/errno.h>
-#include <linux/aio.h>
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <linux/spinlock.h>
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index b03a638b420c..9ecff240a39b 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -25,7 +25,6 @@
 #include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/pm_qos.h>
-#include <linux/aio.h>
 #include <linux/io.h>
 #include <linux/dma-mapping.h>
 #include <sound/core.h>
@@ -35,6 +34,7 @@
 #include <sound/pcm_params.h>
 #include <sound/timer.h>
 #include <sound/minors.h>
+#include <linux/uio.h>
 
 /*
  *  Compatibility
-- 
cgit v1.2.3


From c7cef0a84912cab3c9df8949b034e4aa62982ec9 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Mon, 9 Mar 2015 16:27:12 -0400
Subject: console: Add extensible console matching

Add match() method to struct console which allows the console to
perform console command line matching instead of (or in addition to)
default console matching (ie., by fixed name and index).

The match() method returns 0 to indicate a successful match; normal
console matching occurs if no match() method is defined or the
match() method returns non-zero. The match() method is expected to set
the console index if required.

Re-implement earlycon-to-console-handoff with direct matching of
"console=uart|uart8250,..." to the 8250 ttyS console.

Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_core.c  | 64 +++++++++++++++++++++++++++---------
 drivers/tty/serial/8250/8250_early.c | 23 -------------
 include/linux/console.h              |  3 +-
 include/linux/serial_8250.h          |  2 --
 kernel/printk/printk.c               | 52 ++++++++++-------------------
 5 files changed, 67 insertions(+), 77 deletions(-)

(limited to 'kernel')

diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
index bd06ab790c64..924ee1e13828 100644
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -3322,9 +3322,54 @@ static int serial8250_console_setup(struct console *co, char *options)
 	return uart_set_options(port, co, baud, parity, bits, flow);
 }
 
-static int serial8250_console_early_setup(void)
+/**
+ *	serial8250_console_match - non-standard console matching
+ *	@co:	  registering console
+ *	@name:	  name from console command line
+ *	@idx:	  index from console command line
+ *	@options: ptr to option string from console command line
+ *
+ *	Only attempts to match console command lines of the form:
+ *	    console=uart<>,io|mmio|mmio32,<addr>,<options>
+ *	    console=uart<>,<addr>,options
+ *	This form is used to register an initial earlycon boot console and
+ *	replace it with the serial8250_console at 8250 driver init.
+ *
+ *	Performs console setup for a match (as required by interface)
+ *
+ *	Returns 0 if console matches; otherwise non-zero to use default matching
+ */
+static int serial8250_console_match(struct console *co, char *name, int idx,
+				    char *options)
 {
-	return serial8250_find_port_for_earlycon();
+	char match[] = "uart";	/* 8250-specific earlycon name */
+	unsigned char iotype;
+	unsigned long addr;
+	int i;
+
+	if (strncmp(name, match, 4) != 0)
+		return -ENODEV;
+
+	if (uart_parse_earlycon(options, &iotype, &addr, &options))
+		return -ENODEV;
+
+	/* try to match the port specified on the command line */
+	for (i = 0; i < nr_uarts; i++) {
+		struct uart_port *port = &serial8250_ports[i].port;
+
+		if (port->iotype != iotype)
+			continue;
+		if ((iotype == UPIO_MEM || iotype == UPIO_MEM32) &&
+		    (port->mapbase != addr))
+			continue;
+		if (iotype == UPIO_PORT && port->iobase != addr)
+			continue;
+
+		co->index = i;
+		return serial8250_console_setup(co, options);
+	}
+
+	return -ENODEV;
 }
 
 static struct console serial8250_console = {
@@ -3332,7 +3377,7 @@ static struct console serial8250_console = {
 	.write		= serial8250_console_write,
 	.device		= uart_console_device,
 	.setup		= serial8250_console_setup,
-	.early_setup	= serial8250_console_early_setup,
+	.match		= serial8250_console_match,
 	.flags		= CON_PRINTBUFFER | CON_ANYTIME,
 	.index		= -1,
 	.data		= &serial8250_reg,
@@ -3346,19 +3391,6 @@ static int __init serial8250_console_init(void)
 }
 console_initcall(serial8250_console_init);
 
-int serial8250_find_port(struct uart_port *p)
-{
-	int line;
-	struct uart_port *port;
-
-	for (line = 0; line < nr_uarts; line++) {
-		port = &serial8250_ports[line].port;
-		if (uart_match_port(p, port))
-			return line;
-	}
-	return -ENODEV;
-}
-
 #define SERIAL8250_CONSOLE	&serial8250_console
 #else
 #define SERIAL8250_CONSOLE	NULL
diff --git a/drivers/tty/serial/8250/8250_early.c b/drivers/tty/serial/8250/8250_early.c
index c31a22b4f845..49bca65057e6 100644
--- a/drivers/tty/serial/8250/8250_early.c
+++ b/drivers/tty/serial/8250/8250_early.c
@@ -173,26 +173,3 @@ int __init setup_early_serial8250_console(char *cmdline)
 
 	return setup_earlycon(cmdline, match, early_serial8250_setup);
 }
-
-int serial8250_find_port_for_earlycon(void)
-{
-	struct earlycon_device *device = early_device;
-	struct uart_port *port = device ? &device->port : NULL;
-	int line;
-	int ret;
-
-	if (!port || (!port->membase && !port->iobase))
-		return -ENODEV;
-
-	line = serial8250_find_port(port);
-	if (line < 0)
-		return -ENODEV;
-
-	ret = update_console_cmdline("uart", 8250,
-			     "ttyS", line, device->options);
-	if (ret < 0)
-		ret = update_console_cmdline("uart", 0,
-				     "ttyS", line, device->options);
-
-	return ret;
-}
diff --git a/include/linux/console.h b/include/linux/console.h
index 7571a16bd653..9f50fb413c11 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -123,7 +123,7 @@ struct console {
 	struct tty_driver *(*device)(struct console *, int *);
 	void	(*unblank)(void);
 	int	(*setup)(struct console *, char *);
-	int	(*early_setup)(void);
+	int	(*match)(struct console *, char *name, int idx, char *options);
 	short	flags;
 	short	index;
 	int	cflag;
@@ -141,7 +141,6 @@ extern int console_set_on_cmdline;
 extern struct console *early_console;
 
 extern int add_preferred_console(char *name, int idx, char *options);
-extern int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options);
 extern void register_console(struct console *);
 extern int unregister_console(struct console *);
 extern struct console *console_drivers;
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index a8efa235b7c1..f26ae7fa30ae 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -118,8 +118,6 @@ void serial8250_resume_port(int line);
 
 extern int early_serial_setup(struct uart_port *port);
 
-extern int serial8250_find_port(struct uart_port *p);
-extern int serial8250_find_port_for_earlycon(void);
 extern unsigned int serial8250_early_in(struct uart_port *port, int offset);
 extern void serial8250_early_out(struct uart_port *port, int offset, int value);
 extern int setup_early_serial8250_console(char *cmdline);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 26f899809539..dda959221086 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2017,24 +2017,6 @@ int add_preferred_console(char *name, int idx, char *options)
 	return __add_preferred_console(name, idx, options, NULL);
 }
 
-int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options)
-{
-	struct console_cmdline *c;
-	int i;
-
-	for (i = 0, c = console_cmdline;
-	     i < MAX_CMDLINECONSOLES && c->name[0];
-	     i++, c++)
-		if (strcmp(c->name, name) == 0 && c->index == idx) {
-			strlcpy(c->name, name_new, sizeof(c->name));
-			c->options = options;
-			c->index = idx_new;
-			return i;
-		}
-	/* not found */
-	return -1;
-}
-
 bool console_suspend_enabled = true;
 EXPORT_SYMBOL(console_suspend_enabled);
 
@@ -2436,9 +2418,6 @@ void register_console(struct console *newcon)
 	if (preferred_console < 0 || bcon || !console_drivers)
 		preferred_console = selected_console;
 
-	if (newcon->early_setup)
-		newcon->early_setup();
-
 	/*
 	 *	See if we want to use this console driver. If we
 	 *	didn't select a console we take the first one
@@ -2464,21 +2443,26 @@ void register_console(struct console *newcon)
 	for (i = 0, c = console_cmdline;
 	     i < MAX_CMDLINECONSOLES && c->name[0];
 	     i++, c++) {
-		BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name));
-		if (strcmp(c->name, newcon->name) != 0)
-			continue;
-		if (newcon->index >= 0 &&
-		    newcon->index != c->index)
-			continue;
-		if (newcon->index < 0)
-			newcon->index = c->index;
+		if (!newcon->match ||
+		    newcon->match(newcon, c->name, c->index, c->options) != 0) {
+			/* default matching */
+			BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name));
+			if (strcmp(c->name, newcon->name) != 0)
+				continue;
+			if (newcon->index >= 0 &&
+			    newcon->index != c->index)
+				continue;
+			if (newcon->index < 0)
+				newcon->index = c->index;
 
-		if (_braille_register_console(newcon, c))
-			return;
+			if (_braille_register_console(newcon, c))
+				return;
+
+			if (newcon->setup &&
+			    newcon->setup(newcon, c->options) != 0)
+				break;
+		}
 
-		if (newcon->setup &&
-		    newcon->setup(newcon, console_cmdline[i].options) != 0)
-			break;
 		newcon->flags |= CON_ENABLED;
 		if (i == selected_console) {
 			newcon->flags |= CON_CONSDEV;
-- 
cgit v1.2.3


From ccd41c86ad4d464d0ed4e48d80759ff85c2115b0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 25 Feb 2015 15:56:04 +0100
Subject: perf: Fix racy group access

While looking at some fuzzer output I noticed that we do not hold any
locks on leader->ctx and therefore the sibling_list iteration is
unsafe.

Acquire the relevant ctx->mutex before calling into the pmu specific
code.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Link: http://lkml.kernel.org/r/20150225151639.GL5029@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index b01dfb602db1..bb1a7c36e794 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7036,12 +7036,23 @@ EXPORT_SYMBOL_GPL(perf_pmu_unregister);
 
 static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
 {
+	struct perf_event_context *ctx = NULL;
 	int ret;
 
 	if (!try_module_get(pmu->module))
 		return -ENODEV;
+
+	if (event->group_leader != event) {
+		ctx = perf_event_ctx_lock(event->group_leader);
+		BUG_ON(!ctx);
+	}
+
 	event->pmu = pmu;
 	ret = pmu->event_init(event);
+
+	if (ctx)
+		perf_event_ctx_unlock(event->group_leader, ctx);
+
 	if (ret)
 		module_put(pmu->module);
 
-- 
cgit v1.2.3


From 34f439278cef7b1177f8ce24f9fc81dfc6221d3b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 20 Feb 2015 14:05:38 +0100
Subject: perf: Add per event clockid support

While thinking on the whole clock discussion it occurred to me we have
two distinct uses of time:

 1) the tracking of event/ctx/cgroup enabled/running/stopped times
    which includes the self-monitoring support in struct
    perf_event_mmap_page.

 2) the actual timestamps visible in the data records.

And we've been conflating them.

The first is all about tracking time deltas, nobody should really care
in what time base that happens, its all relative information, as long
as its internally consistent it works.

The second however is what people are worried about when having to
merge their data with external sources. And here we have the
discussion on MONOTONIC vs MONOTONIC_RAW etc..

Where MONOTONIC is good for correlating between machines (static
offset), MONOTNIC_RAW is required for correlating against a fixed rate
hardware clock.

This means configurability; now 1) makes that hard because it needs to
be internally consistent across groups of unrelated events; which is
why we had to have a global perf_clock().

However, for 2) it doesn't really matter, perf itself doesn't care
what it writes into the buffer.

The below patch makes the distinction between these two cases by
adding perf_event_clock() which is used for the second case. It
further makes this configurable on a per-event basis, but adds a few
sanity checks such that we cannot combine events with different clocks
in confusing ways.

And since we then have per-event configurability we might as well
retain the 'legacy' behaviour as a default.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 14 ++++++--
 include/linux/perf_event.h       |  2 ++
 include/uapi/linux/perf_event.h  |  6 ++--
 kernel/events/core.c             | 77 ++++++++++++++++++++++++++++++++++++++--
 4 files changed, 91 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index ac41b3ad1fc9..0420ebcac116 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1978,13 +1978,23 @@ void arch_perf_update_userpage(struct perf_event *event,
 
 	data = cyc2ns_read_begin();
 
+	/*
+	 * Internal timekeeping for enabled/running/stopped times
+	 * is always in the local_clock domain.
+	 */
 	userpg->cap_user_time = 1;
 	userpg->time_mult = data->cyc2ns_mul;
 	userpg->time_shift = data->cyc2ns_shift;
 	userpg->time_offset = data->cyc2ns_offset - now;
 
-	userpg->cap_user_time_zero = 1;
-	userpg->time_zero = data->cyc2ns_offset;
+	/*
+	 * cap_user_time_zero doesn't make sense when we're using a different
+	 * time base for the records.
+	 */
+	if (event->clock == &local_clock) {
+		userpg->cap_user_time_zero = 1;
+		userpg->time_zero = data->cyc2ns_offset;
+	}
 
 	cyc2ns_read_end(data);
 }
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b16eac5f54ce..401554074de9 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -173,6 +173,7 @@ struct perf_event;
  * pmu::capabilities flags
  */
 #define PERF_PMU_CAP_NO_INTERRUPT		0x01
+#define PERF_PMU_CAP_NO_NMI			0x02
 
 /**
  * struct pmu - generic performance monitoring unit
@@ -457,6 +458,7 @@ struct perf_event {
 	struct pid_namespace		*ns;
 	u64				id;
 
+	u64				(*clock)(void);
 	perf_overflow_handler_t		overflow_handler;
 	void				*overflow_handler_context;
 
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 1e3cd07cf76e..3bb40ddadbe5 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -326,7 +326,8 @@ struct perf_event_attr {
 				exclude_callchain_user   : 1, /* exclude user callchains */
 				mmap2          :  1, /* include mmap with inode data     */
 				comm_exec      :  1, /* flag comm events that are due to an exec */
-				__reserved_1   : 39;
+				use_clockid    :  1, /* use @clockid for time fields */
+				__reserved_1   : 38;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -355,8 +356,7 @@ struct perf_event_attr {
 	 */
 	__u32	sample_stack_user;
 
-	/* Align to u64. */
-	__u32	__reserved_2;
+	__s32	clockid;
 	/*
 	 * Defines set of regs to dump for each sample
 	 * state captured on:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index bb1a7c36e794..c40c2cac2d8e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -327,6 +327,11 @@ static inline u64 perf_clock(void)
 	return local_clock();
 }
 
+static inline u64 perf_event_clock(struct perf_event *event)
+{
+	return event->clock();
+}
+
 static inline struct perf_cpu_context *
 __get_cpu_context(struct perf_event_context *ctx)
 {
@@ -4762,7 +4767,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
 	}
 
 	if (sample_type & PERF_SAMPLE_TIME)
-		data->time = perf_clock();
+		data->time = perf_event_clock(event);
 
 	if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
 		data->id = primary_event_id(event);
@@ -5340,6 +5345,8 @@ static void perf_event_task_output(struct perf_event *event,
 	task_event->event_id.tid = perf_event_tid(event, task);
 	task_event->event_id.ptid = perf_event_tid(event, current);
 
+	task_event->event_id.time = perf_event_clock(event);
+
 	perf_output_put(&handle, task_event->event_id);
 
 	perf_event__output_id_sample(event, &handle, &sample);
@@ -5373,7 +5380,7 @@ static void perf_event_task(struct task_struct *task,
 			/* .ppid */
 			/* .tid  */
 			/* .ptid */
-			.time = perf_clock(),
+			/* .time */
 		},
 	};
 
@@ -5749,7 +5756,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
 			.misc = 0,
 			.size = sizeof(throttle_event),
 		},
-		.time		= perf_clock(),
+		.time		= perf_event_clock(event),
 		.id		= primary_event_id(event),
 		.stream_id	= event->id,
 	};
@@ -6293,6 +6300,8 @@ static int perf_swevent_init(struct perf_event *event)
 static struct pmu perf_swevent = {
 	.task_ctx_nr	= perf_sw_context,
 
+	.capabilities	= PERF_PMU_CAP_NO_NMI,
+
 	.event_init	= perf_swevent_init,
 	.add		= perf_swevent_add,
 	.del		= perf_swevent_del,
@@ -6636,6 +6645,8 @@ static int cpu_clock_event_init(struct perf_event *event)
 static struct pmu perf_cpu_clock = {
 	.task_ctx_nr	= perf_sw_context,
 
+	.capabilities	= PERF_PMU_CAP_NO_NMI,
+
 	.event_init	= cpu_clock_event_init,
 	.add		= cpu_clock_event_add,
 	.del		= cpu_clock_event_del,
@@ -6715,6 +6726,8 @@ static int task_clock_event_init(struct perf_event *event)
 static struct pmu perf_task_clock = {
 	.task_ctx_nr	= perf_sw_context,
 
+	.capabilities	= PERF_PMU_CAP_NO_NMI,
+
 	.event_init	= task_clock_event_init,
 	.add		= task_clock_event_add,
 	.del		= task_clock_event_del,
@@ -7200,6 +7213,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 		event->hw.target = task;
 	}
 
+	event->clock = &local_clock;
+	if (parent_event)
+		event->clock = parent_event->clock;
+
 	if (!overflow_handler && parent_event) {
 		overflow_handler = parent_event->overflow_handler;
 		context = parent_event->overflow_handler_context;
@@ -7422,6 +7439,12 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 	if (output_event->cpu == -1 && output_event->ctx != event->ctx)
 		goto out;
 
+	/*
+	 * Mixing clocks in the same buffer is trouble you don't need.
+	 */
+	if (output_event->clock != event->clock)
+		goto out;
+
 set:
 	mutex_lock(&event->mmap_mutex);
 	/* Can't redirect output if we've got an active mmap() */
@@ -7454,6 +7477,43 @@ static void mutex_lock_double(struct mutex *a, struct mutex *b)
 	mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
 }
 
+static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
+{
+	bool nmi_safe = false;
+
+	switch (clk_id) {
+	case CLOCK_MONOTONIC:
+		event->clock = &ktime_get_mono_fast_ns;
+		nmi_safe = true;
+		break;
+
+	case CLOCK_MONOTONIC_RAW:
+		event->clock = &ktime_get_raw_fast_ns;
+		nmi_safe = true;
+		break;
+
+	case CLOCK_REALTIME:
+		event->clock = &ktime_get_real_ns;
+		break;
+
+	case CLOCK_BOOTTIME:
+		event->clock = &ktime_get_boot_ns;
+		break;
+
+	case CLOCK_TAI:
+		event->clock = &ktime_get_tai_ns;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
+		return -EINVAL;
+
+	return 0;
+}
+
 /**
  * sys_perf_event_open - open a performance event, associate it to a task/cpu
  *
@@ -7569,6 +7629,12 @@ SYSCALL_DEFINE5(perf_event_open,
 	 */
 	pmu = event->pmu;
 
+	if (attr.use_clockid) {
+		err = perf_event_set_clock(event, attr.clockid);
+		if (err)
+			goto err_alloc;
+	}
+
 	if (group_leader &&
 	    (is_software_event(event) != is_software_event(group_leader))) {
 		if (is_software_event(event)) {
@@ -7618,6 +7684,11 @@ SYSCALL_DEFINE5(perf_event_open,
 		 */
 		if (group_leader->group_leader != group_leader)
 			goto err_context;
+
+		/* All events in a group should have the same clock */
+		if (group_leader->clock != event->clock)
+			goto err_context;
+
 		/*
 		 * Do not allow to attach to a group in a different
 		 * task or CPU context:
-- 
cgit v1.2.3


From 608cd71a9c7c9db76e78a792c5a4101e12fea43f Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Thu, 26 Mar 2015 19:53:57 -0700
Subject: tc: bpf: generalize pedit action

existing TC action 'pedit' can munge any bits of the packet.
Generalize it for use in bpf programs attached as cls_bpf and act_bpf via
bpf_skb_store_bytes() helper function.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Jiri Pirko <jiri@resnulli.us>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  1 +
 include/uapi/linux/bpf.h |  1 +
 kernel/bpf/verifier.c    |  2 ++
 net/core/filter.c        | 71 ++++++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 73 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 280a315de8d6..d5cda067115a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -59,6 +59,7 @@ enum bpf_arg_type {
 	ARG_PTR_TO_STACK,	/* any pointer to eBPF program stack */
 	ARG_CONST_STACK_SIZE,	/* number of bytes accessed from stack */
 
+	ARG_PTR_TO_CTX,		/* pointer to context */
 	ARG_ANYTHING,		/* any (initialized) argument is ok */
 };
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 27dc4ec58840..74aab6e0d964 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -168,6 +168,7 @@ enum bpf_func_id {
 	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
 	BPF_FUNC_get_prandom_u32, /* u32 prandom_u32(void) */
 	BPF_FUNC_get_smp_processor_id, /* u32 raw_smp_processor_id(void) */
+	BPF_FUNC_skb_store_bytes, /* int skb_store_bytes(skb, offset, from, len) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0e714f799ec0..630a7bac1e51 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -773,6 +773,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 		expected_type = CONST_IMM;
 	} else if (arg_type == ARG_CONST_MAP_PTR) {
 		expected_type = CONST_PTR_TO_MAP;
+	} else if (arg_type == ARG_PTR_TO_CTX) {
+		expected_type = PTR_TO_CTX;
 	} else {
 		verbose("unsupported arg_type %d\n", arg_type);
 		return -EFAULT;
diff --git a/net/core/filter.c b/net/core/filter.c
index 32f43c59908c..444a07e4f68d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1175,6 +1175,56 @@ int sk_attach_bpf(u32 ufd, struct sock *sk)
 	return 0;
 }
 
+static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	struct sk_buff *skb = (struct sk_buff *) (long) r1;
+	unsigned int offset = (unsigned int) r2;
+	void *from = (void *) (long) r3;
+	unsigned int len = (unsigned int) r4;
+	char buf[16];
+	void *ptr;
+
+	/* bpf verifier guarantees that:
+	 * 'from' pointer points to bpf program stack
+	 * 'len' bytes of it were initialized
+	 * 'len' > 0
+	 * 'skb' is a valid pointer to 'struct sk_buff'
+	 *
+	 * so check for invalid 'offset' and too large 'len'
+	 */
+	if (offset > 0xffff || len > sizeof(buf))
+		return -EFAULT;
+
+	if (skb_cloned(skb) && !skb_clone_writable(skb, offset + len))
+		return -EFAULT;
+
+	ptr = skb_header_pointer(skb, offset, len, buf);
+	if (unlikely(!ptr))
+		return -EFAULT;
+
+	skb_postpull_rcsum(skb, ptr, len);
+
+	memcpy(ptr, from, len);
+
+	if (ptr == buf)
+		/* skb_store_bits cannot return -EFAULT here */
+		skb_store_bits(skb, offset, ptr, len);
+
+	if (skb->ip_summed == CHECKSUM_COMPLETE)
+		skb->csum = csum_add(skb->csum, csum_partial(ptr, len, 0));
+	return 0;
+}
+
+const struct bpf_func_proto bpf_skb_store_bytes_proto = {
+	.func		= bpf_skb_store_bytes,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_STACK,
+	.arg4_type	= ARG_CONST_STACK_SIZE,
+};
+
 static const struct bpf_func_proto *
 sk_filter_func_proto(enum bpf_func_id func_id)
 {
@@ -1194,6 +1244,17 @@ sk_filter_func_proto(enum bpf_func_id func_id)
 	}
 }
 
+static const struct bpf_func_proto *
+tc_cls_act_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_skb_store_bytes:
+		return &bpf_skb_store_bytes_proto;
+	default:
+		return sk_filter_func_proto(func_id);
+	}
+}
+
 static bool sk_filter_is_valid_access(int off, int size,
 				      enum bpf_access_type type)
 {
@@ -1270,18 +1331,24 @@ static const struct bpf_verifier_ops sk_filter_ops = {
 	.convert_ctx_access = sk_filter_convert_ctx_access,
 };
 
+static const struct bpf_verifier_ops tc_cls_act_ops = {
+	.get_func_proto = tc_cls_act_func_proto,
+	.is_valid_access = sk_filter_is_valid_access,
+	.convert_ctx_access = sk_filter_convert_ctx_access,
+};
+
 static struct bpf_prog_type_list sk_filter_type __read_mostly = {
 	.ops = &sk_filter_ops,
 	.type = BPF_PROG_TYPE_SOCKET_FILTER,
 };
 
 static struct bpf_prog_type_list sched_cls_type __read_mostly = {
-	.ops = &sk_filter_ops,
+	.ops = &tc_cls_act_ops,
 	.type = BPF_PROG_TYPE_SCHED_CLS,
 };
 
 static struct bpf_prog_type_list sched_act_type __read_mostly = {
-	.ops = &sk_filter_ops,
+	.ops = &tc_cls_act_ops,
 	.type = BPF_PROG_TYPE_SCHED_ACT,
 };
 
-- 
cgit v1.2.3


From d631c8cceb1d1d06f372878935949d421585186b Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 27 Mar 2015 17:39:49 -0400
Subject: ring-buffer: Remove duplicate use of '&' in recursive code

A clean up of the recursive protection code changed

  val = this_cpu_read(current_context);
  val--;
  val &= this_cpu_read(current_context);

to

  val = this_cpu_read(current_context);
  val &= val & (val - 1);

Which has a duplicate use of '&' as the above is the same as

  val = val & (val - 1);

Actually, it would be best to remove that line altogether and
just add it to where it is used.

And Christoph even mentioned that it can be further compacted to
just a single line:

  __this_cpu_and(current_context, __this_cpu_read(current_context) - 1);

Link: http://lkml.kernel.org/alpine.DEB.2.11.1503271423580.23114@gentwo.org

Suggested-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 922048a0f7ea..0315d43176d8 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2703,10 +2703,7 @@ static __always_inline int trace_recursive_lock(void)
 
 static __always_inline void trace_recursive_unlock(void)
 {
-	unsigned int val = __this_cpu_read(current_context);
-
-	val &= val & (val - 1);
-	__this_cpu_write(current_context, val);
+	__this_cpu_and(current_context, __this_cpu_read(current_context) - 1);
 }
 
 #else
-- 
cgit v1.2.3


From 72cbbc8994242b5b43753738c01bf07bf29cb70d Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Wed, 25 Mar 2015 12:49:19 -0700
Subject: tracing: Add kprobe flag

add TRACE_EVENT_FL_KPROBE flag to differentiate kprobe type of
tracepoints, since bpf programs can only be attached to kprobe
type of PERF_TYPE_TRACEPOINT perf events.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-3-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/ftrace_event.h | 3 +++
 kernel/trace/trace_kprobe.c  | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index c674ee8f7fca..77325e1a1816 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -252,6 +252,7 @@ enum {
 	TRACE_EVENT_FL_WAS_ENABLED_BIT,
 	TRACE_EVENT_FL_USE_CALL_FILTER_BIT,
 	TRACE_EVENT_FL_TRACEPOINT_BIT,
+	TRACE_EVENT_FL_KPROBE_BIT,
 };
 
 /*
@@ -265,6 +266,7 @@ enum {
  *                     it is best to clear the buffers that used it).
  *  USE_CALL_FILTER - For ftrace internal events, don't use file filter
  *  TRACEPOINT    - Event is a tracepoint
+ *  KPROBE        - Event is a kprobe
  */
 enum {
 	TRACE_EVENT_FL_FILTERED		= (1 << TRACE_EVENT_FL_FILTERED_BIT),
@@ -274,6 +276,7 @@ enum {
 	TRACE_EVENT_FL_WAS_ENABLED	= (1 << TRACE_EVENT_FL_WAS_ENABLED_BIT),
 	TRACE_EVENT_FL_USE_CALL_FILTER	= (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT),
 	TRACE_EVENT_FL_TRACEPOINT	= (1 << TRACE_EVENT_FL_TRACEPOINT_BIT),
+	TRACE_EVENT_FL_KPROBE		= (1 << TRACE_EVENT_FL_KPROBE_BIT),
 };
 
 struct ftrace_event_call {
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index d73f565b4e06..8fa549f6f528 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1286,7 +1286,7 @@ static int register_kprobe_event(struct trace_kprobe *tk)
 		kfree(call->print_fmt);
 		return -ENODEV;
 	}
-	call->flags = 0;
+	call->flags = TRACE_EVENT_FL_KPROBE;
 	call->class->reg = kprobe_register;
 	call->data = tk;
 	ret = trace_add_event_call(call);
-- 
cgit v1.2.3


From 2541517c32be2531e0da59dfd7efc1ce844644f5 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Wed, 25 Mar 2015 12:49:20 -0700
Subject: tracing, perf: Implement BPF programs attached to kprobes

BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.

The user interface is to attach to a kprobe via the perf syscall:

	struct perf_event_attr attr = {
		.type	= PERF_TYPE_TRACEPOINT,
		.config	= event_id,
		...
	};

	event_fd = perf_event_open(&attr,...);
	ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);

'prog_fd' is a file descriptor associated with BPF program
previously loaded.

'event_id' is an ID of the kprobe created.

Closing 'event_fd':

	close(event_fd);

... automatically detaches BPF program from it.

BPF programs can call in-kernel helper functions to:

  - lookup/update/delete elements in maps

  - probe_read - wraper of probe_kernel_read() used to access any
    kernel data structures

BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.

Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/ftrace_event.h    |  11 ++++
 include/uapi/linux/bpf.h        |   3 +
 include/uapi/linux/perf_event.h |   1 +
 kernel/bpf/syscall.c            |   7 ++-
 kernel/events/core.c            |  59 ++++++++++++++++++
 kernel/trace/Makefile           |   1 +
 kernel/trace/bpf_trace.c        | 130 ++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_kprobe.c     |   8 +++
 8 files changed, 219 insertions(+), 1 deletion(-)
 create mode 100644 kernel/trace/bpf_trace.c

(limited to 'kernel')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 77325e1a1816..0aa535bc9f05 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -13,6 +13,7 @@ struct trace_array;
 struct trace_buffer;
 struct tracer;
 struct dentry;
+struct bpf_prog;
 
 struct trace_print_flags {
 	unsigned long		mask;
@@ -306,6 +307,7 @@ struct ftrace_event_call {
 #ifdef CONFIG_PERF_EVENTS
 	int				perf_refcount;
 	struct hlist_head __percpu	*perf_events;
+	struct bpf_prog			*prog;
 
 	int	(*perf_perm)(struct ftrace_event_call *,
 			     struct perf_event *);
@@ -551,6 +553,15 @@ event_trigger_unlock_commit_regs(struct ftrace_event_file *file,
 		event_triggers_post_call(file, tt);
 }
 
+#ifdef CONFIG_BPF_SYSCALL
+unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx);
+#else
+static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+{
+	return 1;
+}
+#endif
+
 enum {
 	FILTER_OTHER = 0,
 	FILTER_STATIC_STRING,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 45da7ec7d274..b2948feeb70b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -118,6 +118,7 @@ enum bpf_map_type {
 enum bpf_prog_type {
 	BPF_PROG_TYPE_UNSPEC,
 	BPF_PROG_TYPE_SOCKET_FILTER,
+	BPF_PROG_TYPE_KPROBE,
 };
 
 /* flags for BPF_MAP_UPDATE_ELEM command */
@@ -151,6 +152,7 @@ union bpf_attr {
 		__u32		log_level;	/* verbosity level of verifier */
 		__u32		log_size;	/* size of user buffer */
 		__aligned_u64	log_buf;	/* user supplied buffer */
+		__u32		kern_version;	/* checked when prog_type=kprobe */
 	};
 } __attribute__((aligned(8)));
 
@@ -162,6 +164,7 @@ enum bpf_func_id {
 	BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */
 	BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */
 	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
+	BPF_FUNC_probe_read,      /* int bpf_probe_read(void *dst, int size, void *src) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 3bb40ddadbe5..91803e54ee73 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -381,6 +381,7 @@ struct perf_event_attr {
 #define PERF_EVENT_IOC_SET_OUTPUT	_IO ('$', 5)
 #define PERF_EVENT_IOC_SET_FILTER	_IOW('$', 6, char *)
 #define PERF_EVENT_IOC_ID		_IOR('$', 7, __u64 *)
+#define PERF_EVENT_IOC_SET_BPF		_IOW('$', 8, __u32)
 
 enum perf_event_ioc_flags {
 	PERF_IOC_FLAG_GROUP		= 1U << 0,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 536edc2be307..504c10b990ef 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -16,6 +16,7 @@
 #include <linux/file.h>
 #include <linux/license.h>
 #include <linux/filter.h>
+#include <linux/version.h>
 
 static LIST_HEAD(bpf_map_types);
 
@@ -467,7 +468,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
 }
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD log_buf
+#define	BPF_PROG_LOAD_LAST_FIELD kern_version
 
 static int bpf_prog_load(union bpf_attr *attr)
 {
@@ -492,6 +493,10 @@ static int bpf_prog_load(union bpf_attr *attr)
 	if (attr->insn_cnt >= BPF_MAXINSNS)
 		return -EINVAL;
 
+	if (type == BPF_PROG_TYPE_KPROBE &&
+	    attr->kern_version != LINUX_VERSION_CODE)
+		return -EINVAL;
+
 	/* plain bpf_prog allocation */
 	prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
 	if (!prog)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c40c2cac2d8e..5c13862d3e85 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -42,6 +42,8 @@
 #include <linux/module.h>
 #include <linux/mman.h>
 #include <linux/compat.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
 
 #include "internal.h"
 
@@ -3407,6 +3409,7 @@ errout:
 }
 
 static void perf_event_free_filter(struct perf_event *event);
+static void perf_event_free_bpf_prog(struct perf_event *event);
 
 static void free_event_rcu(struct rcu_head *head)
 {
@@ -3416,6 +3419,7 @@ static void free_event_rcu(struct rcu_head *head)
 	if (event->ns)
 		put_pid_ns(event->ns);
 	perf_event_free_filter(event);
+	perf_event_free_bpf_prog(event);
 	kfree(event);
 }
 
@@ -3928,6 +3932,7 @@ static inline int perf_fget_light(int fd, struct fd *p)
 static int perf_event_set_output(struct perf_event *event,
 				 struct perf_event *output_event);
 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
 
 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
 {
@@ -3981,6 +3986,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
 	case PERF_EVENT_IOC_SET_FILTER:
 		return perf_event_set_filter(event, (void __user *)arg);
 
+	case PERF_EVENT_IOC_SET_BPF:
+		return perf_event_set_bpf_prog(event, arg);
+
 	default:
 		return -ENOTTY;
 	}
@@ -6455,6 +6463,49 @@ static void perf_event_free_filter(struct perf_event *event)
 	ftrace_profile_free_filter(event);
 }
 
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+	struct bpf_prog *prog;
+
+	if (event->attr.type != PERF_TYPE_TRACEPOINT)
+		return -EINVAL;
+
+	if (event->tp_event->prog)
+		return -EEXIST;
+
+	if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
+		/* bpf programs can only be attached to kprobes */
+		return -EINVAL;
+
+	prog = bpf_prog_get(prog_fd);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	if (prog->aux->prog_type != BPF_PROG_TYPE_KPROBE) {
+		/* valid fd, but invalid bpf program type */
+		bpf_prog_put(prog);
+		return -EINVAL;
+	}
+
+	event->tp_event->prog = prog;
+
+	return 0;
+}
+
+static void perf_event_free_bpf_prog(struct perf_event *event)
+{
+	struct bpf_prog *prog;
+
+	if (!event->tp_event)
+		return;
+
+	prog = event->tp_event->prog;
+	if (prog) {
+		event->tp_event->prog = NULL;
+		bpf_prog_put(prog);
+	}
+}
+
 #else
 
 static inline void perf_tp_register(void)
@@ -6470,6 +6521,14 @@ static void perf_event_free_filter(struct perf_event *event)
 {
 }
 
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+	return -ENOENT;
+}
+
+static void perf_event_free_bpf_prog(struct perf_event *event)
+{
+}
 #endif /* CONFIG_EVENT_TRACING */
 
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 98f26588255e..c575a300103b 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -53,6 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
+obj-$(CONFIG_BPF_SYSCALL) += bpf_trace.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
 obj-$(CONFIG_TRACEPOINTS) += power-traces.o
 ifeq ($(CONFIG_PM),y)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
new file mode 100644
index 000000000000..f1e87da91da3
--- /dev/null
+++ b/kernel/trace/bpf_trace.c
@@ -0,0 +1,130 @@
+/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/uaccess.h>
+#include "trace.h"
+
+static DEFINE_PER_CPU(int, bpf_prog_active);
+
+/**
+ * trace_call_bpf - invoke BPF program
+ * @prog: BPF program
+ * @ctx: opaque context pointer
+ *
+ * kprobe handlers execute BPF programs via this helper.
+ * Can be used from static tracepoints in the future.
+ *
+ * Return: BPF programs always return an integer which is interpreted by
+ * kprobe handler as:
+ * 0 - return from kprobe (event is filtered out)
+ * 1 - store kprobe event into ring buffer
+ * Other values are reserved and currently alias to 1
+ */
+unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+{
+	unsigned int ret;
+
+	if (in_nmi()) /* not supported yet */
+		return 1;
+
+	preempt_disable();
+
+	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
+		/*
+		 * since some bpf program is already running on this cpu,
+		 * don't call into another bpf program (same or different)
+		 * and don't send kprobe event into ring-buffer,
+		 * so return zero here
+		 */
+		ret = 0;
+		goto out;
+	}
+
+	rcu_read_lock();
+	ret = BPF_PROG_RUN(prog, ctx);
+	rcu_read_unlock();
+
+ out:
+	__this_cpu_dec(bpf_prog_active);
+	preempt_enable();
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(trace_call_bpf);
+
+static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	void *dst = (void *) (long) r1;
+	int size = (int) r2;
+	void *unsafe_ptr = (void *) (long) r3;
+
+	return probe_kernel_read(dst, unsafe_ptr, size);
+}
+
+static const struct bpf_func_proto bpf_probe_read_proto = {
+	.func		= bpf_probe_read,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_STACK,
+	.arg2_type	= ARG_CONST_STACK_SIZE,
+	.arg3_type	= ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_map_lookup_elem:
+		return &bpf_map_lookup_elem_proto;
+	case BPF_FUNC_map_update_elem:
+		return &bpf_map_update_elem_proto;
+	case BPF_FUNC_map_delete_elem:
+		return &bpf_map_delete_elem_proto;
+	case BPF_FUNC_probe_read:
+		return &bpf_probe_read_proto;
+	default:
+		return NULL;
+	}
+}
+
+/* bpf+kprobe programs can access fields of 'struct pt_regs' */
+static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+	/* check bounds */
+	if (off < 0 || off >= sizeof(struct pt_regs))
+		return false;
+
+	/* only read is allowed */
+	if (type != BPF_READ)
+		return false;
+
+	/* disallow misaligned access */
+	if (off % size != 0)
+		return false;
+
+	return true;
+}
+
+static struct bpf_verifier_ops kprobe_prog_ops = {
+	.get_func_proto  = kprobe_prog_func_proto,
+	.is_valid_access = kprobe_prog_is_valid_access,
+};
+
+static struct bpf_prog_type_list kprobe_tl = {
+	.ops	= &kprobe_prog_ops,
+	.type	= BPF_PROG_TYPE_KPROBE,
+};
+
+static int __init register_kprobe_prog_ops(void)
+{
+	bpf_register_prog_type(&kprobe_tl);
+	return 0;
+}
+late_initcall(register_kprobe_prog_ops);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 8fa549f6f528..dc3462507d7c 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1134,11 +1134,15 @@ static void
 kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 {
 	struct ftrace_event_call *call = &tk->tp.call;
+	struct bpf_prog *prog = call->prog;
 	struct kprobe_trace_entry_head *entry;
 	struct hlist_head *head;
 	int size, __size, dsize;
 	int rctx;
 
+	if (prog && !trace_call_bpf(prog, regs))
+		return;
+
 	head = this_cpu_ptr(call->perf_events);
 	if (hlist_empty(head))
 		return;
@@ -1165,11 +1169,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 		    struct pt_regs *regs)
 {
 	struct ftrace_event_call *call = &tk->tp.call;
+	struct bpf_prog *prog = call->prog;
 	struct kretprobe_trace_entry_head *entry;
 	struct hlist_head *head;
 	int size, __size, dsize;
 	int rctx;
 
+	if (prog && !trace_call_bpf(prog, regs))
+		return;
+
 	head = this_cpu_ptr(call->perf_events);
 	if (hlist_empty(head))
 		return;
-- 
cgit v1.2.3


From d9847d310ab4003725e6ed1822682e24bd406908 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Wed, 25 Mar 2015 12:49:21 -0700
Subject: tracing: Allow BPF programs to call bpf_ktime_get_ns()

bpf_ktime_get_ns() is used by programs to compute time delta
between events or as a timestamp

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-5-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/uapi/linux/bpf.h |  1 +
 kernel/trace/bpf_trace.c | 14 ++++++++++++++
 2 files changed, 15 insertions(+)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b2948feeb70b..238c6883877b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -165,6 +165,7 @@ enum bpf_func_id {
 	BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */
 	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
 	BPF_FUNC_probe_read,      /* int bpf_probe_read(void *dst, int size, void *src) */
+	BPF_FUNC_ktime_get_ns,    /* u64 bpf_ktime_get_ns(void) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f1e87da91da3..8f5787294971 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -78,6 +78,18 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	/* NMI safe access to clock monotonic */
+	return ktime_get_mono_fast_ns();
+}
+
+static const struct bpf_func_proto bpf_ktime_get_ns_proto = {
+	.func		= bpf_ktime_get_ns,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+};
+
 static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
@@ -89,6 +101,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_map_delete_elem_proto;
 	case BPF_FUNC_probe_read:
 		return &bpf_probe_read_proto;
+	case BPF_FUNC_ktime_get_ns:
+		return &bpf_ktime_get_ns_proto;
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3


From 9c959c863f8217a2ff3d7c296e8223654d240569 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Wed, 25 Mar 2015 12:49:22 -0700
Subject: tracing: Allow BPF programs to call bpf_trace_printk()

Debugging of BPF programs needs some form of printk from the
program, so let programs call limited trace_printk() with %d %u
%x %p modifiers only.

Similar to kernel modules, during program load verifier checks
whether program is calling bpf_trace_printk() and if so, kernel
allocates trace_printk buffers and emits big 'this is debug
only' banner.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-6-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/uapi/linux/bpf.h |  1 +
 kernel/trace/bpf_trace.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 238c6883877b..cc47ef41076a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -166,6 +166,7 @@ enum bpf_func_id {
 	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
 	BPF_FUNC_probe_read,      /* int bpf_probe_read(void *dst, int size, void *src) */
 	BPF_FUNC_ktime_get_ns,    /* u64 bpf_ktime_get_ns(void) */
+	BPF_FUNC_trace_printk,    /* int bpf_trace_printk(const char *fmt, int fmt_size, ...) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 8f5787294971..2d56ce501632 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -10,6 +10,7 @@
 #include <linux/bpf.h>
 #include <linux/filter.h>
 #include <linux/uaccess.h>
+#include <linux/ctype.h>
 #include "trace.h"
 
 static DEFINE_PER_CPU(int, bpf_prog_active);
@@ -90,6 +91,74 @@ static const struct bpf_func_proto bpf_ktime_get_ns_proto = {
 	.ret_type	= RET_INTEGER,
 };
 
+/*
+ * limited trace_printk()
+ * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed
+ */
+static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
+{
+	char *fmt = (char *) (long) r1;
+	int mod[3] = {};
+	int fmt_cnt = 0;
+	int i;
+
+	/*
+	 * bpf_check()->check_func_arg()->check_stack_boundary()
+	 * guarantees that fmt points to bpf program stack,
+	 * fmt_size bytes of it were initialized and fmt_size > 0
+	 */
+	if (fmt[--fmt_size] != 0)
+		return -EINVAL;
+
+	/* check format string for allowed specifiers */
+	for (i = 0; i < fmt_size; i++) {
+		if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i]))
+			return -EINVAL;
+
+		if (fmt[i] != '%')
+			continue;
+
+		if (fmt_cnt >= 3)
+			return -EINVAL;
+
+		/* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
+		i++;
+		if (fmt[i] == 'l') {
+			mod[fmt_cnt]++;
+			i++;
+		} else if (fmt[i] == 'p') {
+			mod[fmt_cnt]++;
+			i++;
+			if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0)
+				return -EINVAL;
+			fmt_cnt++;
+			continue;
+		}
+
+		if (fmt[i] == 'l') {
+			mod[fmt_cnt]++;
+			i++;
+		}
+
+		if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x')
+			return -EINVAL;
+		fmt_cnt++;
+	}
+
+	return __trace_printk(1/* fake ip will not be printed */, fmt,
+			      mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3,
+			      mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4,
+			      mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5);
+}
+
+static const struct bpf_func_proto bpf_trace_printk_proto = {
+	.func		= bpf_trace_printk,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_STACK,
+	.arg2_type	= ARG_CONST_STACK_SIZE,
+};
+
 static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
@@ -103,6 +172,15 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_probe_read_proto;
 	case BPF_FUNC_ktime_get_ns:
 		return &bpf_ktime_get_ns_proto;
+
+	case BPF_FUNC_trace_printk:
+		/*
+		 * this program might be calling bpf_trace_printk,
+		 * so allocate per-cpu printk buffers
+		 */
+		trace_printk_init_buffers();
+
+		return &bpf_trace_printk_proto;
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3


From e1abf2cc8d5d80b41c4419368ec743ccadbb131e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 2 Apr 2015 15:51:39 +0200
Subject: bpf: Fix the build on BPF_SYSCALL=y && !CONFIG_TRACING kernels, make
 it more configurable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

So bpf_tracing.o depends on CONFIG_BPF_SYSCALL - but that's not its only
dependency, it also depends on the tracing infrastructure and on kprobes,
without which it will fail to build with:

  In file included from kernel/trace/bpf_trace.c:14:0:
  kernel/trace/trace.h: In function ‘trace_test_and_set_recursion’:
  kernel/trace/trace.h:491:28: error: ‘struct task_struct’ has no member named ‘trace_recursion’
    unsigned int val = current->trace_recursion;
  [...]

It took quite some time to trigger this build failure, because right now
BPF_SYSCALL is very obscure, depends on CONFIG_EXPERT. So also make BPF_SYSCALL
more configurable, not just under CONFIG_EXPERT.

If BPF_SYSCALL, tracing and kprobes are enabled then enable the bpf_tracing
gateway as well.

We might want to make this an interactive option later on, although
I'd not complicate it unnecessarily: enabling BPF_SYSCALL is enough of
an indicator that the user wants BPF support.

Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 init/Kconfig          | 2 +-
 kernel/trace/Kconfig  | 8 ++++++++
 kernel/trace/Makefile | 2 +-
 3 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/init/Kconfig b/init/Kconfig
index f5dbc6d4261b..2b4d055aca4a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1513,7 +1513,7 @@ config EVENTFD
 
 # syscall, maps, verifier
 config BPF_SYSCALL
-	bool "Enable bpf() system call" if EXPERT
+	bool "Enable bpf() system call"
 	select ANON_INODES
 	select BPF
 	default n
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a5da09c899dd..c8e53c051293 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -432,6 +432,14 @@ config UPROBE_EVENT
 	  This option is required if you plan to use perf-probe subcommand
 	  of perf tools on user space applications.
 
+config BPF_EVENTS
+	depends on BPF_SYSCALL
+	depends on KPROBE_EVENT
+	bool
+	default y
+	help
+	  This allows the user to attach BPF programs to kprobe events.
+
 config PROBE_EVENTS
 	def_bool n
 
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c575a300103b..9b1044e936a6 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -53,7 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
-obj-$(CONFIG_BPF_SYSCALL) += bpf_trace.o
+obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
 obj-$(CONFIG_TRACEPOINTS) += power-traces.o
 ifeq ($(CONFIG_PM),y)
-- 
cgit v1.2.3


From e8c6deac69629c0cb97c3d3272f8631ef17f8f0f Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Wed, 14 Jan 2015 14:18:10 +0200
Subject: perf: Add data_{offset,size} to user_page

Currently, the actual perf ring buffer is one page into the mmap area,
following the user page and the userspace follows this convention. This
patch adds data_{offset,size} fields to user_page that can be used by
userspace instead for locating perf data in the mmap area. This is also
helpful when mapping existing or shared buffers if their size is not
known in advance.

Right now, it is made to follow the existing convention that

	data_offset == PAGE_SIZE and
	data_offset + data_size == mmap_size.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-2-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/uapi/linux/perf_event.h | 5 +++++
 kernel/events/core.c            | 2 ++
 2 files changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 91803e54ee73..86c44ae66d43 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -522,9 +522,14 @@ struct perf_event_mmap_page {
 	 * In this case the kernel will not over-write unread data.
 	 *
 	 * See perf_output_put_handle() for the data ordering.
+	 *
+	 * data_{offset,size} indicate the location and size of the perf record
+	 * buffer within the mmapped area.
 	 */
 	__u64   data_head;		/* head in the data section */
 	__u64	data_tail;		/* user-space written tail */
+	__u64	data_offset;		/* where the buffer starts */
+	__u64	data_size;		/* data buffer size */
 };
 
 #define PERF_RECORD_MISC_CPUMODE_MASK		(7 << 0)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5c13862d3e85..6efa516f1ab8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4105,6 +4105,8 @@ static void perf_event_init_userpage(struct perf_event *event)
 	/* Allow new userspace to detect that bit 0 is deprecated */
 	userpg->cap_bit0_is_deprecated = 1;
 	userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
+	userpg->data_offset = PAGE_SIZE;
+	userpg->data_size = perf_data_size(rb);
 
 unlock:
 	rcu_read_unlock();
-- 
cgit v1.2.3


From 45bfb2e50471abbbfd83d40d28c986078b0d24ff Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 14 Jan 2015 14:18:11 +0200
Subject: perf: Add AUX area to ring buffer for raw data streams

This patch introduces "AUX space" in the perf mmap buffer, intended for
exporting high bandwidth data streams to userspace, such as instruction
flow traces.

AUX space is a ring buffer, defined by aux_{offset,size} fields in the
user_page structure, and read/write pointers aux_{head,tail}, which abide
by the same rules as data_* counterparts of the main perf buffer.

In order to allocate/mmap AUX, userspace needs to set up aux_offset to
such an offset that will be greater than data_offset+data_size and
aux_size to be the desired buffer size. Both need to be page aligned.
Then, same aux_offset and aux_size should be passed to mmap() call and
if everything adds up, you should have an AUX buffer as a result.

Pages that are mapped into this buffer also come out of user's mlock
rlimit plus perf_event_mlock_kb allowance.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-3-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h      |  17 +++++
 include/uapi/linux/perf_event.h |  16 +++++
 kernel/events/core.c            | 141 +++++++++++++++++++++++++++++++++-------
 kernel/events/internal.h        |  23 +++++++
 kernel/events/ring_buffer.c     |  97 +++++++++++++++++++++++++--
 5 files changed, 265 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 401554074de9..5a94f6d6fa91 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -284,6 +284,18 @@ struct pmu {
 	 * Return the count value for a counter.
 	 */
 	u64 (*count)			(struct perf_event *event); /*optional*/
+
+	/*
+	 * Set up pmu-private data structures for an AUX area
+	 */
+	void *(*setup_aux)		(int cpu, void **pages,
+					 int nr_pages, bool overwrite);
+					/* optional */
+
+	/*
+	 * Free pmu-private AUX data structures
+	 */
+	void (*free_aux)		(void *aux); /* optional */
 };
 
 /**
@@ -862,6 +874,11 @@ static inline bool needs_branch_stack(struct perf_event *event)
 	return event->attr.branch_sample_type != 0;
 }
 
+static inline bool has_aux(struct perf_event *event)
+{
+	return event->pmu->setup_aux;
+}
+
 extern int perf_output_begin(struct perf_output_handle *handle,
 			     struct perf_event *event, unsigned int size);
 extern void perf_output_end(struct perf_output_handle *handle);
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 86c44ae66d43..6c5013a71714 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -530,6 +530,22 @@ struct perf_event_mmap_page {
 	__u64	data_tail;		/* user-space written tail */
 	__u64	data_offset;		/* where the buffer starts */
 	__u64	data_size;		/* data buffer size */
+
+	/*
+	 * AUX area is defined by aux_{offset,size} fields that should be set
+	 * by the userspace, so that
+	 *
+	 *   aux_offset >= data_offset + data_size
+	 *
+	 * prior to mmap()ing it. Size of the mmap()ed area should be aux_size.
+	 *
+	 * Ring buffer pointers aux_{head,tail} have the same semantics as
+	 * data_{head,tail} and same ordering rules apply.
+	 */
+	__u64	aux_head;
+	__u64	aux_tail;
+	__u64	aux_offset;
+	__u64	aux_size;
 };
 
 #define PERF_RECORD_MISC_CPUMODE_MASK		(7 << 0)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6efa516f1ab8..da51128c337a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4306,6 +4306,9 @@ static void perf_mmap_open(struct vm_area_struct *vma)
 	atomic_inc(&event->mmap_count);
 	atomic_inc(&event->rb->mmap_count);
 
+	if (vma->vm_pgoff)
+		atomic_inc(&event->rb->aux_mmap_count);
+
 	if (event->pmu->event_mapped)
 		event->pmu->event_mapped(event);
 }
@@ -4330,6 +4333,20 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 	if (event->pmu->event_unmapped)
 		event->pmu->event_unmapped(event);
 
+	/*
+	 * rb->aux_mmap_count will always drop before rb->mmap_count and
+	 * event->mmap_count, so it is ok to use event->mmap_mutex to
+	 * serialize with perf_mmap here.
+	 */
+	if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
+	    atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
+		atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
+		vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
+
+		rb_free_aux(rb);
+		mutex_unlock(&event->mmap_mutex);
+	}
+
 	atomic_dec(&rb->mmap_count);
 
 	if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
@@ -4403,7 +4420,7 @@ out_put:
 
 static const struct vm_operations_struct perf_mmap_vmops = {
 	.open		= perf_mmap_open,
-	.close		= perf_mmap_close,
+	.close		= perf_mmap_close, /* non mergable */
 	.fault		= perf_mmap_fault,
 	.page_mkwrite	= perf_mmap_fault,
 };
@@ -4414,10 +4431,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	unsigned long user_locked, user_lock_limit;
 	struct user_struct *user = current_user();
 	unsigned long locked, lock_limit;
-	struct ring_buffer *rb;
+	struct ring_buffer *rb = NULL;
 	unsigned long vma_size;
 	unsigned long nr_pages;
-	long user_extra, extra;
+	long user_extra = 0, extra = 0;
 	int ret = 0, flags = 0;
 
 	/*
@@ -4432,7 +4449,66 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 		return -EINVAL;
 
 	vma_size = vma->vm_end - vma->vm_start;
-	nr_pages = (vma_size / PAGE_SIZE) - 1;
+
+	if (vma->vm_pgoff == 0) {
+		nr_pages = (vma_size / PAGE_SIZE) - 1;
+	} else {
+		/*
+		 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
+		 * mapped, all subsequent mappings should have the same size
+		 * and offset. Must be above the normal perf buffer.
+		 */
+		u64 aux_offset, aux_size;
+
+		if (!event->rb)
+			return -EINVAL;
+
+		nr_pages = vma_size / PAGE_SIZE;
+
+		mutex_lock(&event->mmap_mutex);
+		ret = -EINVAL;
+
+		rb = event->rb;
+		if (!rb)
+			goto aux_unlock;
+
+		aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
+		aux_size = ACCESS_ONCE(rb->user_page->aux_size);
+
+		if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
+			goto aux_unlock;
+
+		if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
+			goto aux_unlock;
+
+		/* already mapped with a different offset */
+		if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
+			goto aux_unlock;
+
+		if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
+			goto aux_unlock;
+
+		/* already mapped with a different size */
+		if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
+			goto aux_unlock;
+
+		if (!is_power_of_2(nr_pages))
+			goto aux_unlock;
+
+		if (!atomic_inc_not_zero(&rb->mmap_count))
+			goto aux_unlock;
+
+		if (rb_has_aux(rb)) {
+			atomic_inc(&rb->aux_mmap_count);
+			ret = 0;
+			goto unlock;
+		}
+
+		atomic_set(&rb->aux_mmap_count, 1);
+		user_extra = nr_pages;
+
+		goto accounting;
+	}
 
 	/*
 	 * If we have rb pages ensure they're a power-of-two number, so we
@@ -4444,9 +4520,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	if (vma_size != PAGE_SIZE * (1 + nr_pages))
 		return -EINVAL;
 
-	if (vma->vm_pgoff != 0)
-		return -EINVAL;
-
 	WARN_ON_ONCE(event->ctx->parent_ctx);
 again:
 	mutex_lock(&event->mmap_mutex);
@@ -4470,6 +4543,8 @@ again:
 	}
 
 	user_extra = nr_pages + 1;
+
+accounting:
 	user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
 
 	/*
@@ -4479,7 +4554,6 @@ again:
 
 	user_locked = atomic_long_read(&user->locked_vm) + user_extra;
 
-	extra = 0;
 	if (user_locked > user_lock_limit)
 		extra = user_locked - user_lock_limit;
 
@@ -4493,35 +4567,45 @@ again:
 		goto unlock;
 	}
 
-	WARN_ON(event->rb);
+	WARN_ON(!rb && event->rb);
 
 	if (vma->vm_flags & VM_WRITE)
 		flags |= RING_BUFFER_WRITABLE;
 
-	rb = rb_alloc(nr_pages, 
-		event->attr.watermark ? event->attr.wakeup_watermark : 0,
-		event->cpu, flags);
-
 	if (!rb) {
-		ret = -ENOMEM;
-		goto unlock;
-	}
+		rb = rb_alloc(nr_pages,
+			      event->attr.watermark ? event->attr.wakeup_watermark : 0,
+			      event->cpu, flags);
 
-	atomic_set(&rb->mmap_count, 1);
-	rb->mmap_locked = extra;
-	rb->mmap_user = get_current_user();
+		if (!rb) {
+			ret = -ENOMEM;
+			goto unlock;
+		}
 
-	atomic_long_add(user_extra, &user->locked_vm);
-	vma->vm_mm->pinned_vm += extra;
+		atomic_set(&rb->mmap_count, 1);
+		rb->mmap_user = get_current_user();
+		rb->mmap_locked = extra;
 
-	ring_buffer_attach(event, rb);
+		ring_buffer_attach(event, rb);
 
-	perf_event_init_userpage(event);
-	perf_event_update_userpage(event);
+		perf_event_init_userpage(event);
+		perf_event_update_userpage(event);
+	} else {
+		ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, flags);
+		if (!ret)
+			rb->aux_mmap_locked = extra;
+	}
 
 unlock:
-	if (!ret)
+	if (!ret) {
+		atomic_long_add(user_extra, &user->locked_vm);
+		vma->vm_mm->pinned_vm += extra;
+
 		atomic_inc(&event->mmap_count);
+	} else if (rb) {
+		atomic_dec(&rb->mmap_count);
+	}
+aux_unlock:
 	mutex_unlock(&event->mmap_mutex);
 
 	/*
@@ -7506,6 +7590,13 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 	if (output_event->clock != event->clock)
 		goto out;
 
+	/*
+	 * If both events generate aux data, they must be on the same PMU
+	 */
+	if (has_aux(event) && has_aux(output_event) &&
+	    event->pmu != output_event->pmu)
+		goto out;
+
 set:
 	mutex_lock(&event->mmap_mutex);
 	/* Can't redirect output if we've got an active mmap() */
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 569b218782ad..0f6d08015927 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -35,6 +35,16 @@ struct ring_buffer {
 	unsigned long			mmap_locked;
 	struct user_struct		*mmap_user;
 
+	/* AUX area */
+	unsigned long			aux_pgoff;
+	int				aux_nr_pages;
+	atomic_t			aux_mmap_count;
+	unsigned long			aux_mmap_locked;
+	void				(*free_aux)(void *);
+	atomic_t			aux_refcount;
+	void				**aux_pages;
+	void				*aux_priv;
+
 	struct perf_event_mmap_page	*user_page;
 	void				*data_pages[0];
 };
@@ -43,6 +53,14 @@ extern void rb_free(struct ring_buffer *rb);
 extern struct ring_buffer *
 rb_alloc(int nr_pages, long watermark, int cpu, int flags);
 extern void perf_event_wakeup(struct perf_event *event);
+extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
+			pgoff_t pgoff, int nr_pages, int flags);
+extern void rb_free_aux(struct ring_buffer *rb);
+
+static inline bool rb_has_aux(struct ring_buffer *rb)
+{
+	return !!rb->aux_nr_pages;
+}
 
 extern void
 perf_event_header__init_id(struct perf_event_header *header,
@@ -81,6 +99,11 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb)
 	return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
 }
 
+static inline unsigned long perf_aux_size(struct ring_buffer *rb)
+{
+	return rb->aux_nr_pages << PAGE_SHIFT;
+}
+
 #define DEFINE_OUTPUT_COPY(func_name, memcpy_func)			\
 static inline unsigned long						\
 func_name(struct perf_output_handle *handle,				\
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index eadb95ce7aac..3de9c4e9ea9f 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -243,14 +243,87 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
 	spin_lock_init(&rb->event_lock);
 }
 
+int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
+		 pgoff_t pgoff, int nr_pages, int flags)
+{
+	bool overwrite = !(flags & RING_BUFFER_WRITABLE);
+	int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
+	int ret = -ENOMEM;
+
+	if (!has_aux(event))
+		return -ENOTSUPP;
+
+	rb->aux_pages = kzalloc_node(nr_pages * sizeof(void *), GFP_KERNEL, node);
+	if (!rb->aux_pages)
+		return -ENOMEM;
+
+	rb->free_aux = event->pmu->free_aux;
+	for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;
+	     rb->aux_nr_pages++) {
+		struct page *page;
+
+		page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+		if (!page)
+			goto out;
+
+		rb->aux_pages[rb->aux_nr_pages] = page_address(page);
+	}
+
+	rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages,
+					     overwrite);
+	if (!rb->aux_priv)
+		goto out;
+
+	ret = 0;
+
+	/*
+	 * aux_pages (and pmu driver's private data, aux_priv) will be
+	 * referenced in both producer's and consumer's contexts, thus
+	 * we keep a refcount here to make sure either of the two can
+	 * reference them safely.
+	 */
+	atomic_set(&rb->aux_refcount, 1);
+
+out:
+	if (!ret)
+		rb->aux_pgoff = pgoff;
+	else
+		rb_free_aux(rb);
+
+	return ret;
+}
+
+static void __rb_free_aux(struct ring_buffer *rb)
+{
+	int pg;
+
+	if (rb->aux_priv) {
+		rb->free_aux(rb->aux_priv);
+		rb->free_aux = NULL;
+		rb->aux_priv = NULL;
+	}
+
+	for (pg = 0; pg < rb->aux_nr_pages; pg++)
+		free_page((unsigned long)rb->aux_pages[pg]);
+
+	kfree(rb->aux_pages);
+	rb->aux_nr_pages = 0;
+}
+
+void rb_free_aux(struct ring_buffer *rb)
+{
+	if (atomic_dec_and_test(&rb->aux_refcount))
+		__rb_free_aux(rb);
+}
+
 #ifndef CONFIG_PERF_USE_VMALLOC
 
 /*
  * Back perf_mmap() with regular GFP_KERNEL-0 pages.
  */
 
-struct page *
-perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+static struct page *
+__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
 {
 	if (pgoff > rb->nr_pages)
 		return NULL;
@@ -340,8 +413,8 @@ static int data_page_nr(struct ring_buffer *rb)
 	return rb->nr_pages << page_order(rb);
 }
 
-struct page *
-perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+static struct page *
+__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
 {
 	/* The '>' counts in the user page. */
 	if (pgoff > data_page_nr(rb))
@@ -416,3 +489,19 @@ fail:
 }
 
 #endif
+
+struct page *
+perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+{
+	if (rb->aux_nr_pages) {
+		/* above AUX space */
+		if (pgoff > rb->aux_pgoff + rb->aux_nr_pages)
+			return NULL;
+
+		/* AUX space */
+		if (pgoff >= rb->aux_pgoff)
+			return virt_to_page(rb->aux_pages[pgoff - rb->aux_pgoff]);
+	}
+
+	return __perf_mmap_to_page(rb, pgoff);
+}
-- 
cgit v1.2.3


From 0a4e38e64f5e91ce131cc42ee5bb3925377ec840 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Wed, 14 Jan 2015 14:18:12 +0200
Subject: perf: Support high-order allocations for AUX space

Some pmus (such as BTS or Intel PT without multiple-entry ToPA capability)
don't support scatter-gather and will prefer larger contiguous areas for
their output regions.

This patch adds a new pmu capability to request higher order allocations.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-4-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h  |  1 +
 kernel/events/ring_buffer.c | 56 ++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 51 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 5a94f6d6fa91..d5a4a8e95808 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -174,6 +174,7 @@ struct perf_event;
  */
 #define PERF_PMU_CAP_NO_INTERRUPT		0x01
 #define PERF_PMU_CAP_NO_NMI			0x02
+#define PERF_PMU_CAP_AUX_NO_SG			0x04
 
 /**
  * struct pmu - generic performance monitoring unit
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 3de9c4e9ea9f..ed0859e33b2f 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -243,30 +243,74 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
 	spin_lock_init(&rb->event_lock);
 }
 
+#define PERF_AUX_GFP	(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
+
+static struct page *rb_alloc_aux_page(int node, int order)
+{
+	struct page *page;
+
+	if (order > MAX_ORDER)
+		order = MAX_ORDER;
+
+	do {
+		page = alloc_pages_node(node, PERF_AUX_GFP, order);
+	} while (!page && order--);
+
+	if (page && order) {
+		/*
+		 * Communicate the allocation size to the driver
+		 */
+		split_page(page, order);
+		SetPagePrivate(page);
+		set_page_private(page, order);
+	}
+
+	return page;
+}
+
+static void rb_free_aux_page(struct ring_buffer *rb, int idx)
+{
+	struct page *page = virt_to_page(rb->aux_pages[idx]);
+
+	ClearPagePrivate(page);
+	page->mapping = NULL;
+	__free_page(page);
+}
+
 int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
 		 pgoff_t pgoff, int nr_pages, int flags)
 {
 	bool overwrite = !(flags & RING_BUFFER_WRITABLE);
 	int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
-	int ret = -ENOMEM;
+	int ret = -ENOMEM, max_order = 0;
 
 	if (!has_aux(event))
 		return -ENOTSUPP;
 
+	if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG)
+		/*
+		 * We need to start with the max_order that fits in nr_pages,
+		 * not the other way around, hence ilog2() and not get_order.
+		 */
+		max_order = ilog2(nr_pages);
+
 	rb->aux_pages = kzalloc_node(nr_pages * sizeof(void *), GFP_KERNEL, node);
 	if (!rb->aux_pages)
 		return -ENOMEM;
 
 	rb->free_aux = event->pmu->free_aux;
-	for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;
-	     rb->aux_nr_pages++) {
+	for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) {
 		struct page *page;
+		int last, order;
 
-		page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+		order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages));
+		page = rb_alloc_aux_page(node, order);
 		if (!page)
 			goto out;
 
-		rb->aux_pages[rb->aux_nr_pages] = page_address(page);
+		for (last = rb->aux_nr_pages + (1 << page_private(page));
+		     last > rb->aux_nr_pages; rb->aux_nr_pages++)
+			rb->aux_pages[rb->aux_nr_pages] = page_address(page++);
 	}
 
 	rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages,
@@ -304,7 +348,7 @@ static void __rb_free_aux(struct ring_buffer *rb)
 	}
 
 	for (pg = 0; pg < rb->aux_nr_pages; pg++)
-		free_page((unsigned long)rb->aux_pages[pg]);
+		rb_free_aux_page(rb, pg);
 
 	kfree(rb->aux_pages);
 	rb->aux_nr_pages = 0;
-- 
cgit v1.2.3


From 6a279230391b63130070e0219b0ad09d34d28c89 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Wed, 14 Jan 2015 14:18:13 +0200
Subject: perf: Add a capability for AUX_NO_SG pmus to do software double
 buffering

For pmus that don't support scatter-gather for AUX data in hardware, it
might still make sense to implement software double buffering to avoid
losing data while the user is reading data out. For this purpose, add
a pmu capability that guarantees multiple high-order chunks for AUX buffer,
so that the pmu driver can do switchover tricks.

To make use of this feature, add PERF_PMU_CAP_AUX_SW_DOUBLEBUF to your
pmu's capability mask. This will make the ring buffer AUX allocation code
ensure that the biggest high order allocation for the aux buffer pages is
no bigger than half of the total requested buffer size, thus making sure
that the buffer has at least two high order allocations.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-5-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h  |  1 +
 kernel/events/ring_buffer.c | 15 ++++++++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index d5a4a8e95808..13a1eb3a2a2d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -175,6 +175,7 @@ struct perf_event;
 #define PERF_PMU_CAP_NO_INTERRUPT		0x01
 #define PERF_PMU_CAP_NO_NMI			0x02
 #define PERF_PMU_CAP_AUX_NO_SG			0x04
+#define PERF_PMU_CAP_AUX_SW_DOUBLEBUF		0x08
 
 /**
  * struct pmu - generic performance monitoring unit
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index ed0859e33b2f..6e3be7a10c50 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -287,13 +287,26 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
 	if (!has_aux(event))
 		return -ENOTSUPP;
 
-	if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG)
+	if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) {
 		/*
 		 * We need to start with the max_order that fits in nr_pages,
 		 * not the other way around, hence ilog2() and not get_order.
 		 */
 		max_order = ilog2(nr_pages);
 
+		/*
+		 * PMU requests more than one contiguous chunks of memory
+		 * for SW double buffering
+		 */
+		if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) &&
+		    !overwrite) {
+			if (!max_order)
+				return -EINVAL;
+
+			max_order--;
+		}
+	}
+
 	rb->aux_pages = kzalloc_node(nr_pages * sizeof(void *), GFP_KERNEL, node);
 	if (!rb->aux_pages)
 		return -ENOMEM;
-- 
cgit v1.2.3


From bed5b25ad9c8a2f5d735ef0bc746ec870c01c1b0 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Fri, 30 Jan 2015 12:31:06 +0200
Subject: perf: Add a pmu capability for "exclusive" events

Usually, pmus that do, for example, instruction tracing, would only ever
be able to have one event per task per cpu (or per perf_event_context). For
such pmus it makes sense to disallow creating conflicting events early on,
so as to provide consistent behavior for the user.

This patch adds a pmu capability that indicates such constraint on event
creation.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1422613866-113186-1-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |   2 +
 kernel/events/core.c       | 119 ++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 119 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 13a1eb3a2a2d..f936a1e51f29 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -176,6 +176,7 @@ struct perf_event;
 #define PERF_PMU_CAP_NO_NMI			0x02
 #define PERF_PMU_CAP_AUX_NO_SG			0x04
 #define PERF_PMU_CAP_AUX_SW_DOUBLEBUF		0x08
+#define PERF_PMU_CAP_EXCLUSIVE			0x10
 
 /**
  * struct pmu - generic performance monitoring unit
@@ -196,6 +197,7 @@ struct pmu {
 
 	int * __percpu			pmu_disable_count;
 	struct perf_cpu_context * __percpu pmu_cpu_context;
+	atomic_t			exclusive_cnt; /* < 0: cpu; > 0: tsk */
 	int				task_ctx_nr;
 	int				hrtimer_interval_ms;
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index da51128c337a..6d9fdaef7b57 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3459,6 +3459,91 @@ static void unaccount_event(struct perf_event *event)
 	unaccount_event_cpu(event, event->cpu);
 }
 
+/*
+ * The following implement mutual exclusion of events on "exclusive" pmus
+ * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
+ * at a time, so we disallow creating events that might conflict, namely:
+ *
+ *  1) cpu-wide events in the presence of per-task events,
+ *  2) per-task events in the presence of cpu-wide events,
+ *  3) two matching events on the same context.
+ *
+ * The former two cases are handled in the allocation path (perf_event_alloc(),
+ * __free_event()), the latter -- before the first perf_install_in_context().
+ */
+static int exclusive_event_init(struct perf_event *event)
+{
+	struct pmu *pmu = event->pmu;
+
+	if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+		return 0;
+
+	/*
+	 * Prevent co-existence of per-task and cpu-wide events on the
+	 * same exclusive pmu.
+	 *
+	 * Negative pmu::exclusive_cnt means there are cpu-wide
+	 * events on this "exclusive" pmu, positive means there are
+	 * per-task events.
+	 *
+	 * Since this is called in perf_event_alloc() path, event::ctx
+	 * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
+	 * to mean "per-task event", because unlike other attach states it
+	 * never gets cleared.
+	 */
+	if (event->attach_state & PERF_ATTACH_TASK) {
+		if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
+			return -EBUSY;
+	} else {
+		if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
+			return -EBUSY;
+	}
+
+	return 0;
+}
+
+static void exclusive_event_destroy(struct perf_event *event)
+{
+	struct pmu *pmu = event->pmu;
+
+	if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+		return;
+
+	/* see comment in exclusive_event_init() */
+	if (event->attach_state & PERF_ATTACH_TASK)
+		atomic_dec(&pmu->exclusive_cnt);
+	else
+		atomic_inc(&pmu->exclusive_cnt);
+}
+
+static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
+{
+	if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) &&
+	    (e1->cpu == e2->cpu ||
+	     e1->cpu == -1 ||
+	     e2->cpu == -1))
+		return true;
+	return false;
+}
+
+/* Called under the same ctx::mutex as perf_install_in_context() */
+static bool exclusive_event_installable(struct perf_event *event,
+					struct perf_event_context *ctx)
+{
+	struct perf_event *iter_event;
+	struct pmu *pmu = event->pmu;
+
+	if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+		return true;
+
+	list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
+		if (exclusive_event_match(iter_event, event))
+			return false;
+	}
+
+	return true;
+}
+
 static void __free_event(struct perf_event *event)
 {
 	if (!event->parent) {
@@ -3472,8 +3557,10 @@ static void __free_event(struct perf_event *event)
 	if (event->ctx)
 		put_ctx(event->ctx);
 
-	if (event->pmu)
+	if (event->pmu) {
+		exclusive_event_destroy(event);
 		module_put(event->pmu->module);
+	}
 
 	call_rcu(&event->rcu_head, free_event_rcu);
 }
@@ -7150,6 +7237,7 @@ got_cpu_context:
 		pmu->event_idx = perf_event_idx_default;
 
 	list_add_rcu(&pmu->entry, &pmus);
+	atomic_set(&pmu->exclusive_cnt, 0);
 	ret = 0;
 unlock:
 	mutex_unlock(&pmus_lock);
@@ -7405,16 +7493,23 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 		goto err_ns;
 	}
 
+	err = exclusive_event_init(event);
+	if (err)
+		goto err_pmu;
+
 	if (!event->parent) {
 		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
 			err = get_callchain_buffers();
 			if (err)
-				goto err_pmu;
+				goto err_per_task;
 		}
 	}
 
 	return event;
 
+err_per_task:
+	exclusive_event_destroy(event);
+
 err_pmu:
 	if (event->destroy)
 		event->destroy(event);
@@ -7819,6 +7914,11 @@ SYSCALL_DEFINE5(perf_event_open,
 		goto err_alloc;
 	}
 
+	if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
+		err = -EBUSY;
+		goto err_context;
+	}
+
 	if (task) {
 		put_task_struct(task);
 		task = NULL;
@@ -7941,6 +8041,13 @@ SYSCALL_DEFINE5(perf_event_open,
 		get_ctx(ctx);
 	}
 
+	if (!exclusive_event_installable(event, ctx)) {
+		err = -EBUSY;
+		mutex_unlock(&ctx->mutex);
+		fput(event_file);
+		goto err_context;
+	}
+
 	perf_install_in_context(ctx, event, event->cpu);
 	perf_unpin_context(ctx);
 
@@ -8032,6 +8139,14 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 
 	WARN_ON_ONCE(ctx->parent_ctx);
 	mutex_lock(&ctx->mutex);
+	if (!exclusive_event_installable(event, ctx)) {
+		mutex_unlock(&ctx->mutex);
+		perf_unpin_context(ctx);
+		put_ctx(ctx);
+		err = -EBUSY;
+		goto err_free;
+	}
+
 	perf_install_in_context(ctx, event, cpu);
 	perf_unpin_context(ctx);
 	mutex_unlock(&ctx->mutex);
-- 
cgit v1.2.3


From 68db7e98c3a6ebe7284b6cf14906ed7c55f3f7f0 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Wed, 14 Jan 2015 14:18:15 +0200
Subject: perf: Add AUX record

When there's new data in the AUX space, output a record indicating its
offset and size and a set of flags, such as PERF_AUX_FLAG_TRUNCATED, to
mean the described data was truncated to fit in the ring buffer.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-7-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/uapi/linux/perf_event.h | 19 +++++++++++++++++++
 kernel/events/core.c            | 34 ++++++++++++++++++++++++++++++++++
 kernel/events/internal.h        |  3 +++
 3 files changed, 56 insertions(+)

(limited to 'kernel')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 6c5013a71714..8904ad3a850b 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -768,6 +768,20 @@ enum perf_event_type {
 	 */
 	PERF_RECORD_MMAP2			= 10,
 
+	/*
+	 * Records that new data landed in the AUX buffer part.
+	 *
+	 * struct {
+	 * 	struct perf_event_header	header;
+	 *
+	 * 	u64				aux_offset;
+	 * 	u64				aux_size;
+	 *	u64				flags;
+	 * 	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_AUX				= 11,
+
 	PERF_RECORD_MAX,			/* non-ABI */
 };
 
@@ -785,6 +799,11 @@ enum perf_callchain_context {
 	PERF_CONTEXT_MAX		= (__u64)-4095,
 };
 
+/**
+ * PERF_RECORD_AUX::flags bits
+ */
+#define PERF_AUX_FLAG_TRUNCATED		0x01	/* record was truncated to fit */
+
 #define PERF_FLAG_FD_NO_GROUP		(1UL << 0)
 #define PERF_FLAG_FD_OUTPUT		(1UL << 1)
 #define PERF_FLAG_PID_CGROUP		(1UL << 2) /* pid=cgroup id, per-cpu mode only */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6d9fdaef7b57..dbc2eff32230 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5916,6 +5916,40 @@ void perf_event_mmap(struct vm_area_struct *vma)
 	perf_event_mmap_event(&mmap_event);
 }
 
+void perf_event_aux_event(struct perf_event *event, unsigned long head,
+			  unsigned long size, u64 flags)
+{
+	struct perf_output_handle handle;
+	struct perf_sample_data sample;
+	struct perf_aux_event {
+		struct perf_event_header	header;
+		u64				offset;
+		u64				size;
+		u64				flags;
+	} rec = {
+		.header = {
+			.type = PERF_RECORD_AUX,
+			.misc = 0,
+			.size = sizeof(rec),
+		},
+		.offset		= head,
+		.size		= size,
+		.flags		= flags,
+	};
+	int ret;
+
+	perf_event_header__init_id(&rec.header, &sample, event);
+	ret = perf_output_begin(&handle, event, rec.header.size);
+
+	if (ret)
+		return;
+
+	perf_output_put(&handle, rec);
+	perf_event__output_id_sample(event, &handle, &sample);
+
+	perf_output_end(&handle);
+}
+
 /*
  * IRQ throttle logging
  */
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 0f6d08015927..4d117a981431 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -62,6 +62,9 @@ static inline bool rb_has_aux(struct ring_buffer *rb)
 	return !!rb->aux_nr_pages;
 }
 
+void perf_event_aux_event(struct perf_event *event, unsigned long head,
+			  unsigned long size, u64 flags);
+
 extern void
 perf_event_header__init_id(struct perf_event_header *header,
 			   struct perf_sample_data *data,
-- 
cgit v1.2.3


From fdc2670666f40ab3e03143f04d1ebf4a05e2c24a Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Wed, 14 Jan 2015 14:18:16 +0200
Subject: perf: Add API for PMUs to write to the AUX area

For pmus that wish to write data to ring buffer's AUX area, provide
perf_aux_output_{begin,end}() calls to initiate/commit data writes,
similarly to perf_output_{begin,end}. These also use the same output
handle structure. Also, similarly to software counterparts, these
will direct inherited events' output to parents' ring buffers.

After the perf_aux_output_begin() returns successfully, handle->size
is set to the maximum amount of data that can be written wrt aux_tail
pointer, so that no data that the user hasn't seen will be overwritten,
therefore this should always be called before hardware writing is
enabled. On success, this will return the pointer to pmu driver's
private structure allocated for this aux area by pmu::setup_aux. Same
pointer can also be retrieved using perf_get_aux() while hardware
writing is enabled.

PMU driver should pass the actual amount of data written as a parameter
to perf_aux_output_end(). All hardware writes should be completed and
visible before this one is called.

Additionally, perf_aux_output_skip() will adjust output handle and
aux_head in case some part of the buffer has to be skipped over to
maintain hardware's alignment constraints.

Nested writers are forbidden and guards are in place to catch such
attempts.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-8-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h  |  24 +++++++-
 kernel/events/core.c        |   5 +-
 kernel/events/internal.h    |   4 ++
 kernel/events/ring_buffer.c | 139 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 168 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f936a1e51f29..45c5873ad9b3 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -573,7 +573,10 @@ struct perf_output_handle {
 	struct ring_buffer		*rb;
 	unsigned long			wakeup;
 	unsigned long			size;
-	void				*addr;
+	union {
+		void			*addr;
+		unsigned long		head;
+	};
 	int				page;
 };
 
@@ -608,6 +611,14 @@ perf_cgroup_from_task(struct task_struct *task)
 
 #ifdef CONFIG_PERF_EVENTS
 
+extern void *perf_aux_output_begin(struct perf_output_handle *handle,
+				   struct perf_event *event);
+extern void perf_aux_output_end(struct perf_output_handle *handle,
+				unsigned long size, bool truncated);
+extern int perf_aux_output_skip(struct perf_output_handle *handle,
+				unsigned long size);
+extern void *perf_get_aux(struct perf_output_handle *handle);
+
 extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
 extern void perf_pmu_unregister(struct pmu *pmu);
 
@@ -898,6 +909,17 @@ extern void perf_event_disable(struct perf_event *event);
 extern int __perf_event_disable(void *info);
 extern void perf_event_task_tick(void);
 #else /* !CONFIG_PERF_EVENTS: */
+static inline void *
+perf_aux_output_begin(struct perf_output_handle *handle,
+		      struct perf_event *event)				{ return NULL; }
+static inline void
+perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
+		    bool truncated)					{ }
+static inline int
+perf_aux_output_skip(struct perf_output_handle *handle,
+		     unsigned long size)				{ return -EINVAL; }
+static inline void *
+perf_get_aux(struct perf_output_handle *handle)				{ return NULL; }
 static inline void
 perf_event_task_sched_in(struct task_struct *prev,
 			 struct task_struct *task)			{ }
diff --git a/kernel/events/core.c b/kernel/events/core.c
index dbc2eff32230..81e8d14ac59a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3423,7 +3423,6 @@ static void free_event_rcu(struct rcu_head *head)
 	kfree(event);
 }
 
-static void ring_buffer_put(struct ring_buffer *rb);
 static void ring_buffer_attach(struct perf_event *event,
 			       struct ring_buffer *rb);
 
@@ -4361,7 +4360,7 @@ static void rb_free_rcu(struct rcu_head *rcu_head)
 	rb_free(rb);
 }
 
-static struct ring_buffer *ring_buffer_get(struct perf_event *event)
+struct ring_buffer *ring_buffer_get(struct perf_event *event)
 {
 	struct ring_buffer *rb;
 
@@ -4376,7 +4375,7 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
 	return rb;
 }
 
-static void ring_buffer_put(struct ring_buffer *rb)
+void ring_buffer_put(struct ring_buffer *rb)
 {
 	if (!atomic_dec_and_test(&rb->refcount))
 		return;
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 4d117a981431..b701ebc32570 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -36,6 +36,8 @@ struct ring_buffer {
 	struct user_struct		*mmap_user;
 
 	/* AUX area */
+	local_t				aux_head;
+	local_t				aux_nest;
 	unsigned long			aux_pgoff;
 	int				aux_nr_pages;
 	atomic_t			aux_mmap_count;
@@ -56,6 +58,8 @@ extern void perf_event_wakeup(struct perf_event *event);
 extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
 			pgoff_t pgoff, int nr_pages, int flags);
 extern void rb_free_aux(struct ring_buffer *rb);
+extern struct ring_buffer *ring_buffer_get(struct perf_event *event);
+extern void ring_buffer_put(struct ring_buffer *rb);
 
 static inline bool rb_has_aux(struct ring_buffer *rb)
 {
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 6e3be7a10c50..0cc7b0f39058 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -243,6 +243,145 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
 	spin_lock_init(&rb->event_lock);
 }
 
+/*
+ * This is called before hardware starts writing to the AUX area to
+ * obtain an output handle and make sure there's room in the buffer.
+ * When the capture completes, call perf_aux_output_end() to commit
+ * the recorded data to the buffer.
+ *
+ * The ordering is similar to that of perf_output_{begin,end}, with
+ * the exception of (B), which should be taken care of by the pmu
+ * driver, since ordering rules will differ depending on hardware.
+ */
+void *perf_aux_output_begin(struct perf_output_handle *handle,
+			    struct perf_event *event)
+{
+	struct perf_event *output_event = event;
+	unsigned long aux_head, aux_tail;
+	struct ring_buffer *rb;
+
+	if (output_event->parent)
+		output_event = output_event->parent;
+
+	/*
+	 * Since this will typically be open across pmu::add/pmu::del, we
+	 * grab ring_buffer's refcount instead of holding rcu read lock
+	 * to make sure it doesn't disappear under us.
+	 */
+	rb = ring_buffer_get(output_event);
+	if (!rb)
+		return NULL;
+
+	if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount))
+		goto err;
+
+	/*
+	 * Nesting is not supported for AUX area, make sure nested
+	 * writers are caught early
+	 */
+	if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1)))
+		goto err_put;
+
+	aux_head = local_read(&rb->aux_head);
+	aux_tail = ACCESS_ONCE(rb->user_page->aux_tail);
+
+	handle->rb = rb;
+	handle->event = event;
+	handle->head = aux_head;
+	if (aux_head - aux_tail < perf_aux_size(rb))
+		handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
+	else
+		handle->size = 0;
+
+	/*
+	 * handle->size computation depends on aux_tail load; this forms a
+	 * control dependency barrier separating aux_tail load from aux data
+	 * store that will be enabled on successful return
+	 */
+	if (!handle->size) { /* A, matches D */
+		event->pending_disable = 1;
+		perf_output_wakeup(handle);
+		local_set(&rb->aux_nest, 0);
+		goto err_put;
+	}
+
+	return handle->rb->aux_priv;
+
+err_put:
+	rb_free_aux(rb);
+
+err:
+	ring_buffer_put(rb);
+	handle->event = NULL;
+
+	return NULL;
+}
+
+/*
+ * Commit the data written by hardware into the ring buffer by adjusting
+ * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the
+ * pmu driver's responsibility to observe ordering rules of the hardware,
+ * so that all the data is externally visible before this is called.
+ */
+void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
+			 bool truncated)
+{
+	struct ring_buffer *rb = handle->rb;
+	unsigned long aux_head = local_read(&rb->aux_head);
+	u64 flags = 0;
+
+	if (truncated)
+		flags |= PERF_AUX_FLAG_TRUNCATED;
+
+	local_add(size, &rb->aux_head);
+
+	if (size || flags) {
+		/*
+		 * Only send RECORD_AUX if we have something useful to communicate
+		 */
+
+		perf_event_aux_event(handle->event, aux_head, size, flags);
+	}
+
+	rb->user_page->aux_head = local_read(&rb->aux_head);
+
+	perf_output_wakeup(handle);
+	handle->event = NULL;
+
+	local_set(&rb->aux_nest, 0);
+	rb_free_aux(rb);
+	ring_buffer_put(rb);
+}
+
+/*
+ * Skip over a given number of bytes in the AUX buffer, due to, for example,
+ * hardware's alignment constraints.
+ */
+int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
+{
+	struct ring_buffer *rb = handle->rb;
+	unsigned long aux_head;
+
+	if (size > handle->size)
+		return -ENOSPC;
+
+	local_add(size, &rb->aux_head);
+
+	handle->head = aux_head;
+	handle->size -= size;
+
+	return 0;
+}
+
+void *perf_get_aux(struct perf_output_handle *handle)
+{
+	/* this is only valid between perf_aux_output_begin and *_end */
+	if (!handle->event)
+		return NULL;
+
+	return handle->rb->aux_priv;
+}
+
 #define PERF_AUX_GFP	(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
 
 static struct page *rb_alloc_aux_page(int node, int order)
-- 
cgit v1.2.3


From 2023a0d2829e521fe6ad6b9907f3f90bfbf57142 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Wed, 14 Jan 2015 14:18:17 +0200
Subject: perf: Support overwrite mode for the AUX area

This adds support for overwrite mode in the AUX area, which means "keep
collecting data till you're stopped", turning AUX area into a circular
buffer, where new data overwrites old data. It does not depend on data
buffer's overwrite mode, so that it doesn't lose sideband data that is
instrumental for processing AUX data.

Overwrite mode is enabled at mapping AUX area read only. Even though
aux_tail in the buffer's user page might be user writable, it will be
ignored in this mode.

A PERF_RECORD_AUX with PERF_AUX_FLAG_OVERWRITE set is written to the perf
data stream every time an event writes new data to the AUX area. The pmu
driver might not be able to infer the exact beginning of the new data in
each snapshot, some drivers will only provide the tail, which is
aux_offset + aux_size in the AUX record. Consumer has to be able to tell
the new data from the old one, for example, by means of time stamps if
such are provided in the trace.

Consumer is also responsible for disabling any events that might write
to the AUX area (thus potentially racing with the consumer) before
collecting the data.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-9-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/uapi/linux/perf_event.h |  1 +
 kernel/events/internal.h        |  1 +
 kernel/events/ring_buffer.c     | 48 ++++++++++++++++++++++++++++-------------
 3 files changed, 35 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 8904ad3a850b..29ef2f73bb4a 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -803,6 +803,7 @@ enum perf_callchain_context {
  * PERF_RECORD_AUX::flags bits
  */
 #define PERF_AUX_FLAG_TRUNCATED		0x01	/* record was truncated to fit */
+#define PERF_AUX_FLAG_OVERWRITE		0x02	/* snapshot from overwrite mode */
 
 #define PERF_FLAG_FD_NO_GROUP		(1UL << 0)
 #define PERF_FLAG_FD_OUTPUT		(1UL << 1)
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index b701ebc32570..ffd51d9f5945 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -40,6 +40,7 @@ struct ring_buffer {
 	local_t				aux_nest;
 	unsigned long			aux_pgoff;
 	int				aux_nr_pages;
+	int				aux_overwrite;
 	atomic_t			aux_mmap_count;
 	unsigned long			aux_mmap_locked;
 	void				(*free_aux)(void *);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 0cc7b0f39058..67b328337a41 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -283,26 +283,33 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
 		goto err_put;
 
 	aux_head = local_read(&rb->aux_head);
-	aux_tail = ACCESS_ONCE(rb->user_page->aux_tail);
 
 	handle->rb = rb;
 	handle->event = event;
 	handle->head = aux_head;
-	if (aux_head - aux_tail < perf_aux_size(rb))
-		handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
-	else
-		handle->size = 0;
+	handle->size = 0;
 
 	/*
-	 * handle->size computation depends on aux_tail load; this forms a
-	 * control dependency barrier separating aux_tail load from aux data
-	 * store that will be enabled on successful return
+	 * In overwrite mode, AUX data stores do not depend on aux_tail,
+	 * therefore (A) control dependency barrier does not exist. The
+	 * (B) <-> (C) ordering is still observed by the pmu driver.
 	 */
-	if (!handle->size) { /* A, matches D */
-		event->pending_disable = 1;
-		perf_output_wakeup(handle);
-		local_set(&rb->aux_nest, 0);
-		goto err_put;
+	if (!rb->aux_overwrite) {
+		aux_tail = ACCESS_ONCE(rb->user_page->aux_tail);
+		if (aux_head - aux_tail < perf_aux_size(rb))
+			handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
+
+		/*
+		 * handle->size computation depends on aux_tail load; this forms a
+		 * control dependency barrier separating aux_tail load from aux data
+		 * store that will be enabled on successful return
+		 */
+		if (!handle->size) { /* A, matches D */
+			event->pending_disable = 1;
+			perf_output_wakeup(handle);
+			local_set(&rb->aux_nest, 0);
+			goto err_put;
+		}
 	}
 
 	return handle->rb->aux_priv;
@@ -327,13 +334,22 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
 			 bool truncated)
 {
 	struct ring_buffer *rb = handle->rb;
-	unsigned long aux_head = local_read(&rb->aux_head);
+	unsigned long aux_head;
 	u64 flags = 0;
 
 	if (truncated)
 		flags |= PERF_AUX_FLAG_TRUNCATED;
 
-	local_add(size, &rb->aux_head);
+	/* in overwrite mode, driver provides aux_head via handle */
+	if (rb->aux_overwrite) {
+		flags |= PERF_AUX_FLAG_OVERWRITE;
+
+		aux_head = handle->head;
+		local_set(&rb->aux_head, aux_head);
+	} else {
+		aux_head = local_read(&rb->aux_head);
+		local_add(size, &rb->aux_head);
+	}
 
 	if (size || flags) {
 		/*
@@ -480,6 +496,8 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
 	 */
 	atomic_set(&rb->aux_refcount, 1);
 
+	rb->aux_overwrite = overwrite;
+
 out:
 	if (!ret)
 		rb->aux_pgoff = pgoff;
-- 
cgit v1.2.3


From 1a5941312414c71dece6717da9a0fa1303127afa Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Wed, 14 Jan 2015 14:18:18 +0200
Subject: perf: Add wakeup watermark control to the AUX area

When AUX area gets a certain amount of new data, we want to wake up
userspace to collect it. This adds a new control to specify how much
data will cause a wakeup. This is then passed down to pmu drivers via
output handle's "wakeup" field, so that the driver can find the nearest
point where it can generate an interrupt.

We repurpose __reserved_2 in the event attribute for this, even though
it was never checked to be zero before, aux_watermark will only matter
for new AUX-aware code, so the old code should still be fine.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-10-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/uapi/linux/perf_event.h |  7 +++++++
 kernel/events/core.c            |  3 ++-
 kernel/events/internal.h        |  4 +++-
 kernel/events/ring_buffer.c     | 22 +++++++++++++++++++---
 4 files changed, 31 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 29ef2f73bb4a..84819546c8ce 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -261,6 +261,7 @@ enum perf_event_read_format {
 #define PERF_ATTR_SIZE_VER3	96	/* add: sample_regs_user */
 					/* add: sample_stack_user */
 #define PERF_ATTR_SIZE_VER4	104	/* add: sample_regs_intr */
+#define PERF_ATTR_SIZE_VER5	112	/* add: aux_watermark */
 
 /*
  * Hardware event_id to monitor via a performance monitoring event:
@@ -366,6 +367,12 @@ struct perf_event_attr {
 	 * See asm/perf_regs.h for details.
 	 */
 	__u64	sample_regs_intr;
+
+	/*
+	 * Wakeup watermark for AUX area
+	 */
+	__u32	aux_watermark;
+	__u32	__reserved_2;	/* align to __u64 */
 };
 
 #define perf_flags(attr)	(*(&(attr)->read_format + 1))
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 81e8d14ac59a..31f6b504ad62 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4677,7 +4677,8 @@ accounting:
 		perf_event_init_userpage(event);
 		perf_event_update_userpage(event);
 	} else {
-		ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, flags);
+		ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
+				   event->attr.aux_watermark, flags);
 		if (!ret)
 			rb->aux_mmap_locked = extra;
 	}
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index ffd51d9f5945..9f6ce9ba4a04 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -27,6 +27,7 @@ struct ring_buffer {
 	local_t				lost;		/* nr records lost   */
 
 	long				watermark;	/* wakeup watermark  */
+	long				aux_watermark;
 	/* poll crap */
 	spinlock_t			event_lock;
 	struct list_head		event_list;
@@ -38,6 +39,7 @@ struct ring_buffer {
 	/* AUX area */
 	local_t				aux_head;
 	local_t				aux_nest;
+	local_t				aux_wakeup;
 	unsigned long			aux_pgoff;
 	int				aux_nr_pages;
 	int				aux_overwrite;
@@ -57,7 +59,7 @@ extern struct ring_buffer *
 rb_alloc(int nr_pages, long watermark, int cpu, int flags);
 extern void perf_event_wakeup(struct perf_event *event);
 extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
-			pgoff_t pgoff, int nr_pages, int flags);
+			pgoff_t pgoff, int nr_pages, long watermark, int flags);
 extern void rb_free_aux(struct ring_buffer *rb);
 extern struct ring_buffer *ring_buffer_get(struct perf_event *event);
 extern void ring_buffer_put(struct ring_buffer *rb);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 67b328337a41..232f00f273cb 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -296,6 +296,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
 	 */
 	if (!rb->aux_overwrite) {
 		aux_tail = ACCESS_ONCE(rb->user_page->aux_tail);
+		handle->wakeup = local_read(&rb->aux_wakeup) + rb->aux_watermark;
 		if (aux_head - aux_tail < perf_aux_size(rb))
 			handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
 
@@ -359,9 +360,12 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
 		perf_event_aux_event(handle->event, aux_head, size, flags);
 	}
 
-	rb->user_page->aux_head = local_read(&rb->aux_head);
+	aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
 
-	perf_output_wakeup(handle);
+	if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
+		perf_output_wakeup(handle);
+		local_add(rb->aux_watermark, &rb->aux_wakeup);
+	}
 	handle->event = NULL;
 
 	local_set(&rb->aux_nest, 0);
@@ -383,6 +387,14 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
 
 	local_add(size, &rb->aux_head);
 
+	aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
+	if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
+		perf_output_wakeup(handle);
+		local_add(rb->aux_watermark, &rb->aux_wakeup);
+		handle->wakeup = local_read(&rb->aux_wakeup) +
+				 rb->aux_watermark;
+	}
+
 	handle->head = aux_head;
 	handle->size -= size;
 
@@ -433,7 +445,7 @@ static void rb_free_aux_page(struct ring_buffer *rb, int idx)
 }
 
 int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
-		 pgoff_t pgoff, int nr_pages, int flags)
+		 pgoff_t pgoff, int nr_pages, long watermark, int flags)
 {
 	bool overwrite = !(flags & RING_BUFFER_WRITABLE);
 	int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
@@ -497,6 +509,10 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
 	atomic_set(&rb->aux_refcount, 1);
 
 	rb->aux_overwrite = overwrite;
+	rb->aux_watermark = watermark;
+
+	if (!rb->aux_watermark && !rb->aux_overwrite)
+		rb->aux_watermark = nr_pages << (PAGE_SHIFT - 1);
 
 out:
 	if (!ret)
-- 
cgit v1.2.3


From ec0d7729bbaed4b9d2d3fada693278e13a3d1368 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Wed, 14 Jan 2015 14:18:23 +0200
Subject: perf: Add ITRACE_START record to indicate that tracing has started

For counters that generate AUX data that is bound to the context of a
running task, such as instruction tracing, the decoder needs to know
exactly which task is running when the event is first scheduled in,
before the first sched_switch. The decoder's need to know this stems
from the fact that instruction flow trace decoding will almost always
require program's object code in order to reconstruct said flow and
for that we need at least its pid/tid in the perf stream.

To single out such instruction tracing pmus, this patch introduces
ITRACE PMU capability. The reason this is not part of RECORD_AUX
record is that not all pmus capable of generating AUX data need this,
and the opposite is *probably* also true.

While sched_switch covers for most cases, there are two problems with it:
the consumer will need to process events out of order (that is, having
found RECORD_AUX, it will have to skip forward to the nearest sched_switch
to figure out which task it was, then go back to the actual trace to
decode it) and it completely misses the case when the tracing is enabled
and disabled before sched_switch, for example, via PERF_EVENT_IOC_DISABLE.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-15-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h      |  4 ++++
 include/uapi/linux/perf_event.h | 11 +++++++++++
 kernel/events/core.c            | 41 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 45c5873ad9b3..61992cf2e977 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -129,6 +129,9 @@ struct hw_perf_event {
 			struct list_head	cqm_groups_entry;
 			struct list_head	cqm_group_entry;
 		};
+		struct { /* itrace */
+			int			itrace_started;
+		};
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 		struct { /* breakpoint */
 			/*
@@ -177,6 +180,7 @@ struct perf_event;
 #define PERF_PMU_CAP_AUX_NO_SG			0x04
 #define PERF_PMU_CAP_AUX_SW_DOUBLEBUF		0x08
 #define PERF_PMU_CAP_EXCLUSIVE			0x10
+#define PERF_PMU_CAP_ITRACE			0x20
 
 /**
  * struct pmu - generic performance monitoring unit
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 84819546c8ce..309211b3eb67 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -789,6 +789,17 @@ enum perf_event_type {
 	 */
 	PERF_RECORD_AUX				= 11,
 
+	/*
+	 * Indicates that instruction trace has started
+	 *
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u32				pid;
+	 *	u32				tid;
+	 * };
+	 */
+	PERF_RECORD_ITRACE_START		= 12,
+
 	PERF_RECORD_MAX,			/* non-ABI */
 };
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 31f6b504ad62..06917d537302 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1831,6 +1831,7 @@ static void perf_set_shadow_time(struct perf_event *event,
 #define MAX_INTERRUPTS (~0ULL)
 
 static void perf_log_throttle(struct perf_event *event, int enable);
+static void perf_log_itrace_start(struct perf_event *event);
 
 static int
 event_sched_in(struct perf_event *event,
@@ -1869,6 +1870,8 @@ event_sched_in(struct perf_event *event,
 
 	perf_set_shadow_time(event, ctx, tstamp);
 
+	perf_log_itrace_start(event);
+
 	if (event->pmu->add(event, PERF_EF_START)) {
 		event->state = PERF_EVENT_STATE_INACTIVE;
 		event->oncpu = -1;
@@ -5991,6 +5994,44 @@ static void perf_log_throttle(struct perf_event *event, int enable)
 	perf_output_end(&handle);
 }
 
+static void perf_log_itrace_start(struct perf_event *event)
+{
+	struct perf_output_handle handle;
+	struct perf_sample_data sample;
+	struct perf_aux_event {
+		struct perf_event_header        header;
+		u32				pid;
+		u32				tid;
+	} rec;
+	int ret;
+
+	if (event->parent)
+		event = event->parent;
+
+	if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
+	    event->hw.itrace_started)
+		return;
+
+	event->hw.itrace_started = 1;
+
+	rec.header.type	= PERF_RECORD_ITRACE_START;
+	rec.header.misc	= 0;
+	rec.header.size	= sizeof(rec);
+	rec.pid	= perf_event_pid(event, current);
+	rec.tid	= perf_event_tid(event, current);
+
+	perf_event_header__init_id(&rec.header, &sample, event);
+	ret = perf_output_begin(&handle, event, rec.header.size);
+
+	if (ret)
+		return;
+
+	perf_output_put(&handle, rec);
+	perf_event__output_id_sample(event, &handle, &sample);
+
+	perf_output_end(&handle);
+}
+
 /*
  * Generic event overflow handling, sampling.
  */
-- 
cgit v1.2.3


From b3738d29323344da3017a91010530cf3a58590fc Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Mon, 17 Nov 2014 20:07:03 +0100
Subject: watchdog: Add watchdog enable/disable all functions

This patch adds two new functions to enable/disable
the watchdog across all CPUs.

This will be used by the HT PMU bug workaround code to
disable/enable the NMI watchdog across quirk enablement.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Cc: maria.n.dimakopoulou@gmail.com
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/1416251225-17721-12-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/watchdog.h |  8 ++++++++
 kernel/watchdog.c        | 28 ++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h
index 395b70e0eccf..a746bf5216f8 100644
--- a/include/linux/watchdog.h
+++ b/include/linux/watchdog.h
@@ -137,4 +137,12 @@ extern int watchdog_init_timeout(struct watchdog_device *wdd,
 extern int watchdog_register_device(struct watchdog_device *);
 extern void watchdog_unregister_device(struct watchdog_device *);
 
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+void watchdog_nmi_disable_all(void);
+void watchdog_nmi_enable_all(void);
+#else
+static inline void watchdog_nmi_disable_all(void) {}
+static inline void watchdog_nmi_enable_all(void) {}
+#endif
+
 #endif  /* ifndef _LINUX_WATCHDOG_H */
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 3174bf8e3538..9a056f5bc02c 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -567,9 +567,37 @@ static void watchdog_nmi_disable(unsigned int cpu)
 		cpu0_err = 0;
 	}
 }
+
+void watchdog_nmi_enable_all(void)
+{
+	int cpu;
+
+	if (!watchdog_user_enabled)
+		return;
+
+	get_online_cpus();
+	for_each_online_cpu(cpu)
+		watchdog_nmi_enable(cpu);
+	put_online_cpus();
+}
+
+void watchdog_nmi_disable_all(void)
+{
+	int cpu;
+
+	if (!watchdog_running)
+		return;
+
+	get_online_cpus();
+	for_each_online_cpu(cpu)
+		watchdog_nmi_disable(cpu);
+	put_online_cpus();
+}
 #else
 static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
 static void watchdog_nmi_disable(unsigned int cpu) { return; }
+void watchdog_nmi_enable_all(void) {}
+void watchdog_nmi_disable_all(void) {}
 #endif /* CONFIG_HARDLOCKUP_DETECTOR */
 
 static struct smp_hotplug_thread watchdog_threads = {
-- 
cgit v1.2.3


From 00ccbf2f5b7580cd7dcdaeda84828d14f0cba3c9 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 19 Feb 2015 15:56:14 +0100
Subject: ftrace/x86: Let dynamic trampolines call ops->func even for dynamic
 fops

Dynamically allocated trampolines call ftrace_ops_get_func to get the
function which they should call. For dynamic fops (FTRACE_OPS_FL_DYNAMIC
flag is set) ftrace_ops_list_func is always returned. This is reasonable
for static trampolines but goes against the main advantage of dynamic
ones, that is avoidance of going through the list of all registered
callbacks for functions that are only being traced by a single callback.

We can fix it by returning ops->func (or recursion safe version) from
ftrace_ops_get_func whenever it is possible for dynamic trampolines.

Note that dynamic trampolines are not allowed for dynamic fops if
CONFIG_PREEMPT=y.

Link: http://lkml.kernel.org/r/alpine.LNX.2.00.1501291023000.25445@pobox.suse.cz
Link: http://lkml.kernel.org/r/1424357773-13536-1-git-send-email-mbenes@suse.cz

Reported-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4f228024055b..d01d238d8ef4 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -249,6 +249,19 @@ static void update_function_graph_func(void);
 static inline void update_function_graph_func(void) { }
 #endif
 
+
+static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops)
+{
+	/*
+	 * If this is a dynamic ops or we force list func,
+	 * then it needs to call the list anyway.
+	 */
+	if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC)
+		return ftrace_ops_list_func;
+
+	return ftrace_ops_get_func(ops);
+}
+
 static void update_ftrace_function(void)
 {
 	ftrace_func_t func;
@@ -270,7 +283,7 @@ static void update_ftrace_function(void)
 	 * then have the mcount trampoline call the function directly.
 	 */
 	} else if (ftrace_ops_list->next == &ftrace_list_end) {
-		func = ftrace_ops_get_func(ftrace_ops_list);
+		func = ftrace_ops_get_list_func(ftrace_ops_list);
 
 	} else {
 		/* Just use the default ftrace_ops */
@@ -5208,13 +5221,6 @@ static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
  */
 ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
 {
-	/*
-	 * If this is a dynamic ops or we force list func,
-	 * then it needs to call the list anyway.
-	 */
-	if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC)
-		return ftrace_ops_list_func;
-
 	/*
 	 * If the func handles its own recursion, call it directly.
 	 * Otherwise call the recursion protected function that
-- 
cgit v1.2.3


From 0c564a538aa934ad15b2145aaf8b64f3feb0be63 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 24 Mar 2015 17:58:09 -0400
Subject: tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values

Several tracepoints use the helper functions __print_symbolic() or
__print_flags() and pass in enums that do the mapping between the
binary data stored and the value to print. This works well for reading
the ASCII trace files, but when the data is read via userspace tools
such as perf and trace-cmd, the conversion of the binary value to a
human string format is lost if an enum is used, as userspace does not
have access to what the ENUM is.

For example, the tracepoint trace_tlb_flush() has:

 __print_symbolic(REC->reason,
    { TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" },
    { TLB_REMOTE_SHOOTDOWN, "remote shootdown" },
    { TLB_LOCAL_SHOOTDOWN, "local shootdown" },
    { TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" })

Which maps the enum values to the strings they represent. But perf and
trace-cmd do no know what value TLB_LOCAL_MM_SHOOTDOWN is, and would
not be able to map it.

With TRACE_DEFINE_ENUM(), developers can place these in the event header
files and ftrace will convert the enums to their values:

By adding:

 TRACE_DEFINE_ENUM(TLB_FLUSH_ON_TASK_SWITCH);
 TRACE_DEFINE_ENUM(TLB_REMOTE_SHOOTDOWN);
 TRACE_DEFINE_ENUM(TLB_LOCAL_SHOOTDOWN);
 TRACE_DEFINE_ENUM(TLB_LOCAL_MM_SHOOTDOWN);

 $ cat /sys/kernel/debug/tracing/events/tlb/tlb_flush/format
[...]
 __print_symbolic(REC->reason,
    { 0, "flush on task switch" },
    { 1, "remote shootdown" },
    { 2, "local shootdown" },
    { 3, "local mm shootdown" })

The above is what userspace expects to see, and tools do not need to
be modified to parse them.

Link: http://lkml.kernel.org/r/20150403013802.220157513@goodmis.org

Cc: Guilherme Cox <cox@computer.org>
Cc: Tony Luck <tony.luck@gmail.com>
Cc: Xie XiuQi <xiexiuqi@huawei.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/asm-generic/vmlinux.lds.h |   5 +-
 include/linux/ftrace_event.h      |   2 +-
 include/linux/tracepoint.h        |   8 +++
 include/trace/ftrace.h            |  22 ++++++-
 kernel/trace/trace.c              |  26 ++++++++-
 kernel/trace/trace.h              |   2 +
 kernel/trace/trace_events.c       | 119 ++++++++++++++++++++++++++++++++++++++
 7 files changed, 178 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index ac78910d7416..f8e8b34dc427 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -124,7 +124,10 @@
 #define FTRACE_EVENTS()	. = ALIGN(8);					\
 			VMLINUX_SYMBOL(__start_ftrace_events) = .;	\
 			*(_ftrace_events)				\
-			VMLINUX_SYMBOL(__stop_ftrace_events) = .;
+			VMLINUX_SYMBOL(__stop_ftrace_events) = .;	\
+			VMLINUX_SYMBOL(__start_ftrace_enum_maps) = .;	\
+			*(_ftrace_enum_map)				\
+			VMLINUX_SYMBOL(__stop_ftrace_enum_maps) = .;
 #else
 #define FTRACE_EVENTS()
 #endif
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 62b8fac7ded5..112cf49d9576 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -285,7 +285,7 @@ struct ftrace_event_call {
 		struct tracepoint	*tp;
 	};
 	struct trace_event	event;
-	const char		*print_fmt;
+	char			*print_fmt;
 	struct event_filter	*filter;
 	void			*mod;
 	void			*data;
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index c72851328ca9..a5f7f3ecafa3 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -36,6 +36,12 @@ struct tracepoint {
 	struct tracepoint_func __rcu *funcs;
 };
 
+struct trace_enum_map {
+	const char		*system;
+	const char		*enum_string;
+	unsigned long		enum_value;
+};
+
 extern int
 tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data);
 extern int
@@ -87,6 +93,8 @@ extern void syscall_unregfunc(void);
 
 #define PARAMS(args...) args
 
+#define TRACE_DEFINE_ENUM(x)
+
 #endif /* _LINUX_TRACEPOINT_H */
 
 /*
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 2f9b95b6d3fb..37d4b10b111d 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -33,6 +33,19 @@
 
 TRACE_MAKE_SYSTEM_STR();
 
+#undef TRACE_DEFINE_ENUM
+#define TRACE_DEFINE_ENUM(a)				\
+	static struct trace_enum_map __used __initdata	\
+	__##TRACE_SYSTEM##_##a =			\
+	{						\
+		.system = TRACE_SYSTEM_STRING,		\
+		.enum_string = #a,			\
+		.enum_value = a				\
+	};						\
+	static struct trace_enum_map __used		\
+	__attribute__((section("_ftrace_enum_map")))	\
+	*TRACE_SYSTEM##_##a = &__##TRACE_SYSTEM##_##a
+
 /*
  * DECLARE_EVENT_CLASS can be used to add a generic function
  * handlers for events. That is, if all events have the same
@@ -136,6 +149,9 @@ TRACE_MAKE_SYSTEM_STR();
  * The size of an array is also encoded, in the higher 16 bits of <item>.
  */
 
+#undef TRACE_DEFINE_ENUM
+#define TRACE_DEFINE_ENUM(a)
+
 #undef __field
 #define __field(type, item)
 
@@ -553,7 +569,7 @@ static inline notrace int ftrace_get_offsets_##call(			\
  *	.trace			= ftrace_raw_output_<call>, <-- stage 2
  * };
  *
- * static const char print_fmt_<call>[] = <TP_printk>;
+ * static char print_fmt_<call>[] = <TP_printk>;
  *
  * static struct ftrace_event_class __used event_class_<template> = {
  *	.system			= "<system>",
@@ -704,7 +720,7 @@ static inline void ftrace_test_probe_##call(void)			\
 #undef DECLARE_EVENT_CLASS
 #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)	\
 _TRACE_PERF_PROTO(call, PARAMS(proto));					\
-static const char print_fmt_##call[] = print;				\
+static char print_fmt_##call[] = print;					\
 static struct ftrace_event_class __used __refdata event_class_##call = { \
 	.system			= TRACE_SYSTEM_STRING,			\
 	.define_fields		= ftrace_define_fields_##call,		\
@@ -733,7 +749,7 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call
 #undef DEFINE_EVENT_PRINT
 #define DEFINE_EVENT_PRINT(template, call, proto, args, print)		\
 									\
-static const char print_fmt_##call[] = print;				\
+static char print_fmt_##call[] = print;					\
 									\
 static struct ftrace_event_call __used event_##call = {			\
 	.class			= &event_class_##template,		\
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 62c6506d663f..ebf49649534c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3908,6 +3908,20 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = {
 	.write		= tracing_saved_cmdlines_size_write,
 };
 
+static void
+trace_insert_enum_map(struct trace_enum_map **start, struct trace_enum_map **stop)
+{
+	struct trace_enum_map **map;
+	int len = stop - start;
+
+	if (len <= 0)
+		return;
+
+	map = start;
+
+	trace_event_enum_update(map, len);
+}
+
 static ssize_t
 tracing_set_trace_read(struct file *filp, char __user *ubuf,
 		       size_t cnt, loff_t *ppos)
@@ -6542,6 +6556,14 @@ struct dentry *tracing_init_dentry(void)
 	return tr->dir;
 }
 
+extern struct trace_enum_map *__start_ftrace_enum_maps[];
+extern struct trace_enum_map *__stop_ftrace_enum_maps[];
+
+static void __init trace_enum_init(void)
+{
+	trace_insert_enum_map(__start_ftrace_enum_maps, __stop_ftrace_enum_maps);
+}
+
 static __init int tracer_init_debugfs(void)
 {
 	struct dentry *d_tracer;
@@ -6566,6 +6588,8 @@ static __init int tracer_init_debugfs(void)
 	trace_create_file("saved_cmdlines_size", 0644, d_tracer,
 			  NULL, &tracing_saved_cmdlines_size_fops);
 
+	trace_enum_init();
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 	trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
 			&ftrace_update_tot_cnt, &tracing_dyn_info_fops);
@@ -6888,7 +6912,7 @@ void __init trace_init(void)
 			tracepoint_printk = 0;
 	}
 	tracer_alloc_buffers();
-	trace_event_init();	
+	trace_event_init();
 }
 
 __init static int clear_boot_tracer(void)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index dd8205a35760..b48d4b08f691 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1309,8 +1309,10 @@ static inline void init_ftrace_syscalls(void) { }
 
 #ifdef CONFIG_EVENT_TRACING
 void trace_event_init(void);
+void trace_event_enum_update(struct trace_enum_map **map, int len);
 #else
 static inline void __init trace_event_init(void) { }
+static inlin void trace_event_enum_update(struct trace_enum_map **map, int len) { }
 #endif
 
 extern struct trace_iterator *tracepoint_print_iter;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index db54dda10ccc..fc58c50fbf01 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1704,6 +1704,125 @@ __register_event(struct ftrace_event_call *call, struct module *mod)
 	return 0;
 }
 
+static char *enum_replace(char *ptr, struct trace_enum_map *map, int len)
+{
+	int rlen;
+	int elen;
+
+	/* Find the length of the enum value as a string */
+	elen = snprintf(ptr, 0, "%ld", map->enum_value);
+	/* Make sure there's enough room to replace the string with the value */
+	if (len < elen)
+		return NULL;
+
+	snprintf(ptr, elen + 1, "%ld", map->enum_value);
+
+	/* Get the rest of the string of ptr */
+	rlen = strlen(ptr + len);
+	memmove(ptr + elen, ptr + len, rlen);
+	/* Make sure we end the new string */
+	ptr[elen + rlen] = 0;
+
+	return ptr + elen;
+}
+
+static void update_event_printk(struct ftrace_event_call *call,
+				struct trace_enum_map *map)
+{
+	char *ptr;
+	int quote = 0;
+	int len = strlen(map->enum_string);
+
+	for (ptr = call->print_fmt; *ptr; ptr++) {
+		if (*ptr == '\\') {
+			ptr++;
+			/* paranoid */
+			if (!*ptr)
+				break;
+			continue;
+		}
+		if (*ptr == '"') {
+			quote ^= 1;
+			continue;
+		}
+		if (quote)
+			continue;
+		if (isdigit(*ptr)) {
+			/* skip numbers */
+			do {
+				ptr++;
+				/* Check for alpha chars like ULL */
+			} while (isalnum(*ptr));
+			/*
+			 * A number must have some kind of delimiter after
+			 * it, and we can ignore that too.
+			 */
+			continue;
+		}
+		if (isalpha(*ptr) || *ptr == '_') {
+			if (strncmp(map->enum_string, ptr, len) == 0 &&
+			    !isalnum(ptr[len]) && ptr[len] != '_') {
+				ptr = enum_replace(ptr, map, len);
+				/* Hmm, enum string smaller than value */
+				if (WARN_ON_ONCE(!ptr))
+					return;
+				/*
+				 * No need to decrement here, as enum_replace()
+				 * returns the pointer to the character passed
+				 * the enum, and two enums can not be placed
+				 * back to back without something in between.
+				 * We can skip that something in between.
+				 */
+				continue;
+			}
+		skip_more:
+			do {
+				ptr++;
+			} while (isalnum(*ptr) || *ptr == '_');
+			/*
+			 * If what comes after this variable is a '.' or
+			 * '->' then we can continue to ignore that string.
+			 */
+			if (*ptr == '.' || (ptr[0] == '-' && ptr[1] == '>')) {
+				ptr += *ptr == '.' ? 1 : 2;
+				goto skip_more;
+			}
+			/*
+			 * Once again, we can skip the delimiter that came
+			 * after the string.
+			 */
+			continue;
+		}
+	}
+}
+
+void trace_event_enum_update(struct trace_enum_map **map, int len)
+{
+	struct ftrace_event_call *call, *p;
+	const char *last_system = NULL;
+	int last_i;
+	int i;
+
+	down_write(&trace_event_sem);
+	list_for_each_entry_safe(call, p, &ftrace_events, list) {
+		/* events are usually grouped together with systems */
+		if (!last_system || call->class->system != last_system) {
+			last_i = 0;
+			last_system = call->class->system;
+		}
+
+		for (i = last_i; i < len; i++) {
+			if (call->class->system == map[i]->system) {
+				/* Save the first system if need be */
+				if (!last_i)
+					last_i = i;
+				update_event_printk(call, map[i]);
+			}
+		}
+	}
+	up_write(&trace_event_sem);
+}
+
 static struct ftrace_event_file *
 trace_create_new_event(struct ftrace_event_call *call,
 		       struct trace_array *tr)
-- 
cgit v1.2.3


From 3673b8e4ce7237160fa31ee8d7e94a4d5a9976a1 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 25 Mar 2015 15:44:21 -0400
Subject: tracing: Allow for modules to convert their enums to values

Update the infrastructure such that modules that declare TRACE_DEFINE_ENUM()
will have those enums converted into their values in the tracepoint
print fmt strings.

Link: http://lkml.kernel.org/r/87vbhjp74q.fsf@rustcorp.com.au

Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/module.h      |  2 ++
 kernel/module.c             |  3 +++
 kernel/trace/trace.c        | 49 +++++++++++++++++++++++++++++++++++++++++----
 kernel/trace/trace_events.c |  2 +-
 4 files changed, 51 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index 42999fe2dbd0..53dc41dd5c62 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -338,6 +338,8 @@ struct module {
 #ifdef CONFIG_EVENT_TRACING
 	struct ftrace_event_call **trace_events;
 	unsigned int num_trace_events;
+	struct trace_enum_map **trace_enums;
+	unsigned int num_trace_enums;
 #endif
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 	unsigned int num_ftrace_callsites;
diff --git a/kernel/module.c b/kernel/module.c
index b3d634ed06c9..d8f8ab271c2b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2753,6 +2753,9 @@ static int find_module_sections(struct module *mod, struct load_info *info)
 	mod->trace_events = section_objs(info, "_ftrace_events",
 					 sizeof(*mod->trace_events),
 					 &mod->num_trace_events);
+	mod->trace_enums = section_objs(info, "_ftrace_enum_map",
+					sizeof(*mod->trace_enums),
+					&mod->num_trace_enums);
 #endif
 #ifdef CONFIG_TRACING
 	mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ebf49649534c..28e6654e640d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3908,11 +3908,9 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = {
 	.write		= tracing_saved_cmdlines_size_write,
 };
 
-static void
-trace_insert_enum_map(struct trace_enum_map **start, struct trace_enum_map **stop)
+static void trace_insert_enum_map(struct trace_enum_map **start, int len)
 {
 	struct trace_enum_map **map;
-	int len = stop - start;
 
 	if (len <= 0)
 		return;
@@ -6561,9 +6559,48 @@ extern struct trace_enum_map *__stop_ftrace_enum_maps[];
 
 static void __init trace_enum_init(void)
 {
-	trace_insert_enum_map(__start_ftrace_enum_maps, __stop_ftrace_enum_maps);
+	int len;
+
+	len = __stop_ftrace_enum_maps - __start_ftrace_enum_maps;
+	trace_insert_enum_map(__start_ftrace_enum_maps, len);
+}
+
+#ifdef CONFIG_MODULES
+static void trace_module_add_enums(struct module *mod)
+{
+	if (!mod->num_trace_enums)
+		return;
+
+	/*
+	 * Modules with bad taint do not have events created, do
+	 * not bother with enums either.
+	 */
+	if (trace_module_has_bad_taint(mod))
+		return;
+
+	trace_insert_enum_map(mod->trace_enums, mod->num_trace_enums);
+}
+
+static int trace_module_notify(struct notifier_block *self,
+			       unsigned long val, void *data)
+{
+	struct module *mod = data;
+
+	switch (val) {
+	case MODULE_STATE_COMING:
+		trace_module_add_enums(mod);
+		break;
+	}
+
+	return 0;
 }
 
+static struct notifier_block trace_module_nb = {
+	.notifier_call = trace_module_notify,
+	.priority = 0,
+};
+#endif
+
 static __init int tracer_init_debugfs(void)
 {
 	struct dentry *d_tracer;
@@ -6590,6 +6627,10 @@ static __init int tracer_init_debugfs(void)
 
 	trace_enum_init();
 
+#ifdef CONFIG_MODULES
+	register_module_notifier(&trace_module_nb);
+#endif
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 	trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
 			&ftrace_update_tot_cnt, &tracing_dyn_info_fops);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index fc58c50fbf01..a576bbe75577 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2034,7 +2034,7 @@ static int trace_module_notify(struct notifier_block *self,
 
 static struct notifier_block trace_module_nb = {
 	.notifier_call = trace_module_notify,
-	.priority = 0,
+	.priority = 1, /* higher than trace.c module notify */
 };
 #endif /* CONFIG_MODULES */
 
-- 
cgit v1.2.3


From 9828413d4715d4ed12bc92b161f4ed377d777ffb Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 31 Mar 2015 17:23:45 -0400
Subject: tracing: Add enum_map file to show enums that have been mapped

Add a enum_map file in the tracing directory to see what enums have been
saved to convert in the print fmt files.

As this requires the enum mapping to be persistent in memory, it is only
created if the new config option CONFIG_TRACE_ENUM_MAP_FILE is enabled.
This is for debugging and will increase the persistent memory footprint
of the kernel.

Link: http://lkml.kernel.org/r/20150403013802.220157513@goodmis.org

Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Kconfig |  28 ++++++
 kernel/trace/trace.c | 245 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 269 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a5da09c899dd..fedbdd7d5d1e 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -599,6 +599,34 @@ config RING_BUFFER_STARTUP_TEST
 
 	 If unsure, say N
 
+config TRACE_ENUM_MAP_FILE
+       bool "Show enum mappings for trace events"
+       depends on TRACING
+       help
+        The "print fmt" of the trace events will show the enum names instead
+	of their values. This can cause problems for user space tools that
+	use this string to parse the raw data as user space does not know
+	how to convert the string to its value.
+
+	To fix this, there's a special macro in the kernel that can be used
+	to convert the enum into its value. If this macro is used, then the
+	print fmt strings will have the enums converted to their values.
+
+	If something does not get converted properly, this option can be
+	used to show what enums the kernel tried to convert.
+
+	This option is for debugging the enum conversions. A file is created
+	in the tracing directory called "enum_map" that will show the enum
+	names matched with their values and what trace event system they
+	belong too.
+
+	Normally, the mapping of the strings to values will be freed after
+	boot up or module load. With this option, they will not be freed, as
+	they are needed for the "enum_map" file. Enabling this option will
+	increase the memory footprint of the running kernel.
+
+	If unsure, say N
+
 endif # FTRACE
 
 endif # TRACING_SUPPORT
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 28e6654e640d..39e69568302e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -123,6 +123,42 @@ enum ftrace_dump_mode ftrace_dump_on_oops;
 /* When set, tracing will stop when a WARN*() is hit */
 int __disable_trace_on_warning;
 
+#ifdef CONFIG_TRACE_ENUM_MAP_FILE
+/* Map of enums to their values, for "enum_map" file */
+struct trace_enum_map_head {
+	struct module			*mod;
+	unsigned long			length;
+};
+
+union trace_enum_map_item;
+
+struct trace_enum_map_tail {
+	/*
+	 * "end" is first and points to NULL as it must be different
+	 * than "mod" or "enum_string"
+	 */
+	union trace_enum_map_item	*next;
+	const char			*end;	/* points to NULL */
+};
+
+static DEFINE_MUTEX(trace_enum_mutex);
+
+/*
+ * The trace_enum_maps are saved in an array with two extra elements,
+ * one at the beginning, and one at the end. The beginning item contains
+ * the count of the saved maps (head.length), and the module they
+ * belong to if not built in (head.mod). The ending item contains a
+ * pointer to the next array of saved enum_map items.
+ */
+union trace_enum_map_item {
+	struct trace_enum_map		map;
+	struct trace_enum_map_head	head;
+	struct trace_enum_map_tail	tail;
+};
+
+static union trace_enum_map_item *trace_enum_maps;
+#endif /* CONFIG_TRACE_ENUM_MAP_FILE */
+
 static int tracing_set_tracer(struct trace_array *tr, const char *buf);
 
 #define MAX_TRACER_SIZE		100
@@ -3908,7 +3944,169 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = {
 	.write		= tracing_saved_cmdlines_size_write,
 };
 
-static void trace_insert_enum_map(struct trace_enum_map **start, int len)
+#ifdef CONFIG_TRACE_ENUM_MAP_FILE
+static union trace_enum_map_item *
+update_enum_map(union trace_enum_map_item *ptr)
+{
+	if (!ptr->map.enum_string) {
+		if (ptr->tail.next) {
+			ptr = ptr->tail.next;
+			/* Set ptr to the next real item (skip head) */
+			ptr++;
+		} else
+			return NULL;
+	}
+	return ptr;
+}
+
+static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	union trace_enum_map_item *ptr = v;
+
+	/*
+	 * Paranoid! If ptr points to end, we don't want to increment past it.
+	 * This really should never happen.
+	 */
+	ptr = update_enum_map(ptr);
+	if (WARN_ON_ONCE(!ptr))
+		return NULL;
+
+	ptr++;
+
+	(*pos)++;
+
+	ptr = update_enum_map(ptr);
+
+	return ptr;
+}
+
+static void *enum_map_start(struct seq_file *m, loff_t *pos)
+{
+	union trace_enum_map_item *v;
+	loff_t l = 0;
+
+	mutex_lock(&trace_enum_mutex);
+
+	v = trace_enum_maps;
+	if (v)
+		v++;
+
+	while (v && l < *pos) {
+		v = enum_map_next(m, v, &l);
+	}
+
+	return v;
+}
+
+static void enum_map_stop(struct seq_file *m, void *v)
+{
+	mutex_unlock(&trace_enum_mutex);
+}
+
+static int enum_map_show(struct seq_file *m, void *v)
+{
+	union trace_enum_map_item *ptr = v;
+
+	seq_printf(m, "%s %ld (%s)\n",
+		   ptr->map.enum_string, ptr->map.enum_value,
+		   ptr->map.system);
+
+	return 0;
+}
+
+static const struct seq_operations tracing_enum_map_seq_ops = {
+	.start		= enum_map_start,
+	.next		= enum_map_next,
+	.stop		= enum_map_stop,
+	.show		= enum_map_show,
+};
+
+static int tracing_enum_map_open(struct inode *inode, struct file *filp)
+{
+	if (tracing_disabled)
+		return -ENODEV;
+
+	return seq_open(filp, &tracing_enum_map_seq_ops);
+}
+
+static const struct file_operations tracing_enum_map_fops = {
+	.open		= tracing_enum_map_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static inline union trace_enum_map_item *
+trace_enum_jmp_to_tail(union trace_enum_map_item *ptr)
+{
+	/* Return tail of array given the head */
+	return ptr + ptr->head.length + 1;
+}
+
+static void
+trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start,
+			   int len)
+{
+	struct trace_enum_map **stop;
+	struct trace_enum_map **map;
+	union trace_enum_map_item *map_array;
+	union trace_enum_map_item *ptr;
+
+	stop = start + len;
+
+	/*
+	 * The trace_enum_maps contains the map plus a head and tail item,
+	 * where the head holds the module and length of array, and the
+	 * tail holds a pointer to the next list.
+	 */
+	map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL);
+	if (!map_array) {
+		pr_warning("Unable to allocate trace enum mapping\n");
+		return;
+	}
+
+	mutex_lock(&trace_enum_mutex);
+
+	if (!trace_enum_maps)
+		trace_enum_maps = map_array;
+	else {
+		ptr = trace_enum_maps;
+		for (;;) {
+			ptr = trace_enum_jmp_to_tail(ptr);
+			if (!ptr->tail.next)
+				break;
+			ptr = ptr->tail.next;
+
+		}
+		ptr->tail.next = map_array;
+	}
+	map_array->head.mod = mod;
+	map_array->head.length = len;
+	map_array++;
+
+	for (map = start; (unsigned long)map < (unsigned long)stop; map++) {
+		map_array->map = **map;
+		map_array++;
+	}
+	memset(map_array, 0, sizeof(*map_array));
+
+	mutex_unlock(&trace_enum_mutex);
+}
+
+static void trace_create_enum_file(struct dentry *d_tracer)
+{
+	trace_create_file("enum_map", 0444, d_tracer,
+			  NULL, &tracing_enum_map_fops);
+}
+
+#else /* CONFIG_TRACE_ENUM_MAP_FILE */
+static inline void trace_create_enum_file(struct dentry *d_tracer) { }
+static inline void trace_insert_enum_map_file(struct module *mod,
+			      struct trace_enum_map **start, int len) { }
+#endif /* !CONFIG_TRACE_ENUM_MAP_FILE */
+
+static void trace_insert_enum_map(struct module *mod,
+				  struct trace_enum_map **start, int len)
 {
 	struct trace_enum_map **map;
 
@@ -3918,6 +4116,8 @@ static void trace_insert_enum_map(struct trace_enum_map **start, int len)
 	map = start;
 
 	trace_event_enum_update(map, len);
+
+	trace_insert_enum_map_file(mod, start, len);
 }
 
 static ssize_t
@@ -6562,7 +6762,7 @@ static void __init trace_enum_init(void)
 	int len;
 
 	len = __stop_ftrace_enum_maps - __start_ftrace_enum_maps;
-	trace_insert_enum_map(__start_ftrace_enum_maps, len);
+	trace_insert_enum_map(NULL, __start_ftrace_enum_maps, len);
 }
 
 #ifdef CONFIG_MODULES
@@ -6578,9 +6778,41 @@ static void trace_module_add_enums(struct module *mod)
 	if (trace_module_has_bad_taint(mod))
 		return;
 
-	trace_insert_enum_map(mod->trace_enums, mod->num_trace_enums);
+	trace_insert_enum_map(mod, mod->trace_enums, mod->num_trace_enums);
 }
 
+#ifdef CONFIG_TRACE_ENUM_MAP_FILE
+static void trace_module_remove_enums(struct module *mod)
+{
+	union trace_enum_map_item *map;
+	union trace_enum_map_item **last = &trace_enum_maps;
+
+	if (!mod->num_trace_enums)
+		return;
+
+	mutex_lock(&trace_enum_mutex);
+
+	map = trace_enum_maps;
+
+	while (map) {
+		if (map->head.mod == mod)
+			break;
+		map = trace_enum_jmp_to_tail(map);
+		last = &map->tail.next;
+		map = map->tail.next;
+	}
+	if (!map)
+		goto out;
+
+	*last = trace_enum_jmp_to_tail(map)->tail.next;
+	kfree(map);
+ out:
+	mutex_unlock(&trace_enum_mutex);
+}
+#else
+static inline void trace_module_remove_enums(struct module *mod) { }
+#endif /* CONFIG_TRACE_ENUM_MAP_FILE */
+
 static int trace_module_notify(struct notifier_block *self,
 			       unsigned long val, void *data)
 {
@@ -6590,6 +6822,9 @@ static int trace_module_notify(struct notifier_block *self,
 	case MODULE_STATE_COMING:
 		trace_module_add_enums(mod);
 		break;
+	case MODULE_STATE_GOING:
+		trace_module_remove_enums(mod);
+		break;
 	}
 
 	return 0;
@@ -6599,7 +6834,7 @@ static struct notifier_block trace_module_nb = {
 	.notifier_call = trace_module_notify,
 	.priority = 0,
 };
-#endif
+#endif /* CONFIG_MODULES */
 
 static __init int tracer_init_debugfs(void)
 {
@@ -6627,6 +6862,8 @@ static __init int tracer_init_debugfs(void)
 
 	trace_enum_init();
 
+	trace_create_enum_file(d_tracer);
+
 #ifdef CONFIG_MODULES
 	register_module_notifier(&trace_module_nb);
 #endif
-- 
cgit v1.2.3


From d0f88f8d5da674445f23ace60bb1896f4a3a3783 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 31 Mar 2015 12:30:48 -0400
Subject: acct: check FMODE_CAN_WRITE

it's not calling ->write() directly anymore.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/acct.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index e6c10d1a4058..74963d192c5d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -213,7 +213,7 @@ static int acct_on(struct filename *pathname)
 		return -EACCES;
 	}
 
-	if (!file->f_op->write) {
+	if (!(file->f_mode & FMODE_CAN_WRITE)) {
 		kfree(acct);
 		filp_close(file, NULL);
 		return -EIO;
-- 
cgit v1.2.3


From 973f911f55a0e510dd6db8bbb29cd82ff138d3c0 Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Mon, 30 Mar 2015 08:14:16 +0200
Subject: Remove execution domain support

All users of exec_domain are gone, now we can get rid
of that abandoned feature.
To not break existing userspace we keep a dummy
/proc/execdomains file which will always contain
"0-0     Linux                   [kernel]".

Signed-off-by: Richard Weinberger <richard@nod.at>
---
 kernel/exec_domain.c | 100 +--------------------------------------------------
 kernel/exit.c        |   2 --
 kernel/fork.c        |   4 ---
 3 files changed, 1 insertion(+), 105 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 83d4382f5699..b2fb57d6b9b2 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -20,13 +20,7 @@
 #include <linux/types.h>
 #include <linux/fs_struct.h>
 
-
 static void default_handler(int, struct pt_regs *);
-
-static struct exec_domain *exec_domains = &default_exec_domain;
-static DEFINE_RWLOCK(exec_domains_lock);
-
-
 static unsigned long ident_map[32] = {
 	0,	1,	2,	3,	4,	5,	6,	7,
 	8,	9,	10,	11,	12,	13,	14,	15,
@@ -55,94 +49,9 @@ default_handler(int segment, struct pt_regs *regp)
 		send_sig(SIGSEGV, current, 1);
 }
 
-static struct exec_domain *
-lookup_exec_domain(unsigned int personality)
-{
-	unsigned int pers = personality(personality);
-	struct exec_domain *ep;
-
-	read_lock(&exec_domains_lock);
-	for (ep = exec_domains; ep; ep = ep->next) {
-		if (pers >= ep->pers_low && pers <= ep->pers_high)
-			if (try_module_get(ep->module))
-				goto out;
-	}
-
-#ifdef CONFIG_MODULES
-	read_unlock(&exec_domains_lock);
-	request_module("personality-%d", pers);
-	read_lock(&exec_domains_lock);
-
-	for (ep = exec_domains; ep; ep = ep->next) {
-		if (pers >= ep->pers_low && pers <= ep->pers_high)
-			if (try_module_get(ep->module))
-				goto out;
-	}
-#endif
-
-	ep = &default_exec_domain;
-out:
-	read_unlock(&exec_domains_lock);
-	return ep;
-}
-
-int
-register_exec_domain(struct exec_domain *ep)
-{
-	struct exec_domain	*tmp;
-	int			err = -EBUSY;
-
-	if (ep == NULL)
-		return -EINVAL;
-
-	if (ep->next != NULL)
-		return -EBUSY;
-
-	write_lock(&exec_domains_lock);
-	for (tmp = exec_domains; tmp; tmp = tmp->next) {
-		if (tmp == ep)
-			goto out;
-	}
-
-	ep->next = exec_domains;
-	exec_domains = ep;
-	err = 0;
-
-out:
-	write_unlock(&exec_domains_lock);
-	return err;
-}
-EXPORT_SYMBOL(register_exec_domain);
-
-int
-unregister_exec_domain(struct exec_domain *ep)
-{
-	struct exec_domain	**epp;
-
-	epp = &exec_domains;
-	write_lock(&exec_domains_lock);
-	for (epp = &exec_domains; *epp; epp = &(*epp)->next) {
-		if (ep == *epp)
-			goto unregister;
-	}
-	write_unlock(&exec_domains_lock);
-	return -EINVAL;
-
-unregister:
-	*epp = ep->next;
-	ep->next = NULL;
-	write_unlock(&exec_domains_lock);
-	return 0;
-}
-EXPORT_SYMBOL(unregister_exec_domain);
-
 int __set_personality(unsigned int personality)
 {
-	struct exec_domain *oep = current_thread_info()->exec_domain;
-
-	current_thread_info()->exec_domain = lookup_exec_domain(personality);
 	current->personality = personality;
-	module_put(oep->module);
 
 	return 0;
 }
@@ -151,14 +60,7 @@ EXPORT_SYMBOL(__set_personality);
 #ifdef CONFIG_PROC_FS
 static int execdomains_proc_show(struct seq_file *m, void *v)
 {
-	struct exec_domain	*ep;
-
-	read_lock(&exec_domains_lock);
-	for (ep = exec_domains; ep; ep = ep->next)
-		seq_printf(m, "%d-%d\t%-16s\t[%s]\n",
-			       ep->pers_low, ep->pers_high, ep->name,
-			       module_name(ep->module));
-	read_unlock(&exec_domains_lock);
+	seq_puts(m, "0-0\tLinux           \t[kernel]\n");
 	return 0;
 }
 
diff --git a/kernel/exit.c b/kernel/exit.c
index feff10bbb307..22fcc05dec40 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -756,8 +756,6 @@ void do_exit(long code)
 
 	cgroup_exit(tsk);
 
-	module_put(task_thread_info(tsk)->exec_domain->module);
-
 	/*
 	 * FIXME: do that only when needed, using sched_exit tracepoint
 	 */
diff --git a/kernel/fork.c b/kernel/fork.c
index cf65139615a0..f2c1e7352298 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1279,9 +1279,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (nr_threads >= max_threads)
 		goto bad_fork_cleanup_count;
 
-	if (!try_module_get(task_thread_info(p)->exec_domain->module))
-		goto bad_fork_cleanup_count;
-
 	delayacct_tsk_init(p);	/* Must remain after dup_task_struct() */
 	p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
 	p->flags |= PF_FORKNOEXEC;
@@ -1590,7 +1587,6 @@ bad_fork_cleanup_threadgroup_lock:
 	if (clone_flags & CLONE_THREAD)
 		threadgroup_change_end(current);
 	delayacct_tsk_free(p);
-	module_put(task_thread_info(p)->exec_domain->module);
 bad_fork_cleanup_count:
 	atomic_dec(&p->cred->user->processes);
 	exit_creds(p);
-- 
cgit v1.2.3


From 9058f3b326dbe8cd2ebea7f3cfe367b0d101039b Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Sat, 11 Apr 2015 21:45:22 +0200
Subject: Remove rest of exec domains.

It is gone from all archs, now we can remove
the final bits.

Signed-off-by: Richard Weinberger <richard@nod.at>
---
 include/linux/personality.h | 40 +---------------------------------------
 include/linux/sched.h       |  6 ------
 kernel/exec_domain.c        | 37 -------------------------------------
 3 files changed, 1 insertion(+), 82 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/personality.h b/include/linux/personality.h
index 646c0a7d50fa..aeb7892b2468 100644
--- a/include/linux/personality.h
+++ b/include/linux/personality.h
@@ -3,52 +3,14 @@
 
 #include <uapi/linux/personality.h>
 
-
-/*
- * Handling of different ABIs (personalities).
- */
-
-struct exec_domain;
-struct pt_regs;
-
-extern int		register_exec_domain(struct exec_domain *);
-extern int		unregister_exec_domain(struct exec_domain *);
-extern int		__set_personality(unsigned int);
-
-
-/*
- * Description of an execution domain.
- * 
- * The first two members are refernced from assembly source
- * and should stay where they are unless explicitly needed.
- */
-typedef void (*handler_t)(int, struct pt_regs *);
-
-struct exec_domain {
-	const char		*name;		/* name of the execdomain */
-	handler_t		handler;	/* handler for syscalls */
-	unsigned char		pers_low;	/* lowest personality */
-	unsigned char		pers_high;	/* highest personality */
-	unsigned long		*signal_map;	/* signal mapping */
-	unsigned long		*signal_invmap;	/* reverse signal mapping */
-	struct map_segment	*err_map;	/* error mapping */
-	struct map_segment	*socktype_map;	/* socket type mapping */
-	struct map_segment	*sockopt_map;	/* socket option mapping */
-	struct map_segment	*af_map;	/* address family mapping */
-	struct module		*module;	/* module context of the ed. */
-	struct exec_domain	*next;		/* linked list (internal) */
-};
-
 /*
  * Return the base personality without flags.
  */
 #define personality(pers)	(pers & PER_MASK)
 
-
 /*
  * Change personality of the currently running process.
  */
-#define set_personality(pers) \
-	((current->personality == (pers)) ? 0 : __set_personality(pers))
+#define set_personality(pers)	(current->personality = (pers))
 
 #endif /* _LINUX_PERSONALITY_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a419b65770d6..14d9117ac463 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -125,7 +125,6 @@ struct sched_attr {
 	u64 sched_period;
 };
 
-struct exec_domain;
 struct futex_pi_state;
 struct robust_list_head;
 struct bio_list;
@@ -2288,11 +2287,6 @@ extern void set_curr_task(int cpu, struct task_struct *p);
 
 void yield(void);
 
-/*
- * The default (Linux) execution domain.
- */
-extern struct exec_domain	default_exec_domain;
-
 union thread_union {
 	struct thread_info thread_info;
 	unsigned long stack[THREAD_SIZE/sizeof(long)];
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index b2fb57d6b9b2..6873bb3e6b7e 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -20,43 +20,6 @@
 #include <linux/types.h>
 #include <linux/fs_struct.h>
 
-static void default_handler(int, struct pt_regs *);
-static unsigned long ident_map[32] = {
-	0,	1,	2,	3,	4,	5,	6,	7,
-	8,	9,	10,	11,	12,	13,	14,	15,
-	16,	17,	18,	19,	20,	21,	22,	23,
-	24,	25,	26,	27,	28,	29,	30,	31
-};
-
-struct exec_domain default_exec_domain = {
-	.name		= "Linux",		/* name */
-	.handler	= default_handler,	/* lcall7 causes a seg fault. */
-	.pers_low	= 0,			/* PER_LINUX personality. */
-	.pers_high	= 0,			/* PER_LINUX personality. */
-	.signal_map	= ident_map,		/* Identity map signals. */
-	.signal_invmap	= ident_map,		/*  - both ways. */
-};
-
-
-static void
-default_handler(int segment, struct pt_regs *regp)
-{
-	set_personality(0);
-
-	if (current_thread_info()->exec_domain->handler != default_handler)
-		current_thread_info()->exec_domain->handler(segment, regp);
-	else
-		send_sig(SIGSEGV, current, 1);
-}
-
-int __set_personality(unsigned int personality)
-{
-	current->personality = personality;
-
-	return 0;
-}
-EXPORT_SYMBOL(__set_personality);
-
 #ifdef CONFIG_PROC_FS
 static int execdomains_proc_show(struct seq_file *m, void *v)
 {
-- 
cgit v1.2.3


From 00df35f991914db6b8bde8cf09808e19a9cffc3d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 12 Apr 2015 08:06:55 -0700
Subject: cpu: Defer smpboot kthread unparking until CPU known to scheduler

Currently, smpboot_unpark_threads() is invoked before the incoming CPU
has been added to the scheduler's runqueue structures.  This might
potentially cause the unparked kthread to run on the wrong CPU, since the
correct CPU isn't fully set up yet.

That causes a sporadic, hard to debug boot crash triggering on some
systems, reported by Borislav Petkov, and bisected down to:

  2a442c9c6453 ("x86: Use common outgoing-CPU-notification code")

This patch places smpboot_unpark_threads() in a CPU hotplug
notifier with priority set so that these kthreads are unparked just after
the CPU has been added to the runqueues.

Reported-and-tested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/cpu.h |  2 ++
 init/main.c         |  1 +
 kernel/cpu.c        | 34 +++++++++++++++++++++++++++++++---
 3 files changed, 34 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index d028721748d4..091badf0f6ba 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -73,6 +73,7 @@ enum {
 	/* migration should happen before other stuff but after perf */
 	CPU_PRI_PERF		= 20,
 	CPU_PRI_MIGRATION	= 10,
+	CPU_PRI_SMPBOOT		= 9,
 	/* bring up workqueues before normal notifiers and down after */
 	CPU_PRI_WORKQUEUE_UP	= 5,
 	CPU_PRI_WORKQUEUE_DOWN	= -5,
@@ -165,6 +166,7 @@ static inline void __unregister_cpu_notifier(struct notifier_block *nb)
 }
 #endif
 
+void smpboot_thread_init(void);
 int cpu_up(unsigned int cpu);
 void notify_cpu_starting(unsigned int cpu);
 extern void cpu_maps_update_begin(void);
diff --git a/init/main.c b/init/main.c
index 6f0f1c5ff8cc..a12548c93318 100644
--- a/init/main.c
+++ b/init/main.c
@@ -384,6 +384,7 @@ static noinline void __init_refok rest_init(void)
 	int pid;
 
 	rcu_scheduler_starting();
+	smpboot_thread_init();
 	/*
 	 * We need to spawn init first so that it obtains pid 1, however
 	 * the init task will end up wanting to create kthreads, which, if
diff --git a/kernel/cpu.c b/kernel/cpu.c
index d46b4dae0ca0..57858cebd6b5 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -448,6 +448,37 @@ out:
 EXPORT_SYMBOL(cpu_down);
 #endif /*CONFIG_HOTPLUG_CPU*/
 
+/*
+ * Unpark per-CPU smpboot kthreads at CPU-online time.
+ */
+static int smpboot_thread_call(struct notifier_block *nfb,
+			       unsigned long action, void *hcpu)
+{
+	int cpu = (long)hcpu;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+
+	case CPU_ONLINE:
+		smpboot_unpark_threads(cpu);
+		break;
+
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block smpboot_thread_notifier = {
+	.notifier_call = smpboot_thread_call,
+	.priority = CPU_PRI_SMPBOOT,
+};
+
+void __cpuinit smpboot_thread_init(void)
+{
+	register_cpu_notifier(&smpboot_thread_notifier);
+}
+
 /* Requires cpu_add_remove_lock to be held */
 static int _cpu_up(unsigned int cpu, int tasks_frozen)
 {
@@ -487,9 +518,6 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
 		goto out_notify;
 	BUG_ON(!cpu_online(cpu));
 
-	/* Wake the per cpu threads */
-	smpboot_unpark_threads(cpu);
-
 	/* Now call notifier in preparation. */
 	cpu_notify(CPU_ONLINE | mod, hcpu);
 
-- 
cgit v1.2.3


From 84d56e66b9b4a646f04ec30696ca1aeea5e654d5 Mon Sep 17 00:00:00 2001
From: Ulrich Obergfell <uobergfe@redhat.com>
Date: Tue, 14 Apr 2015 15:43:55 -0700
Subject: watchdog: new definitions and variables, initialization

The hardlockup and softockup had always been tied together.  Due to the
request of KVM folks, they had a need to have one enabled but not the
other.  Internally rework the code to split things apart more cleanly.

There is a bunch of churn here, but the end result should be code that
should be easier to maintain and fix without knowing the internals of what
is going on.

This patch (of 9):

Introduce new definitions and variables to separate the user interface in
/proc/sys/kernel from the internal run state of the lockup detectors.  The
internal run state is represented by two bits in a new variable that is
named 'watchdog_enabled'.  This helps simplify the code, for example:

- In order to check if any of the two lockup detectors is enabled,
  it is sufficient to check if 'watchdog_enabled' is not zero.

- In order to enable/disable one or both lockup detectors,
  it is sufficient to set/clear one or both bits in 'watchdog_enabled'.

- Concurrent updates of 'watchdog_enabled' need not be synchronized via
  a spinlock or a mutex. Updates can either be atomic or concurrency can
  be detected by using 'cmpxchg'.

Signed-off-by: Ulrich Obergfell <uobergfe@redhat.com>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/nmi.h |  2 ++
 kernel/watchdog.c   | 27 ++++++++++++++++++++++++++-
 2 files changed, 28 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 9b2022ab4d85..3885a7d12bd2 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -68,6 +68,8 @@ static inline bool trigger_allbutself_cpu_backtrace(void)
 #ifdef CONFIG_LOCKUP_DETECTOR
 int hw_nmi_is_cpu_stuck(struct pt_regs *);
 u64 hw_nmi_get_sample_period(int watchdog_thresh);
+extern int nmi_watchdog_enabled;
+extern int soft_watchdog_enabled;
 extern int watchdog_user_enabled;
 extern int watchdog_thresh;
 extern int sysctl_softlockup_all_cpu_backtrace;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 3174bf8e3538..9a1c78769a33 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -24,8 +24,33 @@
 #include <linux/kvm_para.h>
 #include <linux/perf_event.h>
 
-int watchdog_user_enabled = 1;
+/*
+ * The run state of the lockup detectors is controlled by the content of the
+ * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit -
+ * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector.
+ *
+ * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled'
+ * are variables that are only used as an 'interface' between the parameters
+ * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The
+ * 'watchdog_thresh' variable is handled differently because its value is not
+ * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh'
+ * is equal zero.
+ */
+#define NMI_WATCHDOG_ENABLED_BIT   0
+#define SOFT_WATCHDOG_ENABLED_BIT  1
+#define NMI_WATCHDOG_ENABLED      (1 << NMI_WATCHDOG_ENABLED_BIT)
+#define SOFT_WATCHDOG_ENABLED     (1 << SOFT_WATCHDOG_ENABLED_BIT)
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED;
+#else
+static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
+#endif
+int __read_mostly nmi_watchdog_enabled;
+int __read_mostly soft_watchdog_enabled;
+int __read_mostly watchdog_user_enabled;
 int __read_mostly watchdog_thresh = 10;
+
 #ifdef CONFIG_SMP
 int __read_mostly sysctl_softlockup_all_cpu_backtrace;
 #else
-- 
cgit v1.2.3


From a0c9cbb93da9b9a0e00907a4a5d2e5f1fed86350 Mon Sep 17 00:00:00 2001
From: Ulrich Obergfell <uobergfe@redhat.com>
Date: Tue, 14 Apr 2015 15:43:58 -0700
Subject: watchdog: introduce the proc_watchdog_update() function

This series introduces a separate handler for each watchdog parameter in
/proc/sys/kernel.  The separate handlers need a common function that they
can call to update the run state of the lockup detectors, or to have the
lockup detectors use a new sample period.

Signed-off-by: Ulrich Obergfell <uobergfe@redhat.com>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/watchdog.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 9a1c78769a33..dcc4990097a2 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -678,6 +678,29 @@ static void watchdog_disable_all_cpus(void)
 	}
 }
 
+/*
+ * Update the run state of the lockup detectors.
+ */
+static int proc_watchdog_update(void)
+{
+	int err = 0;
+
+	/*
+	 * Watchdog threads won't be started if they are already active.
+	 * The 'watchdog_running' variable in watchdog_*_all_cpus() takes
+	 * care of this. If those threads are already active, the sample
+	 * period will be updated and the lockup detectors will be enabled
+	 * or disabled 'on the fly'.
+	 */
+	if (watchdog_enabled && watchdog_thresh)
+		err = watchdog_enable_all_cpus(true);
+	else
+		watchdog_disable_all_cpus();
+
+	return err;
+
+}
+
 /*
  * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
  */
-- 
cgit v1.2.3


From f54c2274f5515da6bae779c7340cd0dc69d0dd8d Mon Sep 17 00:00:00 2001
From: Ulrich Obergfell <uobergfe@redhat.com>
Date: Tue, 14 Apr 2015 15:44:01 -0700
Subject: watchdog: move definition of 'watchdog_proc_mutex' outside of
 proc_dowatchdog()

This series removes proc_dowatchdog().  Since multiple new functions need
the 'watchdog_proc_mutex' to serialize access to the watchdog parameters
in /proc/sys/kernel, move the mutex outside of any function.

Signed-off-by: Ulrich Obergfell <uobergfe@redhat.com>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/watchdog.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index dcc4990097a2..28c833b42124 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -701,6 +701,8 @@ static int proc_watchdog_update(void)
 
 }
 
+static DEFINE_MUTEX(watchdog_proc_mutex);
+
 /*
  * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
  */
@@ -710,7 +712,6 @@ int proc_dowatchdog(struct ctl_table *table, int write,
 {
 	int err, old_thresh, old_enabled;
 	bool old_hardlockup;
-	static DEFINE_MUTEX(watchdog_proc_mutex);
 
 	mutex_lock(&watchdog_proc_mutex);
 	old_thresh = ACCESS_ONCE(watchdog_thresh);
-- 
cgit v1.2.3


From ef246a216b02c604ff465b9a62bb0d2e1ea183a7 Mon Sep 17 00:00:00 2001
From: Ulrich Obergfell <uobergfe@redhat.com>
Date: Tue, 14 Apr 2015 15:44:05 -0700
Subject: watchdog: introduce proc_watchdog_common()

Three of four handlers for the watchdog parameters in /proc/sys/kernel
essentially have to do the same thing.

  if the parameter is being read {
    return the state of the corresponding bit(s) in 'watchdog_enabled'
  } else {
    set/clear the state of the corresponding bit(s) in 'watchdog_enabled'
    update the run state of the lockup detector(s)
  }

Hence, introduce a common function that can be called by those handlers.
The callers pass a 'bit mask' to this function to indicate which bit(s)
should be set/cleared in 'watchdog_enabled'.

This function handles an uncommon race with watchdog_nmi_enable() where a
concurrent update of 'watchdog_enabled' is possible.  We use 'cmpxchg' to
detect the concurrency.  [This avoids introducing a new spinlock or a
mutex to synchronize updates of 'watchdog_enabled'.  Using the same lock
or mutex in watchdog thread context and in system call context needs to be
considered carefully because it can make the code prone to deadlock
situations in connection with parking/unparking the watchdog threads.]

Signed-off-by: Ulrich Obergfell <uobergfe@redhat.com>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/watchdog.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 28c833b42124..3600a01c97a9 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -703,6 +703,71 @@ static int proc_watchdog_update(void)
 
 static DEFINE_MUTEX(watchdog_proc_mutex);
 
+/*
+ * common function for watchdog, nmi_watchdog and soft_watchdog parameter
+ *
+ * caller             | table->data points to | 'which' contains the flag(s)
+ * -------------------|-----------------------|-----------------------------
+ * proc_watchdog      | watchdog_user_enabled | NMI_WATCHDOG_ENABLED or'ed
+ *                    |                       | with SOFT_WATCHDOG_ENABLED
+ * -------------------|-----------------------|-----------------------------
+ * proc_nmi_watchdog  | nmi_watchdog_enabled  | NMI_WATCHDOG_ENABLED
+ * -------------------|-----------------------|-----------------------------
+ * proc_soft_watchdog | soft_watchdog_enabled | SOFT_WATCHDOG_ENABLED
+ */
+static int proc_watchdog_common(int which, struct ctl_table *table, int write,
+				void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int err, old, new;
+	int *watchdog_param = (int *)table->data;
+
+	mutex_lock(&watchdog_proc_mutex);
+
+	/*
+	 * If the parameter is being read return the state of the corresponding
+	 * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the
+	 * run state of the lockup detectors.
+	 */
+	if (!write) {
+		*watchdog_param = (watchdog_enabled & which) != 0;
+		err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	} else {
+		err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+		if (err)
+			goto out;
+
+		/*
+		 * There is a race window between fetching the current value
+		 * from 'watchdog_enabled' and storing the new value. During
+		 * this race window, watchdog_nmi_enable() can sneak in and
+		 * clear the NMI_WATCHDOG_ENABLED bit in 'watchdog_enabled'.
+		 * The 'cmpxchg' detects this race and the loop retries.
+		 */
+		do {
+			old = watchdog_enabled;
+			/*
+			 * If the parameter value is not zero set the
+			 * corresponding bit(s), else clear it(them).
+			 */
+			if (*watchdog_param)
+				new = old | which;
+			else
+				new = old & ~which;
+		} while (cmpxchg(&watchdog_enabled, old, new) != old);
+
+		/*
+		 * Update the run state of the lockup detectors.
+		 * Restore 'watchdog_enabled' on failure.
+		 */
+		err = proc_watchdog_update();
+		if (err)
+			watchdog_enabled = old;
+	}
+out:
+	mutex_unlock(&watchdog_proc_mutex);
+	return err;
+}
+
 /*
  * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
  */
-- 
cgit v1.2.3


From 83a80a39075a9ded23df1e26a4b617c289077630 Mon Sep 17 00:00:00 2001
From: Ulrich Obergfell <uobergfe@redhat.com>
Date: Tue, 14 Apr 2015 15:44:08 -0700
Subject: watchdog: introduce separate handlers for parameters in
 /proc/sys/kernel

Separate handlers for each watchdog parameter in /proc/sys/kernel replace
the proc_dowatchdog() function.  Three of those handlers merely call
proc_watchdog_common() with one different argument.

Signed-off-by: Ulrich Obergfell <uobergfe@redhat.com>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/nmi.h |  8 ++++++++
 kernel/watchdog.c   | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 3885a7d12bd2..5b5450585b8a 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -74,6 +74,14 @@ extern int watchdog_user_enabled;
 extern int watchdog_thresh;
 extern int sysctl_softlockup_all_cpu_backtrace;
 struct ctl_table;
+extern int proc_watchdog(struct ctl_table *, int ,
+			 void __user *, size_t *, loff_t *);
+extern int proc_nmi_watchdog(struct ctl_table *, int ,
+			     void __user *, size_t *, loff_t *);
+extern int proc_soft_watchdog(struct ctl_table *, int ,
+			      void __user *, size_t *, loff_t *);
+extern int proc_watchdog_thresh(struct ctl_table *, int ,
+				void __user *, size_t *, loff_t *);
 extern int proc_dowatchdog(struct ctl_table *, int ,
 			   void __user *, size_t *, loff_t *);
 #endif
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 3600a01c97a9..26002ed4c16e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -768,6 +768,65 @@ out:
 	return err;
 }
 
+/*
+ * /proc/sys/kernel/watchdog
+ */
+int proc_watchdog(struct ctl_table *table, int write,
+		  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED,
+				    table, write, buffer, lenp, ppos);
+}
+
+/*
+ * /proc/sys/kernel/nmi_watchdog
+ */
+int proc_nmi_watchdog(struct ctl_table *table, int write,
+		      void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return proc_watchdog_common(NMI_WATCHDOG_ENABLED,
+				    table, write, buffer, lenp, ppos);
+}
+
+/*
+ * /proc/sys/kernel/soft_watchdog
+ */
+int proc_soft_watchdog(struct ctl_table *table, int write,
+			void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return proc_watchdog_common(SOFT_WATCHDOG_ENABLED,
+				    table, write, buffer, lenp, ppos);
+}
+
+/*
+ * /proc/sys/kernel/watchdog_thresh
+ */
+int proc_watchdog_thresh(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int err, old;
+
+	mutex_lock(&watchdog_proc_mutex);
+
+	old = ACCESS_ONCE(watchdog_thresh);
+	err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+	if (err || !write)
+		goto out;
+
+	/*
+	 * Update the sample period.
+	 * Restore 'watchdog_thresh' on failure.
+	 */
+	set_sample_period();
+	err = proc_watchdog_update();
+	if (err)
+		watchdog_thresh = old;
+out:
+	mutex_unlock(&watchdog_proc_mutex);
+	return err;
+}
+
 /*
  * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
  */
-- 
cgit v1.2.3


From bcfba4f4bf3c9c7c72b459d52a9e826dfd72855e Mon Sep 17 00:00:00 2001
From: Ulrich Obergfell <uobergfe@redhat.com>
Date: Tue, 14 Apr 2015 15:44:10 -0700
Subject: watchdog: implement error handling for failure to set up hardware
 perf events

If watchdog_nmi_enable() fails to set up the hardware perf event of one
CPU, the entire hard lockup detector is deemed unreliable.  Hence, disable
the hard lockup detector and shut down the hardware perf events on all
CPUs.

[dzickus@redhat.com: update comments to explain some code]
Signed-off-by: Ulrich Obergfell <uobergfe@redhat.com>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/watchdog.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 26002ed4c16e..fd2b6dc14486 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -502,6 +502,21 @@ static void watchdog(unsigned int cpu)
 	__this_cpu_write(soft_lockup_hrtimer_cnt,
 			 __this_cpu_read(hrtimer_interrupts));
 	__touch_watchdog();
+
+	/*
+	 * watchdog_nmi_enable() clears the NMI_WATCHDOG_ENABLED bit in the
+	 * failure path. Check for failures that can occur asynchronously -
+	 * for example, when CPUs are on-lined - and shut down the hardware
+	 * perf event on each CPU accordingly.
+	 *
+	 * The only non-obvious place this bit can be cleared is through
+	 * watchdog_nmi_enable(), so a pr_info() is placed there.  Placing a
+	 * pr_info here would be too noisy as it would result in a message
+	 * every few seconds if the hardlockup was disabled but the softlockup
+	 * enabled.
+	 */
+	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+		watchdog_nmi_disable(cpu);
 }
 
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
@@ -552,6 +567,18 @@ handle_err:
 		goto out_save;
 	}
 
+	/*
+	 * Disable the hard lockup detector if _any_ CPU fails to set up
+	 * set up the hardware perf event. The watchdog() function checks
+	 * the NMI_WATCHDOG_ENABLED bit periodically.
+	 *
+	 * The barriers are for syncing up watchdog_enabled across all the
+	 * cpus, as clear_bit() does not use barriers.
+	 */
+	smp_mb__before_atomic();
+	clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
+	smp_mb__after_atomic();
+
 	/* skip displaying the same error again */
 	if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
 		return PTR_ERR(event);
@@ -565,6 +592,9 @@ handle_err:
 	else
 		pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
 			cpu, PTR_ERR(event));
+
+	pr_info("Shutting down hard lockup detector on all cpus\n");
+
 	return PTR_ERR(event);
 
 	/* success path */
-- 
cgit v1.2.3


From 195daf665a6299de98a4da3843fed2dd9de19d3a Mon Sep 17 00:00:00 2001
From: Ulrich Obergfell <uobergfe@redhat.com>
Date: Tue, 14 Apr 2015 15:44:13 -0700
Subject: watchdog: enable the new user interface of the watchdog mechanism

With the current user interface of the watchdog mechanism it is only
possible to disable or enable both lockup detectors at the same time.
This series introduces new kernel parameters and changes the semantics of
some existing kernel parameters, so that the hard lockup detector and the
soft lockup detector can be disabled or enabled individually.  With this
series applied, the user interface is as follows.

- parameters in /proc/sys/kernel

  . soft_watchdog
    This is a new parameter to control and examine the run state of
    the soft lockup detector.

  . nmi_watchdog
    The semantics of this parameter have changed. It can now be used
    to control and examine the run state of the hard lockup detector.

  . watchdog
    This parameter is still available to control the run state of both
    lockup detectors at the same time. If this parameter is examined,
    it shows the logical OR of soft_watchdog and nmi_watchdog.

  . watchdog_thresh
    The semantics of this parameter are not affected by the patch.

- kernel command line parameters

  . nosoftlockup
    The semantics of this parameter have changed. It can now be used
    to disable the soft lockup detector at boot time.

  . nmi_watchdog=0 or nmi_watchdog=1
    Disable or enable the hard lockup detector at boot time. The patch
    introduces '=1' as a new option.

  . nowatchdog
    The semantics of this parameter are not affected by the patch. It
    is still available to disable both lockup detectors at boot time.

Also, remove the proc_dowatchdog() function which is no longer needed.

[dzickus@redhat.com: wrote changelog]
[dzickus@redhat.com: update documentation for kernel params and sysctl]
Signed-off-by: Ulrich Obergfell <uobergfe@redhat.com>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/kernel-parameters.txt |  6 ++-
 Documentation/sysctl/kernel.txt     | 62 +++++++++++++++++++++++-----
 include/linux/nmi.h                 |  2 -
 kernel/sysctl.c                     | 35 +++++++++++-----
 kernel/watchdog.c                   | 81 ++++++++-----------------------------
 5 files changed, 97 insertions(+), 89 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 01aa47d3b6ab..71eecb263250 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2236,8 +2236,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
 	nmi_watchdog=	[KNL,BUGS=X86] Debugging features for SMP kernels
 			Format: [panic,][nopanic,][num]
-			Valid num: 0
+			Valid num: 0 or 1
 			0 - turn nmi_watchdog off
+			1 - turn nmi_watchdog on
 			When panic is specified, panic when an NMI watchdog
 			timeout occurs (or 'nopanic' to override the opposite
 			default).
@@ -2464,7 +2465,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
 	nousb		[USB] Disable the USB subsystem
 
-	nowatchdog	[KNL] Disable the lockup detector (NMI watchdog).
+	nowatchdog	[KNL] Disable both lockup detectors, i.e.
+                        soft-lockup and NMI watchdog (hard-lockup).
 
 	nowb		[ARM]
 
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 83ab25660fc9..99d7eb3a1416 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -77,12 +77,14 @@ show up in /proc/sys/kernel:
 - shmmax                      [ sysv ipc ]
 - shmmni
 - softlockup_all_cpu_backtrace
+- soft_watchdog
 - stop-a                      [ SPARC only ]
 - sysrq                       ==> Documentation/sysrq.txt
 - sysctl_writes_strict
 - tainted
 - threads-max
 - unknown_nmi_panic
+- watchdog
 - watchdog_thresh
 - version
 
@@ -417,16 +419,23 @@ successful IPC object allocation.
 
 nmi_watchdog:
 
-Enables/Disables the NMI watchdog on x86 systems. When the value is
-non-zero the NMI watchdog is enabled and will continuously test all
-online cpus to determine whether or not they are still functioning
-properly. Currently, passing "nmi_watchdog=" parameter at boot time is
-required for this function to work.
+This parameter can be used to control the NMI watchdog
+(i.e. the hard lockup detector) on x86 systems.
 
-If LAPIC NMI watchdog method is in use (nmi_watchdog=2 kernel
-parameter), the NMI watchdog shares registers with oprofile. By
-disabling the NMI watchdog, oprofile may have more registers to
-utilize.
+   0 - disable the hard lockup detector
+   1 - enable the hard lockup detector
+
+The hard lockup detector monitors each CPU for its ability to respond to
+timer interrupts. The mechanism utilizes CPU performance counter registers
+that are programmed to generate Non-Maskable Interrupts (NMIs) periodically
+while a CPU is busy. Hence, the alternative name 'NMI watchdog'.
+
+The NMI watchdog is disabled by default if the kernel is running as a guest
+in a KVM virtual machine. This default can be overridden by adding
+
+   nmi_watchdog=1
+
+to the guest kernel command line (see Documentation/kernel-parameters.txt).
 
 ==============================================================
 
@@ -816,6 +825,22 @@ NMI.
 
 ==============================================================
 
+soft_watchdog
+
+This parameter can be used to control the soft lockup detector.
+
+   0 - disable the soft lockup detector
+   1 - enable the soft lockup detector
+
+The soft lockup detector monitors CPUs for threads that are hogging the CPUs
+without rescheduling voluntarily, and thus prevent the 'watchdog/N' threads
+from running. The mechanism depends on the CPUs ability to respond to timer
+interrupts which are needed for the 'watchdog/N' threads to be woken up by
+the watchdog timer function, otherwise the NMI watchdog - if enabled - can
+detect a hard lockup condition.
+
+==============================================================
+
 tainted:
 
 Non-zero if the kernel has been tainted.  Numeric values, which
@@ -858,6 +883,25 @@ example.  If a system hangs up, try pressing the NMI switch.
 
 ==============================================================
 
+watchdog:
+
+This parameter can be used to disable or enable the soft lockup detector
+_and_ the NMI watchdog (i.e. the hard lockup detector) at the same time.
+
+   0 - disable both lockup detectors
+   1 - enable both lockup detectors
+
+The soft lockup detector and the NMI watchdog can also be disabled or
+enabled individually, using the soft_watchdog and nmi_watchdog parameters.
+If the watchdog parameter is read, for example by executing
+
+   cat /proc/sys/kernel/watchdog
+
+the output of this command (0 or 1) shows the logical OR of soft_watchdog
+and nmi_watchdog.
+
+==============================================================
+
 watchdog_thresh:
 
 This value can be used to control the frequency of hrtimer and NMI
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 5b5450585b8a..0426357297d5 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -82,8 +82,6 @@ extern int proc_soft_watchdog(struct ctl_table *, int ,
 			      void __user *, size_t *, loff_t *);
 extern int proc_watchdog_thresh(struct ctl_table *, int ,
 				void __user *, size_t *, loff_t *);
-extern int proc_dowatchdog(struct ctl_table *, int ,
-			   void __user *, size_t *, loff_t *);
 #endif
 
 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ce410bb9f2e1..245e7dcc3741 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -846,7 +846,7 @@ static struct ctl_table kern_table[] = {
 		.data           = &watchdog_user_enabled,
 		.maxlen         = sizeof (int),
 		.mode           = 0644,
-		.proc_handler   = proc_dowatchdog,
+		.proc_handler   = proc_watchdog,
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
@@ -855,10 +855,32 @@ static struct ctl_table kern_table[] = {
 		.data		= &watchdog_thresh,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dowatchdog,
+		.proc_handler	= proc_watchdog_thresh,
 		.extra1		= &zero,
 		.extra2		= &sixty,
 	},
+	{
+		.procname       = "nmi_watchdog",
+		.data           = &nmi_watchdog_enabled,
+		.maxlen         = sizeof (int),
+		.mode           = 0644,
+		.proc_handler   = proc_nmi_watchdog,
+		.extra1		= &zero,
+#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
+		.extra2		= &one,
+#else
+		.extra2		= &zero,
+#endif
+	},
+	{
+		.procname       = "soft_watchdog",
+		.data           = &soft_watchdog_enabled,
+		.maxlen         = sizeof (int),
+		.mode           = 0644,
+		.proc_handler   = proc_soft_watchdog,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 	{
 		.procname	= "softlockup_panic",
 		.data		= &softlockup_panic,
@@ -879,15 +901,6 @@ static struct ctl_table kern_table[] = {
 		.extra2		= &one,
 	},
 #endif /* CONFIG_SMP */
-	{
-		.procname       = "nmi_watchdog",
-		.data           = &watchdog_user_enabled,
-		.maxlen         = sizeof (int),
-		.mode           = 0644,
-		.proc_handler   = proc_dowatchdog,
-		.extra1		= &zero,
-		.extra2		= &one,
-	},
 #endif
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 	{
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index fd2b6dc14486..63d702885686 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -110,15 +110,9 @@ static int __init hardlockup_panic_setup(char *str)
 	else if (!strncmp(str, "nopanic", 7))
 		hardlockup_panic = 0;
 	else if (!strncmp(str, "0", 1))
-		watchdog_user_enabled = 0;
-	else if (!strncmp(str, "1", 1) || !strncmp(str, "2", 1)) {
-		/*
-		 * Setting 'nmi_watchdog=1' or 'nmi_watchdog=2' (legacy option)
-		 * has the same effect.
-		 */
-		watchdog_user_enabled = 1;
-		watchdog_enable_hardlockup_detector(true);
-	}
+		watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+	else if (!strncmp(str, "1", 1))
+		watchdog_enabled |= NMI_WATCHDOG_ENABLED;
 	return 1;
 }
 __setup("nmi_watchdog=", hardlockup_panic_setup);
@@ -137,19 +131,18 @@ __setup("softlockup_panic=", softlockup_panic_setup);
 
 static int __init nowatchdog_setup(char *str)
 {
-	watchdog_user_enabled = 0;
+	watchdog_enabled = 0;
 	return 1;
 }
 __setup("nowatchdog", nowatchdog_setup);
 
-/* deprecated */
 static int __init nosoftlockup_setup(char *str)
 {
-	watchdog_user_enabled = 0;
+	watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED;
 	return 1;
 }
 __setup("nosoftlockup", nosoftlockup_setup);
-/*  */
+
 #ifdef CONFIG_SMP
 static int __init softlockup_all_cpu_backtrace_setup(char *str)
 {
@@ -264,10 +257,11 @@ static int is_softlockup(unsigned long touch_ts)
 {
 	unsigned long now = get_timestamp();
 
-	/* Warn about unreasonable delays: */
-	if (time_after(now, touch_ts + get_softlockup_thresh()))
-		return now - touch_ts;
-
+	if (watchdog_enabled & SOFT_WATCHDOG_ENABLED) {
+		/* Warn about unreasonable delays. */
+		if (time_after(now, touch_ts + get_softlockup_thresh()))
+			return now - touch_ts;
+	}
 	return 0;
 }
 
@@ -532,6 +526,10 @@ static int watchdog_nmi_enable(unsigned int cpu)
 	struct perf_event_attr *wd_attr;
 	struct perf_event *event = per_cpu(watchdog_ev, cpu);
 
+	/* nothing to do if the hard lockup detector is disabled */
+	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+		goto out;
+
 	/*
 	 * Some kernels need to default hard lockup detection to
 	 * 'disabled', for example a guest on a hypervisor.
@@ -856,59 +854,12 @@ out:
 	mutex_unlock(&watchdog_proc_mutex);
 	return err;
 }
-
-/*
- * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
- */
-
-int proc_dowatchdog(struct ctl_table *table, int write,
-		    void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	int err, old_thresh, old_enabled;
-	bool old_hardlockup;
-
-	mutex_lock(&watchdog_proc_mutex);
-	old_thresh = ACCESS_ONCE(watchdog_thresh);
-	old_enabled = ACCESS_ONCE(watchdog_user_enabled);
-	old_hardlockup = watchdog_hardlockup_detector_is_enabled();
-
-	err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-	if (err || !write)
-		goto out;
-
-	set_sample_period();
-	/*
-	 * Watchdog threads shouldn't be enabled if they are
-	 * disabled. The 'watchdog_running' variable check in
-	 * watchdog_*_all_cpus() function takes care of this.
-	 */
-	if (watchdog_user_enabled && watchdog_thresh) {
-		/*
-		 * Prevent a change in watchdog_thresh accidentally overriding
-		 * the enablement of the hardlockup detector.
-		 */
-		if (watchdog_user_enabled != old_enabled)
-			watchdog_enable_hardlockup_detector(true);
-		err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh);
-	} else
-		watchdog_disable_all_cpus();
-
-	/* Restore old values on failure */
-	if (err) {
-		watchdog_thresh = old_thresh;
-		watchdog_user_enabled = old_enabled;
-		watchdog_enable_hardlockup_detector(old_hardlockup);
-	}
-out:
-	mutex_unlock(&watchdog_proc_mutex);
-	return err;
-}
 #endif /* CONFIG_SYSCTL */
 
 void __init lockup_detector_init(void)
 {
 	set_sample_period();
 
-	if (watchdog_user_enabled)
+	if (watchdog_enabled)
 		watchdog_enable_all_cpus(false);
 }
-- 
cgit v1.2.3


From b2f57c3a0df9d168220be8848a303a32ef4d2e7d Mon Sep 17 00:00:00 2001
From: Ulrich Obergfell <uobergfe@redhat.com>
Date: Tue, 14 Apr 2015 15:44:16 -0700
Subject: watchdog: clean up some function names and arguments

Rename the update_timers*() functions to update_watchdog*().

Remove the boolean argument from watchdog_enable_all_cpus() because
update_watchdog_all_cpus() is now a generic function to change the run
state of the lockup detectors and to have the lockup detectors use a new
sample period.

Signed-off-by: Ulrich Obergfell <uobergfe@redhat.com>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/watchdog.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 63d702885686..49d02250aaac 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -653,7 +653,7 @@ static void restart_watchdog_hrtimer(void *info)
 				HRTIMER_MODE_REL_PINNED);
 }
 
-static void update_timers(int cpu)
+static void update_watchdog(int cpu)
 {
 	/*
 	 * Make sure that perf event counter will adopt to a new
@@ -668,17 +668,17 @@ static void update_timers(int cpu)
 	watchdog_nmi_enable(cpu);
 }
 
-static void update_timers_all_cpus(void)
+static void update_watchdog_all_cpus(void)
 {
 	int cpu;
 
 	get_online_cpus();
 	for_each_online_cpu(cpu)
-		update_timers(cpu);
+		update_watchdog(cpu);
 	put_online_cpus();
 }
 
-static int watchdog_enable_all_cpus(bool sample_period_changed)
+static int watchdog_enable_all_cpus(void)
 {
 	int err = 0;
 
@@ -688,8 +688,12 @@ static int watchdog_enable_all_cpus(bool sample_period_changed)
 			pr_err("Failed to create watchdog threads, disabled\n");
 		else
 			watchdog_running = 1;
-	} else if (sample_period_changed) {
-		update_timers_all_cpus();
+	} else {
+		/*
+		 * Enable/disable the lockup detectors or
+		 * change the sample period 'on the fly'.
+		 */
+		update_watchdog_all_cpus();
 	}
 
 	return err;
@@ -721,7 +725,7 @@ static int proc_watchdog_update(void)
 	 * or disabled 'on the fly'.
 	 */
 	if (watchdog_enabled && watchdog_thresh)
-		err = watchdog_enable_all_cpus(true);
+		err = watchdog_enable_all_cpus();
 	else
 		watchdog_disable_all_cpus();
 
@@ -861,5 +865,5 @@ void __init lockup_detector_init(void)
 	set_sample_period();
 
 	if (watchdog_enabled)
-		watchdog_enable_all_cpus(false);
+		watchdog_enable_all_cpus();
 }
-- 
cgit v1.2.3


From 692297d8f96887f836d9049a653ed05a71cf48fb Mon Sep 17 00:00:00 2001
From: Ulrich Obergfell <uobergfe@redhat.com>
Date: Tue, 14 Apr 2015 15:44:19 -0700
Subject: watchdog: introduce the hardlockup_detector_disable() function

Have kvm_guest_init() use hardlockup_detector_disable() instead of
watchdog_enable_hardlockup_detector(false).

Remove the watchdog_hardlockup_detector_is_enabled() and the
watchdog_enable_hardlockup_detector() function which are no longer needed.

Signed-off-by: Ulrich Obergfell <uobergfe@redhat.com>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/kvm.c |  2 +-
 include/linux/nmi.h   |  9 ++-------
 kernel/watchdog.c     | 21 ++-------------------
 3 files changed, 5 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index e354cc6446ab..9435620062df 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -513,7 +513,7 @@ void __init kvm_guest_init(void)
 	 * can get false positives too easily, for example if the host is
 	 * overcommitted.
 	 */
-	watchdog_enable_hardlockup_detector(false);
+	hardlockup_detector_disable();
 }
 
 static noinline uint32_t __kvm_cpuid_base(void)
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 0426357297d5..3d46fb4708e0 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -25,16 +25,11 @@ static inline void touch_nmi_watchdog(void)
 #endif
 
 #if defined(CONFIG_HARDLOCKUP_DETECTOR)
-extern void watchdog_enable_hardlockup_detector(bool val);
-extern bool watchdog_hardlockup_detector_is_enabled(void);
+extern void hardlockup_detector_disable(void);
 #else
-static inline void watchdog_enable_hardlockup_detector(bool val)
+static inline void hardlockup_detector_disable(void)
 {
 }
-static inline bool watchdog_hardlockup_detector_is_enabled(void)
-{
-	return true;
-}
 #endif
 
 /*
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 49d02250aaac..f2be11ab7e08 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -83,8 +83,6 @@ static unsigned long soft_lockup_nmi_warn;
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 static int hardlockup_panic =
 			CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
-
-static bool hardlockup_detector_enabled = true;
 /*
  * We may not want to enable hard lockup detection by default in all cases,
  * for example when running the kernel as a guest on a hypervisor. In these
@@ -93,14 +91,9 @@ static bool hardlockup_detector_enabled = true;
  * kernel command line parameters are parsed, because otherwise it is not
  * possible to override this in hardlockup_panic_setup().
  */
-void watchdog_enable_hardlockup_detector(bool val)
-{
-	hardlockup_detector_enabled = val;
-}
-
-bool watchdog_hardlockup_detector_is_enabled(void)
+void hardlockup_detector_disable(void)
 {
-	return hardlockup_detector_enabled;
+	watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
 }
 
 static int __init hardlockup_panic_setup(char *str)
@@ -530,15 +523,6 @@ static int watchdog_nmi_enable(unsigned int cpu)
 	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
 		goto out;
 
-	/*
-	 * Some kernels need to default hard lockup detection to
-	 * 'disabled', for example a guest on a hypervisor.
-	 */
-	if (!watchdog_hardlockup_detector_is_enabled()) {
-		event = ERR_PTR(-ENOENT);
-		goto handle_err;
-	}
-
 	/* is it already setup and enabled? */
 	if (event && event->state > PERF_EVENT_STATE_OFF)
 		goto out;
@@ -553,7 +537,6 @@ static int watchdog_nmi_enable(unsigned int cpu)
 	/* Try to register using hardware perf events */
 	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
 
-handle_err:
 	/* save cpu0 error for future comparision */
 	if (cpu == 0 && IS_ERR(event))
 		cpu0_err = PTR_ERR(event);
-- 
cgit v1.2.3


From 6e276d2a517fba71a2be2252fdffb237e746e784 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 14 Apr 2015 15:47:01 -0700
Subject: kernel, cpuset: remove exception for __GFP_THISNODE

Nothing calls __cpuset_node_allowed() with __GFP_THISNODE set anymore, so
remove the obscure comment about it and its special-case exception.

Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Pravin Shelar <pshelar@nicira.com>
Cc: Jarno Rajahalme <jrajahalme@nicira.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index c68f0721df10..ee14e3a35a29 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2453,20 +2453,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
  * @node: is this an allowed node?
  * @gfp_mask: memory allocation flags
  *
- * If we're in interrupt, yes, we can always allocate.  If __GFP_THISNODE is
- * set, yes, we can always allocate.  If node is in our task's mems_allowed,
- * yes.  If it's not a __GFP_HARDWALL request and this node is in the nearest
- * hardwalled cpuset ancestor to this task's cpuset, yes.  If the task has been
- * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE
- * flag, yes.
+ * If we're in interrupt, yes, we can always allocate.  If @node is set in
+ * current's mems_allowed, yes.  If it's not a __GFP_HARDWALL request and this
+ * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
+ * yes.  If current has access to memory reserves due to TIF_MEMDIE, yes.
  * Otherwise, no.
  *
- * The __GFP_THISNODE placement logic is really handled elsewhere,
- * by forcibly using a zonelist starting at a specified node, and by
- * (in get_page_from_freelist()) refusing to consider the zones for
- * any node on the zonelist except the first.  By the time any such
- * calls get to this routine, we should just shut up and say 'yes'.
- *
  * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
  * and do not allow allocations outside the current tasks cpuset
  * unless the task has been OOM killed as is marked TIF_MEMDIE.
@@ -2502,7 +2494,7 @@ int __cpuset_node_allowed(int node, gfp_t gfp_mask)
 	int allowed;			/* is allocation in zone z allowed? */
 	unsigned long flags;
 
-	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
+	if (in_interrupt())
 		return 1;
 	if (node_isset(node, current->mems_allowed))
 		return 1;
-- 
cgit v1.2.3


From 8d7dc9283f399e1fda4e48a1c453f689326d9396 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 14 Apr 2015 19:33:59 -0700
Subject: rcu: Control grace-period delays directly from value

In a misguided attempt to avoid an #ifdef, the use of the
gp_init_delay module parameter was conditioned on the corresponding
RCU_TORTURE_TEST_SLOW_INIT Kconfig variable, using IS_ENABLED() at
the point of use in the code.  This meant that the compiler always saw
the delay, which meant that RCU_TORTURE_TEST_SLOW_INIT_DELAY had to be
unconditionally defined.  This in turn caused "make oldconfig" to ask
pointless questions about the value of RCU_TORTURE_TEST_SLOW_INIT_DELAY
in cases where it was not even used.

This commit avoids these pointless questions by defining gp_init_delay
under #ifdef.  In one branch, gp_init_delay is initialized to
RCU_TORTURE_TEST_SLOW_INIT_DELAY and is also a module parameter (thus
allowing boot-time modification), and in the other branch gp_init_delay
is a const variable initialized by default to zero.

This approach also simplifies the code at the delay point by eliminating
the IS_DEFINED().  Because gp_init_delay is constant zero in the no-delay
case intended for production use, the "gp_init_delay > 0" check causes
the delay to become dead code, as desired in this case.  In addition,
this commit replaces magic constant "10" with the preprocessor variable
PER_RCU_NODE_PERIOD, which controls the number of grace periods that
are allowed to elapse at full speed before a delay is inserted.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by:
Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 16 +++++++++-------
 lib/Kconfig.debug |  1 +
 2 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 233165da782f..8cf7304b2867 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -162,11 +162,14 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
 static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
 module_param(kthread_prio, int, 0644);
 
-/* Delay in jiffies for grace-period initialization delays. */
-static int gp_init_delay = IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT)
-				? CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY
-				: 0;
+/* Delay in jiffies for grace-period initialization delays, debug only. */
+#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT
+static int gp_init_delay = CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY;
 module_param(gp_init_delay, int, 0644);
+#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */
+static const int gp_init_delay;
+#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */
+#define PER_RCU_NODE_PERIOD 10	/* Number of grace periods between delays. */
 
 /*
  * Track the rcutorture test sequence number and the update version
@@ -1843,9 +1846,8 @@ static int rcu_gp_init(struct rcu_state *rsp)
 		raw_spin_unlock_irq(&rnp->lock);
 		cond_resched_rcu_qs();
 		ACCESS_ONCE(rsp->gp_activity) = jiffies;
-		if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT) &&
-		    gp_init_delay > 0 &&
-		    !(rsp->gpnum % (rcu_num_nodes * 10)))
+		if (gp_init_delay > 0 &&
+		    !(rsp->gpnum % (rcu_num_nodes * PER_RCU_NODE_PERIOD)))
 			schedule_timeout_uninterruptible(gp_init_delay);
 	}
 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 1ad74c0df01f..5f5ff7d7e5eb 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1268,6 +1268,7 @@ config RCU_TORTURE_TEST_SLOW_INIT_DELAY
 	int "How much to slow down RCU grace-period initialization"
 	range 0 5
 	default 3
+	depends on RCU_TORTURE_TEST_SLOW_INIT
 	help
 	  This option specifies the number of jiffies to wait between
 	  each rcu_node structure initialization.
-- 
cgit v1.2.3


From b9cc4489c68de59f7a38ef4e02a9829465a6a544 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 15 Apr 2015 13:23:48 +0930
Subject: params: handle quotes properly for values not of form foo="bar".

When starting kernel with arguments like:
  init=/bin/sh -c "echo arguments"
the trailing double quote is not removed which results in following command
being executed:
  /bin/sh -c 'echo arguments"'

Reported-by: Arthur Gautier <baloo@gandi.net>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/params.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index 728e05b167de..a22d6a759b1a 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -173,9 +173,9 @@ static char *next_arg(char *args, char **param, char **val)
 			if (args[i-1] == '"')
 				args[i-1] = '\0';
 		}
-		if (quoted && args[i-1] == '"')
-			args[i-1] = '\0';
 	}
+	if (quoted && args[i-1] == '"')
+		args[i-1] = '\0';
 
 	if (args[i]) {
 		args[i] = '\0';
-- 
cgit v1.2.3


From 3b362157b2162719145d4f3d4f534357d89b45ce Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 17 Mar 2015 22:26:21 +0000
Subject: VFS: audit: d_backing_inode() annotations

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit.c       |  2 +-
 kernel/audit_tree.c  |  4 ++--
 kernel/audit_watch.c | 14 +++++++-------
 kernel/auditsc.c     |  6 +++---
 4 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 72ab759a0b43..f0603092f5cc 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1915,7 +1915,7 @@ void audit_log_link_denied(const char *operation, struct path *link)
 
 	/* Generate AUDIT_PATH record with object. */
 	name->type = AUDIT_TYPE_NORMAL;
-	audit_copy_inode(name, link->dentry, link->dentry->d_inode);
+	audit_copy_inode(name, link->dentry, d_backing_inode(link->dentry));
 	audit_log_name(current->audit_context, name, link, 0, NULL);
 out:
 	kfree(name);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 2e0c97427b33..70bd2532437b 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -576,7 +576,7 @@ int audit_remove_tree_rule(struct audit_krule *rule)
 
 static int compare_root(struct vfsmount *mnt, void *arg)
 {
-	return mnt->mnt_root->d_inode == arg;
+	return d_backing_inode(mnt->mnt_root) == arg;
 }
 
 void audit_trim_trees(void)
@@ -648,7 +648,7 @@ void audit_put_tree(struct audit_tree *tree)
 
 static int tag_mount(struct vfsmount *mnt, void *arg)
 {
-	return tag_chunk(mnt->mnt_root->d_inode, arg);
+	return tag_chunk(d_backing_inode(mnt->mnt_root), arg);
 }
 
 /* called with audit_filter_mutex */
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index ad9c1682f616..6e30024d9aac 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -146,7 +146,7 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
 /* Initialize a parent watch entry. */
 static struct audit_parent *audit_init_parent(struct path *path)
 {
-	struct inode *inode = path->dentry->d_inode;
+	struct inode *inode = d_backing_inode(path->dentry);
 	struct audit_parent *parent;
 	int ret;
 
@@ -361,11 +361,11 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent)
 	struct dentry *d = kern_path_locked(watch->path, parent);
 	if (IS_ERR(d))
 		return PTR_ERR(d);
-	mutex_unlock(&parent->dentry->d_inode->i_mutex);
-	if (d->d_inode) {
+	mutex_unlock(&d_backing_inode(parent->dentry)->i_mutex);
+	if (d_is_positive(d)) {
 		/* update watch filter fields */
-		watch->dev = d->d_inode->i_sb->s_dev;
-		watch->ino = d->d_inode->i_ino;
+		watch->dev = d_backing_inode(d)->i_sb->s_dev;
+		watch->ino = d_backing_inode(d)->i_ino;
 	}
 	dput(d);
 	return 0;
@@ -426,7 +426,7 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
 		return ret;
 
 	/* either find an old parent or attach a new one */
-	parent = audit_find_parent(parent_path.dentry->d_inode);
+	parent = audit_find_parent(d_backing_inode(parent_path.dentry));
 	if (!parent) {
 		parent = audit_init_parent(&parent_path);
 		if (IS_ERR(parent)) {
@@ -482,7 +482,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
 
 	switch (data_type) {
 	case (FSNOTIFY_EVENT_PATH):
-		inode = ((struct path *)data)->dentry->d_inode;
+		inode = d_backing_inode(((struct path *)data)->dentry);
 		break;
 	case (FSNOTIFY_EVENT_INODE):
 		inode = (struct inode *)data;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index dc4ae70a7413..9edcf813318c 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1629,7 +1629,7 @@ retry:
 	rcu_read_lock();
 	seq = read_seqbegin(&rename_lock);
 	for(;;) {
-		struct inode *inode = d->d_inode;
+		struct inode *inode = d_backing_inode(d);
 		if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) {
 			struct audit_chunk *chunk;
 			chunk = audit_tree_lookup(inode);
@@ -1754,7 +1754,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
 		   unsigned int flags)
 {
 	struct audit_context *context = current->audit_context;
-	const struct inode *inode = dentry->d_inode;
+	const struct inode *inode = d_backing_inode(dentry);
 	struct audit_names *n;
 	bool parent = flags & AUDIT_INODE_PARENT;
 
@@ -1853,7 +1853,7 @@ void __audit_inode_child(const struct inode *parent,
 			 const unsigned char type)
 {
 	struct audit_context *context = current->audit_context;
-	const struct inode *inode = dentry->d_inode;
+	const struct inode *inode = d_backing_inode(dentry);
 	const char *dname = dentry->d_name.name;
 	struct audit_names *n, *found_parent = NULL, *found_child = NULL;
 
-- 
cgit v1.2.3


From 7682c918439d42291df0d76b3e06627f27fbcdef Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 17 Mar 2015 22:26:16 +0000
Subject: VFS: kernel/: d_inode() annotations

relayfs and tracefs are dealing with inodes of their own;
those two act as filesystem drivers

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/relay.c              | 4 ++--
 kernel/trace/trace.c        | 8 ++++----
 kernel/trace/trace_events.c | 4 ++--
 kernel/trace/trace_uprobe.c | 2 +-
 4 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index 5a56d3c8dc03..e9dbaeb8fd65 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -407,7 +407,7 @@ static inline void relay_set_buf_dentry(struct rchan_buf *buf,
 					struct dentry *dentry)
 {
 	buf->dentry = dentry;
-	buf->dentry->d_inode->i_size = buf->early_bytes;
+	d_inode(buf->dentry)->i_size = buf->early_bytes;
 }
 
 static struct dentry *relay_create_buf_file(struct rchan *chan,
@@ -733,7 +733,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
 		buf->padding[old_subbuf] = buf->prev_padding;
 		buf->subbufs_produced++;
 		if (buf->dentry)
-			buf->dentry->d_inode->i_size +=
+			d_inode(buf->dentry)->i_size +=
 				buf->chan->subbuf_size -
 				buf->padding[old_subbuf];
 		else
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 62c6506d663f..d44901bcd867 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5846,7 +5846,7 @@ trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent,
 	struct dentry *ret = trace_create_file(name, mode, parent, data, fops);
 
 	if (ret) /* See tracing_get_cpu() */
-		ret->d_inode->i_cdev = (void *)(cpu + 1);
+		d_inode(ret)->i_cdev = (void *)(cpu + 1);
 	return ret;
 }
 
@@ -6420,7 +6420,7 @@ static int instance_rmdir(struct inode *inode, struct dentry *dentry)
 		return -ENOENT;
 
 	/* The caller did a dget() on dentry */
-	mutex_unlock(&dentry->d_inode->i_mutex);
+	mutex_unlock(&d_inode(dentry)->i_mutex);
 
 	/*
 	 * The inode mutex is locked, but debugfs_create_dir() will also
@@ -6435,7 +6435,7 @@ static int instance_rmdir(struct inode *inode, struct dentry *dentry)
 	ret = instance_delete(dentry->d_iname);
 
 	mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
-	mutex_lock(&dentry->d_inode->i_mutex);
+	mutex_lock(&d_inode(dentry)->i_mutex);
 
 	return ret;
 }
@@ -6453,7 +6453,7 @@ static __init void create_trace_instances(struct dentry *d_tracer)
 		return;
 
 	/* Hijack the dir inode operations, to allow mkdir */
-	trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations;
+	d_inode(trace_instance_dir)->i_op = &instance_dir_inode_operations;
 }
 
 static void
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index db54dda10ccc..e19e7578a0d3 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -494,8 +494,8 @@ static void remove_event_file_dir(struct ftrace_event_file *file)
 	if (dir) {
 		spin_lock(&dir->d_lock);	/* probably unneeded */
 		list_for_each_entry(child, &dir->d_subdirs, d_child) {
-			if (child->d_inode)	/* probably unneeded */
-				child->d_inode->i_private = NULL;
+			if (d_really_is_positive(child))	/* probably unneeded */
+				d_inode(child)->i_private = NULL;
 		}
 		spin_unlock(&dir->d_lock);
 
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 7dc1c8abecd6..cb3b37e533cd 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -443,7 +443,7 @@ static int create_trace_uprobe(int argc, char **argv)
 	if (ret)
 		goto fail_address_parse;
 
-	inode = igrab(path.dentry->d_inode);
+	inode = igrab(d_inode(path.dentry));
 	path_put(&path);
 
 	if (!inode || !S_ISREG(inode->i_mode)) {
-- 
cgit v1.2.3


From adbe427b92d18cf3f801e82e9cd664fbe31cea3c Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Wed, 15 Apr 2015 16:13:00 -0700
Subject: memcg: zap mem_cgroup_lookup()

mem_cgroup_lookup() is a wrapper around mem_cgroup_from_id(), which
checks that id != 0 before issuing the function call.  Today, there is
no point in this additional check apart from optimization, because there
is no css with id <= 0, so that css_from_id, called by
mem_cgroup_from_id, will return NULL for any id <= 0.

Since mem_cgroup_from_id is only called from mem_cgroup_lookup, let us
zap mem_cgroup_lookup, substituting calls to it with mem_cgroup_from_id
and moving the check if id > 0 to css_from_id.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c |  2 +-
 mm/memcontrol.c | 24 ++++++++----------------
 2 files changed, 9 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a220fdb66568..59aa339a245c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5451,7 +5451,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
 {
 	WARN_ON_ONCE(!rcu_read_lock_held());
-	return idr_find(&ss->css_idr, id);
+	return id > 0 ? idr_find(&ss->css_idr, id) : NULL;
 }
 
 #ifdef CONFIG_CGROUP_DEBUG
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c3f09b2dda5f..0a06628470cc 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -460,6 +460,12 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
 	return memcg->css.id;
 }
 
+/*
+ * A helper function to get mem_cgroup from ID. must be called under
+ * rcu_read_lock().  The caller is responsible for calling
+ * css_tryget_online() if the mem_cgroup is used for charging. (dropping
+ * refcnt from swap can be called against removed memcg.)
+ */
 static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
 {
 	struct cgroup_subsys_state *css;
@@ -2348,20 +2354,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
 	css_put_many(&memcg->css, nr_pages);
 }
 
-/*
- * A helper function to get mem_cgroup from ID. must be called under
- * rcu_read_lock().  The caller is responsible for calling
- * css_tryget_online() if the mem_cgroup is used for charging. (dropping
- * refcnt from swap can be called against removed memcg.)
- */
-static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
-{
-	/* ID 0 is unused ID */
-	if (!id)
-		return NULL;
-	return mem_cgroup_from_id(id);
-}
-
 /*
  * try_get_mem_cgroup_from_page - look up page's memcg association
  * @page: the page
@@ -2388,7 +2380,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 		ent.val = page_private(page);
 		id = lookup_swap_cgroup_id(ent);
 		rcu_read_lock();
-		memcg = mem_cgroup_lookup(id);
+		memcg = mem_cgroup_from_id(id);
 		if (memcg && !css_tryget_online(&memcg->css))
 			memcg = NULL;
 		rcu_read_unlock();
@@ -5869,7 +5861,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
 
 	id = swap_cgroup_record(entry, 0);
 	rcu_read_lock();
-	memcg = mem_cgroup_lookup(id);
+	memcg = mem_cgroup_from_id(id);
 	if (memcg) {
 		if (!mem_cgroup_is_root(memcg))
 			page_counter_uncharge(&memcg->memsw, 1);
-- 
cgit v1.2.3


From 5bbe3547aa3ba5242366a322a28996872301b703 Mon Sep 17 00:00:00 2001
From: Eric B Munson <emunson@akamai.com>
Date: Wed, 15 Apr 2015 16:13:20 -0700
Subject: mm: allow compaction of unevictable pages

Currently, pages which are marked as unevictable are protected from
compaction, but not from other types of migration.  The POSIX real time
extension explicitly states that mlock() will prevent a major page
fault, but the spirit of this is that mlock() should give a process the
ability to control sources of latency, including minor page faults.
However, the mlock manpage only explicitly says that a locked page will
not be written to swap and this can cause some confusion.  The
compaction code today does not give a developer who wants to avoid swap
but wants to have large contiguous areas available any method to achieve
this state.  This patch introduces a sysctl for controlling compaction
behavior with respect to the unevictable lru.  Users who demand no page
faults after a page is present can set compact_unevictable_allowed to 0
and users who need the large contiguous areas can enable compaction on
locked memory by leaving the default value of 1.

To illustrate this problem I wrote a quick test program that mmaps a
large number of 1MB files filled with random data.  These maps are
created locked and read only.  Then every other mmap is unmapped and I
attempt to allocate huge pages to the static huge page pool.  When the
compact_unevictable_allowed sysctl is 0, I cannot allocate hugepages
after fragmenting memory.  When the value is set to 1, allocations
succeed.

Signed-off-by: Eric B Munson <emunson@akamai.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Christoph Lameter <cl@linux.com>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/vm.txt | 11 +++++++++++
 include/linux/compaction.h  |  1 +
 kernel/sysctl.c             |  9 +++++++++
 mm/compaction.c             |  7 +++++++
 4 files changed, 28 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 902b4574acfb..9832ec52f859 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -21,6 +21,7 @@ Currently, these files are in /proc/sys/vm:
 - admin_reserve_kbytes
 - block_dump
 - compact_memory
+- compact_unevictable_allowed
 - dirty_background_bytes
 - dirty_background_ratio
 - dirty_bytes
@@ -106,6 +107,16 @@ huge pages although processes will also directly compact memory as required.
 
 ==============================================================
 
+compact_unevictable_allowed
+
+Available only when CONFIG_COMPACTION is set. When set to 1, compaction is
+allowed to examine the unevictable lru (mlocked pages) for pages to compact.
+This should be used on systems where stalls for minor page faults are an
+acceptable trade for large contiguous free memory.  Set to 0 to prevent
+compaction from moving pages that are unevictable.  Default value is 1.
+
+==============================================================
+
 dirty_background_bytes
 
 Contains the amount of dirty memory at which the background kernel
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index a014559e4a49..aa8f61cf3a19 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -34,6 +34,7 @@ extern int sysctl_compaction_handler(struct ctl_table *table, int write,
 extern int sysctl_extfrag_threshold;
 extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
 			void __user *buffer, size_t *length, loff_t *ppos);
+extern int sysctl_compact_unevictable_allowed;
 
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8c0eabd41886..42b7fc2860c1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1335,6 +1335,15 @@ static struct ctl_table vm_table[] = {
 		.extra1		= &min_extfrag_threshold,
 		.extra2		= &max_extfrag_threshold,
 	},
+	{
+		.procname	= "compact_unevictable_allowed",
+		.data		= &sysctl_compact_unevictable_allowed,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 
 #endif /* CONFIG_COMPACTION */
 	{
diff --git a/mm/compaction.c b/mm/compaction.c
index a18201a8124e..570426edcadf 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1046,6 +1046,12 @@ typedef enum {
 	ISOLATE_SUCCESS,	/* Pages isolated, migrate */
 } isolate_migrate_t;
 
+/*
+ * Allow userspace to control policy on scanning the unevictable LRU for
+ * compactable pages.
+ */
+int sysctl_compact_unevictable_allowed __read_mostly = 1;
+
 /*
  * Isolate all pages that can be migrated from the first suitable block,
  * starting at the block pointed to by the migrate scanner pfn within
@@ -1057,6 +1063,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 	unsigned long low_pfn, end_pfn;
 	struct page *page;
 	const isolate_mode_t isolate_mode =
+		(sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
 		(cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0);
 
 	/*
-- 
cgit v1.2.3


From 2813893f8b197a14f1e1ddb04d99bce46817c84a Mon Sep 17 00:00:00 2001
From: Iulia Manda <iulia.manda21@gmail.com>
Date: Wed, 15 Apr 2015 16:16:41 -0700
Subject: kernel: conditionally support non-root users, groups and capabilities

There are a lot of embedded systems that run most or all of their
functionality in init, running as root:root.  For these systems,
supporting multiple users is not necessary.

This patch adds a new symbol, CONFIG_MULTIUSER, that makes support for
non-root users, non-root groups, and capabilities optional.  It is enabled
under CONFIG_EXPERT menu.

When this symbol is not defined, UID and GID are zero in any possible case
and processes always have all capabilities.

The following syscalls are compiled out: setuid, setregid, setgid,
setreuid, setresuid, getresuid, setresgid, getresgid, setgroups,
getgroups, setfsuid, setfsgid, capget, capset.

Also, groups.c is compiled out completely.

In kernel/capability.c, capable function was moved in order to avoid
adding two ifdef blocks.

This change saves about 25 KB on a defconfig build.  The most minimal
kernels have total text sizes in the high hundreds of kB rather than
low MB.  (The 25k goes down a bit with allnoconfig, but not that much.

The kernel was booted in Qemu.  All the common functionalities work.
Adding users/groups is not possible, failing with -ENOSYS.

Bloat-o-meter output:
add/remove: 7/87 grow/shrink: 19/397 up/down: 1675/-26325 (-24650)

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Iulia Manda <iulia.manda21@gmail.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Tested-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/s390/Kconfig                     |  1 +
 drivers/staging/lustre/lustre/Kconfig |  1 +
 fs/nfs/Kconfig                        |  2 +-
 fs/nfsd/Kconfig                       |  1 +
 include/linux/capability.h            | 29 +++++++++++++++++++++++++++++
 include/linux/cred.h                  | 23 +++++++++++++++++++----
 include/linux/uidgid.h                | 12 ++++++++++++
 init/Kconfig                          | 19 ++++++++++++++++++-
 kernel/Makefile                       |  4 +++-
 kernel/capability.c                   | 35 +++++++++++++++++++----------------
 kernel/cred.c                         |  3 +++
 kernel/groups.c                       |  3 ---
 kernel/sys.c                          |  2 ++
 kernel/sys_ni.c                       | 14 ++++++++++++++
 net/sunrpc/Kconfig                    |  2 ++
 security/Kconfig                      |  1 +
 16 files changed, 126 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index a5ced5c3c1e0..de2726a487b0 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -328,6 +328,7 @@ config COMPAT
 	select COMPAT_BINFMT_ELF if BINFMT_ELF
 	select ARCH_WANT_OLD_COMPAT_IPC
 	select COMPAT_OLD_SIGACTION
+	depends on MULTIUSER
 	help
 	  Select this option if you want to enable your system kernel to
 	  handle system-calls from ELF binaries for 31 bit ESA.  This option
diff --git a/drivers/staging/lustre/lustre/Kconfig b/drivers/staging/lustre/lustre/Kconfig
index 6725467ef4d0..62c7bba75274 100644
--- a/drivers/staging/lustre/lustre/Kconfig
+++ b/drivers/staging/lustre/lustre/Kconfig
@@ -10,6 +10,7 @@ config LUSTRE_FS
 	select CRYPTO_SHA1
 	select CRYPTO_SHA256
 	select CRYPTO_SHA512
+	depends on MULTIUSER
 	help
 	  This option enables Lustre file system client support. Choose Y
 	  here if you want to access a Lustre file system cluster. To compile
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index c7abc10279af..f31fd0dd92c6 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -1,6 +1,6 @@
 config NFS_FS
 	tristate "NFS client support"
-	depends on INET && FILE_LOCKING
+	depends on INET && FILE_LOCKING && MULTIUSER
 	select LOCKD
 	select SUNRPC
 	select NFS_ACL_SUPPORT if NFS_V3_ACL
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 683bf718aead..fc2d108f5272 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -6,6 +6,7 @@ config NFSD
 	select SUNRPC
 	select EXPORTFS
 	select NFS_ACL_SUPPORT if NFSD_V2_ACL
+	depends on MULTIUSER
 	help
 	  Choose Y here if you want to allow other computers to access
 	  files residing on this system using Sun's Network File System
diff --git a/include/linux/capability.h b/include/linux/capability.h
index aa93e5ef594c..af9f0b9e80e6 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -205,6 +205,7 @@ static inline kernel_cap_t cap_raise_nfsd_set(const kernel_cap_t a,
 			   cap_intersect(permitted, __cap_nfsd_set));
 }
 
+#ifdef CONFIG_MULTIUSER
 extern bool has_capability(struct task_struct *t, int cap);
 extern bool has_ns_capability(struct task_struct *t,
 			      struct user_namespace *ns, int cap);
@@ -213,6 +214,34 @@ extern bool has_ns_capability_noaudit(struct task_struct *t,
 				      struct user_namespace *ns, int cap);
 extern bool capable(int cap);
 extern bool ns_capable(struct user_namespace *ns, int cap);
+#else
+static inline bool has_capability(struct task_struct *t, int cap)
+{
+	return true;
+}
+static inline bool has_ns_capability(struct task_struct *t,
+			      struct user_namespace *ns, int cap)
+{
+	return true;
+}
+static inline bool has_capability_noaudit(struct task_struct *t, int cap)
+{
+	return true;
+}
+static inline bool has_ns_capability_noaudit(struct task_struct *t,
+				      struct user_namespace *ns, int cap)
+{
+	return true;
+}
+static inline bool capable(int cap)
+{
+	return true;
+}
+static inline bool ns_capable(struct user_namespace *ns, int cap)
+{
+	return true;
+}
+#endif /* CONFIG_MULTIUSER */
 extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap);
 extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap);
 
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 2fb2ca2127ed..8b6c083e68a7 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -62,9 +62,27 @@ do {							\
 		groups_free(group_info);		\
 } while (0)
 
-extern struct group_info *groups_alloc(int);
 extern struct group_info init_groups;
+#ifdef CONFIG_MULTIUSER
+extern struct group_info *groups_alloc(int);
 extern void groups_free(struct group_info *);
+
+extern int in_group_p(kgid_t);
+extern int in_egroup_p(kgid_t);
+#else
+static inline void groups_free(struct group_info *group_info)
+{
+}
+
+static inline int in_group_p(kgid_t grp)
+{
+        return 1;
+}
+static inline int in_egroup_p(kgid_t grp)
+{
+        return 1;
+}
+#endif
 extern int set_current_groups(struct group_info *);
 extern void set_groups(struct cred *, struct group_info *);
 extern int groups_search(const struct group_info *, kgid_t);
@@ -74,9 +92,6 @@ extern bool may_setgroups(void);
 #define GROUP_AT(gi, i) \
 	((gi)->blocks[(i) / NGROUPS_PER_BLOCK][(i) % NGROUPS_PER_BLOCK])
 
-extern int in_group_p(kgid_t);
-extern int in_egroup_p(kgid_t);
-
 /*
  * The security context of a task
  *
diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h
index 2d1f9b627f91..0ee05da38899 100644
--- a/include/linux/uidgid.h
+++ b/include/linux/uidgid.h
@@ -29,6 +29,7 @@ typedef struct {
 #define KUIDT_INIT(value) (kuid_t){ value }
 #define KGIDT_INIT(value) (kgid_t){ value }
 
+#ifdef CONFIG_MULTIUSER
 static inline uid_t __kuid_val(kuid_t uid)
 {
 	return uid.val;
@@ -38,6 +39,17 @@ static inline gid_t __kgid_val(kgid_t gid)
 {
 	return gid.val;
 }
+#else
+static inline uid_t __kuid_val(kuid_t uid)
+{
+	return 0;
+}
+
+static inline gid_t __kgid_val(kgid_t gid)
+{
+	return 0;
+}
+#endif
 
 #define GLOBAL_ROOT_UID KUIDT_INIT(0)
 #define GLOBAL_ROOT_GID KGIDT_INIT(0)
diff --git a/init/Kconfig b/init/Kconfig
index a905b7301e10..3b9df1aa35db 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -394,6 +394,7 @@ endchoice
 
 config BSD_PROCESS_ACCT
 	bool "BSD Process Accounting"
+	depends on MULTIUSER
 	help
 	  If you say Y here, a user level program will be able to instruct the
 	  kernel (via a special system call) to write process accounting
@@ -420,6 +421,7 @@ config BSD_PROCESS_ACCT_V3
 config TASKSTATS
 	bool "Export task/process statistics through netlink"
 	depends on NET
+	depends on MULTIUSER
 	default n
 	help
 	  Export selected statistics for tasks/processes through the
@@ -1160,6 +1162,7 @@ config CHECKPOINT_RESTORE
 
 menuconfig NAMESPACES
 	bool "Namespaces support" if EXPERT
+	depends on MULTIUSER
 	default !EXPERT
 	help
 	  Provides the way to make tasks work with different objects using
@@ -1356,11 +1359,25 @@ menuconfig EXPERT
 
 config UID16
 	bool "Enable 16-bit UID system calls" if EXPERT
-	depends on HAVE_UID16
+	depends on HAVE_UID16 && MULTIUSER
 	default y
 	help
 	  This enables the legacy 16-bit UID syscall wrappers.
 
+config MULTIUSER
+	bool "Multiple users, groups and capabilities support" if EXPERT
+	default y
+	help
+	  This option enables support for non-root users, groups and
+	  capabilities.
+
+	  If you say N here, all processes will run with UID 0, GID 0, and all
+	  possible capabilities.  Saying N here also compiles out support for
+	  system calls related to UIDs, GIDs, and capabilities, such as setuid,
+	  setgid, and capset.
+
+	  If unsure, say Y here.
+
 config SGETMASK_SYSCALL
 	bool "sgetmask/ssetmask syscalls support" if EXPERT
 	def_bool PARISC || MN10300 || BLACKFIN || M68K || PPC || MIPS || X86 || SPARC || CRIS || MICROBLAZE || SUPERH
diff --git a/kernel/Makefile b/kernel/Makefile
index 1408b3353a3c..0f8f8b0bc1bf 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,9 @@ obj-y     = fork.o exec_domain.o panic.o \
 	    extable.o params.o \
 	    kthread.o sys_ni.o nsproxy.o \
 	    notifier.o ksysfs.o cred.o reboot.o \
-	    async.o range.o groups.o smpboot.o
+	    async.o range.o smpboot.o
+
+obj-$(CONFIG_MULTIUSER) += groups.o
 
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace debug files and internal ftrace files
diff --git a/kernel/capability.c b/kernel/capability.c
index 989f5bfc57dc..45432b54d5c6 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -35,6 +35,7 @@ static int __init file_caps_disable(char *str)
 }
 __setup("no_file_caps", file_caps_disable);
 
+#ifdef CONFIG_MULTIUSER
 /*
  * More recent versions of libcap are available from:
  *
@@ -386,6 +387,24 @@ bool ns_capable(struct user_namespace *ns, int cap)
 }
 EXPORT_SYMBOL(ns_capable);
 
+
+/**
+ * capable - Determine if the current task has a superior capability in effect
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+bool capable(int cap)
+{
+	return ns_capable(&init_user_ns, cap);
+}
+EXPORT_SYMBOL(capable);
+#endif /* CONFIG_MULTIUSER */
+
 /**
  * file_ns_capable - Determine if the file's opener had a capability in effect
  * @file:  The file we want to check
@@ -411,22 +430,6 @@ bool file_ns_capable(const struct file *file, struct user_namespace *ns,
 }
 EXPORT_SYMBOL(file_ns_capable);
 
-/**
- * capable - Determine if the current task has a superior capability in effect
- * @cap: The capability to be tested for
- *
- * Return true if the current task has the given superior capability currently
- * available for use, false if not.
- *
- * This sets PF_SUPERPRIV on the task if the capability is available on the
- * assumption that it's about to be used.
- */
-bool capable(int cap)
-{
-	return ns_capable(&init_user_ns, cap);
-}
-EXPORT_SYMBOL(capable);
-
 /**
  * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
  * @inode: The inode in question
diff --git a/kernel/cred.c b/kernel/cred.c
index e0573a43c7df..ec1c07667ec1 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -29,6 +29,9 @@
 
 static struct kmem_cache *cred_jar;
 
+/* init to 2 - one for init_task, one to ensure it is never freed */
+struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
+
 /*
  * The initial credentials for the initial task
  */
diff --git a/kernel/groups.c b/kernel/groups.c
index 664411f171b5..74d431d25251 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -9,9 +9,6 @@
 #include <linux/user_namespace.h>
 #include <asm/uaccess.h>
 
-/* init to 2 - one for init_task, one to ensure it is never freed */
-struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
-
 struct group_info *groups_alloc(int gidsetsize)
 {
 	struct group_info *group_info;
diff --git a/kernel/sys.c b/kernel/sys.c
index a03d9cd23ed7..3be344902316 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -325,6 +325,7 @@ out_unlock:
  * SMP: There are not races, the GIDs are checked only by filesystem
  *      operations (as far as semantic preservation is concerned).
  */
+#ifdef CONFIG_MULTIUSER
 SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
 {
 	struct user_namespace *ns = current_user_ns();
@@ -815,6 +816,7 @@ change_okay:
 	commit_creds(new);
 	return old_fsgid;
 }
+#endif /* CONFIG_MULTIUSER */
 
 /**
  * sys_getpid - return the thread group id of the current process
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 5adcb0ae3a58..7995ef5868d8 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -159,6 +159,20 @@ cond_syscall(sys_uselib);
 cond_syscall(sys_fadvise64);
 cond_syscall(sys_fadvise64_64);
 cond_syscall(sys_madvise);
+cond_syscall(sys_setuid);
+cond_syscall(sys_setregid);
+cond_syscall(sys_setgid);
+cond_syscall(sys_setreuid);
+cond_syscall(sys_setresuid);
+cond_syscall(sys_getresuid);
+cond_syscall(sys_setresgid);
+cond_syscall(sys_getresgid);
+cond_syscall(sys_setgroups);
+cond_syscall(sys_getgroups);
+cond_syscall(sys_setfsuid);
+cond_syscall(sys_setfsgid);
+cond_syscall(sys_capget);
+cond_syscall(sys_capset);
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index fb78117b896c..9068e72aa73c 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -1,9 +1,11 @@
 config SUNRPC
 	tristate
+	depends on MULTIUSER
 
 config SUNRPC_GSS
 	tristate
 	select OID_REGISTRY
+	depends on MULTIUSER
 
 config SUNRPC_BACKCHANNEL
 	bool
diff --git a/security/Kconfig b/security/Kconfig
index beb86b500adf..bf4ec46474b6 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -21,6 +21,7 @@ config SECURITY_DMESG_RESTRICT
 config SECURITY
 	bool "Enable different security models"
 	depends on SYSFS
+	depends on MULTIUSER
 	help
 	  This allows you to choose different security modules to be
 	  configured into your kernel.
-- 
cgit v1.2.3


From 96831c0a6738f88f89e7012f4df0a747514af0a0 Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jsitnicki@gmail.com>
Date: Wed, 15 Apr 2015 16:16:44 -0700
Subject: kernel/resource.c: remove deprecated __check_region() and friends

All users of __check_region(), check_region(), and check_mem_region() are
gone.  We got rid of the last user in v4.0-rc1.  Remove them.

bloat-o-meter on x86_64 shows:

add/remove: 0/3 grow/shrink: 0/0 up/down: 0/-102 (-102)
function                                     old     new   delta
__kstrtab___check_region                      15       -     -15
__ksymtab___check_region                      16       -     -16
__check_region                                71       -     -71

Signed-off-by: Jakub Sitnicki <jsitnicki@gmail.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ioport.h |  8 --------
 kernel/resource.c      | 32 --------------------------------
 2 files changed, 40 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 2c5250222278..388e3ae94f7a 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -196,10 +196,8 @@ extern struct resource * __request_region(struct resource *,
 
 /* Compatibility cruft */
 #define release_region(start,n)	__release_region(&ioport_resource, (start), (n))
-#define check_mem_region(start,n)	__check_region(&iomem_resource, (start), (n))
 #define release_mem_region(start,n)	__release_region(&iomem_resource, (start), (n))
 
-extern int __check_region(struct resource *, resource_size_t, resource_size_t);
 extern void __release_region(struct resource *, resource_size_t,
 				resource_size_t);
 #ifdef CONFIG_MEMORY_HOTREMOVE
@@ -207,12 +205,6 @@ extern int release_mem_region_adjustable(struct resource *, resource_size_t,
 				resource_size_t);
 #endif
 
-static inline int __deprecated check_region(resource_size_t s,
-						resource_size_t n)
-{
-	return __check_region(&ioport_resource, s, n);
-}
-
 /* Wrappers for managed devices */
 struct device;
 
diff --git a/kernel/resource.c b/kernel/resource.c
index 19f2357dfda3..90552aab5f2d 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1034,8 +1034,6 @@ resource_size_t resource_alignment(struct resource *res)
  *
  * request_region creates a new busy region.
  *
- * check_region returns non-zero if the area is already busy.
- *
  * release_region releases a matching busy region.
  */
 
@@ -1097,36 +1095,6 @@ struct resource * __request_region(struct resource *parent,
 }
 EXPORT_SYMBOL(__request_region);
 
-/**
- * __check_region - check if a resource region is busy or free
- * @parent: parent resource descriptor
- * @start: resource start address
- * @n: resource region size
- *
- * Returns 0 if the region is free at the moment it is checked,
- * returns %-EBUSY if the region is busy.
- *
- * NOTE:
- * This function is deprecated because its use is racy.
- * Even if it returns 0, a subsequent call to request_region()
- * may fail because another driver etc. just allocated the region.
- * Do NOT use it.  It will be removed from the kernel.
- */
-int __check_region(struct resource *parent, resource_size_t start,
-			resource_size_t n)
-{
-	struct resource * res;
-
-	res = __request_region(parent, start, n, "check-region", 0);
-	if (!res)
-		return -EBUSY;
-
-	release_resource(res);
-	free_resource(res);
-	return 0;
-}
-EXPORT_SYMBOL(__check_region);
-
 /**
  * __release_region - release a previously reserved resource region
  * @parent: parent resource descriptor
-- 
cgit v1.2.3


From 972fae6993cbbb934345011664dc703c0891dda3 Mon Sep 17 00:00:00 2001
From: Aaron Tomlin <atomlin@redhat.com>
Date: Wed, 15 Apr 2015 16:16:47 -0700
Subject: kernel/hung_task.c: change hung_task.c to use
 for_each_process_thread()

In check_hung_uninterruptible_tasks() avoid the use of deprecated
while_each_thread().

The "max_count" logic will prevent a livelock - see commit 0c740d0a
("introduce for_each_thread() to replace the buggy while_each_thread()").
Having said this let's use for_each_process_thread().

Signed-off-by: Aaron Tomlin <atomlin@redhat.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dave Wysochanski <dwysocha@redhat.com>
Cc: Aaron Tomlin <atomlin@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/hung_task.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 06db12434d72..e0f90c2b57aa 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -169,7 +169,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 		return;
 
 	rcu_read_lock();
-	do_each_thread(g, t) {
+	for_each_process_thread(g, t) {
 		if (!max_count--)
 			goto unlock;
 		if (!--batch_count) {
@@ -180,7 +180,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
 		if (t->state == TASK_UNINTERRUPTIBLE)
 			check_hung_task(t, timeout);
-	} while_each_thread(g, t);
+	}
  unlock:
 	rcu_read_unlock();
 }
-- 
cgit v1.2.3


From 7a54f46b301cfab8a0d7365aa186545f8b98f22e Mon Sep 17 00:00:00 2001
From: Joel Stanley <joel@jms.id.au>
Date: Wed, 15 Apr 2015 16:16:53 -0700
Subject: kernel/reboot.c: add orderly_reboot for graceful reboot

The kernel has orderly_poweroff which allows the kernel to initiate a
graceful shutdown of userspace, by running /sbin/poweroff.  This adds
orderly_reboot that will cause userspace to shut itself down by calling
/sbin/reboot.

This will be used for shutdown initiated by a system controller on
platforms that do not use ACPI.

orderly_reboot() should be used when the system wants to allow userspace
to gracefully shut itself down.  For cases where the system may imminently
catch on fire, the existing emergency_restart() provides an immediate
reboot without involving userspace.

Signed-off-by: Joel Stanley <joel@jms.id.au>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jeremy Kerr <jk@ozlabs.org>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/reboot.h |  3 ++-
 kernel/reboot.c        | 53 +++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 50 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index 67fc8fcdc4b0..a7ff409f386d 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -70,7 +70,8 @@ void ctrl_alt_del(void);
 #define POWEROFF_CMD_PATH_LEN	256
 extern char poweroff_cmd[POWEROFF_CMD_PATH_LEN];
 
-extern int orderly_poweroff(bool force);
+extern void orderly_poweroff(bool force);
+extern void orderly_reboot(void);
 
 /*
  * Emergency restart, callable from an interrupt handler.
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 5925f5ae8dff..d20c85d9f8c0 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -387,8 +387,9 @@ void ctrl_alt_del(void)
 }
 
 char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
+static const char reboot_cmd[] = "/sbin/reboot";
 
-static int __orderly_poweroff(bool force)
+static int run_cmd(const char *cmd)
 {
 	char **argv;
 	static char *envp[] = {
@@ -397,8 +398,7 @@ static int __orderly_poweroff(bool force)
 		NULL
 	};
 	int ret;
-
-	argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL);
+	argv = argv_split(GFP_KERNEL, cmd, NULL);
 	if (argv) {
 		ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
 		argv_free(argv);
@@ -406,8 +406,33 @@ static int __orderly_poweroff(bool force)
 		ret = -ENOMEM;
 	}
 
+	return ret;
+}
+
+static int __orderly_reboot(void)
+{
+	int ret;
+
+	ret = run_cmd(reboot_cmd);
+
+	if (ret) {
+		pr_warn("Failed to start orderly reboot: forcing the issue\n");
+		emergency_sync();
+		kernel_restart(NULL);
+	}
+
+	return ret;
+}
+
+static int __orderly_poweroff(bool force)
+{
+	int ret;
+
+	ret = run_cmd(poweroff_cmd);
+
 	if (ret && force) {
 		pr_warn("Failed to start orderly shutdown: forcing the issue\n");
+
 		/*
 		 * I guess this should try to kick off some daemon to sync and
 		 * poweroff asap.  Or not even bother syncing if we're doing an
@@ -436,15 +461,33 @@ static DECLARE_WORK(poweroff_work, poweroff_work_func);
  * This may be called from any context to trigger a system shutdown.
  * If the orderly shutdown fails, it will force an immediate shutdown.
  */
-int orderly_poweroff(bool force)
+void orderly_poweroff(bool force)
 {
 	if (force) /* do not override the pending "true" */
 		poweroff_force = true;
 	schedule_work(&poweroff_work);
-	return 0;
 }
 EXPORT_SYMBOL_GPL(orderly_poweroff);
 
+static void reboot_work_func(struct work_struct *work)
+{
+	__orderly_reboot();
+}
+
+static DECLARE_WORK(reboot_work, reboot_work_func);
+
+/**
+ * orderly_reboot - Trigger an orderly system reboot
+ *
+ * This may be called from any context to trigger a system reboot.
+ * If the orderly reboot fails, it will force an immediate reboot.
+ */
+void orderly_reboot(void)
+{
+	schedule_work(&reboot_work);
+}
+EXPORT_SYMBOL_GPL(orderly_reboot);
+
 static int __init reboot_setup(char *str)
 {
 	for (;;) {
-- 
cgit v1.2.3


From 94ff212d096d96afea29e41585b0d8ce41087c0f Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 15 Apr 2015 16:18:20 -0700
Subject: cgroup: remove use of seq_printf return value

The seq_printf return value, because it's frequently misused,
will eventually be converted to void.

See: commit 1f33c41c03da ("seq_file: Rename seq_overflow() to
     seq_has_overflowed() and make public")

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 59aa339a245c..469dd547770c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4196,7 +4196,9 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
 
 static int cgroup_pidlist_show(struct seq_file *s, void *v)
 {
-	return seq_printf(s, "%d\n", *(int *)v);
+	seq_printf(s, "%d\n", *(int *)v);
+
+	return 0;
 }
 
 static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
-- 
cgit v1.2.3


From 962e3707d9fb16bcf66ec5e5ebcea5248b9c2ab3 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 15 Apr 2015 16:18:22 -0700
Subject: tracing: remove use of seq_printf return value

The seq_printf return value, because it's frequently misused,
will eventually be converted to void.

See: commit 1f33c41c03da ("seq_file: Rename seq_overflow() to
     seq_has_overflowed() and make public")

Miscellanea:

o Remove unused return value from trace_lookup_stack

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/trace/trace_stack.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index c3e4fcfddd45..3f34496244e9 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -327,11 +327,11 @@ static void t_stop(struct seq_file *m, void *p)
 	local_irq_enable();
 }
 
-static int trace_lookup_stack(struct seq_file *m, long i)
+static void trace_lookup_stack(struct seq_file *m, long i)
 {
 	unsigned long addr = stack_dump_trace[i];
 
-	return seq_printf(m, "%pS\n", (void *)addr);
+	seq_printf(m, "%pS\n", (void *)addr);
 }
 
 static void print_disabled(struct seq_file *m)
-- 
cgit v1.2.3


From ef99b88b16bee753fa51207abdc58ae660453ec6 Mon Sep 17 00:00:00 2001
From: Rabin Vincent <rabin@rab.in>
Date: Mon, 13 Apr 2015 22:30:12 +0200
Subject: tracing: Handle ftrace_dump() atomic context in graph_trace_open()

graph_trace_open() can be called in atomic context from ftrace_dump().
Use GFP_ATOMIC for the memory allocations when that's the case, in order
to avoid the following splat.

 BUG: sleeping function called from invalid context at mm/slab.c:2849
 in_atomic(): 1, irqs_disabled(): 128, pid: 0, name: swapper/0
 Backtrace:
 ..
 [<8004dc94>] (__might_sleep) from [<801371f4>] (kmem_cache_alloc_trace+0x160/0x238)
  r7:87800040 r6:000080d0 r5:810d16e8 r4:000080d0
 [<80137094>] (kmem_cache_alloc_trace) from [<800cbd60>] (graph_trace_open+0x30/0xd0)
  r10:00000100 r9:809171a8 r8:00008e28 r7:810d16f0 r6:00000001 r5:810d16e8
  r4:810d16f0
 [<800cbd30>] (graph_trace_open) from [<800c79c4>] (trace_init_global_iter+0x50/0x9c)
  r8:00008e28 r7:808c853c r6:00000001 r5:810d16e8 r4:810d16f0 r3:800cbd30
 [<800c7974>] (trace_init_global_iter) from [<800c7aa0>] (ftrace_dump+0x90/0x2ec)
  r4:810d2580 r3:00000000
 [<800c7a10>] (ftrace_dump) from [<80414b2c>] (sysrq_ftrace_dump+0x1c/0x20)
  r10:00000100 r9:809171a8 r8:808f6e7c r7:00000001 r6:00000007 r5:0000007a
  r4:808d5394
 [<80414b10>] (sysrq_ftrace_dump) from [<800169b8>] (return_to_handler+0x0/0x18)
 [<80415498>] (__handle_sysrq) from [<800169b8>] (return_to_handler+0x0/0x18)
  r8:808c8100 r7:808c8444 r6:00000101 r5:00000010 r4:84eb3210
 [<80415668>] (handle_sysrq) from [<800169b8>] (return_to_handler+0x0/0x18)
 [<8042a760>] (pl011_int) from [<800169b8>] (return_to_handler+0x0/0x18)
  r10:809171bc r9:809171a8 r8:00000001 r7:00000026 r6:808c6000 r5:84f01e60
  r4:8454fe00
 [<8007782c>] (handle_irq_event_percpu) from [<80077b44>] (handle_irq_event+0x4c/0x6c)
  r10:808c7ef0 r9:87283e00 r8:00000001 r7:00000000 r6:8454fe00 r5:84f01e60
  r4:84f01e00
 [<80077af8>] (handle_irq_event) from [<8007aa28>] (handle_fasteoi_irq+0xf0/0x1ac)
  r6:808f52a4 r5:84f01e60 r4:84f01e00 r3:00000000
 [<8007a938>] (handle_fasteoi_irq) from [<80076dc0>] (generic_handle_irq+0x3c/0x4c)
  r6:00000026 r5:00000000 r4:00000026 r3:8007a938
 [<80076d84>] (generic_handle_irq) from [<80077128>] (__handle_domain_irq+0x8c/0xfc)
  r4:808c1e38 r3:0000002e
 [<8007709c>] (__handle_domain_irq) from [<800087b8>] (gic_handle_irq+0x34/0x6c)
  r10:80917748 r9:00000001 r8:88802100 r7:808c7ef0 r6:808c8fb0 r5:00000015
  r4:8880210c r3:808c7ef0
 [<80008784>] (gic_handle_irq) from [<80014044>] (__irq_svc+0x44/0x7c)

Link: http://lkml.kernel.org/r/1428953721-31349-1-git-send-email-rabin@rab.in
Link: http://lkml.kernel.org/r/1428957012-2319-1-git-send-email-rabin@rab.in

Cc: stable@vger.kernel.org # 3.13+
Signed-off-by: Rabin Vincent <rabin@rab.in>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_functions_graph.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 2d25ad1526bb..b6fce365ef27 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -1309,15 +1309,19 @@ void graph_trace_open(struct trace_iterator *iter)
 {
 	/* pid and depth on the last trace processed */
 	struct fgraph_data *data;
+	gfp_t gfpflags;
 	int cpu;
 
 	iter->private = NULL;
 
-	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	/* We can be called in atomic context via ftrace_dump() */
+	gfpflags = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL;
+
+	data = kzalloc(sizeof(*data), gfpflags);
 	if (!data)
 		goto out_err;
 
-	data->cpu_data = alloc_percpu(struct fgraph_cpu_data);
+	data->cpu_data = alloc_percpu_gfp(struct fgraph_cpu_data, gfpflags);
 	if (!data->cpu_data)
 		goto out_err_free;
 
-- 
cgit v1.2.3


From 84fce9db4d7eaebd6cb2ee30c15da6d4e4daf846 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Thu, 16 Apr 2015 13:44:44 +0900
Subject: tracing: Fix incorrect enabling of trace events by boot cmdline

There is a problem that trace events are not properly enabled with
boot cmdline. The problem is that if we pass "trace_event=kmem:mm_page_alloc"
to the boot cmdline, it enables all kmem trace events, and not just
the page_alloc event.

This is caused by the parsing mechanism. When we parse the cmdline, the buffer
contents is modified due to tokenization. And, if we use this buffer
again, we will get the wrong result.

Unfortunately, this buffer is be accessed three times to set trace events
properly at boot time. So, we need to handle this situation.

There is already code handling ",", but we need another for ":".
This patch adds it.

Link: http://lkml.kernel.org/r/1429159484-22977-1-git-send-email-iamjoonsoo.kim@lge.com

Cc: stable@vger.kernel.org # 3.19+
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
[ added missing return ret; ]
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index a576bbe75577..36a957c996c7 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -565,6 +565,7 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
 static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
 {
 	char *event = NULL, *sub = NULL, *match;
+	int ret;
 
 	/*
 	 * The buf format can be <subsystem>:<event-name>
@@ -590,7 +591,13 @@ static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
 			event = NULL;
 	}
 
-	return __ftrace_set_clr_event(tr, match, sub, event, set);
+	ret = __ftrace_set_clr_event(tr, match, sub, event, set);
+
+	/* Put back the colon to allow this to be called again */
+	if (buf)
+		*(buf - 1) = ':';
+
+	return ret;
 }
 
 /**
-- 
cgit v1.2.3


From c3de6317d748e23b9e46ba36e10483728d00d144 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Tue, 14 Apr 2015 15:57:13 -0700
Subject: bpf: fix verifier memory corruption

Due to missing bounds check the DAG pass of the BPF verifier can corrupt
the memory which can cause random crashes during program loading:

[8.449451] BUG: unable to handle kernel paging request at ffffffffffffffff
[8.451293] IP: [<ffffffff811de33d>] kmem_cache_alloc_trace+0x8d/0x2f0
[8.452329] Oops: 0000 [#1] SMP
[8.452329] Call Trace:
[8.452329]  [<ffffffff8116cc82>] bpf_check+0x852/0x2000
[8.452329]  [<ffffffff8116b7e4>] bpf_prog_load+0x1e4/0x310
[8.452329]  [<ffffffff811b190f>] ? might_fault+0x5f/0xb0
[8.452329]  [<ffffffff8116c206>] SyS_bpf+0x806/0xa30

Fixes: f1bca824dabb ("bpf: add search pruning optimization to verifier")
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 630a7bac1e51..66bec36ec1ec 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1397,7 +1397,8 @@ peek_stack:
 			/* tell verifier to check for equivalent states
 			 * after every call and jump
 			 */
-			env->explored_states[t + 1] = STATE_LIST_MARK;
+			if (t + 1 < insn_cnt)
+				env->explored_states[t + 1] = STATE_LIST_MARK;
 		} else {
 			/* conditional jump with two edges */
 			ret = push_insn(t, t + 1, FALLTHROUGH, env);
-- 
cgit v1.2.3


From 725f9dcd58dedfea49ef958babf6c0bf6b7594a9 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Wed, 15 Apr 2015 16:19:33 -0700
Subject: bpf: fix two bugs in verification logic when accessing 'ctx' pointer

1.
first bug is a silly mistake. It broke tracing examples and prevented
simple bpf programs from loading.

In the following code:
if (insn->imm == 0 && BPF_SIZE(insn->code) == BPF_W) {
} else if (...) {
  // this part should have been executed when
  // insn->code == BPF_W and insn->imm != 0
}

Obviously it's not doing that. So simple instructions like:
r2 = *(u64 *)(r1 + 8)
will be rejected. Note the comments in the code around these branches
were and still valid and indicate the true intent.

Replace it with:
if (BPF_SIZE(insn->code) != BPF_W)
  continue;

if (insn->imm == 0) {
} else if (...) {
  // now this code will be executed when
  // insn->code == BPF_W and insn->imm != 0
}

2.
second bug is more subtle.
If malicious code is using the same dest register as source register,
the checks designed to prevent the same instruction to be used with different
pointer types will fail to trigger, since we were assigning src_reg_type
when it was already overwritten by check_mem_access().
The fix is trivial. Just move line:
src_reg_type = regs[insn->src_reg].type;
before check_mem_access().
Add new 'access skb fields bad4' test to check this case.

Fixes: 9bac3d6d548e ("bpf: allow extended BPF programs access skb fields")
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c       |  9 +++++++--
 samples/bpf/test_verifier.c | 22 ++++++++++++++++++++++
 2 files changed, 29 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 66bec36ec1ec..47dcd3aa6e23 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1637,6 +1637,8 @@ static int do_check(struct verifier_env *env)
 			if (err)
 				return err;
 
+			src_reg_type = regs[insn->src_reg].type;
+
 			/* check that memory (src_reg + off) is readable,
 			 * the state of dst_reg will be updated by this func
 			 */
@@ -1646,9 +1648,12 @@ static int do_check(struct verifier_env *env)
 			if (err)
 				return err;
 
-			src_reg_type = regs[insn->src_reg].type;
+			if (BPF_SIZE(insn->code) != BPF_W) {
+				insn_idx++;
+				continue;
+			}
 
-			if (insn->imm == 0 && BPF_SIZE(insn->code) == BPF_W) {
+			if (insn->imm == 0) {
 				/* saw a valid insn
 				 * dst_reg = *(u32 *)(src_reg + off)
 				 * use reserved 'imm' field to mark this insn
diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c
index 9ab645698ffb..12f3780af73f 100644
--- a/samples/bpf/test_verifier.c
+++ b/samples/bpf/test_verifier.c
@@ -721,6 +721,28 @@ static struct bpf_test tests[] = {
 		.errstr = "different pointers",
 		.result = REJECT,
 	},
+	{
+		"access skb fields bad4",
+		.insns = {
+			BPF_JMP_IMM(BPF_JGE, BPF_REG_1, 0, 3),
+			BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1,
+				    offsetof(struct __sk_buff, len)),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+			BPF_EXIT_INSN(),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JA, 0, 0, -13),
+		},
+		.fixup = {7},
+		.errstr = "different pointers",
+		.result = REJECT,
+	},
 };
 
 static int probe_filter_length(struct bpf_insn *fp)
-- 
cgit v1.2.3


From d7bc3197b41e0a1af6677e83f8736e93a1575ce0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 15 Apr 2015 17:11:57 +0200
Subject: lockdep: Make print_lock() robust against concurrent release

During sysrq's show-held-locks command it is possible that
hlock_class() returns NULL for a given lock. The result is then (after
the warning):

	|BUG: unable to handle kernel NULL pointer dereference at 0000001c
	|IP: [<c1088145>] get_usage_chars+0x5/0x100
	|Call Trace:
	| [<c1088263>] print_lock_name+0x23/0x60
	| [<c1576b57>] print_lock+0x5d/0x7e
	| [<c1088314>] lockdep_print_held_locks+0x74/0xe0
	| [<c1088652>] debug_show_all_locks+0x132/0x1b0
	| [<c1315c48>] sysrq_handle_showlocks+0x8/0x10

This *might* happen because the thread on the other CPU drops the lock
after we are looking ->lockdep_depth and ->held_locks points no longer
to a lock that is held.

The fix here is to simply ignore it and continue.

Reported-by: Andreas Messerschmid <andreas@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lockdep.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index ba77ab5f64dd..a0831e1b99f4 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -551,7 +551,21 @@ static void print_lockdep_cache(struct lockdep_map *lock)
 
 static void print_lock(struct held_lock *hlock)
 {
-	print_lock_name(hlock_class(hlock));
+	/*
+	 * We can be called locklessly through debug_show_all_locks() so be
+	 * extra careful, the hlock might have been released and cleared.
+	 */
+	unsigned int class_idx = hlock->class_idx;
+
+	/* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfields: */
+	barrier();
+
+	if (!class_idx || (class_idx - 1) >= MAX_LOCKDEP_KEYS) {
+		printk("<RELEASED>\n");
+		return;
+	}
+
+	print_lock_name(lock_classes + class_idx - 1);
 	printk(", at: ");
 	print_ip_sym(hlock->acquire_ip);
 }
-- 
cgit v1.2.3


From 8053871d0f7f67c7efb7f226ef031f78877d6625 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 11 Feb 2015 12:42:10 -0800
Subject: smp: Fix smp_call_function_single_async() locking

The current smp_function_call code suffers a number of problems, most
notably smp_call_function_single_async() is broken.

The problem is that flush_smp_call_function_queue() does csd_unlock()
_after_ calling csd->func(). This means that a caller cannot properly
synchronize the csd usage as it has to.

Change the code to release the csd before calling ->func() for the
async case, and put a WARN_ON_ONCE(csd->flags & CSD_FLAG_LOCK) in
smp_call_function_single_async() to warn us of improper serialization,
because any waiting there can results in deadlocks when called with
IRQs disabled.

Rename the (currently) unused WAIT flag to SYNCHRONOUS and (re)use it
such that we know what to do in flush_smp_call_function_queue().

Rework csd_{,un}lock() to use smp_load_acquire() / smp_store_release()
to avoid some full barriers while more clearly providing lock
semantics.

Finally move the csd maintenance out of generic_exec_single() into its
callers for clearer code.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
[ Added changelog. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Rafael David Tinoco <inaddy@ubuntu.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/CA+55aFz492bzLFhdbKN-Hygjcreup7CjMEYk3nTSfRWjppz-OA@mail.gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/smp.c | 78 ++++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 47 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index f38a1e692259..2aaac2c47683 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -19,7 +19,7 @@
 
 enum {
 	CSD_FLAG_LOCK		= 0x01,
-	CSD_FLAG_WAIT		= 0x02,
+	CSD_FLAG_SYNCHRONOUS	= 0x02,
 };
 
 struct call_function_data {
@@ -107,7 +107,7 @@ void __init call_function_init(void)
  */
 static void csd_lock_wait(struct call_single_data *csd)
 {
-	while (csd->flags & CSD_FLAG_LOCK)
+	while (smp_load_acquire(&csd->flags) & CSD_FLAG_LOCK)
 		cpu_relax();
 }
 
@@ -121,19 +121,17 @@ static void csd_lock(struct call_single_data *csd)
 	 * to ->flags with any subsequent assignments to other
 	 * fields of the specified call_single_data structure:
 	 */
-	smp_mb();
+	smp_wmb();
 }
 
 static void csd_unlock(struct call_single_data *csd)
 {
-	WARN_ON((csd->flags & CSD_FLAG_WAIT) && !(csd->flags & CSD_FLAG_LOCK));
+	WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
 
 	/*
 	 * ensure we're all done before releasing data:
 	 */
-	smp_mb();
-
-	csd->flags &= ~CSD_FLAG_LOCK;
+	smp_store_release(&csd->flags, 0);
 }
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
@@ -144,13 +142,16 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
  * ->func, ->info, and ->flags set.
  */
 static int generic_exec_single(int cpu, struct call_single_data *csd,
-			       smp_call_func_t func, void *info, int wait)
+			       smp_call_func_t func, void *info)
 {
-	struct call_single_data csd_stack = { .flags = 0 };
-	unsigned long flags;
-
-
 	if (cpu == smp_processor_id()) {
+		unsigned long flags;
+
+		/*
+		 * We can unlock early even for the synchronous on-stack case,
+		 * since we're doing this from the same CPU..
+		 */
+		csd_unlock(csd);
 		local_irq_save(flags);
 		func(info);
 		local_irq_restore(flags);
@@ -161,21 +162,9 @@ static int generic_exec_single(int cpu, struct call_single_data *csd,
 	if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu))
 		return -ENXIO;
 
-
-	if (!csd) {
-		csd = &csd_stack;
-		if (!wait)
-			csd = this_cpu_ptr(&csd_data);
-	}
-
-	csd_lock(csd);
-
 	csd->func = func;
 	csd->info = info;
 
-	if (wait)
-		csd->flags |= CSD_FLAG_WAIT;
-
 	/*
 	 * The list addition should be visible before sending the IPI
 	 * handler locks the list to pull the entry off it because of
@@ -190,9 +179,6 @@ static int generic_exec_single(int cpu, struct call_single_data *csd,
 	if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
 		arch_send_call_function_single_ipi(cpu);
 
-	if (wait)
-		csd_lock_wait(csd);
-
 	return 0;
 }
 
@@ -250,8 +236,17 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
 	}
 
 	llist_for_each_entry_safe(csd, csd_next, entry, llist) {
-		csd->func(csd->info);
-		csd_unlock(csd);
+		smp_call_func_t func = csd->func;
+		void *info = csd->info;
+
+		/* Do we wait until *after* callback? */
+		if (csd->flags & CSD_FLAG_SYNCHRONOUS) {
+			func(info);
+			csd_unlock(csd);
+		} else {
+			csd_unlock(csd);
+			func(info);
+		}
 	}
 
 	/*
@@ -274,6 +269,8 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
 int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
 			     int wait)
 {
+	struct call_single_data *csd;
+	struct call_single_data csd_stack = { .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS };
 	int this_cpu;
 	int err;
 
@@ -292,7 +289,16 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
 	WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
 		     && !oops_in_progress);
 
-	err = generic_exec_single(cpu, NULL, func, info, wait);
+	csd = &csd_stack;
+	if (!wait) {
+		csd = this_cpu_ptr(&csd_data);
+		csd_lock(csd);
+	}
+
+	err = generic_exec_single(cpu, csd, func, info);
+
+	if (wait)
+		csd_lock_wait(csd);
 
 	put_cpu();
 
@@ -321,7 +327,15 @@ int smp_call_function_single_async(int cpu, struct call_single_data *csd)
 	int err = 0;
 
 	preempt_disable();
-	err = generic_exec_single(cpu, csd, csd->func, csd->info, 0);
+
+	/* We could deadlock if we have to wait here with interrupts disabled! */
+	if (WARN_ON_ONCE(csd->flags & CSD_FLAG_LOCK))
+		csd_lock_wait(csd);
+
+	csd->flags = CSD_FLAG_LOCK;
+	smp_wmb();
+
+	err = generic_exec_single(cpu, csd, csd->func, csd->info);
 	preempt_enable();
 
 	return err;
@@ -433,6 +447,8 @@ void smp_call_function_many(const struct cpumask *mask,
 		struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
 
 		csd_lock(csd);
+		if (wait)
+			csd->flags |= CSD_FLAG_SYNCHRONOUS;
 		csd->func = func;
 		csd->info = info;
 		llist_add(&csd->llist, &per_cpu(call_single_queue, cpu));
-- 
cgit v1.2.3


From b72c186999e689cb0b055ab1c7b3cd8fffbeb5ed Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 16 Apr 2015 12:47:29 -0700
Subject: ptrace: fix race between ptrace_resume() and wait_task_stopped()

ptrace_resume() is called when the tracee is still __TASK_TRACED.  We set
tracee->exit_code and then wake_up_state() changes tracee->state.  If the
tracer's sub-thread does wait() in between, task_stopped_code(ptrace => T)
wrongly looks like another report from tracee.

This confuses debugger, and since wait_task_stopped() clears ->exit_code
the tracee can miss a signal.

Test-case:

	#include <stdio.h>
	#include <unistd.h>
	#include <sys/wait.h>
	#include <sys/ptrace.h>
	#include <pthread.h>
	#include <assert.h>

	int pid;

	void *waiter(void *arg)
	{
		int stat;

		for (;;) {
			assert(pid == wait(&stat));
			assert(WIFSTOPPED(stat));
			if (WSTOPSIG(stat) == SIGHUP)
				continue;

			assert(WSTOPSIG(stat) == SIGCONT);
			printf("ERR! extra/wrong report:%x\n", stat);
		}
	}

	int main(void)
	{
		pthread_t thread;

		pid = fork();
		if (!pid) {
			assert(ptrace(PTRACE_TRACEME, 0,0,0) == 0);
			for (;;)
				kill(getpid(), SIGHUP);
		}

		assert(pthread_create(&thread, NULL, waiter, NULL) == 0);

		for (;;)
			ptrace(PTRACE_CONT, pid, 0, SIGCONT);

		return 0;
	}

Note for stable: the bug is very old, but without 9899d11f6544 "ptrace:
ensure arch_ptrace/ptrace_request can never race with SIGKILL" the fix
should use lock_task_sighand(child).

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reported-by: Pavel Labath <labath@google.com>
Tested-by: Pavel Labath <labath@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 227fec36b12a..9a34bd80a745 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -697,6 +697,8 @@ static int ptrace_peek_siginfo(struct task_struct *child,
 static int ptrace_resume(struct task_struct *child, long request,
 			 unsigned long data)
 {
+	bool need_siglock;
+
 	if (!valid_signal(data))
 		return -EIO;
 
@@ -724,8 +726,26 @@ static int ptrace_resume(struct task_struct *child, long request,
 		user_disable_single_step(child);
 	}
 
+	/*
+	 * Change ->exit_code and ->state under siglock to avoid the race
+	 * with wait_task_stopped() in between; a non-zero ->exit_code will
+	 * wrongly look like another report from tracee.
+	 *
+	 * Note that we need siglock even if ->exit_code == data and/or this
+	 * status was not reported yet, the new status must not be cleared by
+	 * wait_task_stopped() after resume.
+	 *
+	 * If data == 0 we do not care if wait_task_stopped() reports the old
+	 * status and clears the code too; this can't race with the tracee, it
+	 * takes siglock after resume.
+	 */
+	need_siglock = data && !thread_group_empty(current);
+	if (need_siglock)
+		spin_lock_irq(&child->sighand->siglock);
 	child->exit_code = data;
 	wake_up_state(child, __TASK_TRACED);
+	if (need_siglock)
+		spin_unlock_irq(&child->sighand->siglock);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 64a4096c5cdab377b6e1f44008ee8b2636db579d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 16 Apr 2015 12:47:32 -0700
Subject: ptrace: ptrace_detach() can no longer race with SIGKILL

ptrace_detach() re-checks ->ptrace under tasklist lock and calls
release_task() if __ptrace_detach() returns true.  This was needed because
the __TASK_TRACED tracee could be killed/untraced, and it could even pass
exit_notify() before we take tasklist_lock.

But this is no longer possible after 9899d11f6544 "ptrace: ensure
arch_ptrace/ptrace_request can never race with SIGKILL".  We can turn
these checks into WARN_ON() and remove release_task().

While at it, document the setting of child->exit_code.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Pavel Labath <labath@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 9a34bd80a745..c8e0e050a36a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -456,8 +456,6 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
 
 static int ptrace_detach(struct task_struct *child, unsigned int data)
 {
-	bool dead = false;
-
 	if (!valid_signal(data))
 		return -EIO;
 
@@ -467,18 +465,19 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
 
 	write_lock_irq(&tasklist_lock);
 	/*
-	 * This child can be already killed. Make sure de_thread() or
-	 * our sub-thread doing do_wait() didn't do release_task() yet.
+	 * We rely on ptrace_freeze_traced(). It can't be killed and
+	 * untraced by another thread, it can't be a zombie.
 	 */
-	if (child->ptrace) {
-		child->exit_code = data;
-		dead = __ptrace_detach(current, child);
-	}
+	WARN_ON(!child->ptrace || child->exit_state);
+	/*
+	 * tasklist_lock avoids the race with wait_task_stopped(), see
+	 * the comment in ptrace_resume().
+	 */
+	child->exit_code = data;
+	__ptrace_detach(current, child);
 	write_unlock_irq(&tasklist_lock);
 
 	proc_ptrace_connector(child, PTRACE_DETACH);
-	if (unlikely(dead))
-		release_task(child);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 69828dce7af2cb6d08ef5a03de687d422fb7ec1f Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Thu, 16 Apr 2015 12:47:35 -0700
Subject: signal: remove warning about using SI_TKILL in rt_[tg]sigqueueinfo

Sending SI_TKILL from rt_[tg]sigqueueinfo was deprecated, so now we issue
a warning on the first attempt of doing it.  We use WARN_ON_ONCE, which is
not informative and, what is worse, taints the kernel, making the trinity
syscall fuzzer complain false-positively from time to time.

It does not look like we need this warning at all, because the behaviour
changed quite a long time ago (2.6.39), and if an application relies on
the old API, it gets EPERM anyway and can issue a warning by itself.

So let us zap the warning in kernel.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index a390499943e4..d51c5ddd855c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2992,11 +2992,9 @@ static int do_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t *info)
 	 * Nor can they impersonate a kill()/tgkill(), which adds source info.
 	 */
 	if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
-	    (task_pid_vnr(current) != pid)) {
-		/* We used to allow any < 0 si_code */
-		WARN_ON_ONCE(info->si_code < 0);
+	    (task_pid_vnr(current) != pid))
 		return -EPERM;
-	}
+
 	info->si_signo = sig;
 
 	/* POSIX.1b doesn't mention process groups.  */
@@ -3041,12 +3039,10 @@ static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
 	/* Not even root can pretend to send signals from the kernel.
 	 * Nor can they impersonate a kill()/tgkill(), which adds source info.
 	 */
-	if (((info->si_code >= 0 || info->si_code == SI_TKILL)) &&
-	    (task_pid_vnr(current) != pid)) {
-		/* We used to allow any < 0 si_code */
-		WARN_ON_ONCE(info->si_code < 0);
+	if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
+	    (task_pid_vnr(current) != pid))
 		return -EPERM;
-	}
+
 	info->si_signo = sig;
 
 	return do_send_specific(tgid, pid, sig, info);
-- 
cgit v1.2.3


From 35f71bc0a09a45924bed268d8ccd0d3407bc476f Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Thu, 16 Apr 2015 12:47:38 -0700
Subject: fork: report pid reservation failure properly

copy_process will report any failure in alloc_pid as ENOMEM currently
which is misleading because the pid allocation might fail not only when
the memory is short but also when the pid space is consumed already.

The current man page even mentions this case:

: EAGAIN
:
:       A system-imposed limit on the number of threads was encountered.
:       There are a number of limits that may trigger this error: the
:       RLIMIT_NPROC soft resource limit (set via setrlimit(2)), which
:       limits the number of processes and threads for a real user ID, was
:       reached; the kernel's system-wide limit on the number of processes
:       and threads, /proc/sys/kernel/threads-max, was reached (see
:       proc(5)); or the maximum number of PIDs, /proc/sys/kernel/pid_max,
:       was reached (see proc(5)).

so the current behavior is also incorrect wrt.  documentation.  POSIX man
page also suggest returing EAGAIN when the process count limit is reached.

This patch simply propagates error code from alloc_pid and makes sure we
return -EAGAIN due to reservation failure.  This will make behavior of
fork closer to both our documentation and POSIX.

alloc_pid might alsoo fail when the reaper in the pid namespace is dead
(the namespace basically disallows all new processes) and there is no
good error code which would match documented ones. We have traditionally
returned ENOMEM for this case which is misleading as well but as per
Eric W. Biederman this behavior is documented in man pid_namespaces(7)

: If the "init" process of a PID namespace terminates, the kernel
: terminates all of the processes in the namespace via a SIGKILL signal.
: This behavior reflects the fact that the "init" process is essential for
: the correct operation of a PID namespace.  In this case, a subsequent
: fork(2) into this PID namespace will fail with the error ENOMEM; it is
: not possible to create a new processes in a PID namespace whose "init"
: process has terminated.

and introducing a new error code would be too risky so let's stick to
ENOMEM for this case.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c |  5 +++--
 kernel/pid.c  | 15 ++++++++-------
 2 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index f2c1e7352298..d778016ac1e3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1403,10 +1403,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto bad_fork_cleanup_io;
 
 	if (pid != &init_struct_pid) {
-		retval = -ENOMEM;
 		pid = alloc_pid(p->nsproxy->pid_ns_for_children);
-		if (!pid)
+		if (IS_ERR(pid)) {
+			retval = PTR_ERR(pid);
 			goto bad_fork_cleanup_io;
+		}
 	}
 
 	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
diff --git a/kernel/pid.c b/kernel/pid.c
index cd36a5e0d173..4fd07d5b7baf 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -182,7 +182,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
 			spin_unlock_irq(&pidmap_lock);
 			kfree(page);
 			if (unlikely(!map->page))
-				break;
+				return -ENOMEM;
 		}
 		if (likely(atomic_read(&map->nr_free))) {
 			for ( ; ; ) {
@@ -210,7 +210,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
 		}
 		pid = mk_pid(pid_ns, map, offset);
 	}
-	return -1;
+	return -EAGAIN;
 }
 
 int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
@@ -301,17 +301,20 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 	int i, nr;
 	struct pid_namespace *tmp;
 	struct upid *upid;
+	int retval = -ENOMEM;
 
 	pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
 	if (!pid)
-		goto out;
+		return ERR_PTR(retval);
 
 	tmp = ns;
 	pid->level = ns->level;
 	for (i = ns->level; i >= 0; i--) {
 		nr = alloc_pidmap(tmp);
-		if (nr < 0)
+		if (IS_ERR_VALUE(nr)) {
+			retval = nr;
 			goto out_free;
+		}
 
 		pid->numbers[i].nr = nr;
 		pid->numbers[i].ns = tmp;
@@ -339,7 +342,6 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 	}
 	spin_unlock_irq(&pidmap_lock);
 
-out:
 	return pid;
 
 out_unlock:
@@ -351,8 +353,7 @@ out_free:
 		free_pidmap(pid->numbers + i);
 
 	kmem_cache_free(ns->pid_cachep, pid);
-	pid = NULL;
-	goto out;
+	return ERR_PTR(retval);
 }
 
 void disable_pid_allocation(struct pid_namespace *ns)
-- 
cgit v1.2.3


From 3ea7f5e25ec271909451b7dc17be37581b888de6 Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Thu, 16 Apr 2015 12:47:41 -0700
Subject: fork_init: update max_threads comment

The comment explaining what value max_threads is set to is outdated.  The
maximum memory consumption ratio for thread structures was 1/2 until
February 2002, then it was briefly changed to 1/16 before being set to 1/8
which we still use today.  The comment was never updated to reflect that
change, it's about time.

Signed-off-by: Jean Delvare <jdelvare@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index d778016ac1e3..c507e29bcb01 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -270,8 +270,8 @@ void __init fork_init(unsigned long mempages)
 
 	/*
 	 * The default maximum number of threads is set to a safe
-	 * value: the thread structures can take up at most half
-	 * of memory.
+	 * value: the thread structures can take up at most one
+	 * eighth of the memory.
 	 */
 	max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE);
 
-- 
cgit v1.2.3


From ff691f6e03815dc8f99461ea509df863a879fc3a Mon Sep 17 00:00:00 2001
From: Heinrich Schuchardt <xypron.glpk@gmx.de>
Date: Thu, 16 Apr 2015 12:47:44 -0700
Subject: kernel/fork.c: new function for max_threads

PAGE_SIZE is not guaranteed to be equal to or less than 8 times the
THREAD_SIZE.

E.g.  architecture hexagon may have page size 1M and thread size 4096.
This would lead to a division by zero in the calculation of max_threads.

With this patch the buggy code is moved to a separate function
set_max_threads.  The error is not fixed.

After fixing the problem in a separate patch the new function can be
reused to adjust max_threads after adding or removing memory.

Argument mempages of function fork_init() is removed as totalram_pages is
an exported symbol.

The creation of separate patches for refactoring to a new function and for
fixing the logic was suggested by Ingo Molnar.

Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/main.c   |  4 ++--
 kernel/fork.c | 34 +++++++++++++++++++++-------------
 2 files changed, 23 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/init/main.c b/init/main.c
index a7e969d12f51..2115055faeac 100644
--- a/init/main.c
+++ b/init/main.c
@@ -91,7 +91,7 @@
 static int kernel_init(void *);
 
 extern void init_IRQ(void);
-extern void fork_init(unsigned long);
+extern void fork_init(void);
 extern void radix_tree_init(void);
 #ifndef CONFIG_DEBUG_RODATA
 static inline void mark_rodata_ro(void) { }
@@ -645,7 +645,7 @@ asmlinkage __visible void __init start_kernel(void)
 #endif
 	thread_info_cache_init();
 	cred_init();
-	fork_init(totalram_pages);
+	fork_init();
 	proc_caches_init();
 	buffer_init();
 	key_init();
diff --git a/kernel/fork.c b/kernel/fork.c
index c507e29bcb01..01038e6f51a8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -253,7 +253,26 @@ EXPORT_SYMBOL_GPL(__put_task_struct);
 
 void __init __weak arch_task_cache_init(void) { }
 
-void __init fork_init(unsigned long mempages)
+/*
+ * set_max_threads
+ */
+static void set_max_threads(void)
+{
+	/*
+	 * The default maximum number of threads is set to a safe
+	 * value: the thread structures can take up at most one
+	 * eighth of the memory.
+	 */
+	max_threads = totalram_pages / (8 * THREAD_SIZE / PAGE_SIZE);
+
+	/*
+	 * we need to allow at least 20 threads to boot a system
+	 */
+	if (max_threads < 20)
+		max_threads = 20;
+}
+
+void __init fork_init(void)
 {
 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
 #ifndef ARCH_MIN_TASKALIGN
@@ -268,18 +287,7 @@ void __init fork_init(unsigned long mempages)
 	/* do the arch specific task caches init */
 	arch_task_cache_init();
 
-	/*
-	 * The default maximum number of threads is set to a safe
-	 * value: the thread structures can take up at most one
-	 * eighth of the memory.
-	 */
-	max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE);
-
-	/*
-	 * we need to allow at least 20 threads to boot a system
-	 */
-	if (max_threads < 20)
-		max_threads = 20;
+	set_max_threads();
 
 	init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
 	init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
-- 
cgit v1.2.3


From ac1b398de1ef94aeee8ba87b0120763526572a6e Mon Sep 17 00:00:00 2001
From: Heinrich Schuchardt <xypron.glpk@gmx.de>
Date: Thu, 16 Apr 2015 12:47:47 -0700
Subject: kernel/fork.c: avoid division by zero

PAGE_SIZE is not guaranteed to be equal to or less than 8 times the
THREAD_SIZE.

E.g.  architecture hexagon may have page size 1M and thread size 4096.
This would lead to a division by zero in the calculation of max_threads.

With 32-bit calculation there is no solution which delivers valid results
for all possible combinations of the parameters.  The code is only called
once.  Hence a 64-bit calculation can be used as solution.

[akpm@linux-foundation.org: use clamp_t(), per Oleg]
Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 01038e6f51a8..c7f2e1a4187a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -87,6 +87,16 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/task.h>
 
+/*
+ * Minimum number of threads to boot the kernel
+ */
+#define MIN_THREADS 20
+
+/*
+ * Maximum number of threads
+ */
+#define MAX_THREADS FUTEX_TID_MASK
+
 /*
  * Protected counters by write_lock_irq(&tasklist_lock)
  */
@@ -258,18 +268,19 @@ void __init __weak arch_task_cache_init(void) { }
  */
 static void set_max_threads(void)
 {
-	/*
-	 * The default maximum number of threads is set to a safe
-	 * value: the thread structures can take up at most one
-	 * eighth of the memory.
-	 */
-	max_threads = totalram_pages / (8 * THREAD_SIZE / PAGE_SIZE);
+	u64 threads;
 
 	/*
-	 * we need to allow at least 20 threads to boot a system
+	 * The number of threads shall be limited such that the thread
+	 * structures may only consume a small part of the available memory.
 	 */
-	if (max_threads < 20)
-		max_threads = 20;
+	if (fls64(totalram_pages) + fls64(PAGE_SIZE) > 64)
+		threads = MAX_THREADS;
+	else
+		threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE,
+				    (u64) THREAD_SIZE * 8UL);
+
+	max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
 }
 
 void __init fork_init(void)
-- 
cgit v1.2.3


From 16db3d3f1170fb0efca652c9378ce7c5f5cb4232 Mon Sep 17 00:00:00 2001
From: Heinrich Schuchardt <xypron.glpk@gmx.de>
Date: Thu, 16 Apr 2015 12:47:50 -0700
Subject: kernel/sysctl.c: threads-max observe limits

Users can change the maximum number of threads by writing to
/proc/sys/kernel/threads-max.

With the patch the value entered is checked against the same limits that
apply when fork_init is called.

Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sysctl.h |  3 +++
 kernel/fork.c          | 31 +++++++++++++++++++++++++++++--
 kernel/sysctl.c        |  6 ++----
 3 files changed, 34 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index b7361f831226..795d5fea5697 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -212,4 +212,7 @@ static inline void setup_sysctl_set(struct ctl_table_set *p,
 
 #endif /* CONFIG_SYSCTL */
 
+int sysctl_max_threads(struct ctl_table *table, int write,
+		       void __user *buffer, size_t *lenp, loff_t *ppos);
+
 #endif /* _LINUX_SYSCTL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index c7f2e1a4187a..8807a129711b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -74,6 +74,7 @@
 #include <linux/uprobes.h>
 #include <linux/aio.h>
 #include <linux/compiler.h>
+#include <linux/sysctl.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -266,7 +267,7 @@ void __init __weak arch_task_cache_init(void) { }
 /*
  * set_max_threads
  */
-static void set_max_threads(void)
+static void set_max_threads(unsigned int max_threads_suggested)
 {
 	u64 threads;
 
@@ -280,6 +281,9 @@ static void set_max_threads(void)
 		threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE,
 				    (u64) THREAD_SIZE * 8UL);
 
+	if (threads > max_threads_suggested)
+		threads = max_threads_suggested;
+
 	max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
 }
 
@@ -298,7 +302,7 @@ void __init fork_init(void)
 	/* do the arch specific task caches init */
 	arch_task_cache_init();
 
-	set_max_threads();
+	set_max_threads(MAX_THREADS);
 
 	init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
 	init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
@@ -2020,3 +2024,26 @@ int unshare_files(struct files_struct **displaced)
 	task_unlock(task);
 	return 0;
 }
+
+int sysctl_max_threads(struct ctl_table *table, int write,
+		       void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table t;
+	int ret;
+	int threads = max_threads;
+	int min = MIN_THREADS;
+	int max = MAX_THREADS;
+
+	t = *table;
+	t.data = &threads;
+	t.extra1 = &min;
+	t.extra2 = &max;
+
+	ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+	if (ret || !write)
+		return ret;
+
+	set_max_threads(threads);
+
+	return 0;
+}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 42b7fc2860c1..3c0998426b57 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -93,11 +93,9 @@
 #include <linux/nmi.h>
 #endif
 
-
 #if defined(CONFIG_SYSCTL)
 
 /* External variables not in a header file. */
-extern int max_threads;
 extern int suid_dumpable;
 #ifdef CONFIG_COREDUMP
 extern int core_uses_pid;
@@ -710,10 +708,10 @@ static struct ctl_table kern_table[] = {
 #endif
 	{
 		.procname	= "threads-max",
-		.data		= &max_threads,
+		.data		= NULL,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= sysctl_max_threads,
 	},
 	{
 		.procname	= "random",
-- 
cgit v1.2.3


From 90f31d0ea88880f780574f3d0bb1a227c4c66ca3 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Thu, 16 Apr 2015 12:47:56 -0700
Subject: mm: rcu-protected get_mm_exe_file()

This patch removes mm->mmap_sem from mm->exe_file read side.
Also it kills dup_mm_exe_file() and moves exe_file duplication into
dup_mmap() where both mmap_sems are locked.

[akpm@linux-foundation.org: fix comment typo]
Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Cc: Davidlohr Bueso <dbueso@suse.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/file.c                |  3 +--
 include/linux/fs.h       |  1 +
 include/linux/mm_types.h |  2 +-
 kernel/fork.c            | 56 ++++++++++++++++++++++++++++++++----------------
 4 files changed, 40 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/fs/file.c b/fs/file.c
index ee738ea028fa..93c5f89c248b 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -638,8 +638,7 @@ static struct file *__fget(unsigned int fd, fmode_t mask)
 	file = fcheck_files(files, fd);
 	if (file) {
 		/* File object ref couldn't be taken */
-		if ((file->f_mode & mask) ||
-		    !atomic_long_inc_not_zero(&file->f_count))
+		if ((file->f_mode & mask) || !get_file_rcu(file))
 			file = NULL;
 	}
 	rcu_read_unlock();
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f4fc60727b8d..6bf7ab7c1573 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -870,6 +870,7 @@ static inline struct file *get_file(struct file *f)
 	atomic_long_inc(&f->f_count);
 	return f;
 }
+#define get_file_rcu(x) atomic_long_inc_not_zero(&(x)->f_count)
 #define fput_atomic(x)	atomic_long_add_unless(&(x)->f_count, -1, 1)
 #define file_count(x)	atomic_long_read(&(x)->f_count)
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 590630eb59ba..8d37e26a1007 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -429,7 +429,7 @@ struct mm_struct {
 #endif
 
 	/* store ref to file /proc/<pid>/exe symlink points to */
-	struct file *exe_file;
+	struct file __rcu *exe_file;
 #ifdef CONFIG_MMU_NOTIFIER
 	struct mmu_notifier_mm *mmu_notifier_mm;
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index 8807a129711b..259202637531 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -403,6 +403,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 	 */
 	down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
 
+	/* No ordering required: file already has been exposed. */
+	RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
+
 	mm->total_vm = oldmm->total_vm;
 	mm->shared_vm = oldmm->shared_vm;
 	mm->exec_vm = oldmm->exec_vm;
@@ -528,7 +531,13 @@ static inline void mm_free_pgd(struct mm_struct *mm)
 	pgd_free(mm, mm->pgd);
 }
 #else
-#define dup_mmap(mm, oldmm)	(0)
+static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+	down_write(&oldmm->mmap_sem);
+	RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
+	up_write(&oldmm->mmap_sem);
+	return 0;
+}
 #define mm_alloc_pgd(mm)	(0)
 #define mm_free_pgd(mm)
 #endif /* CONFIG_MMU */
@@ -697,35 +706,46 @@ void mmput(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mmput);
 
+/**
+ * set_mm_exe_file - change a reference to the mm's executable file
+ *
+ * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
+ *
+ * Main users are mmput(), sys_execve() and sys_prctl(PR_SET_MM_MAP/EXE_FILE).
+ * Callers prevent concurrent invocations: in mmput() nobody alive left,
+ * in execve task is single-threaded, prctl holds mmap_sem exclusively.
+ */
 void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
 {
+	struct file *old_exe_file = rcu_dereference_protected(mm->exe_file,
+			!atomic_read(&mm->mm_users) || current->in_execve ||
+			lockdep_is_held(&mm->mmap_sem));
+
 	if (new_exe_file)
 		get_file(new_exe_file);
-	if (mm->exe_file)
-		fput(mm->exe_file);
-	mm->exe_file = new_exe_file;
+	rcu_assign_pointer(mm->exe_file, new_exe_file);
+	if (old_exe_file)
+		fput(old_exe_file);
 }
 
+/**
+ * get_mm_exe_file - acquire a reference to the mm's executable file
+ *
+ * Returns %NULL if mm has no associated executable file.
+ * User must release file via fput().
+ */
 struct file *get_mm_exe_file(struct mm_struct *mm)
 {
 	struct file *exe_file;
 
-	/* We need mmap_sem to protect against races with removal of exe_file */
-	down_read(&mm->mmap_sem);
-	exe_file = mm->exe_file;
-	if (exe_file)
-		get_file(exe_file);
-	up_read(&mm->mmap_sem);
+	rcu_read_lock();
+	exe_file = rcu_dereference(mm->exe_file);
+	if (exe_file && !get_file_rcu(exe_file))
+		exe_file = NULL;
+	rcu_read_unlock();
 	return exe_file;
 }
 
-static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
-{
-	/* It's safe to write the exe_file pointer without exe_file_lock because
-	 * this is called during fork when the task is not yet in /proc */
-	newmm->exe_file = get_mm_exe_file(oldmm);
-}
-
 /**
  * get_task_mm - acquire a reference to the task's mm
  *
@@ -887,8 +907,6 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
 	if (!mm_init(mm, tsk))
 		goto fail_nomem;
 
-	dup_mm_exe_file(oldmm, mm);
-
 	err = dup_mmap(mm, oldmm);
 	if (err)
 		goto free_pt;
-- 
cgit v1.2.3


From 6e399cd144d8500ffb5d40fa6848890e2580a80a Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Thu, 16 Apr 2015 12:47:59 -0700
Subject: prctl: avoid using mmap_sem for exe_file serialization

Oleg cleverly suggested using xchg() to set the new mm->exe_file instead
of calling set_mm_exe_file() which requires some form of serialization --
mmap_sem in this case.  For archs that do not have atomic rmw instructions
we still fallback to a spinlock alternative, so this should always be
safe.  As such, we only need the mmap_sem for looking up the backing
vm_file, which can be done sharing the lock.  Naturally, this means we
need to manually deal with both the new and old file reference counting,
and we need not worry about the MMF_EXE_FILE_CHANGED bits, which can
probably be deleted in the future anyway.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Suggested-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c     |  6 ++++++
 kernel/fork.c | 19 +++++++++++++------
 kernel/sys.c  | 47 ++++++++++++++++++++++++++++-------------------
 3 files changed, 47 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index c7f9b733406d..a5fef835ebc5 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1078,7 +1078,13 @@ int flush_old_exec(struct linux_binprm * bprm)
 	if (retval)
 		goto out;
 
+	/*
+	 * Must be called _before_ exec_mmap() as bprm->mm is
+	 * not visibile until then. This also enables the update
+	 * to be lockless.
+	 */
 	set_mm_exe_file(bprm->mm, bprm->file);
+
 	/*
 	 * Release all of the old mmap stuff
 	 */
diff --git a/kernel/fork.c b/kernel/fork.c
index 259202637531..0d23e76a0c61 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -711,15 +711,22 @@ EXPORT_SYMBOL_GPL(mmput);
  *
  * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
  *
- * Main users are mmput(), sys_execve() and sys_prctl(PR_SET_MM_MAP/EXE_FILE).
- * Callers prevent concurrent invocations: in mmput() nobody alive left,
- * in execve task is single-threaded, prctl holds mmap_sem exclusively.
+ * Main users are mmput() and sys_execve(). Callers prevent concurrent
+ * invocations: in mmput() nobody alive left, in execve task is single
+ * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the
+ * mm->exe_file, but does so without using set_mm_exe_file() in order
+ * to do avoid the need for any locks.
  */
 void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
 {
-	struct file *old_exe_file = rcu_dereference_protected(mm->exe_file,
-			!atomic_read(&mm->mm_users) || current->in_execve ||
-			lockdep_is_held(&mm->mmap_sem));
+	struct file *old_exe_file;
+
+	/*
+	 * It is safe to dereference the exe_file without RCU as
+	 * this function is only called if nobody else can access
+	 * this mm -- see comment above for justification.
+	 */
+	old_exe_file = rcu_dereference_raw(mm->exe_file);
 
 	if (new_exe_file)
 		get_file(new_exe_file);
diff --git a/kernel/sys.c b/kernel/sys.c
index 3be344902316..a4e372b798a5 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1649,14 +1649,13 @@ SYSCALL_DEFINE1(umask, int, mask)
 	return mask;
 }
 
-static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
+static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 {
 	struct fd exe;
+	struct file *old_exe, *exe_file;
 	struct inode *inode;
 	int err;
 
-	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
-
 	exe = fdget(fd);
 	if (!exe.file)
 		return -EBADF;
@@ -1680,15 +1679,22 @@ static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
 	/*
 	 * Forbid mm->exe_file change if old file still mapped.
 	 */
+	exe_file = get_mm_exe_file(mm);
 	err = -EBUSY;
-	if (mm->exe_file) {
+	if (exe_file) {
 		struct vm_area_struct *vma;
 
-		for (vma = mm->mmap; vma; vma = vma->vm_next)
-			if (vma->vm_file &&
-			    path_equal(&vma->vm_file->f_path,
-				       &mm->exe_file->f_path))
-				goto exit;
+		down_read(&mm->mmap_sem);
+		for (vma = mm->mmap; vma; vma = vma->vm_next) {
+			if (!vma->vm_file)
+				continue;
+			if (path_equal(&vma->vm_file->f_path,
+				       &exe_file->f_path))
+				goto exit_err;
+		}
+
+		up_read(&mm->mmap_sem);
+		fput(exe_file);
 	}
 
 	/*
@@ -1702,10 +1708,18 @@ static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
 		goto exit;
 
 	err = 0;
-	set_mm_exe_file(mm, exe.file);	/* this grabs a reference to exe.file */
+	/* set the new file, lockless */
+	get_file(exe.file);
+	old_exe = xchg(&mm->exe_file, exe.file);
+	if (old_exe)
+		fput(old_exe);
 exit:
 	fdput(exe);
 	return err;
+exit_err:
+	up_read(&mm->mmap_sem);
+	fput(exe_file);
+	goto exit;
 }
 
 #ifdef CONFIG_CHECKPOINT_RESTORE
@@ -1840,10 +1854,9 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
 		user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
 	}
 
-	down_write(&mm->mmap_sem);
 	if (prctl_map.exe_fd != (u32)-1)
-		error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd);
-	downgrade_write(&mm->mmap_sem);
+		error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
+	down_read(&mm->mmap_sem);
 	if (error)
 		goto out;
 
@@ -1909,12 +1922,8 @@ static int prctl_set_mm(int opt, unsigned long addr,
 	if (!capable(CAP_SYS_RESOURCE))
 		return -EPERM;
 
-	if (opt == PR_SET_MM_EXE_FILE) {
-		down_write(&mm->mmap_sem);
-		error = prctl_set_mm_exe_file_locked(mm, (unsigned int)addr);
-		up_write(&mm->mmap_sem);
-		return error;
-	}
+	if (opt == PR_SET_MM_EXE_FILE)
+		return prctl_set_mm_exe_file(mm, (unsigned int)addr);
 
 	if (addr >= TASK_SIZE || addr < mmap_min_addr)
 		return -EINVAL;
-- 
cgit v1.2.3


From 230633d109e35b0a24277498e773edeb79b4a331 Mon Sep 17 00:00:00 2001
From: Heinrich Schuchardt <xypron.glpk@gmx.de>
Date: Thu, 16 Apr 2015 12:48:07 -0700
Subject: kernel/sysctl.c: detect overflows when converting to int

When converting unsigned long to int overflows may occur.  These currently
are not detected when writing to the sysctl file system.

E.g. on a system where int has 32 bits and long has 64 bits
  echo 0x800001234 > /proc/sys/kernel/threads-max
has the same effect as
  echo 0x1234 > /proc/sys/kernel/threads-max

The patch adds the missing check in do_proc_dointvec_conv.

With the patch an overflow will result in an error EINVAL when writing to
the the sysctl file system.

Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3c0998426b57..2082b1a88fb9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1981,7 +1981,15 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
 				 int write, void *data)
 {
 	if (write) {
-		*valp = *negp ? -*lvalp : *lvalp;
+		if (*negp) {
+			if (*lvalp > (unsigned long) INT_MAX + 1)
+				return -EINVAL;
+			*valp = -*lvalp;
+		} else {
+			if (*lvalp > (unsigned long) INT_MAX)
+				return -EINVAL;
+			*valp = *lvalp;
+		}
 	} else {
 		int val = *valp;
 		if (val < 0) {
-- 
cgit v1.2.3


From 9d796e66230205cd3366f5660387bd9ecca9d336 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Thu, 16 Apr 2015 12:48:10 -0700
Subject: gcov: fix softlockups

gcov profiling if enabled with other heavy compile-time instrumentation
like KASan could trigger following softlockups:

  NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [swapper/0:1]
  Modules linked in:
  irq event stamp: 22823276
  hardirqs last  enabled at (22823275): [<ffffffff86e8d10d>] mutex_lock_nested+0x7d9/0x930
  hardirqs last disabled at (22823276): [<ffffffff86e9521d>] apic_timer_interrupt+0x6d/0x80
  softirqs last  enabled at (22823172): [<ffffffff811ed969>] __do_softirq+0x4db/0x729
  softirqs last disabled at (22823167): [<ffffffff811edfcf>] irq_exit+0x7d/0x15b
  CPU: 0 PID: 1 Comm: swapper/0 Tainted: G        W       3.19.0-05245-gbb33326-dirty #3
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5.1-0-g8936dbb-20141113_115728-nilsson.home.kraxel.org 04/01/2014
  task: ffff88006cba8000 ti: ffff88006cbb0000 task.ti: ffff88006cbb0000
  RIP: kasan_mem_to_shadow+0x1e/0x1f
  Call Trace:
    strcmp+0x28/0x70
    get_node_by_name+0x66/0x99
    gcov_event+0x4f/0x69e
    gcov_enable_events+0x54/0x7b
    gcov_fs_init+0xf8/0x134
    do_one_initcall+0x1b2/0x288
    kernel_init_freeable+0x467/0x580
    kernel_init+0x15/0x18b
    ret_from_fork+0x7c/0xb0
  Kernel panic - not syncing: softlockup: hung tasks

Fix this by sticking cond_resched() in gcov_enable_events().

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Cc: Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/gcov/base.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index b358a802fd18..a744098e4eb7 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -18,6 +18,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/sched.h>
 #include "gcov.h"
 
 static int gcov_events_enabled;
@@ -107,8 +108,10 @@ void gcov_enable_events(void)
 	gcov_events_enabled = 1;
 
 	/* Perform event callback for previously registered entries. */
-	while ((info = gcov_info_next(info)))
+	while ((info = gcov_info_next(info))) {
 		gcov_event(GCOV_ADD, info);
+		cond_resched();
+	}
 
 	mutex_unlock(&gcov_lock);
 }
-- 
cgit v1.2.3


From 11163348a23cdbcdca5fb42485418e75f8566a5c Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Thu, 16 Apr 2015 12:49:12 -0700
Subject: oprofile: reduce mmap_sem hold for mm->exe_file

sync_buffer() needs the mmap_sem for two distinct operations, both only
occurring upon user context switch handling:

 1) Dealing with the exe_file.

 2) Adding the dcookie data as we need to lookup the vma that
   backs it. This is done via add_sample() and add_data().

This patch isolates 1), for it will no longer need the mmap_sem for
serialization.  However, for now, make of the more standard
get_mm_exe_file(), requiring only holding the mmap_sem to read the value,
and relying on reference counting to make sure that the exe file won't
dissappear underneath us while doing the get dcookie.

As a consequence, for 2) we move the mmap_sem locking into where we really
need it, in lookup_dcookie().  The benefits are twofold: reduce mmap_sem
hold times, and cleaner code.

[akpm@linux-foundation.org: export get_mm_exe_file for arch/x86/oprofile/oprofile.ko]
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Robert Richter <rric@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/oprofile/buffer_sync.c | 30 ++++++++++++++++--------------
 kernel/fork.c                  |  1 +
 2 files changed, 17 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c
index d93b2b6b1f7a..82f7000a285d 100644
--- a/drivers/oprofile/buffer_sync.c
+++ b/drivers/oprofile/buffer_sync.c
@@ -21,6 +21,7 @@
  * objects.
  */
 
+#include <linux/file.h>
 #include <linux/mm.h>
 #include <linux/workqueue.h>
 #include <linux/notifier.h>
@@ -224,10 +225,18 @@ static inline unsigned long fast_get_dcookie(struct path *path)
 static unsigned long get_exec_dcookie(struct mm_struct *mm)
 {
 	unsigned long cookie = NO_COOKIE;
+	struct file *exe_file;
 
-	if (mm && mm->exe_file)
-		cookie = fast_get_dcookie(&mm->exe_file->f_path);
+	if (!mm)
+		goto done;
+
+	exe_file = get_mm_exe_file(mm);
+	if (!exe_file)
+		goto done;
 
+	cookie = fast_get_dcookie(&exe_file->f_path);
+	fput(exe_file);
+done:
 	return cookie;
 }
 
@@ -236,6 +245,8 @@ static unsigned long get_exec_dcookie(struct mm_struct *mm)
  * pair that can then be added to the global event buffer. We make
  * sure to do this lookup before a mm->mmap modification happens so
  * we don't lose track.
+ *
+ * The caller must ensure the mm is not nil (ie: not a kernel thread).
  */
 static unsigned long
 lookup_dcookie(struct mm_struct *mm, unsigned long addr, off_t *offset)
@@ -243,6 +254,7 @@ lookup_dcookie(struct mm_struct *mm, unsigned long addr, off_t *offset)
 	unsigned long cookie = NO_COOKIE;
 	struct vm_area_struct *vma;
 
+	down_read(&mm->mmap_sem);
 	for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) {
 
 		if (addr < vma->vm_start || addr >= vma->vm_end)
@@ -262,6 +274,7 @@ lookup_dcookie(struct mm_struct *mm, unsigned long addr, off_t *offset)
 
 	if (!vma)
 		cookie = INVALID_COOKIE;
+	up_read(&mm->mmap_sem);
 
 	return cookie;
 }
@@ -402,20 +415,9 @@ static void release_mm(struct mm_struct *mm)
 {
 	if (!mm)
 		return;
-	up_read(&mm->mmap_sem);
 	mmput(mm);
 }
 
-
-static struct mm_struct *take_tasks_mm(struct task_struct *task)
-{
-	struct mm_struct *mm = get_task_mm(task);
-	if (mm)
-		down_read(&mm->mmap_sem);
-	return mm;
-}
-
-
 static inline int is_code(unsigned long val)
 {
 	return val == ESCAPE_CODE;
@@ -532,7 +534,7 @@ void sync_buffer(int cpu)
 				new = (struct task_struct *)val;
 				oldmm = mm;
 				release_mm(oldmm);
-				mm = take_tasks_mm(new);
+				mm = get_task_mm(new);
 				if (mm != oldmm)
 					cookie = get_exec_dcookie(mm);
 				add_user_ctx_switch(new, cookie);
diff --git a/kernel/fork.c b/kernel/fork.c
index 0d23e76a0c61..03c1eaaa6ef5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -752,6 +752,7 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
 	rcu_read_unlock();
 	return exe_file;
 }
+EXPORT_SYMBOL(get_mm_exe_file);
 
 /**
  * get_task_mm - acquire a reference to the task's mm
-- 
cgit v1.2.3


From 3193899d4dd54056f8c2e0b1e40dd6e2f0009f28 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 17 Apr 2015 10:27:57 -0400
Subject: tracing: Fix possible out of bounds memory access when parsing enums

The code that replaces the enum names with the enum values in the
tracepoints' format files could possible miss the end of string nul
character. This was caused by processing things like backslashes, quotes
and other tokens. After processing the tokens, a check for the nul
character needed to be done before continuing the loop, because the loop
incremented the pointer before doing the check, which could bypass the nul
character.

Link: http://lkml.kernel.org/r/552E661D.5060502@oracle.com

Reported-by: Sasha Levin <sasha.levin@oracle.com> # via KASan
Tested-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Fixes: 0c564a538aa9 "tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values"
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 36a957c996c7..b49c107f82ac 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1760,6 +1760,8 @@ static void update_event_printk(struct ftrace_event_call *call,
 				ptr++;
 				/* Check for alpha chars like ULL */
 			} while (isalnum(*ptr));
+			if (!*ptr)
+				break;
 			/*
 			 * A number must have some kind of delimiter after
 			 * it, and we can ignore that too.
@@ -1786,12 +1788,16 @@ static void update_event_printk(struct ftrace_event_call *call,
 			do {
 				ptr++;
 			} while (isalnum(*ptr) || *ptr == '_');
+			if (!*ptr)
+				break;
 			/*
 			 * If what comes after this variable is a '.' or
 			 * '->' then we can continue to ignore that string.
 			 */
 			if (*ptr == '.' || (ptr[0] == '-' && ptr[1] == '>')) {
 				ptr += *ptr == '.' ? 1 : 2;
+				if (!*ptr)
+					break;
 				goto skip_more;
 			}
 			/*
-- 
cgit v1.2.3


From 5224b9613b91d937c6948fe977023247afbcc04e Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 19 Apr 2015 04:56:03 -0400
Subject: smp: Fix error case handling in smp_call_function_*()

Commit 8053871d0f7f ("smp: Fix smp_call_function_single_async()
locking") fixed the locking for the asynchronous smp-call case, but in
the process of moving the lock handling around, one of the error cases
ended up not unlocking the call data at all.

This went unnoticed on x86, because this is a "caller is buggy" case,
where the caller is trying to call a non-existent CPU.  But apparently
ARM does that (at least under qemu-arm).  Bindly doing cross-cpu calls
to random CPU's that aren't even online seems a bit fishy, but the error
handling was clearly not correct.

Simply add the missing "csd_unlock()" to the error path.

Reported-and-tested-by: Guenter Roeck <linux@roeck-us.net>
Analyzed-by: Rabin Vincent <rabin@rab.in>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/smp.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index 2aaac2c47683..07854477c164 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -159,8 +159,10 @@ static int generic_exec_single(int cpu, struct call_single_data *csd,
 	}
 
 
-	if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu))
+	if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) {
+		csd_unlock(csd);
 		return -ENXIO;
+	}
 
 	csd->func = func;
 	csd->info = info;
-- 
cgit v1.2.3


From 7e01b5acd88b3f3108d8c4ce44e3205d67437202 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Thu, 16 Apr 2015 14:47:33 +0200
Subject: kexec: allocate the kexec control page with KEXEC_CONTROL_MEMORY_GFP

Introduce KEXEC_CONTROL_MEMORY_GFP to allow the architecture code
to override the gfp flags of the allocation for the kexec control
page. The loop in kimage_alloc_normal_control_pages allocates pages
with GFP_KERNEL until a page is found that happens to have an
address smaller than the KEXEC_CONTROL_MEMORY_LIMIT. On systems
with a large memory size but a small KEXEC_CONTROL_MEMORY_LIMIT
the loop will keep allocating memory until the oom killer steps in.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/kexec.h | 3 +++
 include/linux/kexec.h         | 4 ++++
 kernel/kexec.c                | 2 +-
 3 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/arch/s390/include/asm/kexec.h b/arch/s390/include/asm/kexec.h
index 694bcd6bd927..2f924bc30e35 100644
--- a/arch/s390/include/asm/kexec.h
+++ b/arch/s390/include/asm/kexec.h
@@ -26,6 +26,9 @@
 /* Not more than 2GB */
 #define KEXEC_CONTROL_MEMORY_LIMIT (1UL<<31)
 
+/* Allocate control page with GFP_DMA */
+#define KEXEC_CONTROL_MEMORY_GFP GFP_DMA
+
 /* Maximum address we can use for the crash control pages */
 #define KEXEC_CRASH_CONTROL_MEMORY_LIMIT (-1UL)
 
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index e60a745ac198..e804306ef5e8 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -40,6 +40,10 @@
 #error KEXEC_CONTROL_MEMORY_LIMIT not defined
 #endif
 
+#ifndef KEXEC_CONTROL_MEMORY_GFP
+#define KEXEC_CONTROL_MEMORY_GFP GFP_KERNEL
+#endif
+
 #ifndef KEXEC_CONTROL_PAGE_SIZE
 #error KEXEC_CONTROL_PAGE_SIZE not defined
 #endif
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 38c25b1f2fd5..7a36fdcca5bf 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -707,7 +707,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
 	do {
 		unsigned long pfn, epfn, addr, eaddr;
 
-		pages = kimage_alloc_pages(GFP_KERNEL, order);
+		pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
 		if (!pages)
 			break;
 		pfn   = page_to_pfn(pages);
-- 
cgit v1.2.3


From 10a50f1ab5f06c9a3ee5ece3ec52e607ed53c79f Mon Sep 17 00:00:00 2001
From: Roger Quadros <rogerq@ti.com>
Date: Wed, 15 Apr 2015 11:14:11 +0300
Subject: genirq: Set IRQCHIP_SKIP_SET_WAKE flag for dummy_irq_chip

Without this system suspend is broken on systems that have
drivers calling enable/disable_irq_wake() for interrupts based off
the dummy irq hook. (e.g. drivers/gpio/gpio-pcf857x.c)

Signed-off-by: Roger Quadros <rogerq@ti.com>
Cc: <cw00.choi@samsung.com>
Cc: <balbi@ti.com>
Cc: <tony@atomide.com>
Cc: Gregory Clement <gregory.clement@free-electrons.com>
Link: http://lkml.kernel.org/r/552E1DD3.4040106@ti.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/dummychip.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
index 988dc58e8847..2feb6feca0cc 100644
--- a/kernel/irq/dummychip.c
+++ b/kernel/irq/dummychip.c
@@ -57,5 +57,6 @@ struct irq_chip dummy_irq_chip = {
 	.irq_ack	= noop,
 	.irq_mask	= noop,
 	.irq_unmask	= noop,
+	.flags		= IRQCHIP_SKIP_SET_WAKE,
 };
 EXPORT_SYMBOL_GPL(dummy_irq_chip);
-- 
cgit v1.2.3


From 149aabcc44e3e2c1f8fe4f0832be53d2db55b598 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 10 Apr 2015 12:56:41 +0530
Subject: clockevents: Shutdown detached clockevent device

A clockevent device is marked DETACHED when it is replaced by another
clockevent device.

The device is shutdown properly for drivers that implement legacy
->set_mode() callback, as we call ->set_mode() for CLOCK_EVT_MODE_UNUSED
as well.

But for the new per-state callback interface, we skip shutting down the
device, as we thought its an internal state change. That wasn't correct.

The effect is that the device is left programmed in oneshot or periodic
mode.

Fall-back to 'case CLOCK_EVT_STATE_SHUTDOWN', to shutdown the device.

Fixes: bd624d75db21 "clockevents: Introduce mode specific callbacks"
Reported-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Cc: linaro-kernel@lists.linaro.org
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/eef0a91c51b74d4e52c8e5a95eca27b5a0563f07.1428650683.git.viresh.kumar@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/clockevents.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 25d942d1da27..629be4e21e96 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -117,11 +117,7 @@ static int __clockevents_set_state(struct clock_event_device *dev,
 	/* Transition with new state-specific callbacks */
 	switch (state) {
 	case CLOCK_EVT_STATE_DETACHED:
-		/*
-		 * This is an internal state, which is guaranteed to go from
-		 * SHUTDOWN to DETACHED. No driver interaction required.
-		 */
-		return 0;
+		/* The clockevent device is getting replaced. Shut it down. */
 
 	case CLOCK_EVT_STATE_SHUTDOWN:
 		return dev->set_state_shutdown(dev);
-- 
cgit v1.2.3


From 73459e2a1ada09a68c02cc5b73f3116fc8194b3d Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 23 Apr 2015 13:20:18 +0200
Subject: x86: pvclock: Really remove the sched notifier for cross-cpu
 migrations

This reverts commits 0a4e6be9ca17c54817cf814b4b5aa60478c6df27
and 80f7fdb1c7f0f9266421f823964fd1962681f6ce.

The task migration notifier was originally introduced in order to support
the pvclock vsyscall with non-synchronized TSC, but KVM only supports it
with synchronized TSC.  Hence, on KVM the race condition is only needed
due to a bad implementation on the host side, and even then it's so rare
that it's mostly theoretical.

As far as KVM is concerned it's possible to fix the host, avoiding the
additional complexity in the vDSO and the (re)introduction of the task
migration notifier.

Xen, on the other hand, hasn't yet implemented vsyscall support at
all, so we do not care about its plans for non-synchronized TSC.

Reported-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/pvclock.h |  1 -
 arch/x86/kernel/pvclock.c      | 44 ------------------------------------------
 arch/x86/vdso/vclock_gettime.c | 34 ++++++++++++++------------------
 include/linux/sched.h          |  8 --------
 kernel/sched/core.c            | 15 --------------
 5 files changed, 15 insertions(+), 87 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 25b1cc07d496..d6b078e9fa28 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -95,7 +95,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
 
 struct pvclock_vsyscall_time_info {
 	struct pvclock_vcpu_time_info pvti;
-	u32 migrate_count;
 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
 
 #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index e5ecd20e72dd..2f355d229a58 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -141,46 +141,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
 	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
 }
 
-static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
-
-static struct pvclock_vsyscall_time_info *
-pvclock_get_vsyscall_user_time_info(int cpu)
-{
-	if (!pvclock_vdso_info) {
-		BUG();
-		return NULL;
-	}
-
-	return &pvclock_vdso_info[cpu];
-}
-
-struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
-{
-	return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
-}
-
 #ifdef CONFIG_X86_64
-static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
-			        void *v)
-{
-	struct task_migration_notifier *mn = v;
-	struct pvclock_vsyscall_time_info *pvti;
-
-	pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
-
-	/* this is NULL when pvclock vsyscall is not initialized */
-	if (unlikely(pvti == NULL))
-		return NOTIFY_DONE;
-
-	pvti->migrate_count++;
-
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block pvclock_migrate = {
-	.notifier_call = pvclock_task_migrate,
-};
-
 /*
  * Initialize the generic pvclock vsyscall state.  This will allocate
  * a/some page(s) for the per-vcpu pvclock information, set up a
@@ -194,17 +155,12 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
 
 	WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
 
-	pvclock_vdso_info = i;
-
 	for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
 		__set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
 			     __pa(i) + (idx*PAGE_SIZE),
 			     PAGE_KERNEL_VVAR);
 	}
 
-
-	register_task_migration_notifier(&pvclock_migrate);
-
 	return 0;
 }
 #endif
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 40d2473836c9..9793322751e0 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -82,15 +82,18 @@ static notrace cycle_t vread_pvclock(int *mode)
 	cycle_t ret;
 	u64 last;
 	u32 version;
-	u32 migrate_count;
 	u8 flags;
 	unsigned cpu, cpu1;
 
 
 	/*
-	 * When looping to get a consistent (time-info, tsc) pair, we
-	 * also need to deal with the possibility we can switch vcpus,
-	 * so make sure we always re-fetch time-info for the current vcpu.
+	 * Note: hypervisor must guarantee that:
+	 * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
+	 * 2. that per-CPU pvclock time info is updated if the
+	 *    underlying CPU changes.
+	 * 3. that version is increased whenever underlying CPU
+	 *    changes.
+	 *
 	 */
 	do {
 		cpu = __getcpu() & VGETCPU_CPU_MASK;
@@ -99,27 +102,20 @@ static notrace cycle_t vread_pvclock(int *mode)
 		 * __getcpu() calls (Gleb).
 		 */
 
-		/* Make sure migrate_count will change if we leave the VCPU. */
-		do {
-			pvti = get_pvti(cpu);
-			migrate_count = pvti->migrate_count;
-
-			cpu1 = cpu;
-			cpu = __getcpu() & VGETCPU_CPU_MASK;
-		} while (unlikely(cpu != cpu1));
+		pvti = get_pvti(cpu);
 
 		version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
 
 		/*
 		 * Test we're still on the cpu as well as the version.
-		 * - We must read TSC of pvti's VCPU.
-		 * - KVM doesn't follow the versioning protocol, so data could
-		 *   change before version if we left the VCPU.
+		 * We could have been migrated just after the first
+		 * vgetcpu but before fetching the version, so we
+		 * wouldn't notice a version change.
 		 */
-		smp_rmb();
-	} while (unlikely((pvti->pvti.version & 1) ||
-			  pvti->pvti.version != version ||
-			  pvti->migrate_count != migrate_count));
+		cpu1 = __getcpu() & VGETCPU_CPU_MASK;
+	} while (unlikely(cpu != cpu1 ||
+			  (pvti->pvti.version & 1) ||
+			  pvti->pvti.version != version));
 
 	if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
 		*mode = VCLOCK_NONE;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8222ae40ecb0..26a2e6122734 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -175,14 +175,6 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
 extern void calc_global_load(unsigned long ticks);
 extern void update_cpu_load_nohz(void);
 
-/* Notifier for when a task gets migrated to a new CPU */
-struct task_migration_notifier {
-	struct task_struct *task;
-	int from_cpu;
-	int to_cpu;
-};
-extern void register_task_migration_notifier(struct notifier_block *n);
-
 extern unsigned long get_parent_ip(unsigned long addr);
 
 extern void dump_cpu_task(int cpu);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f9123a82cbb6..fe22f7510bce 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1016,13 +1016,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 		rq_clock_skip_update(rq, true);
 }
 
-static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
-
-void register_task_migration_notifier(struct notifier_block *n)
-{
-	atomic_notifier_chain_register(&task_migration_notifier, n);
-}
-
 #ifdef CONFIG_SMP
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
@@ -1053,18 +1046,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 	trace_sched_migrate_task(p, new_cpu);
 
 	if (task_cpu(p) != new_cpu) {
-		struct task_migration_notifier tmn;
-
 		if (p->sched_class->migrate_task_rq)
 			p->sched_class->migrate_task_rq(p, new_cpu);
 		p->se.nr_migrations++;
 		perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
-
-		tmn.task = p;
-		tmn.from_cpu = task_cpu(p);
-		tmn.to_cpu = new_cpu;
-
-		atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
 	}
 
 	__set_task_cpu(p, new_cpu);
-- 
cgit v1.2.3


From 876a7ae65b86d8cec8efe7d15d050ac61116874e Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Mon, 27 Apr 2015 14:40:37 -0700
Subject: bpf: fix 64-bit divide

ALU64_DIV instruction should be dividing 64-bit by 64-bit,
whereas do_div() does 64-bit by 32-bit divide.
x64 and arm64 JITs correctly implement 64 by 64 unsigned divide.
llvm BPF backend emits code assuming that ALU64_DIV does 64 by 64.

Fixes: 89aa075832b0 ("net: sock: allow eBPF programs to be attached to sockets")
Reported-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 4139a0f8b558..54f0e7fcd0e2 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -357,8 +357,8 @@ select_insn:
 	ALU64_MOD_X:
 		if (unlikely(SRC == 0))
 			return 0;
-		tmp = DST;
-		DST = do_div(tmp, SRC);
+		div64_u64_rem(DST, SRC, &tmp);
+		DST = tmp;
 		CONT;
 	ALU_MOD_X:
 		if (unlikely(SRC == 0))
@@ -367,8 +367,8 @@ select_insn:
 		DST = do_div(tmp, (u32) SRC);
 		CONT;
 	ALU64_MOD_K:
-		tmp = DST;
-		DST = do_div(tmp, IMM);
+		div64_u64_rem(DST, IMM, &tmp);
+		DST = tmp;
 		CONT;
 	ALU_MOD_K:
 		tmp = (u32) DST;
@@ -377,7 +377,7 @@ select_insn:
 	ALU64_DIV_X:
 		if (unlikely(SRC == 0))
 			return 0;
-		do_div(DST, SRC);
+		DST = div64_u64(DST, SRC);
 		CONT;
 	ALU_DIV_X:
 		if (unlikely(SRC == 0))
@@ -387,7 +387,7 @@ select_insn:
 		DST = (u32) tmp;
 		CONT;
 	ALU64_DIV_K:
-		do_div(DST, IMM);
+		DST = div64_u64(DST, IMM);
 		CONT;
 	ALU_DIV_K:
 		tmp = (u32) DST;
-- 
cgit v1.2.3


From df8d9eeadd0f7a216f2476351d5aee43c6550bf0 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 29 Apr 2015 15:19:21 +0200
Subject: cpuidle: Run tick_broadcast_exit() with disabled interrupts

Commit 335f49196fd6 (sched/idle: Use explicit broadcast oneshot
control function) replaced clockevents_notify() invocations in
cpuidle_idle_call() with direct calls to tick_broadcast_enter()
and tick_broadcast_exit(), but it overlooked the fact that
interrupts were already enabled before calling the latter which
led to functional breakage on systems using idle states with the
CPUIDLE_FLAG_TIMER_STOP flag set.

Fix that by moving the invocations of tick_broadcast_enter()
and tick_broadcast_exit() down into cpuidle_enter_state() where
interrupts are still disabled when tick_broadcast_exit() is
called.  Also ensure that interrupts will be disabled before
running tick_broadcast_exit() even if they have been enabled by
the idle state's ->enter callback.  Trigger a WARN_ON_ONCE() in
that case, as we generally don't want that to happen for states
with CPUIDLE_FLAG_TIMER_STOP set.

Fixes: 335f49196fd6 (sched/idle: Use explicit broadcast oneshot control function)
Reported-and-tested-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Reported-and-tested-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpuidle/cpuidle.c | 16 ++++++++++++++++
 kernel/sched/idle.c       | 16 ++--------------
 2 files changed, 18 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 7a73a279e179..61c417b9e53f 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -158,9 +158,18 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 	int entered_state;
 
 	struct cpuidle_state *target_state = &drv->states[index];
+	bool broadcast = !!(target_state->flags & CPUIDLE_FLAG_TIMER_STOP);
 	ktime_t time_start, time_end;
 	s64 diff;
 
+	/*
+	 * Tell the time framework to switch to a broadcast timer because our
+	 * local timer will be shut down.  If a local timer is used from another
+	 * CPU as a broadcast timer, this call may fail if it is not available.
+	 */
+	if (broadcast && tick_broadcast_enter())
+		return -EBUSY;
+
 	trace_cpu_idle_rcuidle(index, dev->cpu);
 	time_start = ktime_get();
 
@@ -169,6 +178,13 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 	time_end = ktime_get();
 	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
 
+	if (broadcast) {
+		if (WARN_ON_ONCE(!irqs_disabled()))
+			local_irq_disable();
+
+		tick_broadcast_exit();
+	}
+
 	if (!cpuidle_state_is_coupled(dev, drv, entered_state))
 		local_irq_enable();
 
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index deef1caa94c6..fefcb1fa5160 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -81,7 +81,6 @@ static void cpuidle_idle_call(void)
 	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
 	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
 	int next_state, entered_state;
-	unsigned int broadcast;
 	bool reflect;
 
 	/*
@@ -150,17 +149,6 @@ static void cpuidle_idle_call(void)
 		goto exit_idle;
 	}
 
-	broadcast = drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP;
-
-	/*
-	 * Tell the time framework to switch to a broadcast timer
-	 * because our local timer will be shutdown. If a local timer
-	 * is used from another cpu as a broadcast timer, this call may
-	 * fail if it is not available
-	 */
-	if (broadcast && tick_broadcast_enter())
-		goto use_default;
-
 	/* Take note of the planned idle state. */
 	idle_set_state(this_rq(), &drv->states[next_state]);
 
@@ -174,8 +162,8 @@ static void cpuidle_idle_call(void)
 	/* The cpu is no longer idle or about to enter idle. */
 	idle_set_state(this_rq(), NULL);
 
-	if (broadcast)
-		tick_broadcast_exit();
+	if (entered_state == -EBUSY)
+		goto use_default;
 
 	/*
 	 * Give the governor an opportunity to reflect on the outcome
-- 
cgit v1.2.3


From 9c4249c8e0221e5cfae758d35b768aee84abf6c0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 30 Apr 2015 14:58:43 +0100
Subject: modsign: change default key details

Change default key details to be more obviously unspecified.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <james.l.morris@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/module-signing.txt | 6 +++---
 kernel/Makefile                  | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/module-signing.txt b/Documentation/module-signing.txt
index 09c2382ad055..c72702ec1ded 100644
--- a/Documentation/module-signing.txt
+++ b/Documentation/module-signing.txt
@@ -119,9 +119,9 @@ Most notably, in the x509.genkey file, the req_distinguished_name section
 should be altered from the default:
 
 	[ req_distinguished_name ]
-	O = Magrathea
-	CN = Glacier signing key
-	emailAddress = slartibartfast@magrathea.h2g2
+	#O = Unspecified company
+	CN = Build time autogenerated kernel key
+	#emailAddress = unspecified.user@unspecified.company
 
 The generated RSA key size can also be set with:
 
diff --git a/kernel/Makefile b/kernel/Makefile
index 0f8f8b0bc1bf..60c302cfb4d3 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -197,9 +197,9 @@ x509.genkey:
 	@echo >>x509.genkey "x509_extensions = myexts"
 	@echo >>x509.genkey
 	@echo >>x509.genkey "[ req_distinguished_name ]"
-	@echo >>x509.genkey "O = Magrathea"
-	@echo >>x509.genkey "CN = Glacier signing key"
-	@echo >>x509.genkey "emailAddress = slartibartfast@magrathea.h2g2"
+	@echo >>x509.genkey "#O = Unspecified company"
+	@echo >>x509.genkey "CN = Build time autogenerated kernel key"
+	@echo >>x509.genkey "#emailAddress = unspecified.user@unspecified.company"
 	@echo >>x509.genkey
 	@echo >>x509.genkey "[ myexts ]"
 	@echo >>x509.genkey "basicConstraints=critical,CA:FALSE"
-- 
cgit v1.2.3


From ac01ce1410fc2c7b5f3af5e9c972e6a412eee54f Mon Sep 17 00:00:00 2001
From: Alex Bennée <alex.bennee@linaro.org>
Date: Wed, 29 Apr 2015 16:18:46 +0100
Subject: tracing: Make ftrace_print_array_seq compute buf_len
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The only caller to this function (__print_array) was getting it wrong by
passing the array length instead of buffer length. As the element size
was already being passed for other reasons it seems reasonable to push
the calculation of buffer length into the function.

Link: http://lkml.kernel.org/r/1430320727-14582-1-git-send-email-alex.bennee@linaro.org

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h | 2 +-
 kernel/trace/trace_output.c  | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 46e83c2156c6..f9ecf63d47f1 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -46,7 +46,7 @@ const char *ftrace_print_hex_seq(struct trace_seq *p,
 				 const unsigned char *buf, int len);
 
 const char *ftrace_print_array_seq(struct trace_seq *p,
-				   const void *buf, int buf_len,
+				   const void *buf, int count,
 				   size_t el_size);
 
 struct trace_iterator;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 692bf7184c8c..25a086bcb700 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -178,12 +178,13 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
 EXPORT_SYMBOL(ftrace_print_hex_seq);
 
 const char *
-ftrace_print_array_seq(struct trace_seq *p, const void *buf, int buf_len,
+ftrace_print_array_seq(struct trace_seq *p, const void *buf, int count,
 		       size_t el_size)
 {
 	const char *ret = trace_seq_buffer_ptr(p);
 	const char *prefix = "";
 	void *ptr = (void *)buf;
+	size_t buf_len = count * el_size;
 
 	trace_seq_putc(p, '{');
 
-- 
cgit v1.2.3


From 0782e63bc6fe7e2d3408d250df11d388b7799c6b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 5 May 2015 19:49:49 +0200
Subject: sched: Handle priority boosted tasks proper in setscheduler()

Ronny reported that the following scenario is not handled correctly:

	T1 (prio = 10)
	   lock(rtmutex);

	T2 (prio = 20)
	   lock(rtmutex)
	      boost T1

	T1 (prio = 20)
	   sys_set_scheduler(prio = 30)
	   T1 prio = 30
	   ....
	   sys_set_scheduler(prio = 10)
	   T1 prio = 30

The last step is wrong as T1 should now be back at prio 20.

Commit c365c292d059 ("sched: Consider pi boosting in setscheduler()")
only handles the case where a boosted tasks tries to lower its
priority.

Fix it by taking the new effective priority into account for the
decision whether a change of the priority is required.

Reported-by: Ronny Meeus <ronny.meeus@gmail.com>
Tested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Cc: <stable@vger.kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Fixes: c365c292d059 ("sched: Consider pi boosting in setscheduler()")
Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1505051806060.4225@nanos
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/rt.h |  7 ++++---
 kernel/locking/rtmutex.c | 12 +++++++-----
 kernel/sched/core.c      | 26 ++++++++++++++------------
 3 files changed, 25 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index 6341f5be6e24..a30b172df6e1 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -18,7 +18,7 @@ static inline int rt_task(struct task_struct *p)
 #ifdef CONFIG_RT_MUTEXES
 extern int rt_mutex_getprio(struct task_struct *p);
 extern void rt_mutex_setprio(struct task_struct *p, int prio);
-extern int rt_mutex_check_prio(struct task_struct *task, int newprio);
+extern int rt_mutex_get_effective_prio(struct task_struct *task, int newprio);
 extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
 extern void rt_mutex_adjust_pi(struct task_struct *p);
 static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
@@ -31,9 +31,10 @@ static inline int rt_mutex_getprio(struct task_struct *p)
 	return p->normal_prio;
 }
 
-static inline int rt_mutex_check_prio(struct task_struct *task, int newprio)
+static inline int rt_mutex_get_effective_prio(struct task_struct *task,
+					      int newprio)
 {
-	return 0;
+	return newprio;
 }
 
 static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index b73279367087..b025295f4966 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -265,15 +265,17 @@ struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
 }
 
 /*
- * Called by sched_setscheduler() to check whether the priority change
- * is overruled by a possible priority boosting.
+ * Called by sched_setscheduler() to get the priority which will be
+ * effective after the change.
  */
-int rt_mutex_check_prio(struct task_struct *task, int newprio)
+int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
 {
 	if (!task_has_pi_waiters(task))
-		return 0;
+		return newprio;
 
-	return task_top_pi_waiter(task)->task->prio <= newprio;
+	if (task_top_pi_waiter(task)->task->prio <= newprio)
+		return task_top_pi_waiter(task)->task->prio;
+	return newprio;
 }
 
 /*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fe22f7510bce..34db9bf892a3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3300,15 +3300,18 @@ static void __setscheduler_params(struct task_struct *p,
 
 /* Actually do priority change: must hold pi & rq lock. */
 static void __setscheduler(struct rq *rq, struct task_struct *p,
-			   const struct sched_attr *attr)
+			   const struct sched_attr *attr, bool keep_boost)
 {
 	__setscheduler_params(p, attr);
 
 	/*
-	 * If we get here, there was no pi waiters boosting the
-	 * task. It is safe to use the normal prio.
+	 * Keep a potential priority boosting if called from
+	 * sched_setscheduler().
 	 */
-	p->prio = normal_prio(p);
+	if (keep_boost)
+		p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
+	else
+		p->prio = normal_prio(p);
 
 	if (dl_prio(p->prio))
 		p->sched_class = &dl_sched_class;
@@ -3408,7 +3411,7 @@ static int __sched_setscheduler(struct task_struct *p,
 	int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
 		      MAX_RT_PRIO - 1 - attr->sched_priority;
 	int retval, oldprio, oldpolicy = -1, queued, running;
-	int policy = attr->sched_policy;
+	int new_effective_prio, policy = attr->sched_policy;
 	unsigned long flags;
 	const struct sched_class *prev_class;
 	struct rq *rq;
@@ -3590,15 +3593,14 @@ change:
 	oldprio = p->prio;
 
 	/*
-	 * Special case for priority boosted tasks.
-	 *
-	 * If the new priority is lower or equal (user space view)
-	 * than the current (boosted) priority, we just store the new
+	 * Take priority boosted tasks into account. If the new
+	 * effective priority is unchanged, we just store the new
 	 * normal parameters and do not touch the scheduler class and
 	 * the runqueue. This will be done when the task deboost
 	 * itself.
 	 */
-	if (rt_mutex_check_prio(p, newprio)) {
+	new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+	if (new_effective_prio == oldprio) {
 		__setscheduler_params(p, attr);
 		task_rq_unlock(rq, p, &flags);
 		return 0;
@@ -3612,7 +3614,7 @@ change:
 		put_prev_task(rq, p);
 
 	prev_class = p->sched_class;
-	__setscheduler(rq, p, attr);
+	__setscheduler(rq, p, attr, true);
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
@@ -7346,7 +7348,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
 	queued = task_on_rq_queued(p);
 	if (queued)
 		dequeue_task(rq, p, 0);
-	__setscheduler(rq, p, &attr);
+	__setscheduler(rq, p, &attr, false);
 	if (queued) {
 		enqueue_task(rq, p, 0);
 		resched_curr(rq);
-- 
cgit v1.2.3


From 533445c6e53368569e50ab3fb712230c03d523f3 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@osandov.com>
Date: Mon, 4 May 2015 03:09:36 -0700
Subject: sched/core: Fix regression in cpuset_cpu_inactive() for suspend

Commit 3c18d447b3b3 ("sched/core: Check for available DL bandwidth in
cpuset_cpu_inactive()"), a SCHED_DEADLINE bugfix, had a logic error that
caused a regression in setting a CPU inactive during suspend. I ran into
this when a program was failing pthread_setaffinity_np() with EINVAL after
a suspend+wake up.

A simple reproducer:

	$ ./a.out
	sched_setaffinity: Success
	$ systemctl suspend
	$ ./a.out
	sched_setaffinity: Invalid argument

... where ./a.out is:

	#define _GNU_SOURCE
	#include <errno.h>
	#include <sched.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <unistd.h>

	int main(void)
	{
		long num_cores;
		cpu_set_t cpu_set;
		int ret;

		num_cores = sysconf(_SC_NPROCESSORS_ONLN);
		CPU_ZERO(&cpu_set);
		CPU_SET(num_cores - 1, &cpu_set);
		errno = 0;
		ret = sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);
		perror("sched_setaffinity");
		return ret ? EXIT_FAILURE : EXIT_SUCCESS;
	}

The mistake is that suspend is handled in the action ==
CPU_DOWN_PREPARE_FROZEN case of the switch statement in
cpuset_cpu_inactive().

However, the commit in question masked out CPU_TASKS_FROZEN
from the action, making this case dead.

The fix is straightforward.

Signed-off-by: Omar Sandoval <osandov@osandov.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 3c18d447b3b3 ("sched/core: Check for available DL bandwidth in cpuset_cpu_inactive()")
Link: http://lkml.kernel.org/r/1cb5ecb3d6543c38cce5790387f336f54ec8e2bc.1430733960.git.osandov@osandov.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 34db9bf892a3..57bd333bc4ab 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6999,27 +6999,23 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
 	unsigned long flags;
 	long cpu = (long)hcpu;
 	struct dl_bw *dl_b;
+	bool overflow;
+	int cpus;
 
-	switch (action & ~CPU_TASKS_FROZEN) {
+	switch (action) {
 	case CPU_DOWN_PREPARE:
-		/* explicitly allow suspend */
-		if (!(action & CPU_TASKS_FROZEN)) {
-			bool overflow;
-			int cpus;
-
-			rcu_read_lock_sched();
-			dl_b = dl_bw_of(cpu);
+		rcu_read_lock_sched();
+		dl_b = dl_bw_of(cpu);
 
-			raw_spin_lock_irqsave(&dl_b->lock, flags);
-			cpus = dl_bw_cpus(cpu);
-			overflow = __dl_overflow(dl_b, cpus, 0, 0);
-			raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+		raw_spin_lock_irqsave(&dl_b->lock, flags);
+		cpus = dl_bw_cpus(cpu);
+		overflow = __dl_overflow(dl_b, cpus, 0, 0);
+		raw_spin_unlock_irqrestore(&dl_b->lock, flags);
 
-			rcu_read_unlock_sched();
+		rcu_read_unlock_sched();
 
-			if (overflow)
-				return notifier_from_errno(-EBUSY);
-		}
+		if (overflow)
+			return notifier_from_errno(-EBUSY);
 		cpuset_update_active_cpus(false);
 		break;
 	case CPU_DOWN_PREPARE_FROZEN:
-- 
cgit v1.2.3


From 8b10c5e2b59ef2a80a07ab594a3b4987a4676211 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 1 May 2015 16:08:46 +0200
Subject: perf: Annotate inherited event ctx->mutex recursion

While fuzzing Sasha tripped over another ctx->mutex recursion lockdep
splat. Annotate this.

Reported-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 41 +++++++++++++++++++++++++++++++++--------
 1 file changed, 33 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 81aa3a4ece9f..1a3bf48743ce 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -913,10 +913,30 @@ static void put_ctx(struct perf_event_context *ctx)
  * Those places that change perf_event::ctx will hold both
  * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
  *
- * Lock ordering is by mutex address. There is one other site where
- * perf_event_context::mutex nests and that is put_event(). But remember that
- * that is a parent<->child context relation, and migration does not affect
- * children, therefore these two orderings should not interact.
+ * Lock ordering is by mutex address. There are two other sites where
+ * perf_event_context::mutex nests and those are:
+ *
+ *  - perf_event_exit_task_context()	[ child , 0 ]
+ *      __perf_event_exit_task()
+ *        sync_child_event()
+ *          put_event()			[ parent, 1 ]
+ *
+ *  - perf_event_init_context()		[ parent, 0 ]
+ *      inherit_task_group()
+ *        inherit_group()
+ *          inherit_event()
+ *            perf_event_alloc()
+ *              perf_init_event()
+ *                perf_try_init_event()	[ child , 1 ]
+ *
+ * While it appears there is an obvious deadlock here -- the parent and child
+ * nesting levels are inverted between the two. This is in fact safe because
+ * life-time rules separate them. That is an exiting task cannot fork, and a
+ * spawning task cannot (yet) exit.
+ *
+ * But remember that that these are parent<->child context relations, and
+ * migration does not affect children, therefore these two orderings should not
+ * interact.
  *
  * The change in perf_event::ctx does not affect children (as claimed above)
  * because the sys_perf_event_open() case will install a new event and break
@@ -3657,9 +3677,6 @@ static void perf_remove_from_owner(struct perf_event *event)
 	}
 }
 
-/*
- * Called when the last reference to the file is gone.
- */
 static void put_event(struct perf_event *event)
 {
 	struct perf_event_context *ctx;
@@ -3697,6 +3714,9 @@ int perf_event_release_kernel(struct perf_event *event)
 }
 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
 
+/*
+ * Called when the last reference to the file is gone.
+ */
 static int perf_release(struct inode *inode, struct file *file)
 {
 	put_event(file->private_data);
@@ -7364,7 +7384,12 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
 		return -ENODEV;
 
 	if (event->group_leader != event) {
-		ctx = perf_event_ctx_lock(event->group_leader);
+		/*
+		 * This ctx->mutex can nest when we're called through
+		 * inheritance. See the perf_event_ctx_lock_nested() comment.
+		 */
+		ctx = perf_event_ctx_lock_nested(event->group_leader,
+						 SINGLE_DEPTH_NESTING);
 		BUG_ON(!ctx);
 	}
 
-- 
cgit v1.2.3


From ab992dc38f9ae40b3ab996d68449692d464c98cf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 18 May 2015 11:31:50 +0200
Subject: watchdog: Fix merge 'conflict'

Two watchdog changes that came through different trees had a non
conflicting conflict, that is, one changed the semantics of a variable
but no actual code conflict happened. So the merge appeared fine, but
the resulting code did not behave as expected.

Commit 195daf665a62 ("watchdog: enable the new user interface of the
watchdog mechanism") changes the semantics of watchdog_user_enabled,
which thereafter is only used by the functions introduced by
b3738d293233 ("watchdog: Add watchdog enable/disable all functions").

There further appears to be a distinct lack of serialization between
setting and using watchdog_enabled, so perhaps we should wrap the
{en,dis}able_all() things in watchdog_proc_mutex.

This patch fixes a s2r failure reported by Michal; which I cannot
readily explain. But this does make the code internally consistent
again.

Reported-and-tested-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/watchdog.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 2316f50b07a4..506edcc500c4 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -41,6 +41,8 @@
 #define NMI_WATCHDOG_ENABLED      (1 << NMI_WATCHDOG_ENABLED_BIT)
 #define SOFT_WATCHDOG_ENABLED     (1 << SOFT_WATCHDOG_ENABLED_BIT)
 
+static DEFINE_MUTEX(watchdog_proc_mutex);
+
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED;
 #else
@@ -608,26 +610,36 @@ void watchdog_nmi_enable_all(void)
 {
 	int cpu;
 
-	if (!watchdog_user_enabled)
-		return;
+	mutex_lock(&watchdog_proc_mutex);
+
+	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+		goto unlock;
 
 	get_online_cpus();
 	for_each_online_cpu(cpu)
 		watchdog_nmi_enable(cpu);
 	put_online_cpus();
+
+unlock:
+	mutex_lock(&watchdog_proc_mutex);
 }
 
 void watchdog_nmi_disable_all(void)
 {
 	int cpu;
 
+	mutex_lock(&watchdog_proc_mutex);
+
 	if (!watchdog_running)
-		return;
+		goto unlock;
 
 	get_online_cpus();
 	for_each_online_cpu(cpu)
 		watchdog_nmi_disable(cpu);
 	put_online_cpus();
+
+unlock:
+	mutex_unlock(&watchdog_proc_mutex);
 }
 #else
 static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
@@ -744,8 +756,6 @@ static int proc_watchdog_update(void)
 
 }
 
-static DEFINE_MUTEX(watchdog_proc_mutex);
-
 /*
  * common function for watchdog, nmi_watchdog and soft_watchdog parameter
  *
-- 
cgit v1.2.3


From 1173ff09b9c57be8248427b7be161f7599dccd6b Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Tue, 19 May 2015 09:07:27 +0200
Subject: watchdog: fix double lock in watchdog_nmi_enable_all

Commit ab992dc38f9a ("watchdog: Fix merge 'conflict'") has introduced an
obvious deadlock because of a typo.  watchdog_proc_mutex should be
unlocked on exit.

Thanks to Miroslav Benes who was staring at the code with me and noticed
this.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Duh-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/watchdog.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 506edcc500c4..581a68a04c64 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -621,7 +621,7 @@ void watchdog_nmi_enable_all(void)
 	put_online_cpus();
 
 unlock:
-	mutex_lock(&watchdog_proc_mutex);
+	mutex_unlock(&watchdog_proc_mutex);
 }
 
 void watchdog_nmi_disable_all(void)
-- 
cgit v1.2.3