From 0eec481e8fb000a209fda9bf8f466aca87dc1150 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 15 Oct 2008 14:56:37 +0800
Subject: markers: simplify marker_set_format()

current marker_set_format() is complex this patch simplify it,
and decrease the overhead of marker_update_probes().

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/marker.c | 71 ++++++++++++++++++++-------------------------------------
 1 file changed, 25 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/kernel/marker.c b/kernel/marker.c
index e9c6b2bc9400..75a1a17cd78a 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -64,6 +64,7 @@ struct marker_entry {
 	void *oldptr;
 	int rcu_pending;
 	unsigned char ptype:1;
+	unsigned char format_allocated:1;
 	char name[0];	/* Contains name'\0'format'\0' */
 };
 
@@ -416,6 +417,7 @@ static struct marker_entry *add_marker(const char *name, const char *format)
 	e->single.probe_private = NULL;
 	e->multi = NULL;
 	e->ptype = 0;
+	e->format_allocated = 0;
 	e->refcount = 0;
 	e->rcu_pending = 0;
 	hlist_add_head(&e->hlist, head);
@@ -447,6 +449,8 @@ static int remove_marker(const char *name)
 	if (e->single.func != __mark_empty_function)
 		return -EBUSY;
 	hlist_del(&e->hlist);
+	if (e->format_allocated)
+		kfree(e->format);
 	/* Make sure the call_rcu has been executed */
 	if (e->rcu_pending)
 		rcu_barrier_sched();
@@ -457,57 +461,34 @@ static int remove_marker(const char *name)
 /*
  * Set the mark_entry format to the format found in the element.
  */
-static int marker_set_format(struct marker_entry **entry, const char *format)
+static int marker_set_format(struct marker_entry *entry, const char *format)
 {
-	struct marker_entry *e;
-	size_t name_len = strlen((*entry)->name) + 1;
-	size_t format_len = strlen(format) + 1;
-
-
-	e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
-			GFP_KERNEL);
-	if (!e)
+	entry->format = kstrdup(format, GFP_KERNEL);
+	if (!entry->format)
 		return -ENOMEM;
-	memcpy(&e->name[0], (*entry)->name, name_len);
-	e->format = &e->name[name_len];
-	memcpy(e->format, format, format_len);
-	if (strcmp(e->format, MARK_NOARGS) == 0)
-		e->call = marker_probe_cb_noarg;
-	else
-		e->call = marker_probe_cb;
-	e->single = (*entry)->single;
-	e->multi = (*entry)->multi;
-	e->ptype = (*entry)->ptype;
-	e->refcount = (*entry)->refcount;
-	e->rcu_pending = 0;
-	hlist_add_before(&e->hlist, &(*entry)->hlist);
-	hlist_del(&(*entry)->hlist);
-	/* Make sure the call_rcu has been executed */
-	if ((*entry)->rcu_pending)
-		rcu_barrier_sched();
-	kfree(*entry);
-	*entry = e;
+	entry->format_allocated = 1;
+
 	trace_mark(core_marker_format, "name %s format %s",
-			e->name, e->format);
+			entry->name, entry->format);
 	return 0;
 }
 
 /*
  * Sets the probe callback corresponding to one marker.
  */
-static int set_marker(struct marker_entry **entry, struct marker *elem,
+static int set_marker(struct marker_entry *entry, struct marker *elem,
 		int active)
 {
 	int ret;
-	WARN_ON(strcmp((*entry)->name, elem->name) != 0);
+	WARN_ON(strcmp(entry->name, elem->name) != 0);
 
-	if ((*entry)->format) {
-		if (strcmp((*entry)->format, elem->format) != 0) {
+	if (entry->format) {
+		if (strcmp(entry->format, elem->format) != 0) {
 			printk(KERN_NOTICE
 				"Format mismatch for probe %s "
 				"(%s), marker (%s)\n",
-				(*entry)->name,
-				(*entry)->format,
+				entry->name,
+				entry->format,
 				elem->format);
 			return -EPERM;
 		}
@@ -523,34 +504,33 @@ static int set_marker(struct marker_entry **entry, struct marker *elem,
 	 * pass from a "safe" callback (with argument) to an "unsafe"
 	 * callback (does not set arguments).
 	 */
-	elem->call = (*entry)->call;
+	elem->call = entry->call;
 	/*
 	 * Sanity check :
 	 * We only update the single probe private data when the ptr is
 	 * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
 	 */
 	WARN_ON(elem->single.func != __mark_empty_function
-		&& elem->single.probe_private
-		!= (*entry)->single.probe_private &&
-		!elem->ptype);
-	elem->single.probe_private = (*entry)->single.probe_private;
+		&& elem->single.probe_private != entry->single.probe_private
+		&& !elem->ptype);
+	elem->single.probe_private = entry->single.probe_private;
 	/*
 	 * Make sure the private data is valid when we update the
 	 * single probe ptr.
 	 */
 	smp_wmb();
-	elem->single.func = (*entry)->single.func;
+	elem->single.func = entry->single.func;
 	/*
 	 * We also make sure that the new probe callbacks array is consistent
 	 * before setting a pointer to it.
 	 */
-	rcu_assign_pointer(elem->multi, (*entry)->multi);
+	rcu_assign_pointer(elem->multi, entry->multi);
 	/*
 	 * Update the function or multi probe array pointer before setting the
 	 * ptype.
 	 */
 	smp_wmb();
-	elem->ptype = (*entry)->ptype;
+	elem->ptype = entry->ptype;
 	elem->state = active;
 
 	return 0;
@@ -594,8 +574,7 @@ void marker_update_probe_range(struct marker *begin,
 	for (iter = begin; iter < end; iter++) {
 		mark_entry = get_marker(iter->name);
 		if (mark_entry) {
-			set_marker(&mark_entry, iter,
-					!!mark_entry->refcount);
+			set_marker(mark_entry, iter, !!mark_entry->refcount);
 			/*
 			 * ignore error, continue
 			 */
@@ -657,7 +636,7 @@ int marker_probe_register(const char *name, const char *format,
 			ret = PTR_ERR(entry);
 	} else if (format) {
 		if (!entry->format)
-			ret = marker_set_format(&entry, format);
+			ret = marker_set_format(entry, format);
 		else if (strcmp(entry->format, format))
 			ret = -EPERM;
 	}
-- 
cgit v1.2.3


From 505e371da195fad20cb8aaf45407a2849774d6d0 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 15 Oct 2008 14:56:42 +0800
Subject: markers: remove exported symbol marker_probe_cb_noarg()

marker_probe_cb_noarg() should not be seen by outer code.
this patch remove it.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/marker.h | 2 --
 kernel/marker.c        | 3 +--
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/marker.h b/include/linux/marker.h
index 889196c7fbb1..4cf45472d9f5 100644
--- a/include/linux/marker.h
+++ b/include/linux/marker.h
@@ -136,8 +136,6 @@ extern marker_probe_func __mark_empty_function;
 
 extern void marker_probe_cb(const struct marker *mdata,
 	void *call_private, ...);
-extern void marker_probe_cb_noarg(const struct marker *mdata,
-	void *call_private, ...);
 
 /*
  * Connect a probe to a marker.
diff --git a/kernel/marker.c b/kernel/marker.c
index 75a1a17cd78a..23192646555f 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -158,7 +158,7 @@ EXPORT_SYMBOL_GPL(marker_probe_cb);
  *
  * Should be connected to markers "MARK_NOARGS".
  */
-void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
+static void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
 {
 	va_list args;	/* not initialized */
 	char ptype;
@@ -198,7 +198,6 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
 	}
 	rcu_read_unlock_sched();
 }
-EXPORT_SYMBOL_GPL(marker_probe_cb_noarg);
 
 static void free_old_closure(struct rcu_head *head)
 {
-- 
cgit v1.2.3


From 4de62748e69c31fc4fd5bc43b73e9cf60a17ec53 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 15 Oct 2008 14:56:47 +0800
Subject: markers: let marker_table be close to its comments

marker_table is defined far from its comments, this fix make
cleanup for it.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/marker.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/marker.c b/kernel/marker.c
index 23192646555f..0f2a944329d3 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -43,6 +43,7 @@ static DEFINE_MUTEX(markers_mutex);
  */
 #define MARKER_HASH_BITS 6
 #define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
+static struct hlist_head marker_table[MARKER_TABLE_SIZE];
 
 /*
  * Note about RCU :
@@ -68,8 +69,6 @@ struct marker_entry {
 	char name[0];	/* Contains name'\0'format'\0' */
 };
 
-static struct hlist_head marker_table[MARKER_TABLE_SIZE];
-
 /**
  * __mark_empty_function - Empty probe callback
  * @probe_private: probe private data
-- 
cgit v1.2.3


From 5d9881ea1440f046ee851bbaa2a2962543336a11 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Wed, 22 Oct 2008 11:38:01 +0800
Subject: markers: break the redundant loop in kernel/marker.c

Impact: cleanup, no functionality changed

Because e->name is unique in list, we don't need to continue the iteration
after matched.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/marker.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/marker.c b/kernel/marker.c
index 0f2a944329d3..2898b647d415 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -825,8 +825,6 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe,
 			if (!e->ptype) {
 				if (num == 0 && e->single.func == probe)
 					return e->single.probe_private;
-				else
-					break;
 			} else {
 				struct marker_probe_closure *closure;
 				int match = 0;
@@ -838,6 +836,7 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe,
 						return closure[i].probe_private;
 				}
 			}
+			break;
 		}
 	}
 	return ERR_PTR(-ENOENT);
-- 
cgit v1.2.3


From 944ac4259e39801c843a915c3da8194ac9af0440 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 23 Oct 2008 19:26:08 -0400
Subject: ftrace: ftrace dump on oops control

Impact: add (default-off) dump-trace-on-oops flag

Currently, ftrace is set up to dump its contents to the console if the
kernel panics or oops. This can be annoying if you have trace data in
the buffers and you experience an oops, but the trace data is old or
static.

Usually when you want ftrace to dump its contents is when you are debugging
your system and you have set up ftrace to trace the events leading to
an oops.

This patch adds a control variable called "ftrace_dump_on_oops" that will
enable the ftrace dump to console on oops. This variable is default off
but a developer can enable it either through the kernel command line
by adding "ftrace_dump_on_oops" or at run time by setting (or disabling)
/proc/sys/kernel/ftrace_dump_on_oops.

v2:

   Replaced /** with /* as Randy explained that kernel-doc does
    not yet handle variables.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h |  2 ++
 kernel/sysctl.c        | 10 ++++++++++
 kernel/trace/trace.c   | 29 ++++++++++++++++++++++++++---
 3 files changed, 38 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index a3d46151be19..9623b7b9e5a5 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -165,6 +165,8 @@ static inline void __ftrace_enabled_restore(int enabled)
 #endif
 
 #ifdef CONFIG_TRACING
+extern int ftrace_dump_on_oops;
+
 extern void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index a13bd4dfaeb1..84754f5801e6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -484,6 +484,16 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &ftrace_enable_sysctl,
 	},
 #endif
+#ifdef CONFIG_TRACING
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "ftrace_dump_on_opps",
+		.data		= &ftrace_dump_on_oops,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
 #ifdef CONFIG_MODULES
 	{
 		.ctl_name	= KERN_MODPROBE,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d345d649d073..47f46cbdd860 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -63,6 +63,28 @@ static cpumask_t __read_mostly		tracing_buffer_mask;
 
 static int tracing_disabled = 1;
 
+/*
+ * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
+ *
+ * If there is an oops (or kernel panic) and the ftrace_dump_on_oops
+ * is set, then ftrace_dump is called. This will output the contents
+ * of the ftrace buffers to the console.  This is very useful for
+ * capturing traces that lead to crashes and outputing it to a
+ * serial console.
+ *
+ * It is default off, but you can enable it with either specifying
+ * "ftrace_dump_on_oops" in the kernel command line, or setting
+ * /proc/sys/kernel/ftrace_dump_on_oops to true.
+ */
+int ftrace_dump_on_oops;
+
+static int __init set_ftrace_dump_on_oops(char *str)
+{
+	ftrace_dump_on_oops = 1;
+	return 1;
+}
+__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
+
 long
 ns2usecs(cycle_t nsec)
 {
@@ -3021,7 +3043,8 @@ EXPORT_SYMBOL_GPL(__ftrace_printk);
 static int trace_panic_handler(struct notifier_block *this,
 			       unsigned long event, void *unused)
 {
-	ftrace_dump();
+	if (ftrace_dump_on_oops)
+		ftrace_dump();
 	return NOTIFY_OK;
 }
 
@@ -3037,7 +3060,8 @@ static int trace_die_handler(struct notifier_block *self,
 {
 	switch (val) {
 	case DIE_OOPS:
-		ftrace_dump();
+		if (ftrace_dump_on_oops)
+			ftrace_dump();
 		break;
 	default:
 		break;
@@ -3078,7 +3102,6 @@ trace_printk_seq(struct trace_seq *s)
 	trace_seq_reset(s);
 }
 
-
 void ftrace_dump(void)
 {
 	static DEFINE_SPINLOCK(ftrace_dump_lock);
-- 
cgit v1.2.3


From eab172294d5e24464f332dd8e94a57a9819c81c4 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 29 Oct 2008 17:03:22 +0800
Subject: sched: cleanup for alloc_rt/fair_sched_group()

Impact: cleanup

Remove checking parent == NULL. It won't be NULLL, because we dynamically
create sub task_group only, and sub task_group always has its parent.
(root task_group is statically defined)

Also replace kmalloc_node(GFP_ZERO) with kzalloc_node().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index e8819bc6f462..7dd6c860773b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8472,7 +8472,7 @@ static
 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	struct cfs_rq *cfs_rq;
-	struct sched_entity *se, *parent_se;
+	struct sched_entity *se;
 	struct rq *rq;
 	int i;
 
@@ -8488,18 +8488,17 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
 
-		cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
-				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
+				      GFP_KERNEL, cpu_to_node(i));
 		if (!cfs_rq)
 			goto err;
 
-		se = kmalloc_node(sizeof(struct sched_entity),
-				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+		se = kzalloc_node(sizeof(struct sched_entity),
+				  GFP_KERNEL, cpu_to_node(i));
 		if (!se)
 			goto err;
 
-		parent_se = parent ? parent->se[i] : NULL;
-		init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
+		init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
 	}
 
 	return 1;
@@ -8560,7 +8559,7 @@ static
 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	struct rt_rq *rt_rq;
-	struct sched_rt_entity *rt_se, *parent_se;
+	struct sched_rt_entity *rt_se;
 	struct rq *rq;
 	int i;
 
@@ -8577,18 +8576,17 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
 
-		rt_rq = kmalloc_node(sizeof(struct rt_rq),
-				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+		rt_rq = kzalloc_node(sizeof(struct rt_rq),
+				     GFP_KERNEL, cpu_to_node(i));
 		if (!rt_rq)
 			goto err;
 
-		rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
-				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+		rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
+				     GFP_KERNEL, cpu_to_node(i));
 		if (!rt_se)
 			goto err;
 
-		parent_se = parent ? parent->rt_se[i] : NULL;
-		init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
+		init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
 	}
 
 	return 1;
-- 
cgit v1.2.3


From 34f3a814eef8069a24e5b3ebcf27aba9dabac2ea Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 30 Oct 2008 15:23:32 +0800
Subject: sched: switch sched_features to seqfile

Impact: cleanup

So handling of sched_features read is simplified.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 52 ++++++++++++++++------------------------------------
 1 file changed, 16 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 7dd6c860773b..5419df9cc5c4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -703,45 +703,18 @@ static __read_mostly char *sched_feat_names[] = {
 
 #undef SCHED_FEAT
 
-static int sched_feat_open(struct inode *inode, struct file *filp)
+static int sched_feat_show(struct seq_file *m, void *v)
 {
-	filp->private_data = inode->i_private;
-	return 0;
-}
-
-static ssize_t
-sched_feat_read(struct file *filp, char __user *ubuf,
-		size_t cnt, loff_t *ppos)
-{
-	char *buf;
-	int r = 0;
-	int len = 0;
 	int i;
 
 	for (i = 0; sched_feat_names[i]; i++) {
-		len += strlen(sched_feat_names[i]);
-		len += 4;
+		if (!(sysctl_sched_features & (1UL << i)))
+			seq_puts(m, "NO_");
+		seq_printf(m, "%s ", sched_feat_names[i]);
 	}
+	seq_puts(m, "\n");
 
-	buf = kmalloc(len + 2, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-
-	for (i = 0; sched_feat_names[i]; i++) {
-		if (sysctl_sched_features & (1UL << i))
-			r += sprintf(buf + r, "%s ", sched_feat_names[i]);
-		else
-			r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
-	}
-
-	r += sprintf(buf + r, "\n");
-	WARN_ON(r >= len + 2);
-
-	r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-
-	kfree(buf);
-
-	return r;
+	return 0;
 }
 
 static ssize_t
@@ -786,10 +759,17 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
 	return cnt;
 }
 
+static int sched_feat_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, sched_feat_show, NULL);
+}
+
 static struct file_operations sched_feat_fops = {
-	.open	= sched_feat_open,
-	.read	= sched_feat_read,
-	.write	= sched_feat_write,
+	.open		= sched_feat_open,
+	.write		= sched_feat_write,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
 };
 
 static __init int sched_init_debug(void)
-- 
cgit v1.2.3


From b807c3d0f8e39ed7cbbbe6da162650e305e8de15 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 30 Oct 2008 16:08:33 -0400
Subject: ftrace: nmi update statistics

Impact: add more debug info to /debugfs/tracing/dyn_ftrace_total_info

This patch adds dynamic ftrace NMI update statistics to the
/debugfs/tracing/dyn_ftrace_total_info stat file.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ftrace.c | 26 ++++++++++++++++++++++++--
 kernel/trace/trace.c     | 31 ++++++++++++++++++++++++-------
 2 files changed, 48 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index fe5f859130b5..6685b0fc1b44 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -91,6 +91,19 @@ static int mod_code_write;
 static void *mod_code_ip;
 static void *mod_code_newcode;
 
+static int nmi_wait_count;
+static atomic_t nmi_update_count;
+
+int ftrace_arch_read_dyn_info(char *buf, int size)
+{
+	int r;
+
+	r = snprintf(buf, size, "%u %u",
+		     nmi_wait_count,
+		     atomic_read(&nmi_update_count));
+	return r;
+}
+
 static void ftrace_mod_code(void)
 {
 	/*
@@ -109,8 +122,10 @@ void ftrace_nmi_enter(void)
 	atomic_inc(&in_nmi);
 	/* Must have in_nmi seen before reading write flag */
 	smp_mb();
-	if (mod_code_write)
+	if (mod_code_write) {
 		ftrace_mod_code();
+		atomic_inc(&nmi_update_count);
+	}
 }
 
 void ftrace_nmi_exit(void)
@@ -122,8 +137,15 @@ void ftrace_nmi_exit(void)
 
 static void wait_for_nmi(void)
 {
-	while (atomic_read(&in_nmi))
+	int waited = 0;
+
+	while (atomic_read(&in_nmi)) {
+		waited = 1;
 		cpu_relax();
+	}
+
+	if (waited)
+		nmi_wait_count++;
 }
 
 static int
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a610ca771558..bc36febc0771 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2815,22 +2815,39 @@ static struct file_operations tracing_mark_fops = {
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 
+#define DYN_INFO_BUF_SIZE 1023
+static char ftrace_dyn_info_buffer[DYN_INFO_BUF_SIZE+1];
+static DEFINE_MUTEX(dyn_info_mutex);
+
+int __weak ftrace_arch_read_dyn_info(char *buf, int size)
+{
+	return 0;
+}
+
 static ssize_t
-tracing_read_long(struct file *filp, char __user *ubuf,
+tracing_read_dyn_info(struct file *filp, char __user *ubuf,
 		  size_t cnt, loff_t *ppos)
 {
 	unsigned long *p = filp->private_data;
-	char buf[64];
+	char *buf = ftrace_dyn_info_buffer;
 	int r;
 
-	r = sprintf(buf, "%ld\n", *p);
+	mutex_lock(&dyn_info_mutex);
+	r = sprintf(buf, "%ld ", *p);
 
-	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+	r += ftrace_arch_read_dyn_info(buf+r, DYN_INFO_BUF_SIZE-r);
+	buf[r++] = '\n';
+
+	r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+
+	mutex_unlock(&dyn_info_mutex);
+
+	return r;
 }
 
-static struct file_operations tracing_read_long_fops = {
+static struct file_operations tracing_dyn_info_fops = {
 	.open		= tracing_open_generic,
-	.read		= tracing_read_long,
+	.read		= tracing_read_dyn_info,
 };
 #endif
 
@@ -2939,7 +2956,7 @@ static __init int tracer_init_debugfs(void)
 #ifdef CONFIG_DYNAMIC_FTRACE
 	entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
 				    &ftrace_update_tot_cnt,
-				    &tracing_read_long_fops);
+				    &tracing_dyn_info_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs "
 			   "'dyn_ftrace_total_info' entry\n");
-- 
cgit v1.2.3


From a26a2a27396c0a0877aa701f8f92d08ba550a6c9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 31 Oct 2008 00:03:22 -0400
Subject: ftrace: nmi safe code clean ups

Impact: cleanup

This patch cleans up the NMI safe code for dynamic ftrace as suggested
by Andrew Morton.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/arm/include/asm/ftrace.h     |  4 ++--
 arch/powerpc/include/asm/ftrace.h |  4 ++--
 arch/sh/include/asm/ftrace.h      |  4 ++--
 arch/sparc/include/asm/ftrace.h   |  4 ++--
 arch/x86/include/asm/ftrace.h     | 10 +++++-----
 arch/x86/kernel/ftrace.c          | 16 ++++++++--------
 include/linux/ftrace.h            |  3 +++
 kernel/trace/trace.c              |  9 ++++-----
 8 files changed, 28 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/include/asm/ftrace.h b/arch/arm/include/asm/ftrace.h
index d4c24a7a9280..3f3a1d1508ea 100644
--- a/arch/arm/include/asm/ftrace.h
+++ b/arch/arm/include/asm/ftrace.h
@@ -2,8 +2,8 @@
 #define _ASM_ARM_FTRACE
 
 #ifndef __ASSEMBLY__
-#define ftrace_nmi_enter()	do { } while (0)
-#define ftrace_nmi_exit()	do { } while (0)
+static inline void ftrace_nmi_enter(void) { }
+static inline void ftrace_nmi_exit(void) { }
 #endif
 
 #ifdef CONFIG_FUNCTION_TRACER
diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h
index 7652755dc000..1cd72700fbc0 100644
--- a/arch/powerpc/include/asm/ftrace.h
+++ b/arch/powerpc/include/asm/ftrace.h
@@ -2,8 +2,8 @@
 #define _ASM_POWERPC_FTRACE
 
 #ifndef __ASSEMBLY__
-#define ftrace_nmi_enter()	do { } while (0)
-#define ftrace_nmi_exit()	do { } while (0)
+static inline void ftrace_nmi_enter(void) { }
+static inline void ftrace_nmi_exit(void) { }
 #endif
 
 #ifdef CONFIG_FUNCTION_TRACER
diff --git a/arch/sh/include/asm/ftrace.h b/arch/sh/include/asm/ftrace.h
index cdf2cb0b9ffe..31ada0370cb6 100644
--- a/arch/sh/include/asm/ftrace.h
+++ b/arch/sh/include/asm/ftrace.h
@@ -2,8 +2,8 @@
 #define __ASM_SH_FTRACE_H
 
 #ifndef __ASSEMBLY__
-#define ftrace_nmi_enter()	do { } while (0)
-#define ftrace_nmi_exit()	do { } while (0)
+static inline void ftrace_nmi_enter(void) { }
+static inline void ftrace_nmi_exit(void) { }
 #endif
 
 #ifndef __ASSEMBLY__
diff --git a/arch/sparc/include/asm/ftrace.h b/arch/sparc/include/asm/ftrace.h
index 33a95feeb137..62055ac0496e 100644
--- a/arch/sparc/include/asm/ftrace.h
+++ b/arch/sparc/include/asm/ftrace.h
@@ -2,8 +2,8 @@
 #define _ASM_SPARC64_FTRACE
 
 #ifndef __ASSEMBLY__
-#define ftrace_nmi_enter()	do { } while (0)
-#define ftrace_nmi_exit()	do { } while (0)
+static inline void ftrace_nmi_enter(void) { }
+static inline void ftrace_nmi_exit(void) { }
 #endif
 
 #ifdef CONFIG_MCOUNT
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index f2ed6b704a75..a23468194b8c 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -22,16 +22,16 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
 extern void ftrace_nmi_enter(void);
 extern void ftrace_nmi_exit(void);
 #else
-#define ftrace_nmi_enter()	do { } while (0)
-#define ftrace_nmi_exit()	do { } while (0)
-#endif
+static inline void ftrace_nmi_enter(void) { }
+static inline void ftrace_nmi_exit(void) { }
 #endif
+#endif /* __ASSEMBLY__ */
 
 #else /* CONFIG_FUNCTION_TRACER */
 
 #ifndef __ASSEMBLY__
-#define ftrace_nmi_enter()	do { } while (0)
-#define ftrace_nmi_exit()	do { } while (0)
+static inline void ftrace_nmi_enter(void) { }
+static inline void ftrace_nmi_exit(void) { }
 #endif
 
 #endif /* CONFIG_FUNCTION_TRACER */
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 6685b0fc1b44..69149337f2fe 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -67,7 +67,7 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
  *
  * Two buffers are added: An IP buffer and a "code" buffer.
  *
- * 1) Put in the instruction pointer into the IP buffer
+ * 1) Put the instruction pointer into the IP buffer
  *    and the new code into the "code" buffer.
  * 2) Set a flag that says we are modifying code
  * 3) Wait for any running NMIs to finish.
@@ -85,14 +85,14 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
  * are the same as what exists.
  */
 
-static atomic_t in_nmi;
-static int mod_code_status;
-static int mod_code_write;
-static void *mod_code_ip;
-static void *mod_code_newcode;
+static atomic_t in_nmi = ATOMIC_INIT(0);
+static int mod_code_status;		/* holds return value of text write */
+static int mod_code_write;		/* set when NMI should do the write */
+static void *mod_code_ip;		/* holds the IP to write to */
+static void *mod_code_newcode;		/* holds the text to write to the IP */
 
-static int nmi_wait_count;
-static atomic_t nmi_update_count;
+static unsigned nmi_wait_count;
+static atomic_t nmi_update_count = ATOMIC_INIT(0);
 
 int ftrace_arch_read_dyn_info(char *buf, int size)
 {
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 703eb53cfa2b..22240dfe912e 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -74,6 +74,9 @@ extern void ftrace_caller(void);
 extern void ftrace_call(void);
 extern void mcount_call(void);
 
+/* May be defined in arch */
+extern int ftrace_arch_read_dyn_info(char *buf, int size);
+
 /**
  * ftrace_modify_code - modify code segment
  * @ip: the address of the code segment
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bc36febc0771..7f86067d760c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2815,10 +2815,6 @@ static struct file_operations tracing_mark_fops = {
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 
-#define DYN_INFO_BUF_SIZE 1023
-static char ftrace_dyn_info_buffer[DYN_INFO_BUF_SIZE+1];
-static DEFINE_MUTEX(dyn_info_mutex);
-
 int __weak ftrace_arch_read_dyn_info(char *buf, int size)
 {
 	return 0;
@@ -2828,14 +2824,17 @@ static ssize_t
 tracing_read_dyn_info(struct file *filp, char __user *ubuf,
 		  size_t cnt, loff_t *ppos)
 {
+	static char ftrace_dyn_info_buffer[1024];
+	static DEFINE_MUTEX(dyn_info_mutex);
 	unsigned long *p = filp->private_data;
 	char *buf = ftrace_dyn_info_buffer;
+	int size = ARRAY_SIZE(ftrace_dyn_info_buffer);
 	int r;
 
 	mutex_lock(&dyn_info_mutex);
 	r = sprintf(buf, "%ld ", *p);
 
-	r += ftrace_arch_read_dyn_info(buf+r, DYN_INFO_BUF_SIZE-r);
+	r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r);
 	buf[r++] = '\n';
 
 	r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-- 
cgit v1.2.3


From 8bb8c4386d08f2cc5d871d22f220d35032213f84 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 1 Nov 2008 00:13:49 +0100
Subject: sched, ftrace: trace sched.c

Impact: allow function tracing within sched.c

Its useful to see what happens in sched.c.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Makefile | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 9a3ec66a9d84..e1af03972148 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -21,7 +21,6 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
 CFLAGS_REMOVE_rtmutex-debug.o = -pg
 CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_sched_clock.o = -pg
-CFLAGS_REMOVE_sched.o = -mno-spe -pg
 endif
 
 obj-$(CONFIG_FREEZER) += freezer.o
-- 
cgit v1.2.3


From d9e540762f5cdd89f24e518ad1fd31142d0b9726 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 1 Nov 2008 19:57:37 +0100
Subject: ftrace: ftrace_dump_on_oops=[tracer]

Impact: add new (optional) debug boot option

In order to facilitate early boot trouble, allow one to specify a tracer
on the kernel boot line.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/kernel-parameters.txt |  8 +++++
 kernel/trace/trace.c                | 58 ++++++++++++++++++++++++-------------
 2 files changed, 46 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 1bbcaa8982b6..4862284d3119 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -765,6 +765,14 @@ and is between 256 and 4096 characters. It is defined in the file
 			parameter will force ia64_sal_cache_flush to call
 			ia64_pal_cache_flush instead of SAL_CACHE_FLUSH.
 
+	ftrace=[tracer]
+			[ftrace] will set and start the specified tracer
+			as early as possible in order to facilitate early
+			boot debugging.
+
+	ftrace_dump_on_oops
+			[ftrace] will dump the trace buffers on oops.
+
 	gamecon.map[2|3]=
 			[HW,JOY] Multisystem joystick and NES/SNES/PSX pad
 			support via parallel port (up to 5 devices per port)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bdb1df00fb10..482583eb8001 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -79,6 +79,15 @@ static int tracing_disabled = 1;
  */
 int ftrace_dump_on_oops;
 
+static int tracing_set_tracer(char *buf);
+
+static int __init set_ftrace(char *str)
+{
+	tracing_set_tracer(str);
+	return 1;
+}
+__setup("ftrace", set_ftrace);
+
 static int __init set_ftrace_dump_on_oops(char *str)
 {
 	ftrace_dump_on_oops = 1;
@@ -2394,29 +2403,11 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
-static ssize_t
-tracing_set_trace_write(struct file *filp, const char __user *ubuf,
-			size_t cnt, loff_t *ppos)
+static int tracing_set_tracer(char *buf)
 {
 	struct trace_array *tr = &global_trace;
 	struct tracer *t;
-	char buf[max_tracer_type_len+1];
-	int i;
-	size_t ret;
-
-	ret = cnt;
-
-	if (cnt > max_tracer_type_len)
-		cnt = max_tracer_type_len;
-
-	if (copy_from_user(&buf, ubuf, cnt))
-		return -EFAULT;
-
-	buf[cnt] = 0;
-
-	/* strip ending whitespace. */
-	for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
-		buf[i] = 0;
+	int ret = 0;
 
 	mutex_lock(&trace_types_lock);
 	for (t = trace_types; t; t = t->next) {
@@ -2440,6 +2431,33 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
  out:
 	mutex_unlock(&trace_types_lock);
 
+	return ret;
+}
+
+static ssize_t
+tracing_set_trace_write(struct file *filp, const char __user *ubuf,
+			size_t cnt, loff_t *ppos)
+{
+	char buf[max_tracer_type_len+1];
+	int i;
+	size_t ret;
+
+	if (cnt > max_tracer_type_len)
+		cnt = max_tracer_type_len;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	/* strip ending whitespace. */
+	for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
+		buf[i] = 0;
+
+	ret = tracing_set_tracer(buf);
+	if (!ret)
+		ret = cnt;
+
 	if (ret > 0)
 		filp->f_pos += ret;
 
-- 
cgit v1.2.3


From 19dba33c43a2f0f2aa727ae075ec3b11330775ef Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 28 Oct 2008 10:51:49 +0800
Subject: tracepoint: simplification for tracepoints using RCU

Impact: simplify implementation

Now, unused memory is handled by struct tp_probes.

old code use these three field to handle unused memory.
struct tracepoint_entry {
	...
	struct rcu_head rcu;
	void *oldptr;
	unsigned char rcu_pending:1;
	...
};

in this way, unused memory is handled by struct tracepoint_entry.
it bring reenter bug(it was fixed) and tracepoint.c is filled
full of ".*rcu.*" code statements. this patch removes all these.

and:
  rcu_barrier_sched() is removed.
  Do not need regain tracepoints_mutex after tracepoint_update_probes()
  several little cleanup.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/tracepoint.c | 111 ++++++++++++++++++----------------------------------
 1 file changed, 37 insertions(+), 74 deletions(-)

(limited to 'kernel')

diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index af8c85664882..3e22867184e3 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -43,6 +43,7 @@ static DEFINE_MUTEX(tracepoints_mutex);
  */
 #define TRACEPOINT_HASH_BITS 6
 #define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS)
+static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
 
 /*
  * Note about RCU :
@@ -54,40 +55,40 @@ struct tracepoint_entry {
 	struct hlist_node hlist;
 	void **funcs;
 	int refcount;	/* Number of times armed. 0 if disarmed. */
-	struct rcu_head rcu;
-	void *oldptr;
-	unsigned char rcu_pending:1;
 	char name[0];
 };
 
-static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
+struct tp_probes {
+	struct rcu_head rcu;
+	void *probes[0];
+};
 
-static void free_old_closure(struct rcu_head *head)
+static inline void *allocate_probes(int count)
 {
-	struct tracepoint_entry *entry = container_of(head,
-		struct tracepoint_entry, rcu);
-	kfree(entry->oldptr);
-	/* Make sure we free the data before setting the pending flag to 0 */
-	smp_wmb();
-	entry->rcu_pending = 0;
+	struct tp_probes *p  = kmalloc(count * sizeof(void *)
+			+ sizeof(struct tp_probes), GFP_KERNEL);
+	return p == NULL ? NULL : p->probes;
 }
 
-static void tracepoint_entry_free_old(struct tracepoint_entry *entry, void *old)
+static void rcu_free_old_probes(struct rcu_head *head)
 {
-	if (!old)
-		return;
-	entry->oldptr = old;
-	entry->rcu_pending = 1;
-	/* write rcu_pending before calling the RCU callback */
-	smp_wmb();
-	call_rcu_sched(&entry->rcu, free_old_closure);
+	kfree(container_of(head, struct tp_probes, rcu));
+}
+
+static inline void release_probes(void *old)
+{
+	if (old) {
+		struct tp_probes *tp_probes = container_of(old,
+			struct tp_probes, probes[0]);
+		call_rcu(&tp_probes->rcu, rcu_free_old_probes);
+	}
 }
 
 static void debug_print_probes(struct tracepoint_entry *entry)
 {
 	int i;
 
-	if (!tracepoint_debug)
+	if (!tracepoint_debug || !entry->funcs)
 		return;
 
 	for (i = 0; entry->funcs[i]; i++)
@@ -111,12 +112,13 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
 				return ERR_PTR(-EEXIST);
 	}
 	/* + 2 : one for new probe, one for NULL func */
-	new = kzalloc((nr_probes + 2) * sizeof(void *), GFP_KERNEL);
+	new = allocate_probes(nr_probes + 2);
 	if (new == NULL)
 		return ERR_PTR(-ENOMEM);
 	if (old)
 		memcpy(new, old, nr_probes * sizeof(void *));
 	new[nr_probes] = probe;
+	new[nr_probes + 1] = NULL;
 	entry->refcount = nr_probes + 1;
 	entry->funcs = new;
 	debug_print_probes(entry);
@@ -132,7 +134,7 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
 	old = entry->funcs;
 
 	if (!old)
-		return NULL;
+		return ERR_PTR(-ENOENT);
 
 	debug_print_probes(entry);
 	/* (N -> M), (N > 1, M >= 0) probes */
@@ -151,13 +153,13 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
 		int j = 0;
 		/* N -> M, (N > 1, M > 0) */
 		/* + 1 for NULL */
-		new = kzalloc((nr_probes - nr_del + 1)
-			* sizeof(void *), GFP_KERNEL);
+		new = allocate_probes(nr_probes - nr_del + 1);
 		if (new == NULL)
 			return ERR_PTR(-ENOMEM);
 		for (i = 0; old[i]; i++)
 			if ((probe && old[i] != probe))
 				new[j++] = old[i];
+		new[nr_probes - nr_del] = NULL;
 		entry->refcount = nr_probes - nr_del;
 		entry->funcs = new;
 	}
@@ -215,7 +217,6 @@ static struct tracepoint_entry *add_tracepoint(const char *name)
 	memcpy(&e->name[0], name, name_len);
 	e->funcs = NULL;
 	e->refcount = 0;
-	e->rcu_pending = 0;
 	hlist_add_head(&e->hlist, head);
 	return e;
 }
@@ -224,32 +225,10 @@ static struct tracepoint_entry *add_tracepoint(const char *name)
  * Remove the tracepoint from the tracepoint hash table. Must be called with
  * mutex_lock held.
  */
-static int remove_tracepoint(const char *name)
+static inline void remove_tracepoint(struct tracepoint_entry *e)
 {
-	struct hlist_head *head;
-	struct hlist_node *node;
-	struct tracepoint_entry *e;
-	int found = 0;
-	size_t len = strlen(name) + 1;
-	u32 hash = jhash(name, len-1, 0);
-
-	head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
-	hlist_for_each_entry(e, node, head, hlist) {
-		if (!strcmp(name, e->name)) {
-			found = 1;
-			break;
-		}
-	}
-	if (!found)
-		return -ENOENT;
-	if (e->refcount)
-		return -EBUSY;
 	hlist_del(&e->hlist);
-	/* Make sure the call_rcu_sched has been executed */
-	if (e->rcu_pending)
-		rcu_barrier_sched();
 	kfree(e);
-	return 0;
 }
 
 /*
@@ -343,25 +322,17 @@ int tracepoint_probe_register(const char *name, void *probe)
 			goto end;
 		}
 	}
-	/*
-	 * If we detect that a call_rcu_sched is pending for this tracepoint,
-	 * make sure it's executed now.
-	 */
-	if (entry->rcu_pending)
-		rcu_barrier_sched();
 	old = tracepoint_entry_add_probe(entry, probe);
 	if (IS_ERR(old)) {
+		if (!entry->refcount)
+			remove_tracepoint(entry);
 		ret = PTR_ERR(old);
 		goto end;
 	}
 	mutex_unlock(&tracepoints_mutex);
 	tracepoint_update_probes();		/* may update entry */
-	mutex_lock(&tracepoints_mutex);
-	entry = get_tracepoint(name);
-	WARN_ON(!entry);
-	if (entry->rcu_pending)
-		rcu_barrier_sched();
-	tracepoint_entry_free_old(entry, old);
+	release_probes(old);
+	return 0;
 end:
 	mutex_unlock(&tracepoints_mutex);
 	return ret;
@@ -388,25 +359,17 @@ int tracepoint_probe_unregister(const char *name, void *probe)
 	entry = get_tracepoint(name);
 	if (!entry)
 		goto end;
-	if (entry->rcu_pending)
-		rcu_barrier_sched();
 	old = tracepoint_entry_remove_probe(entry, probe);
-	if (!old) {
-		printk(KERN_WARNING "Warning: Trying to unregister a probe"
-				    "that doesn't exist\n");
+	if (IS_ERR(old)) {
+		ret = PTR_ERR(old);
 		goto end;
 	}
+	if (!entry->refcount)
+		remove_tracepoint(entry);
 	mutex_unlock(&tracepoints_mutex);
 	tracepoint_update_probes();		/* may update entry */
-	mutex_lock(&tracepoints_mutex);
-	entry = get_tracepoint(name);
-	if (!entry)
-		goto end;
-	if (entry->rcu_pending)
-		rcu_barrier_sched();
-	tracepoint_entry_free_old(entry, old);
-	remove_tracepoint(name);	/* Ignore busy error message */
-	ret = 0;
+	release_probes(old);
+	return 0;
 end:
 	mutex_unlock(&tracepoints_mutex);
 	return ret;
-- 
cgit v1.2.3


From 127cafbb276266b1b8da967bfe25a062ab1d42ab Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 28 Oct 2008 10:51:53 +0800
Subject: tracepoint: introduce *_noupdate APIs.

Impact: add new tracepoint APIs to allow the batched registration of probes

new APIs separate tracepoint_probe_register(),
tracepoint_probe_unregister() into 2 steps. The first step of them
is just update tracepoint_entry, not connect or disconnect.

this patch introduces tracepoint_probe_update_all() for update all.

these APIs are very useful for registering lots of probes
but just updating once. Another very important thing is that
*_noupdate APIs do not require module_mutex.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/tracepoint.h |   4 ++
 kernel/tracepoint.c        | 170 +++++++++++++++++++++++++++++++++++----------
 2 files changed, 136 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index c5bb39c7a770..63064e9403f2 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -112,6 +112,10 @@ extern int tracepoint_probe_register(const char *name, void *probe);
  */
 extern int tracepoint_probe_unregister(const char *name, void *probe);
 
+extern int tracepoint_probe_register_noupdate(const char *name, void *probe);
+extern int tracepoint_probe_unregister_noupdate(const char *name, void *probe);
+extern void tracepoint_probe_update_all(void);
+
 struct tracepoint_iter {
 	struct module *module;
 	struct tracepoint *tracepoint;
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 3e22867184e3..e96590f17de1 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -59,7 +59,10 @@ struct tracepoint_entry {
 };
 
 struct tp_probes {
-	struct rcu_head rcu;
+	union {
+		struct rcu_head rcu;
+		struct list_head list;
+	} u;
 	void *probes[0];
 };
 
@@ -72,7 +75,7 @@ static inline void *allocate_probes(int count)
 
 static void rcu_free_old_probes(struct rcu_head *head)
 {
-	kfree(container_of(head, struct tp_probes, rcu));
+	kfree(container_of(head, struct tp_probes, u.rcu));
 }
 
 static inline void release_probes(void *old)
@@ -80,7 +83,7 @@ static inline void release_probes(void *old)
 	if (old) {
 		struct tp_probes *tp_probes = container_of(old,
 			struct tp_probes, probes[0]);
-		call_rcu(&tp_probes->rcu, rcu_free_old_probes);
+		call_rcu_sched(&tp_probes->u.rcu, rcu_free_old_probes);
 	}
 }
 
@@ -299,6 +302,23 @@ static void tracepoint_update_probes(void)
 	module_update_tracepoints();
 }
 
+static void *tracepoint_add_probe(const char *name, void *probe)
+{
+	struct tracepoint_entry *entry;
+	void *old;
+
+	entry = get_tracepoint(name);
+	if (!entry) {
+		entry = add_tracepoint(name);
+		if (IS_ERR(entry))
+			return entry;
+	}
+	old = tracepoint_entry_add_probe(entry, probe);
+	if (IS_ERR(old) && !entry->refcount)
+		remove_tracepoint(entry);
+	return old;
+}
+
 /**
  * tracepoint_probe_register -  Connect a probe to a tracepoint
  * @name: tracepoint name
@@ -309,36 +329,36 @@ static void tracepoint_update_probes(void)
  */
 int tracepoint_probe_register(const char *name, void *probe)
 {
-	struct tracepoint_entry *entry;
-	int ret = 0;
 	void *old;
 
 	mutex_lock(&tracepoints_mutex);
-	entry = get_tracepoint(name);
-	if (!entry) {
-		entry = add_tracepoint(name);
-		if (IS_ERR(entry)) {
-			ret = PTR_ERR(entry);
-			goto end;
-		}
-	}
-	old = tracepoint_entry_add_probe(entry, probe);
-	if (IS_ERR(old)) {
-		if (!entry->refcount)
-			remove_tracepoint(entry);
-		ret = PTR_ERR(old);
-		goto end;
-	}
+	old = tracepoint_add_probe(name, probe);
 	mutex_unlock(&tracepoints_mutex);
+	if (IS_ERR(old))
+		return PTR_ERR(old);
+
 	tracepoint_update_probes();		/* may update entry */
 	release_probes(old);
 	return 0;
-end:
-	mutex_unlock(&tracepoints_mutex);
-	return ret;
 }
 EXPORT_SYMBOL_GPL(tracepoint_probe_register);
 
+static void *tracepoint_remove_probe(const char *name, void *probe)
+{
+	struct tracepoint_entry *entry;
+	void *old;
+
+	entry = get_tracepoint(name);
+	if (!entry)
+		return ERR_PTR(-ENOENT);
+	old = tracepoint_entry_remove_probe(entry, probe);
+	if (IS_ERR(old))
+		return old;
+	if (!entry->refcount)
+		remove_tracepoint(entry);
+	return old;
+}
+
 /**
  * tracepoint_probe_unregister -  Disconnect a probe from a tracepoint
  * @name: tracepoint name
@@ -351,31 +371,105 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register);
  */
 int tracepoint_probe_unregister(const char *name, void *probe)
 {
-	struct tracepoint_entry *entry;
 	void *old;
-	int ret = -ENOENT;
 
 	mutex_lock(&tracepoints_mutex);
-	entry = get_tracepoint(name);
-	if (!entry)
-		goto end;
-	old = tracepoint_entry_remove_probe(entry, probe);
-	if (IS_ERR(old)) {
-		ret = PTR_ERR(old);
-		goto end;
-	}
-	if (!entry->refcount)
-		remove_tracepoint(entry);
+	old = tracepoint_remove_probe(name, probe);
 	mutex_unlock(&tracepoints_mutex);
+	if (IS_ERR(old))
+		return PTR_ERR(old);
+
 	tracepoint_update_probes();		/* may update entry */
 	release_probes(old);
 	return 0;
-end:
-	mutex_unlock(&tracepoints_mutex);
-	return ret;
 }
 EXPORT_SYMBOL_GPL(tracepoint_probe_unregister);
 
+static LIST_HEAD(old_probes);
+static int need_update;
+
+static void tracepoint_add_old_probes(void *old)
+{
+	need_update = 1;
+	if (old) {
+		struct tp_probes *tp_probes = container_of(old,
+			struct tp_probes, probes[0]);
+		list_add(&tp_probes->u.list, &old_probes);
+	}
+}
+
+/**
+ * tracepoint_probe_register_noupdate -  register a probe but not connect
+ * @name: tracepoint name
+ * @probe: probe handler
+ *
+ * caller must call tracepoint_probe_update_all()
+ */
+int tracepoint_probe_register_noupdate(const char *name, void *probe)
+{
+	void *old;
+
+	mutex_lock(&tracepoints_mutex);
+	old = tracepoint_add_probe(name, probe);
+	if (IS_ERR(old)) {
+		mutex_unlock(&tracepoints_mutex);
+		return PTR_ERR(old);
+	}
+	tracepoint_add_old_probes(old);
+	mutex_unlock(&tracepoints_mutex);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate);
+
+/**
+ * tracepoint_probe_unregister_noupdate -  remove a probe but not disconnect
+ * @name: tracepoint name
+ * @probe: probe function pointer
+ *
+ * caller must call tracepoint_probe_update_all()
+ */
+int tracepoint_probe_unregister_noupdate(const char *name, void *probe)
+{
+	void *old;
+
+	mutex_lock(&tracepoints_mutex);
+	old = tracepoint_remove_probe(name, probe);
+	if (IS_ERR(old)) {
+		mutex_unlock(&tracepoints_mutex);
+		return PTR_ERR(old);
+	}
+	tracepoint_add_old_probes(old);
+	mutex_unlock(&tracepoints_mutex);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tracepoint_probe_unregister_noupdate);
+
+/**
+ * tracepoint_probe_update_all -  update tracepoints
+ */
+void tracepoint_probe_update_all(void)
+{
+	LIST_HEAD(release_probes);
+	struct tp_probes *pos, *next;
+
+	mutex_lock(&tracepoints_mutex);
+	if (!need_update) {
+		mutex_unlock(&tracepoints_mutex);
+		return;
+	}
+	if (!list_empty(&old_probes))
+		list_replace_init(&old_probes, &release_probes);
+	need_update = 0;
+	mutex_unlock(&tracepoints_mutex);
+
+	tracepoint_update_probes();
+	list_for_each_entry_safe(pos, next, &release_probes, u.list) {
+		list_del(&pos->u.list);
+		call_rcu_sched(&pos->u.rcu, rcu_free_old_probes);
+	}
+}
+EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
+
 /**
  * tracepoint_get_iter_range - Get a next tracepoint iterator given a range.
  * @tracepoint: current tracepoints (in), next tracepoint (out)
-- 
cgit v1.2.3


From e113a745f693af196c8081b328bf42def086989b Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@sgi.com>
Date: Fri, 31 Oct 2008 08:03:41 -0500
Subject: sched/rt: small optimization to update_curr_rt()

Impact: micro-optimization to SCHED_FIFO/RR scheduling

A very minor improvement, but might it be better to check sched_rt_runtime(rt_rq)
before taking the rt_runtime_lock?

Peter Zijlstra observes:

> Yes, I think its ok to do so.
>
> Like pointed out in the other thread, there are two races:
>
>  - sched_rt_runtime() going to RUNTIME_INF, and that will be handled
>    properly by sched_rt_runtime_exceeded()
>
>  - sched_rt_runtime() going to !RUNTIME_INF, and here we can miss an
>    accounting cycle, but I don't think that is something to worry too
>    much about.

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

--

 kernel/sched_rt.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
---
 kernel/sched_rt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d9ba9d5f99d6..c7963d5d0625 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -537,13 +537,13 @@ static void update_curr_rt(struct rq *rq)
 	for_each_sched_rt_entity(rt_se) {
 		rt_rq = rt_rq_of_se(rt_se);
 
-		spin_lock(&rt_rq->rt_runtime_lock);
 		if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
+			spin_lock(&rt_rq->rt_runtime_lock);
 			rt_rq->rt_time += delta_exec;
 			if (sched_rt_runtime_exceeded(rt_rq))
 				resched_task(curr);
+			spin_unlock(&rt_rq->rt_runtime_lock);
 		}
-		spin_unlock(&rt_rq->rt_runtime_lock);
 	}
 }
 
-- 
cgit v1.2.3


From 8f0a056fcb2f83a069fb5d60c2383304b7456687 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 3 Nov 2008 23:15:55 -0500
Subject: ftrace: introduce ftrace_preempt_disable()/enable()

Impact: add new ftrace-plugin internal APIs

Parts of the tracer needs to be careful about schedule recursion.
If the NEED_RESCHED flag is set, a preempt_enable will call schedule.
Inside the schedule function, the NEED_RESCHED flag is cleared.

The problem arises when a trace happens in the schedule function but before
NEED_RESCHED is cleared. The race is as follows:

schedule()
  >> tracer called

    trace_function()
       preempt_disable()
       [ record trace ]
       preempt_enable()  <<- here's the issue.

         [check NEED_RESCHED]
          schedule()
          [ Repeat the above, over and over again ]

The naive approach is simply to use preempt_enable_no_schedule instead.
The problem with that approach is that, although we solve the schedule
recursion issue, we now might lose a preemption check when not in the
schedule function.

  trace_function()
    preempt_disable()
    [ record trace ]
    [Interrupt comes in and sets NEED_RESCHED]
    preempt_enable_no_resched()
    [continue without scheduling]

The way ftrace handles this problem is with the following approach:

	int resched;

	resched = need_resched();
	preempt_disable_notrace();
	[record trace]
	if (resched)
		preempt_enable_no_sched_notrace();
	else
		preempt_enable_notrace();

This may seem like the opposite of what we want. If resched is set
then we call the "no_sched" version??  The reason we do this is because
if NEED_RESCHED is set before we disable preemption, there's two reasons
for that:

  1) we are in an atomic code path
  2) we are already on our way to the schedule function, and maybe even
     in the schedule function, but have yet to clear the flag.

Both the above cases we do not want to schedule.

This solution has already been implemented within the ftrace infrastructure.
But the problem is that it has been implemented several times. This patch
encapsulates this code to two nice functions.

  resched = ftrace_preempt_disable();
  [ record trace]
  ftrace_preempt_enable(resched);

This way the tracers do not need to worry about getting it right.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8465ad052707..10c6dae76894 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -419,4 +419,52 @@ enum trace_iterator_flags {
 
 extern struct tracer nop_trace;
 
+/**
+ * ftrace_preempt_disable - disable preemption scheduler safe
+ *
+ * When tracing can happen inside the scheduler, there exists
+ * cases that the tracing might happen before the need_resched
+ * flag is checked. If this happens and the tracer calls
+ * preempt_enable (after a disable), a schedule might take place
+ * causing an infinite recursion.
+ *
+ * To prevent this, we read the need_recshed flag before
+ * disabling preemption. When we want to enable preemption we
+ * check the flag, if it is set, then we call preempt_enable_no_resched.
+ * Otherwise, we call preempt_enable.
+ *
+ * The rational for doing the above is that if need resched is set
+ * and we have yet to reschedule, we are either in an atomic location
+ * (where we do not need to check for scheduling) or we are inside
+ * the scheduler and do not want to resched.
+ */
+static inline int ftrace_preempt_disable(void)
+{
+	int resched;
+
+	resched = need_resched();
+	preempt_disable_notrace();
+
+	return resched;
+}
+
+/**
+ * ftrace_preempt_enable - enable preemption scheduler safe
+ * @resched: the return value from ftrace_preempt_disable
+ *
+ * This is a scheduler safe way to enable preemption and not miss
+ * any preemption checks. The disabled saved the state of preemption.
+ * If resched is set, then we were either inside an atomic or
+ * are inside the scheduler (we would have already scheduled
+ * otherwise). In this case, we do not want to call normal
+ * preempt_enable, but preempt_enable_no_resched instead.
+ */
+static inline void ftrace_preempt_enable(int resched)
+{
+	if (resched)
+		preempt_enable_no_resched_notrace();
+	else
+		preempt_enable_notrace();
+}
+
 #endif /* _LINUX_KERNEL_TRACE_H */
-- 
cgit v1.2.3


From 182e9f5f704ed6b9175142fe8da33c9ce0c52b52 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 3 Nov 2008 23:15:56 -0500
Subject: ftrace: insert in the ftrace_preempt_disable()/enable() functions

Impact: use new, consolidated APIs in ftrace plugins

This patch replaces the schedule safe preempt disable code with the
ftrace_preempt_disable() and ftrace_preempt_enable() safe functions.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c        | 27 +++++++++------------------
 kernel/trace/trace.c              |  8 ++------
 kernel/trace/trace_sched_wakeup.c | 13 ++-----------
 kernel/trace/trace_stack.c        |  8 ++------
 4 files changed, 15 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index cedf4e268285..151f6a748676 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -16,6 +16,8 @@
 #include <linux/list.h>
 #include <linux/fs.h>
 
+#include "trace.h"
+
 /* Up this if you want to test the TIME_EXTENTS and normalization */
 #define DEBUG_SHIFT 0
 
@@ -1122,8 +1124,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
 		return NULL;
 
 	/* If we are tracing schedule, we don't want to recurse */
-	resched = need_resched();
-	preempt_disable_notrace();
+	resched = ftrace_preempt_disable();
 
 	cpu = raw_smp_processor_id();
 
@@ -1154,10 +1155,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
 	return event;
 
  out:
-	if (resched)
-		preempt_enable_notrace();
-	else
-		preempt_enable_notrace();
+	ftrace_preempt_enable(resched);
 	return NULL;
 }
 
@@ -1199,12 +1197,9 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
 	/*
 	 * Only the last preempt count needs to restore preemption.
 	 */
-	if (preempt_count() == 1) {
-		if (per_cpu(rb_need_resched, cpu))
-			preempt_enable_no_resched_notrace();
-		else
-			preempt_enable_notrace();
-	} else
+	if (preempt_count() == 1)
+		ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
+	else
 		preempt_enable_no_resched_notrace();
 
 	return 0;
@@ -1237,8 +1232,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
 	if (atomic_read(&buffer->record_disabled))
 		return -EBUSY;
 
-	resched = need_resched();
-	preempt_disable_notrace();
+	resched = ftrace_preempt_disable();
 
 	cpu = raw_smp_processor_id();
 
@@ -1264,10 +1258,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
 
 	ret = 0;
  out:
-	if (resched)
-		preempt_enable_no_resched_notrace();
-	else
-		preempt_enable_notrace();
+	ftrace_preempt_enable(resched);
 
 	return ret;
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e4c40c868d67..3e7bf5eb9007 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -904,8 +904,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
 		return;
 
 	pc = preempt_count();
-	resched = need_resched();
-	preempt_disable_notrace();
+	resched = ftrace_preempt_disable();
 	local_save_flags(flags);
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
@@ -915,10 +914,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
 		trace_function(tr, data, ip, parent_ip, flags, pc);
 
 	atomic_dec(&data->disabled);
-	if (resched)
-		preempt_enable_no_resched_notrace();
-	else
-		preempt_enable_notrace();
+	ftrace_preempt_enable(resched);
 }
 
 static struct ftrace_ops trace_ops __read_mostly =
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 3ae93f16b565..7bc4abf6fca8 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -50,8 +50,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
 		return;
 
 	pc = preempt_count();
-	resched = need_resched();
-	preempt_disable_notrace();
+	resched = ftrace_preempt_disable();
 
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
@@ -81,15 +80,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
  out:
 	atomic_dec(&data->disabled);
 
-	/*
-	 * To prevent recursion from the scheduler, if the
-	 * resched flag was set before we entered, then
-	 * don't reschedule.
-	 */
-	if (resched)
-		preempt_enable_no_resched_notrace();
-	else
-		preempt_enable_notrace();
+	ftrace_preempt_enable(resched);
 }
 
 static struct ftrace_ops trace_ops __read_mostly =
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index be682b62fe58..d39e8b7de6a2 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -107,8 +107,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
 	if (unlikely(!ftrace_enabled || stack_trace_disabled))
 		return;
 
-	resched = need_resched();
-	preempt_disable_notrace();
+	resched = ftrace_preempt_disable();
 
 	cpu = raw_smp_processor_id();
 	/* no atomic needed, we only modify this variable by this cpu */
@@ -120,10 +119,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
  out:
 	per_cpu(trace_active, cpu)--;
 	/* prevent recursion in schedule */
-	if (resched)
-		preempt_enable_no_resched_notrace();
-	else
-		preempt_enable_notrace();
+	ftrace_preempt_enable(resched);
 }
 
 static struct ftrace_ops trace_ops __read_mostly =
-- 
cgit v1.2.3


From b2a866f9344cb30d7ddf5d67b5b8393daf8bef07 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 3 Nov 2008 23:15:57 -0500
Subject: ftrace: function tracer with irqs disabled

Impact: disable interrupts during trace entry creation (as opposed to preempt)

To help with performance, I set the ftracer to not disable interrupts,
and only to disable preemption. If an interrupt occurred, it would not
be traced, because the function tracer protects itself from recursion.
This may be faster, but the trace output might miss some traces.

This patch makes the fuction trace disable interrupts, but it also
adds a runtime feature to disable preemption instead. It does this by
having two different tracer functions. When the function tracer is
enabled, it will check to see which version is requested (irqs disabled
or preemption disabled). Then it will use the corresponding function
as the tracer.

Irq disabling is the default behaviour, but if the user wants better
performance, with the chance of missing traces, then they can choose
the preempt disabled version.

Running hackbench 3 times with the irqs disabled and 3 times with
the preempt disabled function tracer yielded:

tracing type       times            entries recorded
------------      --------          ----------------
irq disabled      43.393            166433066
                  43.282            166172618
                  43.298            166256704

preempt disabled  38.969            159871710
                  38.943            159972935
                  39.325            161056510

Average:

   irqs disabled:  43.324           166287462
preempt disabled:  39.079           160300385

 preempt is 10.8 percent faster than irqs disabled.

I wrote a patch to count function trace recursion and reran hackbench.

With irq disabled: 1,150 times the function tracer did not trace due to
  recursion.
with preempt disabled: 5,117,718 times.

The thousand times with irq disabled could be due to NMIs, or simply a case
where it called a function that was not protected by notrace.

But we also see that a large amount of the trace is lost with the
preempt version.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 40 +++++++++++++++++++++++++++++++++++++++-
 kernel/trace/trace.h |  1 +
 2 files changed, 40 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3e7bf5eb9007..d576dbd6defe 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -244,6 +244,7 @@ static const char *trace_options[] = {
 	"stacktrace",
 	"sched-tree",
 	"ftrace_printk",
+	"ftrace_preempt",
 	NULL
 };
 
@@ -891,7 +892,7 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
 
 #ifdef CONFIG_FUNCTION_TRACER
 static void
-function_trace_call(unsigned long ip, unsigned long parent_ip)
+function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
 {
 	struct trace_array *tr = &global_trace;
 	struct trace_array_cpu *data;
@@ -917,6 +918,37 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
 	ftrace_preempt_enable(resched);
 }
 
+static void
+function_trace_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct trace_array *tr = &global_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+	int pc;
+
+	if (unlikely(!ftrace_function_enabled))
+		return;
+
+	/*
+	 * Need to use raw, since this must be called before the
+	 * recursive protection is performed.
+	 */
+	raw_local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1)) {
+		pc = preempt_count();
+		trace_function(tr, data, ip, parent_ip, flags, pc);
+	}
+
+	atomic_dec(&data->disabled);
+	raw_local_irq_restore(flags);
+}
+
 static struct ftrace_ops trace_ops __read_mostly =
 {
 	.func = function_trace_call,
@@ -925,6 +957,12 @@ static struct ftrace_ops trace_ops __read_mostly =
 void tracing_start_function_trace(void)
 {
 	ftrace_function_enabled = 0;
+
+	if (trace_flags & TRACE_ITER_PREEMPTONLY)
+		trace_ops.func = function_trace_call_preempt_only;
+	else
+		trace_ops.func = function_trace_call;
+
 	register_ftrace_function(&trace_ops);
 	if (tracer_enabled)
 		ftrace_function_enabled = 1;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 10c6dae76894..bb547e933af7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -415,6 +415,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_STACKTRACE		= 0x100,
 	TRACE_ITER_SCHED_TREE		= 0x200,
 	TRACE_ITER_PRINTK		= 0x400,
+	TRACE_ITER_PREEMPTONLY		= 0x800,
 };
 
 extern struct tracer nop_trace;
-- 
cgit v1.2.3


From eefd796a8e831408ce17e633d73d70430748c47a Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 4 Nov 2008 16:15:37 +0800
Subject: sched debug: remove sd_level_to_string()

Impact: cleanup

Just use the newly introduced sd->name.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 25 +------------------------
 1 file changed, 1 insertion(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 5419df9cc5c4..7ac59bae87d2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6602,28 +6602,6 @@ early_initcall(migration_init);
 
 #ifdef CONFIG_SCHED_DEBUG
 
-static inline const char *sd_level_to_string(enum sched_domain_level lvl)
-{
-	switch (lvl) {
-	case SD_LV_NONE:
-			return "NONE";
-	case SD_LV_SIBLING:
-			return "SIBLING";
-	case SD_LV_MC:
-			return "MC";
-	case SD_LV_CPU:
-			return "CPU";
-	case SD_LV_NODE:
-			return "NODE";
-	case SD_LV_ALLNODES:
-			return "ALLNODES";
-	case SD_LV_MAX:
-			return "MAX";
-
-	}
-	return "MAX";
-}
-
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 				  cpumask_t *groupmask)
 {
@@ -6643,8 +6621,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 		return -1;
 	}
 
-	printk(KERN_CONT "span %s level %s\n",
-		str, sd_level_to_string(sd->level));
+	printk(KERN_CONT "span %s level %s\n", str, sd->name);
 
 	if (!cpu_isset(cpu, sd->span)) {
 		printk(KERN_ERR "ERROR: domain->span does not contain "
-- 
cgit v1.2.3


From 0a0db8f5c9d4bbb9bbfcc2b6cb6bce2d0ef4d73d Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 4 Nov 2008 16:17:05 +0800
Subject: sched debug: remove NULL checking in print_cfs/rt_rq()

Impact: cleanup

cfs->tg is initialized in init_tg_cfs_entry() with tg != NULL, and
will never be invalidated to NULL. And the underlying cgroup of a
valid task_group is always valid.

Same for rt->tg.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_debug.c | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 5ae17762ec32..d25cefe3f0eb 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -121,14 +121,9 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 
 #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
 	char path[128] = "";
-	struct cgroup *cgroup = NULL;
 	struct task_group *tg = cfs_rq->tg;
 
-	if (tg)
-		cgroup = tg->css.cgroup;
-
-	if (cgroup)
-		cgroup_path(cgroup, path, sizeof(path));
+	cgroup_path(tg->css.cgroup, path, sizeof(path));
 
 	SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
 #else
@@ -193,14 +188,9 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 {
 #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
 	char path[128] = "";
-	struct cgroup *cgroup = NULL;
 	struct task_group *tg = rt_rq->tg;
 
-	if (tg)
-		cgroup = tg->css.cgroup;
-
-	if (cgroup)
-		cgroup_path(cgroup, path, sizeof(path));
+	cgroup_path(tg->css.cgroup, path, sizeof(path));
 
 	SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
 #else
-- 
cgit v1.2.3


From a17e2260926f681a0eb983c1e3cb859ba2064bce Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 4 Nov 2008 16:19:13 +0800
Subject: sched: remove redundant call to unregister_sched_domain_sysctl()

Impact: cleanup

The sysctl has been unregistered by partition_sched_domains().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 7ac59bae87d2..3cb94fad33ca 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7691,8 +7691,6 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
 	cpumask_t tmpmask;
 	int i;
 
-	unregister_sched_domain_sysctl();
-
 	for_each_cpu_mask_nr(i, *cpu_map)
 		cpu_attach_domain(NULL, &def_root_domain, i);
 	synchronize_sched();
-- 
cgit v1.2.3


From faa2f98f856e89d1afb6e4a91707284d242e816e Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 4 Nov 2008 16:20:23 +0800
Subject: sched: add sanity check in partition_sched_domains()

Impact: cleanup, add debug check

It's wrong to make dattr_new = NULL if doms_new == NULL, it introduces
memory leak if dattr_new != NULL. Fortunately dattr_new is always NULL
in this case. So remove the code and add a sanity check.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 3cb94fad33ca..213cad5e50aa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7767,7 +7767,7 @@ match1:
 		ndoms_cur = 0;
 		doms_new = &fallback_doms;
 		cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
-		dattr_new = NULL;
+		WARN_ON_ONCE(dattr_new);
 	}
 
 	/* Build new domains */
-- 
cgit v1.2.3


From 3299b4dd1180762da831be5eb6adc44553eaec26 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 4 Nov 2008 11:58:21 +0100
Subject: ftrace: sysctl typo

Impact: fix sysctl name typo

Steve must have needed more coffee ;-)

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sysctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6b6b727258b5..65d4a9ba79e4 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -487,7 +487,7 @@ static struct ctl_table kern_table[] = {
 #ifdef CONFIG_TRACING
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "ftrace_dump_on_opps",
+		.procname	= "ftrace_dump_on_oops",
 		.data		= &ftrace_dump_on_oops,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-- 
cgit v1.2.3


From 71566a0d161edec70361b7f90f6e54af6a6d5d05 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 31 Oct 2008 12:57:20 +0100
Subject: tracing/fastboot: Enable boot tracing only during initcalls

Impact: modify boot tracer

We used to disable the initcall tracing at a specified time (IE: end
of builtin initcalls). But we don't need it anymore. It will be
stopped when initcalls are finished.

However we want two things:

_Start this tracing only after pre-smp initcalls are finished.

_Since we are planning to trace sched_switches at the same time, we
want to enable them only during the initcall execution.

For this purpose, this patch introduce two functions to enable/disable
the sched_switch tracing during boot.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h    | 24 ++++++++++++++++++++++--
 init/main.c               |  4 +++-
 kernel/trace/trace_boot.c | 28 ++++++++++++++++------------
 3 files changed, 41 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index e46a7b34037c..4642959e5bda 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -234,6 +234,11 @@ ftrace_init_module(unsigned long *start, unsigned long *end) { }
 #endif
 
 
+/*
+ * Structure which defines the trace of an initcall.
+ * You don't have to fill the func field since it is
+ * only used internally by the tracer.
+ */
 struct boot_trace {
 	pid_t			caller;
 	char			func[KSYM_NAME_LEN];
@@ -244,13 +249,28 @@ struct boot_trace {
 };
 
 #ifdef CONFIG_BOOT_TRACER
+/* Append the trace on the ring-buffer */
 extern void trace_boot(struct boot_trace *it, initcall_t fn);
+
+/* Tells the tracer that smp_pre_initcall is finished.
+ * So we can start the tracing
+ */
 extern void start_boot_trace(void);
-extern void stop_boot_trace(void);
+
+/* Resume the tracing of other necessary events
+ * such as sched switches
+ */
+extern void enable_boot_trace(void);
+
+/* Suspend this tracing. Actually, only sched_switches tracing have
+ * to be suspended. Initcalls doesn't need it.)
+ */
+extern void disable_boot_trace(void);
 #else
 static inline void trace_boot(struct boot_trace *it, initcall_t fn) { }
 static inline void start_boot_trace(void) { }
-static inline void stop_boot_trace(void) { }
+static inline void enable_boot_trace(void) { }
+static inline void disable_boot_trace(void) { }
 #endif
 
 
diff --git a/init/main.c b/init/main.c
index 7e117a231af1..4b03cd5656ca 100644
--- a/init/main.c
+++ b/init/main.c
@@ -711,6 +711,7 @@ int do_one_initcall(initcall_t fn)
 		it.caller = task_pid_nr(current);
 		printk("calling  %pF @ %i\n", fn, it.caller);
 		it.calltime = ktime_get();
+		enable_boot_trace();
 	}
 
 	it.result = fn();
@@ -722,6 +723,7 @@ int do_one_initcall(initcall_t fn)
 		printk("initcall %pF returned %d after %Ld usecs\n", fn,
 			it.result, it.duration);
 		trace_boot(&it, fn);
+		disable_boot_trace();
 	}
 
 	msgbuf[0] = 0;
@@ -882,7 +884,7 @@ static int __init kernel_init(void * unused)
 	 * we're essentially up and running. Get rid of the
 	 * initmem segments and start the user-mode stuff..
 	 */
-	stop_boot_trace();
+
 	init_post();
 	return 0;
 }
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index d0a5e50eeff2..d104d5b46413 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -13,23 +13,29 @@
 #include "trace.h"
 
 static struct trace_array *boot_trace;
-static int trace_boot_enabled;
+static bool pre_initcalls_finished;
 
-
-/* Should be started after do_pre_smp_initcalls() in init/main.c */
+/* Tells the boot tracer that the pre_smp_initcalls are finished.
+ * So we are ready .
+ * It doesn't enable sched events tracing however.
+ * You have to call enable_boot_trace to do so.
+ */
 void start_boot_trace(void)
 {
-	trace_boot_enabled = 1;
+	pre_initcalls_finished = true;
+}
+
+void enable_boot_trace(void)
+{
 }
 
-void stop_boot_trace(void)
+void disable_boot_trace(void)
 {
-	trace_boot_enabled = 0;
 }
 
 void reset_boot_trace(struct trace_array *tr)
 {
-	stop_boot_trace();
+	disable_boot_trace();
 }
 
 static void boot_trace_init(struct trace_array *tr)
@@ -37,8 +43,6 @@ static void boot_trace_init(struct trace_array *tr)
 	int cpu;
 	boot_trace = tr;
 
-	trace_boot_enabled = 0;
-
 	for_each_cpu_mask(cpu, cpu_possible_map)
 		tracing_reset(tr, cpu);
 }
@@ -46,9 +50,9 @@ static void boot_trace_init(struct trace_array *tr)
 static void boot_trace_ctrl_update(struct trace_array *tr)
 {
 	if (tr->ctrl)
-		start_boot_trace();
+		enable_boot_trace();
 	else
-		stop_boot_trace();
+		disable_boot_trace();
 }
 
 static enum print_line_t initcall_print_line(struct trace_iterator *iter)
@@ -99,7 +103,7 @@ void trace_boot(struct boot_trace *it, initcall_t fn)
 	unsigned long irq_flags;
 	struct trace_array *tr = boot_trace;
 
-	if (!trace_boot_enabled)
+	if (!pre_initcalls_finished)
 		return;
 
 	/* Get its name now since this function could
-- 
cgit v1.2.3


From 07695fa04e8a3384b0c855398ce1f7885bd7dc3b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 31 Oct 2008 13:08:28 +0100
Subject: tracing/ftrace: fix a race condition in sched_switch tracer

Impact: fix race condition in sched_switch tracer

This patch fixes a race condition in the sched_switch tracer. If
several tasks (IE: concurrent initcalls) are playing with
tracing_start_cmdline_record() and tracing_stop_cmdline_record(), the
following situation could happen:

_ Task A and B are using the same tracepoint probe. Task A holds it.
  Task B is sleeping and doesn't hold it.

_ Task A frees the sched tracer, then sched_ref is decremented to 0.

_ Task A is preempted and hadn't yet unregistered its tracepoint
  probe, then B runs.

_ B increments sched_ref, sees it's 1 and then guess it has to
  register its probe. But it has not been freed by task A.

_ A lot of bad things can happen after that...

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_sched_switch.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index b8f56beb1a62..59de5141207c 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -17,6 +17,7 @@
 static struct trace_array	*ctx_trace;
 static int __read_mostly	tracer_enabled;
 static atomic_t			sched_ref;
+static DEFINE_MUTEX(tracepoint_mutex);
 
 static void
 probe_sched_switch(struct rq *__rq, struct task_struct *prev,
@@ -125,18 +126,22 @@ static void tracing_start_sched_switch(void)
 {
 	long ref;
 
+	mutex_lock(&tracepoint_mutex);
 	ref = atomic_inc_return(&sched_ref);
 	if (ref == 1)
 		tracing_sched_register();
+	mutex_unlock(&tracepoint_mutex);
 }
 
 static void tracing_stop_sched_switch(void)
 {
 	long ref;
 
+	mutex_lock(&tracepoint_mutex);
 	ref = atomic_dec_and_test(&sched_ref);
 	if (ref)
 		tracing_sched_unregister();
+	mutex_unlock(&tracepoint_mutex);
 }
 
 void tracing_start_cmdline_record(void)
-- 
cgit v1.2.3


From e55f605c14679c30be41473e60b7ad26524cdc35 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 31 Oct 2008 13:14:28 +0100
Subject: tracing/ftrace: remove unused code in sched_switch tracer

Impact: cleanup

When init_sched_switch_trace() is called, it has no reason to start
the sched tracer if the sched_ref is not zero.

_ If this is non-zero, the tracer is already used, but we can register it
to the tracing engine. There is already a security which avoid the tracer
probes not to be resgistered twice.

_ If this is zero, this block will not be used.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_sched_switch.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 59de5141207c..96620c714300 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -203,14 +203,6 @@ static struct tracer sched_switch_trace __read_mostly =
 
 __init static int init_sched_switch_trace(void)
 {
-	int ret = 0;
-
-	if (atomic_read(&sched_ref))
-		ret = tracing_sched_register();
-	if (ret) {
-		pr_info("error registering scheduler trace\n");
-		return ret;
-	}
 	return register_tracer(&sched_switch_trace);
 }
 device_initcall(init_sched_switch_trace);
-- 
cgit v1.2.3


From d7ad44b697c9d13e445ddc7d16f736fbac333249 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 31 Oct 2008 13:20:08 +0100
Subject: tracing/fastboot: use sched switch tracer from boot tracer

Impact: enhance boot trace output with scheduling events

Use the sched_switch tracer from the boot tracer.

We also can trace schedule events inside the initcalls.
Sched tracing is disabled after the initcall has finished and
then reenabled before the next one is started.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c              | 2 ++
 kernel/trace/trace.h              | 1 +
 kernel/trace/trace_boot.c         | 6 ++++++
 kernel/trace/trace_sched_switch.c | 6 +++---
 4 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e4c40c868d67..50d7018163f6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3251,6 +3251,8 @@ __init static int tracer_alloc_buffers(void)
 
 	register_tracer(&nop_trace);
 #ifdef CONFIG_BOOT_TRACER
+	/* We don't want to launch sched_switch tracer yet */
+	global_trace.ctrl = 0;
 	register_tracer(&boot_tracer);
 	current_trace = &boot_tracer;
 	current_trace->init(&global_trace);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8465ad052707..9911277b268b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -49,6 +49,7 @@ struct ftrace_entry {
 	unsigned long		parent_ip;
 };
 extern struct tracer boot_tracer;
+extern struct tracer sched_switch_trace; /* Used by the boot tracer */
 
 /*
  * Context switch trace entry - which task (and prio) we switched from/to:
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index d104d5b46413..6bbc8794a6df 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -27,10 +27,14 @@ void start_boot_trace(void)
 
 void enable_boot_trace(void)
 {
+	if (pre_initcalls_finished)
+		tracing_start_cmdline_record();
 }
 
 void disable_boot_trace(void)
 {
+	if (pre_initcalls_finished)
+		tracing_stop_cmdline_record();
 }
 
 void reset_boot_trace(struct trace_array *tr)
@@ -45,6 +49,8 @@ static void boot_trace_init(struct trace_array *tr)
 
 	for_each_cpu_mask(cpu, cpu_possible_map)
 		tracing_reset(tr, cpu);
+
+	sched_switch_trace.init(tr);
 }
 
 static void boot_trace_ctrl_update(struct trace_array *tr)
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 96620c714300..9d7bdac331dd 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -127,6 +127,7 @@ static void tracing_start_sched_switch(void)
 	long ref;
 
 	mutex_lock(&tracepoint_mutex);
+	tracer_enabled = 1;
 	ref = atomic_inc_return(&sched_ref);
 	if (ref == 1)
 		tracing_sched_register();
@@ -138,6 +139,7 @@ static void tracing_stop_sched_switch(void)
 	long ref;
 
 	mutex_lock(&tracepoint_mutex);
+	tracer_enabled = 0;
 	ref = atomic_dec_and_test(&sched_ref);
 	if (ref)
 		tracing_sched_unregister();
@@ -158,12 +160,10 @@ static void start_sched_trace(struct trace_array *tr)
 {
 	sched_switch_reset(tr);
 	tracing_start_cmdline_record();
-	tracer_enabled = 1;
 }
 
 static void stop_sched_trace(struct trace_array *tr)
 {
-	tracer_enabled = 0;
 	tracing_stop_cmdline_record();
 }
 
@@ -190,7 +190,7 @@ static void sched_switch_trace_ctrl_update(struct trace_array *tr)
 		stop_sched_trace(tr);
 }
 
-static struct tracer sched_switch_trace __read_mostly =
+struct tracer sched_switch_trace __read_mostly =
 {
 	.name		= "sched_switch",
 	.init		= sched_switch_trace_init,
-- 
cgit v1.2.3


From efade6e7821c4219818e9da08f9315dfa617048b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 31 Oct 2008 13:28:58 +0100
Subject: tracing/ftrace: types and naming corrections for sched tracer

Impact: cleanup

This patch applies some corrections suggested by Steven Rostedt.

Change the type of shed_ref into int since it is used
into a Mutex, we don't need it anymore as an atomic
variable in the sched_switch tracer.
Also change the name of the register mutex.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_sched_switch.c | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 9d7bdac331dd..969953bf678f 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -16,8 +16,8 @@
 
 static struct trace_array	*ctx_trace;
 static int __read_mostly	tracer_enabled;
-static atomic_t			sched_ref;
-static DEFINE_MUTEX(tracepoint_mutex);
+static int			sched_ref;
+static DEFINE_MUTEX(sched_register_mutex);
 
 static void
 probe_sched_switch(struct rq *__rq, struct task_struct *prev,
@@ -28,7 +28,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
 	int cpu;
 	int pc;
 
-	if (!atomic_read(&sched_ref))
+	if (!sched_ref)
 		return;
 
 	tracing_record_cmdline(prev);
@@ -124,26 +124,22 @@ static void tracing_sched_unregister(void)
 
 static void tracing_start_sched_switch(void)
 {
-	long ref;
-
-	mutex_lock(&tracepoint_mutex);
-	tracer_enabled = 1;
-	ref = atomic_inc_return(&sched_ref);
-	if (ref == 1)
+	mutex_lock(&sched_register_mutex);
+	if (!(sched_ref++)) {
+		tracer_enabled = 1;
 		tracing_sched_register();
-	mutex_unlock(&tracepoint_mutex);
+	}
+	mutex_unlock(&sched_register_mutex);
 }
 
 static void tracing_stop_sched_switch(void)
 {
-	long ref;
-
-	mutex_lock(&tracepoint_mutex);
-	tracer_enabled = 0;
-	ref = atomic_dec_and_test(&sched_ref);
-	if (ref)
+	mutex_lock(&sched_register_mutex);
+	if (!(--sched_ref)) {
 		tracing_sched_unregister();
-	mutex_unlock(&tracepoint_mutex);
+		tracer_enabled = 0;
+	}
+	mutex_unlock(&sched_register_mutex);
 }
 
 void tracing_start_cmdline_record(void)
-- 
cgit v1.2.3


From 79a9d461fd521f133f0e66485aa9ed09c21f5191 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 31 Oct 2008 13:34:45 +0100
Subject: tracing/ftrace: fix a bug when switch current tracer to sched tracer

Impact: fix boot tracer + sched tracer coupling bug

Fix a bug that made the sched_switch tracer unable to run
if set as the current_tracer after the boot tracer.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_boot.c         | 4 ++--
 kernel/trace/trace_sched_switch.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 6bbc8794a6df..bd5046c9deb7 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -37,9 +37,9 @@ void disable_boot_trace(void)
 		tracing_stop_cmdline_record();
 }
 
-void reset_boot_trace(struct trace_array *tr)
+static void reset_boot_trace(struct trace_array *tr)
 {
-	disable_boot_trace();
+	sched_switch_trace.reset(tr);
 }
 
 static void boot_trace_init(struct trace_array *tr)
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 969953bf678f..888944d3409d 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -173,7 +173,7 @@ static void sched_switch_trace_init(struct trace_array *tr)
 
 static void sched_switch_trace_reset(struct trace_array *tr)
 {
-	if (tr->ctrl)
+	if (tr->ctrl && sched_ref)
 		stop_sched_trace(tr);
 }
 
-- 
cgit v1.2.3


From 1f29fae29709b4668979e244c09b2fa78ff1ad59 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Wed, 5 Nov 2008 16:08:52 -0600
Subject: file capabilities: add no_file_caps switch (v4)

Add a no_file_caps boot option when file capabilities are
compiled into the kernel (CONFIG_SECURITY_FILE_CAPABILITIES=y).

This allows distributions to ship a kernel with file capabilities
compiled in, without forcing users to use (and understand and
trust) them.

When no_file_caps is specified at boot, then when a process executes
a file, any file capabilities stored with that file will not be
used in the calculation of the process' new capability sets.

This means that booting with the no_file_caps boot option will
not be the same as booting a kernel with file capabilities
compiled out - in particular a task with  CAP_SETPCAP will not
have any chance of passing capabilities to another task (which
isn't "really" possible anyway, and which may soon by killed
altogether by David Howells in any case), and it will instead
be able to put new capabilities in its pI.  However since fI
will always be empty and pI is masked with fI, it gains the
task nothing.

We also support the extra prctl options, setting securebits and
dropping capabilities from the per-process bounding set.

The other remaining difference is that killpriv, task_setscheduler,
setioprio, and setnice will continue to be hooked.  That will
be noticable in the case where a root task changed its uid
while keeping some caps, and another task owned by the new uid
tries to change settings for the more privileged task.

Changelog:
	Nov 05 2008: (v4) trivial port on top of always-start-\
		with-clear-caps patch
	Sep 23 2008: nixed file_caps_enabled when file caps are
		not compiled in as it isn't used.
		Document no_file_caps in kernel-parameters.txt.

Signed-off-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Andrew G. Morgan <morgan@kernel.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 Documentation/kernel-parameters.txt |  4 ++++
 include/linux/capability.h          |  3 +++
 kernel/capability.c                 | 11 +++++++++++
 security/commoncap.c                |  3 +++
 4 files changed, 21 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 1bbcaa8982b6..784443acca9c 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1459,6 +1459,10 @@ and is between 256 and 4096 characters. It is defined in the file
 			instruction doesn't work correctly and not to
 			use it.
 
+	no_file_caps	Tells the kernel not to honor file capabilities.  The
+			only way then for a file to be executed with privilege
+			is to be setuid root or executed by root.
+
 	nohalt		[IA-64] Tells the kernel not to use the power saving
 			function PAL_HALT_LIGHT when idle. This increases
 			power-consumption. On the positive side, it reduces
diff --git a/include/linux/capability.h b/include/linux/capability.h
index 9d1fe30b6f6c..5bc145bd759a 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -68,6 +68,9 @@ typedef struct __user_cap_data_struct {
 #define VFS_CAP_U32             VFS_CAP_U32_2
 #define VFS_CAP_REVISION	VFS_CAP_REVISION_2
 
+#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
+extern int file_caps_enabled;
+#endif
 
 struct vfs_cap_data {
 	__le32 magic_etc;            /* Little endian */
diff --git a/kernel/capability.c b/kernel/capability.c
index 33e51e78c2d8..e13a68535ad5 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -33,6 +33,17 @@ EXPORT_SYMBOL(__cap_empty_set);
 EXPORT_SYMBOL(__cap_full_set);
 EXPORT_SYMBOL(__cap_init_eff_set);
 
+#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
+int file_caps_enabled = 1;
+
+static int __init file_caps_disable(char *str)
+{
+	file_caps_enabled = 0;
+	return 1;
+}
+__setup("no_file_caps", file_caps_disable);
+#endif
+
 /*
  * More recent versions of libcap are available from:
  *
diff --git a/security/commoncap.c b/security/commoncap.c
index 3976613db829..f88119cb2bc2 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -281,6 +281,9 @@ static int get_file_caps(struct linux_binprm *bprm)
 
 	bprm_clear_caps(bprm);
 
+	if (!file_caps_enabled)
+		return 0;
+
 	if (bprm->file->f_vfsmnt->mnt_flags & MNT_NOSUID)
 		return 0;
 
-- 
cgit v1.2.3


From 60a7ecf42661f2b22168751298592da6ee210c9e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 5 Nov 2008 16:05:44 -0500
Subject: ftrace: add quick function trace stop

Impact: quick start and stop of function tracer

This patch adds a way to disable the function tracer quickly without
the need to run kstop_machine. It adds a new variable called
function_trace_stop which will stop the calls to functions from mcount
when set.  This is just an on/off switch and does not handle recursion
like preempt_disable().

It's main purpose is to help other tracers/debuggers start and stop tracing
fuctions without the need to call kstop_machine.

The config option HAVE_FUNCTION_TRACE_MCOUNT_TEST is added for archs
that implement the testing of the function_trace_stop in the mcount
arch dependent code. Otherwise, the test is done in the C code.

x86 is the only arch at the moment that supports this.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig           |  1 +
 arch/x86/kernel/entry_32.S |  6 ++++++
 arch/x86/kernel/entry_64.S |  5 +++++
 include/linux/ftrace.h     | 30 +++++++++++++++++++++++++++++
 kernel/trace/Kconfig       |  7 +++++++
 kernel/trace/ftrace.c      | 47 ++++++++++++++++++++++++++++++++++++----------
 6 files changed, 86 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6f20718d3156..d09e812c6223 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -29,6 +29,7 @@ config X86
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FUNCTION_TRACER
+	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
 	select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
 	select HAVE_ARCH_KGDB if !X86_VOYAGER
 	select HAVE_ARCH_TRACEHOOK
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 28b597ef9ca1..9134de814c97 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1157,6 +1157,9 @@ ENTRY(mcount)
 END(mcount)
 
 ENTRY(ftrace_caller)
+	cmpl $0, function_trace_stop
+	jne  ftrace_stub
+
 	pushl %eax
 	pushl %ecx
 	pushl %edx
@@ -1180,6 +1183,9 @@ END(ftrace_caller)
 #else /* ! CONFIG_DYNAMIC_FTRACE */
 
 ENTRY(mcount)
+	cmpl $0, function_trace_stop
+	jne  ftrace_stub
+
 	cmpl $ftrace_stub, ftrace_trace_function
 	jnz trace
 .globl ftrace_stub
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b86f332c96a6..08aa6b10933c 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -68,6 +68,8 @@ ENTRY(mcount)
 END(mcount)
 
 ENTRY(ftrace_caller)
+	cmpl $0, function_trace_stop
+	jne  ftrace_stub
 
 	/* taken from glibc */
 	subq $0x38, %rsp
@@ -103,6 +105,9 @@ END(ftrace_caller)
 
 #else /* ! CONFIG_DYNAMIC_FTRACE */
 ENTRY(mcount)
+	cmpl $0, function_trace_stop
+	jne  ftrace_stub
+
 	cmpq $ftrace_stub, ftrace_trace_function
 	jnz trace
 .globl ftrace_stub
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 4642959e5bda..794ab907dbfe 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -23,6 +23,34 @@ struct ftrace_ops {
 	struct ftrace_ops *next;
 };
 
+extern int function_trace_stop;
+
+/**
+ * ftrace_stop - stop function tracer.
+ *
+ * A quick way to stop the function tracer. Note this an on off switch,
+ * it is not something that is recursive like preempt_disable.
+ * This does not disable the calling of mcount, it only stops the
+ * calling of functions from mcount.
+ */
+static inline void ftrace_stop(void)
+{
+	function_trace_stop = 1;
+}
+
+/**
+ * ftrace_start - start the function tracer.
+ *
+ * This function is the inverse of ftrace_stop. This does not enable
+ * the function tracing if the function tracer is disabled. This only
+ * sets the function tracer flag to continue calling the functions
+ * from mcount.
+ */
+static inline void ftrace_start(void)
+{
+	function_trace_stop = 0;
+}
+
 /*
  * The ftrace_ops must be a static and should also
  * be read_mostly.  These functions do modify read_mostly variables
@@ -41,6 +69,8 @@ extern void ftrace_stub(unsigned long a0, unsigned long a1);
 # define unregister_ftrace_function(ops) do { } while (0)
 # define clear_ftrace_function(ops) do { } while (0)
 static inline void ftrace_kill(void) { }
+static inline void ftrace_stop(void) { }
+static inline void ftrace_start(void) { }
 #endif /* CONFIG_FUNCTION_TRACER */
 
 #ifdef CONFIG_DYNAMIC_FTRACE
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 33dbefd471e8..fc4febc3334a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -9,6 +9,13 @@ config NOP_TRACER
 config HAVE_FUNCTION_TRACER
 	bool
 
+config HAVE_FUNCTION_TRACE_MCOUNT_TEST
+	bool
+	help
+	 This gets selected when the arch tests the function_trace_stop
+	 variable at the mcount call site. Otherwise, this variable
+	 is tested by the called function.
+
 config HAVE_DYNAMIC_FTRACE
 	bool
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4a39d24568c8..896c71f0f4c4 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -47,6 +47,9 @@
 int ftrace_enabled __read_mostly;
 static int last_ftrace_enabled;
 
+/* Quick disabling of function tracer. */
+int function_trace_stop;
+
 /*
  * ftrace_disabled is set when an anomaly is discovered.
  * ftrace_disabled is much stronger than ftrace_enabled.
@@ -63,6 +66,7 @@ static struct ftrace_ops ftrace_list_end __read_mostly =
 
 static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
 ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
+ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
 
 static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
 {
@@ -88,8 +92,23 @@ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
 void clear_ftrace_function(void)
 {
 	ftrace_trace_function = ftrace_stub;
+	__ftrace_trace_function = ftrace_stub;
 }
 
+#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
+/*
+ * For those archs that do not test ftrace_trace_stop in their
+ * mcount call site, we need to do it from C.
+ */
+static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
+{
+	if (function_trace_stop)
+		return;
+
+	__ftrace_trace_function(ip, parent_ip);
+}
+#endif
+
 static int __register_ftrace_function(struct ftrace_ops *ops)
 {
 	/* should not be called from interrupt context */
@@ -110,10 +129,18 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
 		 * For one func, simply call it directly.
 		 * For more than one func, call the chain.
 		 */
+#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
 		if (ops->next == &ftrace_list_end)
 			ftrace_trace_function = ops->func;
 		else
 			ftrace_trace_function = ftrace_list_func;
+#else
+		if (ops->next == &ftrace_list_end)
+			__ftrace_trace_function = ops->func;
+		else
+			__ftrace_trace_function = ftrace_list_func;
+		ftrace_trace_function = ftrace_test_stop_func;
+#endif
 	}
 
 	spin_unlock(&ftrace_lock);
@@ -526,7 +553,7 @@ static void ftrace_run_update_code(int command)
 }
 
 static ftrace_func_t saved_ftrace_func;
-static int ftrace_start;
+static int ftrace_start_up;
 static DEFINE_MUTEX(ftrace_start_lock);
 
 static void ftrace_startup(void)
@@ -537,8 +564,8 @@ static void ftrace_startup(void)
 		return;
 
 	mutex_lock(&ftrace_start_lock);
-	ftrace_start++;
-	if (ftrace_start == 1)
+	ftrace_start_up++;
+	if (ftrace_start_up == 1)
 		command |= FTRACE_ENABLE_CALLS;
 
 	if (saved_ftrace_func != ftrace_trace_function) {
@@ -562,8 +589,8 @@ static void ftrace_shutdown(void)
 		return;
 
 	mutex_lock(&ftrace_start_lock);
-	ftrace_start--;
-	if (!ftrace_start)
+	ftrace_start_up--;
+	if (!ftrace_start_up)
 		command |= FTRACE_DISABLE_CALLS;
 
 	if (saved_ftrace_func != ftrace_trace_function) {
@@ -589,8 +616,8 @@ static void ftrace_startup_sysctl(void)
 	mutex_lock(&ftrace_start_lock);
 	/* Force update next time */
 	saved_ftrace_func = NULL;
-	/* ftrace_start is true if we want ftrace running */
-	if (ftrace_start)
+	/* ftrace_start_up is true if we want ftrace running */
+	if (ftrace_start_up)
 		command |= FTRACE_ENABLE_CALLS;
 
 	ftrace_run_update_code(command);
@@ -605,8 +632,8 @@ static void ftrace_shutdown_sysctl(void)
 		return;
 
 	mutex_lock(&ftrace_start_lock);
-	/* ftrace_start is true if ftrace is running */
-	if (ftrace_start)
+	/* ftrace_start_up is true if ftrace is running */
+	if (ftrace_start_up)
 		command |= FTRACE_DISABLE_CALLS;
 
 	ftrace_run_update_code(command);
@@ -1186,7 +1213,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
 
 	mutex_lock(&ftrace_sysctl_lock);
 	mutex_lock(&ftrace_start_lock);
-	if (iter->filtered && ftrace_start && ftrace_enabled)
+	if (iter->filtered && ftrace_start_up && ftrace_enabled)
 		ftrace_run_update_code(FTRACE_ENABLE_CALLS);
 	mutex_unlock(&ftrace_start_lock);
 	mutex_unlock(&ftrace_sysctl_lock);
-- 
cgit v1.2.3


From 0f04870148ecb825133bc2733f473b1c5773ac0b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 5 Nov 2008 16:05:44 -0500
Subject: ftrace: soft tracing stop and start

Impact: add way to quickly start stop tracing from the kernel

This patch adds a soft stop and start to the trace. This simply
disables function tracing via the ftrace_disabled flag, and
disables the trace buffers to prevent recording. The tracing
code may still be executed, but the trace will not be recorded.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h |  5 ++++
 kernel/trace/trace.c   | 81 ++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 84 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 794ab907dbfe..7a75fc6d41f4 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -216,6 +216,9 @@ static inline void __ftrace_enabled_restore(int enabled)
 #ifdef CONFIG_TRACING
 extern int ftrace_dump_on_oops;
 
+extern void tracing_start(void);
+extern void tracing_stop(void);
+
 extern void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
 
@@ -246,6 +249,8 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
 static inline int
 ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 0)));
 
+static inline void tracing_start(void) { }
+static inline void tracing_stop(void) { }
 static inline int
 ftrace_printk(const char *fmt, ...)
 {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 29ab40a764c8..113aea9447ec 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -43,6 +43,15 @@
 unsigned long __read_mostly	tracing_max_latency = (cycle_t)ULONG_MAX;
 unsigned long __read_mostly	tracing_thresh;
 
+
+/*
+ * Kill all tracing for good (never come back).
+ * It is initialized to 1 but will turn to zero if the initialization
+ * of the tracer is successful. But that is the only place that sets
+ * this back to zero.
+ */
+int tracing_disabled = 1;
+
 static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
 
 static inline void ftrace_disable_cpu(void)
@@ -62,8 +71,6 @@ static cpumask_t __read_mostly		tracing_buffer_mask;
 #define for_each_tracing_cpu(cpu)	\
 	for_each_cpu_mask(cpu, tracing_buffer_mask)
 
-static int tracing_disabled = 1;
-
 /*
  * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
  *
@@ -613,6 +620,76 @@ static void trace_init_cmdlines(void)
 	cmdline_idx = 0;
 }
 
+static int trace_stop_count;
+static DEFINE_SPINLOCK(tracing_start_lock);
+
+/**
+ * tracing_start - quick start of the tracer
+ *
+ * If tracing is enabled but was stopped by tracing_stop,
+ * this will start the tracer back up.
+ */
+void tracing_start(void)
+{
+	struct ring_buffer *buffer;
+	unsigned long flags;
+
+	if (tracing_disabled)
+		return;
+
+	spin_lock_irqsave(&tracing_start_lock, flags);
+	if (--trace_stop_count)
+		goto out;
+
+	if (trace_stop_count < 0) {
+		/* Someone screwed up their debugging */
+		WARN_ON_ONCE(1);
+		trace_stop_count = 0;
+		goto out;
+	}
+
+
+	buffer = global_trace.buffer;
+	if (buffer)
+		ring_buffer_record_enable(buffer);
+
+	buffer = max_tr.buffer;
+	if (buffer)
+		ring_buffer_record_enable(buffer);
+
+	ftrace_start();
+ out:
+	spin_unlock_irqrestore(&tracing_start_lock, flags);
+}
+
+/**
+ * tracing_stop - quick stop of the tracer
+ *
+ * Light weight way to stop tracing. Use in conjunction with
+ * tracing_start.
+ */
+void tracing_stop(void)
+{
+	struct ring_buffer *buffer;
+	unsigned long flags;
+
+	ftrace_stop();
+	spin_lock_irqsave(&tracing_start_lock, flags);
+	if (trace_stop_count++)
+		goto out;
+
+	buffer = global_trace.buffer;
+	if (buffer)
+		ring_buffer_record_disable(buffer);
+
+	buffer = max_tr.buffer;
+	if (buffer)
+		ring_buffer_record_disable(buffer);
+
+ out:
+	spin_unlock_irqrestore(&tracing_start_lock, flags);
+}
+
 void trace_stop_cmdline_recording(void);
 
 static void trace_save_cmdline(struct task_struct *tsk)
-- 
cgit v1.2.3


From 9036990d462e09366f7297a2d1da6582c3e6b1d3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 5 Nov 2008 16:05:44 -0500
Subject: ftrace: restructure tracing start/stop infrastructure

Impact: change where tracing is started up and stopped

Currently, when a new tracer is selected via echo'ing a tracer name into
the current_tracer file, the startup is only done if tracing_enabled is
set to one. If tracing_enabled is changed to zero (by echo'ing 0 into
the tracing_enabled file) a full shutdown is performed.

The full startup and shutdown of a tracer can be expensive and the
user can lose out traces when echo'ing in 0 to the tracing_enabled file,
because the process takes too long. There can also be places that
the user would like to start and stop the tracer several times and
doing the full startup and shutdown of a tracer might be too expensive.

This patch performs the full startup and shutdown when a tracer is
selected. It also adds a way to do a quick start or stop of a tracer.
The quick version is just a flag that prevents the tracing from
taking place, but the overhead of the code is still there.

For example, the startup of a tracer may enable tracepoints, or enable
the function tracer.  The stop and start will just set a flag to
have the tracer ignore the calls when the tracepoint or function trace
is called.  The overhead of the tracer may still be present when
the tracer is stopped, but no tracing will occur. Setting the tracer
to the 'nop' tracer (or any other tracer) will perform the shutdown
of the tracer which will disable the tracepoint or disable the
function tracer.

The tracing_enabled file will simply start or stop tracing.

This change is all internal. The end result for the user should be the same
as before. If tracing_enabled is not set, no trace will happen.
If tracing_enabled is set, then the trace will happen. The tracing_enabled
variable is static between tracers. Enabling  tracing_enabled and
going to another tracer will keep tracing_enabled enabled. Same
is true with disabling tracing_enabled.

This patch will now provide a fast start/stop method to the users
for enabling or disabling tracing.

Note: There were two methods to the struct tracer that were never
 used: The methods start and stop. These were to be used as a hook
 to the reading of the trace output, but ended up not being
 necessary. These two methods are now used to enable the start
 and stop of each tracer, in case the tracer needs to do more than
 just not write into the buffer. For example, the irqsoff tracer
 must stop recording max latencies when tracing is stopped.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c              | 64 +++++++++++++++++----------------------
 kernel/trace/trace.h              |  5 +--
 kernel/trace/trace_functions.c    |  6 ++++
 kernel/trace/trace_irqsoff.c      | 41 ++++++++++++++++++++++---
 kernel/trace/trace_sched_switch.c | 13 ++++++++
 kernel/trace/trace_sched_wakeup.c | 39 +++++++++++++++++++++---
 6 files changed, 120 insertions(+), 48 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 113aea9447ec..ff1e9ed9b587 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -150,6 +150,19 @@ static DEFINE_PER_CPU(struct trace_array_cpu, max_data);
 /* tracer_enabled is used to toggle activation of a tracer */
 static int			tracer_enabled = 1;
 
+/**
+ * tracing_is_enabled - return tracer_enabled status
+ *
+ * This function is used by other tracers to know the status
+ * of the tracer_enabled flag.  Tracers may use this function
+ * to know if it should enable their features when starting
+ * up. See irqsoff tracer for an example (start_irqsoff_tracer).
+ */
+int tracing_is_enabled(void)
+{
+	return tracer_enabled;
+}
+
 /* function tracing enabled */
 int				ftrace_function_enabled;
 
@@ -1041,8 +1054,7 @@ void tracing_start_function_trace(void)
 		trace_ops.func = function_trace_call;
 
 	register_ftrace_function(&trace_ops);
-	if (tracer_enabled)
-		ftrace_function_enabled = 1;
+	ftrace_function_enabled = 1;
 }
 
 void tracing_stop_function_trace(void)
@@ -1189,10 +1201,6 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 
 	atomic_inc(&trace_record_cmdline_disabled);
 
-	/* let the tracer grab locks here if needed */
-	if (current_trace->start)
-		current_trace->start(iter);
-
 	if (*pos != iter->pos) {
 		iter->ent = NULL;
 		iter->cpu = 0;
@@ -1219,14 +1227,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 
 static void s_stop(struct seq_file *m, void *p)
 {
-	struct trace_iterator *iter = m->private;
-
 	atomic_dec(&trace_record_cmdline_disabled);
-
-	/* let the tracer release locks here if needed */
-	if (current_trace && current_trace == iter->trace && iter->trace->stop)
-		iter->trace->stop(iter);
-
 	mutex_unlock(&trace_types_lock);
 }
 
@@ -2056,10 +2057,7 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 	m->private = iter;
 
 	/* stop the trace while dumping */
-	if (iter->tr->ctrl) {
-		tracer_enabled = 0;
-		ftrace_function_enabled = 0;
-	}
+	tracing_stop();
 
 	if (iter->trace && iter->trace->open)
 			iter->trace->open(iter);
@@ -2104,14 +2102,7 @@ int tracing_release(struct inode *inode, struct file *file)
 		iter->trace->close(iter);
 
 	/* reenable tracing if it was previously enabled */
-	if (iter->tr->ctrl) {
-		tracer_enabled = 1;
-		/*
-		 * It is safe to enable function tracing even if it
-		 * isn't used
-		 */
-		ftrace_function_enabled = 1;
-	}
+	tracing_start();
 	mutex_unlock(&trace_types_lock);
 
 	seq_release(inode, file);
@@ -2449,11 +2440,10 @@ static ssize_t
 tracing_ctrl_read(struct file *filp, char __user *ubuf,
 		  size_t cnt, loff_t *ppos)
 {
-	struct trace_array *tr = filp->private_data;
 	char buf[64];
 	int r;
 
-	r = sprintf(buf, "%ld\n", tr->ctrl);
+	r = sprintf(buf, "%u\n", tracer_enabled);
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
@@ -2481,16 +2471,18 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
 	val = !!val;
 
 	mutex_lock(&trace_types_lock);
-	if (tr->ctrl ^ val) {
-		if (val)
+	if (tracer_enabled ^ val) {
+		if (val) {
 			tracer_enabled = 1;
-		else
+			if (current_trace->start)
+				current_trace->start(tr);
+			tracing_start();
+		} else {
 			tracer_enabled = 0;
-
-		tr->ctrl = val;
-
-		if (current_trace && current_trace->ctrl_update)
-			current_trace->ctrl_update(tr);
+			tracing_stop();
+			if (current_trace->stop)
+				current_trace->stop(tr);
+		}
 	}
 	mutex_unlock(&trace_types_lock);
 
@@ -3372,7 +3364,7 @@ __init static int tracer_alloc_buffers(void)
 #endif
 
 	/* All seems OK, enable tracing */
-	global_trace.ctrl = tracer_enabled;
+	global_trace.ctrl = 1;
 	tracing_disabled = 0;
 
 	atomic_notifier_chain_register(&panic_notifier_list,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cc14a6bc1094..3422489fad5e 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -237,11 +237,11 @@ struct tracer {
 	const char		*name;
 	void			(*init)(struct trace_array *tr);
 	void			(*reset)(struct trace_array *tr);
+	void			(*start)(struct trace_array *tr);
+	void			(*stop)(struct trace_array *tr);
 	void			(*open)(struct trace_iterator *iter);
 	void			(*pipe_open)(struct trace_iterator *iter);
 	void			(*close)(struct trace_iterator *iter);
-	void			(*start)(struct trace_iterator *iter);
-	void			(*stop)(struct trace_iterator *iter);
 	ssize_t			(*read)(struct trace_iterator *iter,
 					struct file *filp, char __user *ubuf,
 					size_t cnt, loff_t *ppos);
@@ -282,6 +282,7 @@ struct trace_iterator {
 	long			idx;
 };
 
+int tracing_is_enabled(void);
 void trace_wake_up(void);
 void tracing_reset(struct trace_array *tr, int cpu);
 int tracing_open_generic(struct inode *inode, struct file *filp);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 0f85a64003d3..9f1b0de71284 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -62,11 +62,17 @@ static void function_trace_ctrl_update(struct trace_array *tr)
 		stop_function_trace(tr);
 }
 
+static void function_trace_start(struct trace_array *tr)
+{
+	function_reset(tr);
+}
+
 static struct tracer function_trace __read_mostly =
 {
 	.name	     = "function",
 	.init	     = function_trace_init,
 	.reset	     = function_trace_reset,
+	.start	     = function_trace_start,
 	.ctrl_update = function_trace_ctrl_update,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_function,
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 9c74071c10e0..a87a20fa3fc6 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -353,15 +353,28 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
 }
 #endif /* CONFIG_PREEMPT_TRACER */
 
+/*
+ * save_tracer_enabled is used to save the state of the tracer_enabled
+ * variable when we disable it when we open a trace output file.
+ */
+static int save_tracer_enabled;
+
 static void start_irqsoff_tracer(struct trace_array *tr)
 {
 	register_ftrace_function(&trace_ops);
-	tracer_enabled = 1;
+	if (tracing_is_enabled()) {
+		tracer_enabled = 1;
+		save_tracer_enabled = 1;
+	} else {
+		tracer_enabled = 0;
+		save_tracer_enabled = 0;
+	}
 }
 
 static void stop_irqsoff_tracer(struct trace_array *tr)
 {
 	tracer_enabled = 0;
+	save_tracer_enabled = 0;
 	unregister_ftrace_function(&trace_ops);
 }
 
@@ -389,17 +402,29 @@ static void irqsoff_tracer_ctrl_update(struct trace_array *tr)
 		stop_irqsoff_tracer(tr);
 }
 
+static void irqsoff_tracer_start(struct trace_array *tr)
+{
+	irqsoff_tracer_reset(tr);
+	tracer_enabled = 1;
+	save_tracer_enabled = 1;
+}
+
+static void irqsoff_tracer_stop(struct trace_array *tr)
+{
+	tracer_enabled = 0;
+	save_tracer_enabled = 0;
+}
+
 static void irqsoff_tracer_open(struct trace_iterator *iter)
 {
 	/* stop the trace while dumping */
-	if (iter->tr->ctrl)
-		stop_irqsoff_tracer(iter->tr);
+	tracer_enabled = 0;
 }
 
 static void irqsoff_tracer_close(struct trace_iterator *iter)
 {
-	if (iter->tr->ctrl)
-		start_irqsoff_tracer(iter->tr);
+	/* restart tracing */
+	tracer_enabled = save_tracer_enabled;
 }
 
 #ifdef CONFIG_IRQSOFF_TRACER
@@ -414,6 +439,8 @@ static struct tracer irqsoff_tracer __read_mostly =
 	.name		= "irqsoff",
 	.init		= irqsoff_tracer_init,
 	.reset		= irqsoff_tracer_reset,
+	.start		= irqsoff_tracer_start,
+	.stop		= irqsoff_tracer_stop,
 	.open		= irqsoff_tracer_open,
 	.close		= irqsoff_tracer_close,
 	.ctrl_update	= irqsoff_tracer_ctrl_update,
@@ -440,6 +467,8 @@ static struct tracer preemptoff_tracer __read_mostly =
 	.name		= "preemptoff",
 	.init		= preemptoff_tracer_init,
 	.reset		= irqsoff_tracer_reset,
+	.start		= irqsoff_tracer_start,
+	.stop		= irqsoff_tracer_stop,
 	.open		= irqsoff_tracer_open,
 	.close		= irqsoff_tracer_close,
 	.ctrl_update	= irqsoff_tracer_ctrl_update,
@@ -468,6 +497,8 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
 	.name		= "preemptirqsoff",
 	.init		= preemptirqsoff_tracer_init,
 	.reset		= irqsoff_tracer_reset,
+	.start		= irqsoff_tracer_start,
+	.stop		= irqsoff_tracer_stop,
 	.open		= irqsoff_tracer_open,
 	.close		= irqsoff_tracer_close,
 	.ctrl_update	= irqsoff_tracer_ctrl_update,
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 888944d3409d..91c699be8c87 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -186,11 +186,24 @@ static void sched_switch_trace_ctrl_update(struct trace_array *tr)
 		stop_sched_trace(tr);
 }
 
+static void sched_switch_trace_start(struct trace_array *tr)
+{
+	sched_switch_reset(tr);
+	tracing_start_sched_switch();
+}
+
+static void sched_switch_trace_stop(struct trace_array *tr)
+{
+	tracing_stop_sched_switch();
+}
+
 struct tracer sched_switch_trace __read_mostly =
 {
 	.name		= "sched_switch",
 	.init		= sched_switch_trace_init,
 	.reset		= sched_switch_trace_reset,
+	.start		= sched_switch_trace_start,
+	.stop		= sched_switch_trace_stop,
 	.ctrl_update	= sched_switch_trace_ctrl_update,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_sched_switch,
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 7bc4abf6fca8..240577bc8ba5 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -262,6 +262,12 @@ out:
 	atomic_dec(&wakeup_trace->data[cpu]->disabled);
 }
 
+/*
+ * save_tracer_enabled is used to save the state of the tracer_enabled
+ * variable when we disable it when we open a trace output file.
+ */
+static int save_tracer_enabled;
+
 static void start_wakeup_tracer(struct trace_array *tr)
 {
 	int ret;
@@ -300,7 +306,13 @@ static void start_wakeup_tracer(struct trace_array *tr)
 
 	register_ftrace_function(&trace_ops);
 
-	tracer_enabled = 1;
+	if (tracing_is_enabled()) {
+		tracer_enabled = 1;
+		save_tracer_enabled = 1;
+	} else {
+		tracer_enabled = 0;
+		save_tracer_enabled = 0;
+	}
 
 	return;
 fail_deprobe_wake_new:
@@ -312,6 +324,7 @@ fail_deprobe:
 static void stop_wakeup_tracer(struct trace_array *tr)
 {
 	tracer_enabled = 0;
+	save_tracer_enabled = 0;
 	unregister_ftrace_function(&trace_ops);
 	unregister_trace_sched_switch(probe_wakeup_sched_switch);
 	unregister_trace_sched_wakeup_new(probe_wakeup);
@@ -343,18 +356,32 @@ static void wakeup_tracer_ctrl_update(struct trace_array *tr)
 		stop_wakeup_tracer(tr);
 }
 
+static void wakeup_tracer_start(struct trace_array *tr)
+{
+	wakeup_reset(tr);
+	tracer_enabled = 1;
+	save_tracer_enabled = 1;
+}
+
+static void wakeup_tracer_stop(struct trace_array *tr)
+{
+	tracer_enabled = 0;
+	save_tracer_enabled = 0;
+}
+
 static void wakeup_tracer_open(struct trace_iterator *iter)
 {
 	/* stop the trace while dumping */
-	if (iter->tr->ctrl)
-		stop_wakeup_tracer(iter->tr);
+	tracer_enabled = 0;
 }
 
 static void wakeup_tracer_close(struct trace_iterator *iter)
 {
 	/* forget about any processes we were recording */
-	if (iter->tr->ctrl)
-		start_wakeup_tracer(iter->tr);
+	if (save_tracer_enabled) {
+		wakeup_reset(iter->tr);
+		tracer_enabled = 1;
+	}
 }
 
 static struct tracer wakeup_tracer __read_mostly =
@@ -362,6 +389,8 @@ static struct tracer wakeup_tracer __read_mostly =
 	.name		= "wakeup",
 	.init		= wakeup_tracer_init,
 	.reset		= wakeup_tracer_reset,
+	.start		= wakeup_tracer_start,
+	.stop		= wakeup_tracer_stop,
 	.open		= wakeup_tracer_open,
 	.close		= wakeup_tracer_close,
 	.ctrl_update	= wakeup_tracer_ctrl_update,
-- 
cgit v1.2.3


From 3e03fb7f1da2e691644526c0d6df42d778716349 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 6 Nov 2008 00:09:43 -0500
Subject: ring-buffer: convert to raw spinlocks

Impact: no lockdep debugging of ring buffer

The problem with running lockdep on the ring buffer is that the
ring buffer is the core infrastructure of ftrace. What happens is
that the tracer will start tracing the lockdep code while lockdep
is testing the ring buffers locks.  This can cause lockdep to
fail due to testing cases that have not fully finished their
locking transition.

This patch converts the spin locks used by the ring buffer back
into raw spin locks which lockdep does not check.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 151f6a748676..a2dea5008826 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -154,7 +154,7 @@ static inline int test_time_stamp(u64 delta)
 struct ring_buffer_per_cpu {
 	int				cpu;
 	struct ring_buffer		*buffer;
-	spinlock_t			lock;
+	raw_spinlock_t			lock;
 	struct lock_class_key		lock_key;
 	struct list_head		pages;
 	struct buffer_page		*head_page;	/* read from head */
@@ -291,7 +291,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
 
 	cpu_buffer->cpu = cpu;
 	cpu_buffer->buffer = buffer;
-	spin_lock_init(&cpu_buffer->lock);
+	cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 	INIT_LIST_HEAD(&cpu_buffer->pages);
 
 	page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
@@ -854,7 +854,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	if (write > BUF_PAGE_SIZE) {
 		struct buffer_page *next_page = tail_page;
 
-		spin_lock_irqsave(&cpu_buffer->lock, flags);
+		local_irq_save(flags);
+		__raw_spin_lock(&cpu_buffer->lock);
 
 		rb_inc_page(cpu_buffer, &next_page);
 
@@ -930,7 +931,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 			rb_set_commit_to_write(cpu_buffer);
 		}
 
-		spin_unlock_irqrestore(&cpu_buffer->lock, flags);
+		__raw_spin_unlock(&cpu_buffer->lock);
+		local_irq_restore(flags);
 
 		/* fail and let the caller try again */
 		return ERR_PTR(-EAGAIN);
@@ -953,7 +955,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	return event;
 
  out_unlock:
-	spin_unlock_irqrestore(&cpu_buffer->lock, flags);
+	__raw_spin_unlock(&cpu_buffer->lock);
+	local_irq_restore(flags);
 	return NULL;
 }
 
@@ -1524,7 +1527,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	struct buffer_page *reader = NULL;
 	unsigned long flags;
 
-	spin_lock_irqsave(&cpu_buffer->lock, flags);
+	local_irq_save(flags);
+	__raw_spin_lock(&cpu_buffer->lock);
 
  again:
 	reader = cpu_buffer->reader_page;
@@ -1574,7 +1578,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	goto again;
 
  out:
-	spin_unlock_irqrestore(&cpu_buffer->lock, flags);
+	__raw_spin_unlock(&cpu_buffer->lock);
+	local_irq_restore(flags);
 
 	return reader;
 }
@@ -1815,9 +1820,11 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
 	atomic_inc(&cpu_buffer->record_disabled);
 	synchronize_sched();
 
-	spin_lock_irqsave(&cpu_buffer->lock, flags);
+	local_irq_save(flags);
+	__raw_spin_lock(&cpu_buffer->lock);
 	ring_buffer_iter_reset(iter);
-	spin_unlock_irqrestore(&cpu_buffer->lock, flags);
+	__raw_spin_unlock(&cpu_buffer->lock);
+	local_irq_restore(flags);
 
 	return iter;
 }
@@ -1903,11 +1910,13 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
 	if (!cpu_isset(cpu, buffer->cpumask))
 		return;
 
-	spin_lock_irqsave(&cpu_buffer->lock, flags);
+	local_irq_save(flags);
+	__raw_spin_lock(&cpu_buffer->lock);
 
 	rb_reset_cpu(cpu_buffer);
 
-	spin_unlock_irqrestore(&cpu_buffer->lock, flags);
+	__raw_spin_unlock(&cpu_buffer->lock);
+	local_irq_restore(flags);
 }
 
 /**
-- 
cgit v1.2.3


From cf7f8690e864c6fe11e77202dd847fa60f483418 Mon Sep 17 00:00:00 2001
From: Sripathi Kodi <sripathik@in.ibm.com>
Date: Wed, 5 Nov 2008 18:57:14 +0530
Subject: sched, lockdep: inline double_unlock_balance()

We have a test case which measures the variation in the amount of time
needed to perform a fixed amount of work on the preempt_rt kernel. We
started seeing deterioration in it's performance recently. The test
should never take more than 10 microseconds, but we started 5-10%
failure rate.

Using elimination method, we traced the problem to commit
1b12bbc747560ea68bcc132c3d05699e52271da0 (lockdep: re-annotate
scheduler runqueues).

When LOCKDEP is disabled, this patch only adds an additional function
call to double_unlock_balance(). Hence I inlined double_unlock_balance()
and the problem went away. Here is a patch to make this change.

Signed-off-by: Sripathi Kodi <sripathik@in.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    | 2 +-
 kernel/sched_rt.c | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index e8819bc6f462..ad10d0aae1d7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2825,7 +2825,7 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 	return ret;
 }
 
-static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
+static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
 	__releases(busiest->lock)
 {
 	spin_unlock(&busiest->lock);
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index c7963d5d0625..2bdd44423599 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -910,7 +910,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 #define RT_MAX_TRIES 3
 
 static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
-static void double_unlock_balance(struct rq *this_rq, struct rq *busiest);
+static inline void double_unlock_balance(struct rq *this_rq,
+						struct rq *busiest);
 
 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
 
-- 
cgit v1.2.3


From 6d21cd62516a9697cb7ec33cc52e6b814fb65a13 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 7 Nov 2008 17:03:18 +0800
Subject: sched: clean up SCHED_CPUMASK_ALLOC

Impact: cleanup

The #if/#endif is ugly. Change SCHED_CPUMASK_ALLOC and
SCHED_CPUMASK_FREE to static inline functions.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index b24e57a10f6f..59db86c915f9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7279,13 +7279,21 @@ struct allmasks {
 };
 
 #if	NR_CPUS > 128
-#define	SCHED_CPUMASK_ALLOC		1
-#define	SCHED_CPUMASK_FREE(v)		kfree(v)
-#define	SCHED_CPUMASK_DECLARE(v)	struct allmasks *v
+#define SCHED_CPUMASK_DECLARE(v)	struct allmasks *v
+static inline void sched_cpumask_alloc(struct allmasks **masks)
+{
+	*masks = kmalloc(sizeof(**masks), GFP_KERNEL);
+}
+static inline void sched_cpumask_free(struct allmasks *masks)
+{
+	kfree(masks);
+}
 #else
-#define	SCHED_CPUMASK_ALLOC		0
-#define	SCHED_CPUMASK_FREE(v)
-#define	SCHED_CPUMASK_DECLARE(v)	struct allmasks _v, *v = &_v
+#define SCHED_CPUMASK_DECLARE(v)	struct allmasks _v, *v = &_v
+static inline void sched_cpumask_alloc(struct allmasks **masks)
+{ }
+static inline void sched_cpumask_free(struct allmasks *masks)
+{ }
 #endif
 
 #define	SCHED_CPUMASK_VAR(v, a) 	cpumask_t *v = (cpumask_t *) \
@@ -7361,9 +7369,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		return -ENOMEM;
 	}
 
-#if SCHED_CPUMASK_ALLOC
 	/* get space for all scratch cpumask variables */
-	allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);
+	sched_cpumask_alloc(&allmasks);
 	if (!allmasks) {
 		printk(KERN_WARNING "Cannot alloc cpumask array\n");
 		kfree(rd);
@@ -7372,7 +7379,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #endif
 		return -ENOMEM;
 	}
-#endif
+
 	tmpmask = (cpumask_t *)allmasks;
 
 
@@ -7626,13 +7633,13 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		cpu_attach_domain(sd, rd, i);
 	}
 
-	SCHED_CPUMASK_FREE((void *)allmasks);
+	sched_cpumask_free(allmasks);
 	return 0;
 
 #ifdef CONFIG_NUMA
 error:
 	free_sched_groups(cpu_map, tmpmask);
-	SCHED_CPUMASK_FREE((void *)allmasks);
+	sched_cpumask_free(allmasks);
 	kfree(rd);
 	return -ENOMEM;
 #endif
-- 
cgit v1.2.3


From 0183fb1c94b74862b073590fc52c56b7364b7bad Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 7 Nov 2008 22:36:02 -0500
Subject: ftrace: fix set_ftrace_filter

Impact: fix of output of set_ftrace_filter

Commit ftrace: do not show freed records in available_filter_functions

Removed a bit too much from the set_ftrace_filter code, where we now see
all functions in the set_ftrace_filter file even when we set a filter.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 896c71f0f4c4..4d2e751bfb11 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -765,6 +765,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 		    ((iter->flags & FTRACE_ITER_FAILURES) &&
 		     !(rec->flags & FTRACE_FL_FAILED)) ||
 
+		    ((iter->flags & FTRACE_ITER_FILTER) &&
+		     !(rec->flags & FTRACE_FL_FILTER)) ||
+
 		    ((iter->flags & FTRACE_ITER_NOTRACE) &&
 		     !(rec->flags & FTRACE_FL_NOTRACE))) {
 			rec = NULL;
-- 
cgit v1.2.3


From 75f5c47da386445ba0c5a8b7e3ca0c906e763369 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 7 Nov 2008 22:36:02 -0500
Subject: ftrace: fix boot trace sched startup

Impact: boot tracer startup modified

The boot tracer calls into some of the schedule tracing private functions
that should not be exported. This patch cleans it up, and makes
way for further changes in the ftrace infrastructure.

This patch adds a api to assign a tracer array to the schedule
context switch tracer.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h              |  2 +-
 kernel/trace/trace_boot.c         |  9 +++++++--
 kernel/trace/trace_sched_switch.c | 15 ++++++++++++++-
 3 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3422489fad5e..db12e16137e1 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -49,7 +49,6 @@ struct ftrace_entry {
 	unsigned long		parent_ip;
 };
 extern struct tracer boot_tracer;
-extern struct tracer sched_switch_trace; /* Used by the boot tracer */
 
 /*
  * Context switch trace entry - which task (and prio) we switched from/to:
@@ -325,6 +324,7 @@ void trace_function(struct trace_array *tr,
 
 void tracing_start_cmdline_record(void);
 void tracing_stop_cmdline_record(void);
+void tracing_cmdline_assign_trace(struct trace_array *tr);
 int register_tracer(struct tracer *type);
 void unregister_tracer(struct tracer *type);
 
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index bd5046c9deb7..662cb9198906 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -39,7 +39,12 @@ void disable_boot_trace(void)
 
 static void reset_boot_trace(struct trace_array *tr)
 {
-	sched_switch_trace.reset(tr);
+	int cpu;
+
+	tr->time_start = ftrace_now(tr->cpu);
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr, cpu);
 }
 
 static void boot_trace_init(struct trace_array *tr)
@@ -50,7 +55,7 @@ static void boot_trace_init(struct trace_array *tr)
 	for_each_cpu_mask(cpu, cpu_possible_map)
 		tracing_reset(tr, cpu);
 
-	sched_switch_trace.init(tr);
+	tracing_cmdline_assign_trace(tr);
 }
 
 static void boot_trace_ctrl_update(struct trace_array *tr)
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 91c699be8c87..fbf05df7134d 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -152,6 +152,19 @@ void tracing_stop_cmdline_record(void)
 	tracing_stop_sched_switch();
 }
 
+/**
+ * tracing_cmdline_assign_trace - assign a trace array for ctx switch
+ * @tr: trace array pointer to assign
+ *
+ * Some tracers might want to record the context switches in their
+ * trace. This function lets those tracers assign the trace array
+ * to use.
+ */
+void tracing_cmdline_assign_trace(struct trace_array *tr)
+{
+	ctx_trace = tr;
+}
+
 static void start_sched_trace(struct trace_array *tr)
 {
 	sched_switch_reset(tr);
@@ -197,7 +210,7 @@ static void sched_switch_trace_stop(struct trace_array *tr)
 	tracing_stop_sched_switch();
 }
 
-struct tracer sched_switch_trace __read_mostly =
+static struct tracer sched_switch_trace __read_mostly =
 {
 	.name		= "sched_switch",
 	.init		= sched_switch_trace_init,
-- 
cgit v1.2.3


From e168e0516e476070faa9e8e7b23dfcba79b76d82 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 7 Nov 2008 22:36:02 -0500
Subject: ftrace: fix sched_switch API

Impact: fix for sched_switch that broke dynamic ftrace startup

The commit: tracing/fastboot: use sched switch tracer from boot tracer
broke the API of the sched_switch trace. The use of the
tracing_start/stop_cmdline record is for only recording the cmdline,
NOT recording the schedule switches themselves.

Seeing that the boot tracer broke the API to do something that it
wanted, this patch adds a new interface for the API while
puting back the original interface of the old API.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h              |  4 +++-
 kernel/trace/trace_boot.c         |  6 ++---
 kernel/trace/trace_sched_switch.c | 50 +++++++++++++++++++++++++++++++--------
 3 files changed, 46 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index db12e16137e1..25abfc45f081 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -324,7 +324,9 @@ void trace_function(struct trace_array *tr,
 
 void tracing_start_cmdline_record(void);
 void tracing_stop_cmdline_record(void);
-void tracing_cmdline_assign_trace(struct trace_array *tr);
+void tracing_sched_switch_assign_trace(struct trace_array *tr);
+void tracing_stop_sched_switch_record(void);
+void tracing_start_sched_switch_record(void);
 int register_tracer(struct tracer *type);
 void unregister_tracer(struct tracer *type);
 
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 662cb9198906..0203c1054012 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -28,13 +28,13 @@ void start_boot_trace(void)
 void enable_boot_trace(void)
 {
 	if (pre_initcalls_finished)
-		tracing_start_cmdline_record();
+		tracing_start_sched_switch_record();
 }
 
 void disable_boot_trace(void)
 {
 	if (pre_initcalls_finished)
-		tracing_stop_cmdline_record();
+		tracing_stop_sched_switch_record();
 }
 
 static void reset_boot_trace(struct trace_array *tr)
@@ -55,7 +55,7 @@ static void boot_trace_init(struct trace_array *tr)
 	for_each_cpu_mask(cpu, cpu_possible_map)
 		tracing_reset(tr, cpu);
 
-	tracing_cmdline_assign_trace(tr);
+	tracing_sched_switch_assign_trace(tr);
 }
 
 static void boot_trace_ctrl_update(struct trace_array *tr)
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index fbf05df7134d..79410db64d6f 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -125,20 +125,16 @@ static void tracing_sched_unregister(void)
 static void tracing_start_sched_switch(void)
 {
 	mutex_lock(&sched_register_mutex);
-	if (!(sched_ref++)) {
-		tracer_enabled = 1;
+	if (!(sched_ref++))
 		tracing_sched_register();
-	}
 	mutex_unlock(&sched_register_mutex);
 }
 
 static void tracing_stop_sched_switch(void)
 {
 	mutex_lock(&sched_register_mutex);
-	if (!(--sched_ref)) {
+	if (!(--sched_ref))
 		tracing_sched_unregister();
-		tracer_enabled = 0;
-	}
 	mutex_unlock(&sched_register_mutex);
 }
 
@@ -153,14 +149,48 @@ void tracing_stop_cmdline_record(void)
 }
 
 /**
- * tracing_cmdline_assign_trace - assign a trace array for ctx switch
+ * tracing_start_sched_switch_record - start tracing context switches
+ *
+ * Turns on context switch tracing for a tracer.
+ */
+void tracing_start_sched_switch_record(void)
+{
+	if (unlikely(!ctx_trace)) {
+		WARN_ON(1);
+		return;
+	}
+
+	tracing_start_sched_switch();
+
+	mutex_lock(&sched_register_mutex);
+	tracer_enabled++;
+	mutex_unlock(&sched_register_mutex);
+}
+
+/**
+ * tracing_stop_sched_switch_record - start tracing context switches
+ *
+ * Turns off context switch tracing for a tracer.
+ */
+void tracing_stop_sched_switch_record(void)
+{
+	mutex_lock(&sched_register_mutex);
+	tracer_enabled--;
+	WARN_ON(tracer_enabled < 0);
+	mutex_unlock(&sched_register_mutex);
+
+	tracing_stop_sched_switch();
+}
+
+/**
+ * tracing_sched_switch_assign_trace - assign a trace array for ctx switch
  * @tr: trace array pointer to assign
  *
  * Some tracers might want to record the context switches in their
  * trace. This function lets those tracers assign the trace array
  * to use.
  */
-void tracing_cmdline_assign_trace(struct trace_array *tr)
+void tracing_sched_switch_assign_trace(struct trace_array *tr)
 {
 	ctx_trace = tr;
 }
@@ -168,12 +198,12 @@ void tracing_cmdline_assign_trace(struct trace_array *tr)
 static void start_sched_trace(struct trace_array *tr)
 {
 	sched_switch_reset(tr);
-	tracing_start_cmdline_record();
+	tracing_start_sched_switch_record();
 }
 
 static void stop_sched_trace(struct trace_array *tr)
 {
-	tracing_stop_cmdline_record();
+	tracing_stop_sched_switch_record();
 }
 
 static void sched_switch_trace_init(struct trace_array *tr)
-- 
cgit v1.2.3


From 451931702017951f74624ddc4f7f02e4641b0e20 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 7 Nov 2008 22:36:02 -0500
Subject: ftrace: irqsoff tracer incorrect reset

Impact: fix to irqsoff tracer output

In converting to the new start / stop ftrace handling, the irqsoff
tracer start called the irqsoff reset function. irqsoff tracer is
not the same as the other traces, and it resets the buffers while
searching for the longest latency.

The reset that the irqsoff stop method calls disables the function
tracing. That means that, by starting the tracer, the function
tracer is disabled incorrectly.

This patch simply removes the call to reset which keeps the function
tracing enabled. Reset is not needed for the irqsoff stop method.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_irqsoff.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index a87a20fa3fc6..3509086cdc4c 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -404,7 +404,6 @@ static void irqsoff_tracer_ctrl_update(struct trace_array *tr)
 
 static void irqsoff_tracer_start(struct trace_array *tr)
 {
-	irqsoff_tracer_reset(tr);
 	tracer_enabled = 1;
 	save_tracer_enabled = 1;
 }
-- 
cgit v1.2.3


From 49833fc232bd6a5076496994d855f601354501d7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 7 Nov 2008 22:36:02 -0500
Subject: ftrace: enable trace_printk by default

Impact: have the ftrace_printk enabled on startup

It is confusing to have to "echo trace_printk > /debug/tracing/iter_ctrl"
after adding ftrace_printk in the kernel.

Currently the trace_printk is set to off by default. ftrace_printk
should only be in open kernel code when used for debugging, and thus
it should be enabled by default.

It may also be used to record data within a tracer, but those ftrace_printks
should be within wrappers that are either enabled by trace_points or
have a variable protecting the code path from being entered when the
tracer is disabled.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d55ccfc8d674..dbbdacfaaf9e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -205,7 +205,7 @@ static DEFINE_MUTEX(trace_types_lock);
 static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 
 /* trace_flags holds iter_ctrl options */
-unsigned long trace_flags = TRACE_ITER_PRINT_PARENT;
+unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK;
 
 /**
  * trace_wake_up - wake up tasks waiting for trace input
-- 
cgit v1.2.3


From bbf5b1a0cecb56de6236db8b01c5bfb7ab8ba8b2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 7 Nov 2008 22:36:02 -0500
Subject: ftrace: remove ctrl_update method

Impact: Remove the ctrl_update tracer method

With the new quick start/stop method of tracing, the ctrl_update
method is out of date.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c              |  2 --
 kernel/trace/trace.h              |  1 -
 kernel/trace/trace_boot.c         |  9 --------
 kernel/trace/trace_functions.c    |  9 --------
 kernel/trace/trace_irqsoff.c      | 11 ---------
 kernel/trace/trace_mmiotrace.c    | 11 +++------
 kernel/trace/trace_nop.c          | 10 --------
 kernel/trace/trace_sched_switch.c | 10 --------
 kernel/trace/trace_sched_wakeup.c |  9 --------
 kernel/trace/trace_selftest.c     | 48 +++++++++++++++++++++------------------
 kernel/trace/trace_sysprof.c      | 10 --------
 11 files changed, 29 insertions(+), 101 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dbbdacfaaf9e..9e83188172a1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3357,8 +3357,6 @@ __init static int tracer_alloc_buffers(void)
 
 	register_tracer(&nop_trace);
 #ifdef CONFIG_BOOT_TRACER
-	/* We don't want to launch sched_switch tracer yet */
-	global_trace.ctrl = 0;
 	register_tracer(&boot_tracer);
 	current_trace = &boot_tracer;
 	current_trace->init(&global_trace);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 25abfc45f081..e481edaae1c4 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -244,7 +244,6 @@ struct tracer {
 	ssize_t			(*read)(struct trace_iterator *iter,
 					struct file *filp, char __user *ubuf,
 					size_t cnt, loff_t *ppos);
-	void			(*ctrl_update)(struct trace_array *tr);
 #ifdef CONFIG_FTRACE_STARTUP_TEST
 	int			(*selftest)(struct tracer *trace,
 					    struct trace_array *tr);
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 0203c1054012..8f71915e8bb4 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -58,14 +58,6 @@ static void boot_trace_init(struct trace_array *tr)
 	tracing_sched_switch_assign_trace(tr);
 }
 
-static void boot_trace_ctrl_update(struct trace_array *tr)
-{
-	if (tr->ctrl)
-		enable_boot_trace();
-	else
-		disable_boot_trace();
-}
-
 static enum print_line_t initcall_print_line(struct trace_iterator *iter)
 {
 	int ret;
@@ -102,7 +94,6 @@ struct tracer boot_tracer __read_mostly =
 	.name		= "initcall",
 	.init		= boot_trace_init,
 	.reset		= reset_boot_trace,
-	.ctrl_update	= boot_trace_ctrl_update,
 	.print_line	= initcall_print_line,
 };
 
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 9f1b0de71284..e980b872bef5 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -54,14 +54,6 @@ static void function_trace_reset(struct trace_array *tr)
 		stop_function_trace(tr);
 }
 
-static void function_trace_ctrl_update(struct trace_array *tr)
-{
-	if (tr->ctrl)
-		start_function_trace(tr);
-	else
-		stop_function_trace(tr);
-}
-
 static void function_trace_start(struct trace_array *tr)
 {
 	function_reset(tr);
@@ -73,7 +65,6 @@ static struct tracer function_trace __read_mostly =
 	.init	     = function_trace_init,
 	.reset	     = function_trace_reset,
 	.start	     = function_trace_start,
-	.ctrl_update = function_trace_ctrl_update,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_function,
 #endif
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 3509086cdc4c..ffdf592a54e3 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -394,14 +394,6 @@ static void irqsoff_tracer_reset(struct trace_array *tr)
 		stop_irqsoff_tracer(tr);
 }
 
-static void irqsoff_tracer_ctrl_update(struct trace_array *tr)
-{
-	if (tr->ctrl)
-		start_irqsoff_tracer(tr);
-	else
-		stop_irqsoff_tracer(tr);
-}
-
 static void irqsoff_tracer_start(struct trace_array *tr)
 {
 	tracer_enabled = 1;
@@ -442,7 +434,6 @@ static struct tracer irqsoff_tracer __read_mostly =
 	.stop		= irqsoff_tracer_stop,
 	.open		= irqsoff_tracer_open,
 	.close		= irqsoff_tracer_close,
-	.ctrl_update	= irqsoff_tracer_ctrl_update,
 	.print_max	= 1,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_irqsoff,
@@ -470,7 +461,6 @@ static struct tracer preemptoff_tracer __read_mostly =
 	.stop		= irqsoff_tracer_stop,
 	.open		= irqsoff_tracer_open,
 	.close		= irqsoff_tracer_close,
-	.ctrl_update	= irqsoff_tracer_ctrl_update,
 	.print_max	= 1,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_preemptoff,
@@ -500,7 +490,6 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
 	.stop		= irqsoff_tracer_stop,
 	.open		= irqsoff_tracer_open,
 	.close		= irqsoff_tracer_close,
-	.ctrl_update	= irqsoff_tracer_ctrl_update,
 	.print_max	= 1,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_preemptirqsoff,
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index f28484618ff0..fa9354e78b57 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -49,15 +49,10 @@ static void mmio_trace_reset(struct trace_array *tr)
 	mmio_trace_array = NULL;
 }
 
-static void mmio_trace_ctrl_update(struct trace_array *tr)
+static void mmio_trace_start(struct trace_array *tr)
 {
 	pr_debug("in %s\n", __func__);
-	if (tr->ctrl) {
-		mmio_reset_data(tr);
-		enable_mmiotrace();
-	} else {
-		disable_mmiotrace();
-	}
+	mmio_reset_data(tr);
 }
 
 static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
@@ -298,10 +293,10 @@ static struct tracer mmio_tracer __read_mostly =
 	.name		= "mmiotrace",
 	.init		= mmio_trace_init,
 	.reset		= mmio_trace_reset,
+	.start		= mmio_trace_start,
 	.pipe_open	= mmio_pipe_open,
 	.close		= mmio_close,
 	.read		= mmio_read,
-	.ctrl_update	= mmio_trace_ctrl_update,
 	.print_line	= mmio_print_line,
 };
 
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index 4592b4862515..e3c5fbfcdd36 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -42,21 +42,11 @@ static void nop_trace_reset(struct trace_array *tr)
 		stop_nop_trace(tr);
 }
 
-static void nop_trace_ctrl_update(struct trace_array *tr)
-{
-	/* When starting a new trace, reset the buffers */
-	if (tr->ctrl)
-		start_nop_trace(tr);
-	else
-		stop_nop_trace(tr);
-}
-
 struct tracer nop_trace __read_mostly =
 {
 	.name		= "nop",
 	.init		= nop_trace_init,
 	.reset		= nop_trace_reset,
-	.ctrl_update	= nop_trace_ctrl_update,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest	= trace_selftest_startup_nop,
 #endif
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 79410db64d6f..7b73fd11e4aa 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -220,15 +220,6 @@ static void sched_switch_trace_reset(struct trace_array *tr)
 		stop_sched_trace(tr);
 }
 
-static void sched_switch_trace_ctrl_update(struct trace_array *tr)
-{
-	/* When starting a new trace, reset the buffers */
-	if (tr->ctrl)
-		start_sched_trace(tr);
-	else
-		stop_sched_trace(tr);
-}
-
 static void sched_switch_trace_start(struct trace_array *tr)
 {
 	sched_switch_reset(tr);
@@ -247,7 +238,6 @@ static struct tracer sched_switch_trace __read_mostly =
 	.reset		= sched_switch_trace_reset,
 	.start		= sched_switch_trace_start,
 	.stop		= sched_switch_trace_stop,
-	.ctrl_update	= sched_switch_trace_ctrl_update,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_sched_switch,
 #endif
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 240577bc8ba5..23e54d4e4d92 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -348,14 +348,6 @@ static void wakeup_tracer_reset(struct trace_array *tr)
 	}
 }
 
-static void wakeup_tracer_ctrl_update(struct trace_array *tr)
-{
-	if (tr->ctrl)
-		start_wakeup_tracer(tr);
-	else
-		stop_wakeup_tracer(tr);
-}
-
 static void wakeup_tracer_start(struct trace_array *tr)
 {
 	wakeup_reset(tr);
@@ -393,7 +385,6 @@ static struct tracer wakeup_tracer __read_mostly =
 	.stop		= wakeup_tracer_stop,
 	.open		= wakeup_tracer_open,
 	.close		= wakeup_tracer_close,
-	.ctrl_update	= wakeup_tracer_ctrl_update,
 	.print_max	= 1,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_wakeup,
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 90bc752a7580..746934340474 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -134,13 +134,13 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 	msleep(100);
 
 	/* stop the tracing. */
-	tr->ctrl = 0;
-	trace->ctrl_update(tr);
+	tracing_stop();
 	ftrace_enabled = 0;
 
 	/* check the trace buffer */
 	ret = trace_test_buffer(tr, &count);
 	trace->reset(tr);
+	tracing_start();
 
 	/* we should only have one item */
 	if (!ret && count != 1) {
@@ -148,6 +148,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 		ret = -1;
 		goto out;
 	}
+
  out:
 	ftrace_enabled = save_ftrace_enabled;
 	tracer_enabled = save_tracer_enabled;
@@ -185,13 +186,13 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 	/* Sleep for a 1/10 of a second */
 	msleep(100);
 	/* stop the tracing. */
-	tr->ctrl = 0;
-	trace->ctrl_update(tr);
+	tracing_stop();
 	ftrace_enabled = 0;
 
 	/* check the trace buffer */
 	ret = trace_test_buffer(tr, &count);
 	trace->reset(tr);
+	tracing_start();
 
 	if (!ret && !count) {
 		printk(KERN_CONT ".. no entries found ..");
@@ -232,13 +233,13 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
 	udelay(100);
 	local_irq_enable();
 	/* stop the tracing. */
-	tr->ctrl = 0;
-	trace->ctrl_update(tr);
+	tracing_stop();
 	/* check both trace buffers */
 	ret = trace_test_buffer(tr, NULL);
 	if (!ret)
 		ret = trace_test_buffer(&max_tr, &count);
 	trace->reset(tr);
+	tracing_start();
 
 	if (!ret && !count) {
 		printk(KERN_CONT ".. no entries found ..");
@@ -269,13 +270,13 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
 	udelay(100);
 	preempt_enable();
 	/* stop the tracing. */
-	tr->ctrl = 0;
-	trace->ctrl_update(tr);
+	tracing_stop();
 	/* check both trace buffers */
 	ret = trace_test_buffer(tr, NULL);
 	if (!ret)
 		ret = trace_test_buffer(&max_tr, &count);
 	trace->reset(tr);
+	tracing_start();
 
 	if (!ret && !count) {
 		printk(KERN_CONT ".. no entries found ..");
@@ -312,27 +313,30 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 	local_irq_enable();
 
 	/* stop the tracing. */
-	tr->ctrl = 0;
-	trace->ctrl_update(tr);
+	tracing_stop();
 	/* check both trace buffers */
 	ret = trace_test_buffer(tr, NULL);
-	if (ret)
+	if (ret) {
+		tracing_start();
 		goto out;
+	}
 
 	ret = trace_test_buffer(&max_tr, &count);
-	if (ret)
+	if (ret) {
+		tracing_start();
 		goto out;
+	}
 
 	if (!ret && !count) {
 		printk(KERN_CONT ".. no entries found ..");
 		ret = -1;
+		tracing_start();
 		goto out;
 	}
 
 	/* do the test by disabling interrupts first this time */
 	tracing_max_latency = 0;
-	tr->ctrl = 1;
-	trace->ctrl_update(tr);
+	tracing_start();
 	preempt_disable();
 	local_irq_disable();
 	udelay(100);
@@ -341,8 +345,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 	local_irq_enable();
 
 	/* stop the tracing. */
-	tr->ctrl = 0;
-	trace->ctrl_update(tr);
+	tracing_stop();
 	/* check both trace buffers */
 	ret = trace_test_buffer(tr, NULL);
 	if (ret)
@@ -358,6 +361,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 
  out:
 	trace->reset(tr);
+	tracing_start();
 	tracing_max_latency = save_max;
 
 	return ret;
@@ -448,8 +452,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
 	msleep(100);
 
 	/* stop the tracing. */
-	tr->ctrl = 0;
-	trace->ctrl_update(tr);
+	tracing_stop();
 	/* check both trace buffers */
 	ret = trace_test_buffer(tr, NULL);
 	if (!ret)
@@ -457,6 +460,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
 
 
 	trace->reset(tr);
+	tracing_start();
 
 	tracing_max_latency = save_max;
 
@@ -485,11 +489,11 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
 	/* Sleep for a 1/10 of a second */
 	msleep(100);
 	/* stop the tracing. */
-	tr->ctrl = 0;
-	trace->ctrl_update(tr);
+	tracing_stop();
 	/* check the trace buffer */
 	ret = trace_test_buffer(tr, &count);
 	trace->reset(tr);
+	tracing_start();
 
 	if (!ret && !count) {
 		printk(KERN_CONT ".. no entries found ..");
@@ -513,11 +517,11 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
 	/* Sleep for a 1/10 of a second */
 	msleep(100);
 	/* stop the tracing. */
-	tr->ctrl = 0;
-	trace->ctrl_update(tr);
+	tracing_stop();
 	/* check the trace buffer */
 	ret = trace_test_buffer(tr, &count);
 	trace->reset(tr);
+	tracing_start();
 
 	return ret;
 }
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 9587d3bcba55..8097430edff9 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -275,21 +275,11 @@ static void stack_trace_reset(struct trace_array *tr)
 		stop_stack_trace(tr);
 }
 
-static void stack_trace_ctrl_update(struct trace_array *tr)
-{
-	/* When starting a new trace, reset the buffers */
-	if (tr->ctrl)
-		start_stack_trace(tr);
-	else
-		stop_stack_trace(tr);
-}
-
 static struct tracer stack_trace __read_mostly =
 {
 	.name		= "sysprof",
 	.init		= stack_trace_init,
 	.reset		= stack_trace_reset,
-	.ctrl_update	= stack_trace_ctrl_update,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_sysprof,
 #endif
-- 
cgit v1.2.3


From c76f06945be50564f925799ddfb6235ee4c26aa0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 7 Nov 2008 22:36:02 -0500
Subject: ftrace: remove trace array ctrl

Impact: remove obsolete variable in trace_array structure

With the new start / stop method of ftrace, the ctrl variable
in the trace_array structure is now obsolete. Remove it.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c              | 20 +++++---------------
 kernel/trace/trace.h              |  1 -
 kernel/trace/trace_functions.c    |  6 ++----
 kernel/trace/trace_irqsoff.c      |  7 ++-----
 kernel/trace/trace_mmiotrace.c    | 11 +++++------
 kernel/trace/trace_nop.c          |  6 ++----
 kernel/trace/trace_sched_switch.c |  6 ++----
 kernel/trace/trace_sched_wakeup.c | 12 ++++--------
 kernel/trace/trace_selftest.c     |  8 --------
 kernel/trace/trace_sysprof.c      |  6 ++----
 10 files changed, 24 insertions(+), 59 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9e83188172a1..58435415b366 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -537,7 +537,6 @@ int register_tracer(struct tracer *type)
 	if (type->selftest) {
 		struct tracer *saved_tracer = current_trace;
 		struct trace_array *tr = &global_trace;
-		int saved_ctrl = tr->ctrl;
 		int i;
 		/*
 		 * Run a selftest on this tracer.
@@ -550,13 +549,11 @@ int register_tracer(struct tracer *type)
 			tracing_reset(tr, i);
 		}
 		current_trace = type;
-		tr->ctrl = 0;
 		/* the test is responsible for initializing and enabling */
 		pr_info("Testing tracer %s: ", type->name);
 		ret = type->selftest(type, tr);
 		/* the test is responsible for resetting too */
 		current_trace = saved_tracer;
-		tr->ctrl = saved_ctrl;
 		if (ret) {
 			printk(KERN_CONT "FAILED!\n");
 			goto out;
@@ -966,7 +963,7 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
 	int cpu;
 	int pc;
 
-	if (tracing_disabled || !tr->ctrl)
+	if (tracing_disabled)
 		return;
 
 	pc = preempt_count();
@@ -2820,7 +2817,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
 	unsigned long val;
 	char buf[64];
 	int ret;
-	struct trace_array *tr = filp->private_data;
 
 	if (cnt >= sizeof(buf))
 		return -EINVAL;
@@ -2840,12 +2836,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
 
 	mutex_lock(&trace_types_lock);
 
-	if (tr->ctrl) {
-		cnt = -EBUSY;
-		pr_info("ftrace: please disable tracing"
-			" before modifying buffer size\n");
-		goto out;
-	}
+	tracing_stop();
 
 	if (val != global_trace.entries) {
 		ret = ring_buffer_resize(global_trace.buffer, val);
@@ -2878,6 +2869,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
 	if (tracing_disabled)
 		cnt = -ENOMEM;
  out:
+	tracing_start();
 	max_tr.entries = global_trace.entries;
 	mutex_unlock(&trace_types_lock);
 
@@ -2900,9 +2892,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 {
 	char *buf;
 	char *end;
-	struct trace_array *tr = &global_trace;
 
-	if (!tr->ctrl || tracing_disabled)
+	if (tracing_disabled)
 		return -EINVAL;
 
 	if (cnt > TRACE_BUF_SIZE)
@@ -3131,7 +3122,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 	unsigned long flags, irq_flags;
 	int cpu, len = 0, size, pc;
 
-	if (!tr->ctrl || tracing_disabled)
+	if (tracing_disabled)
 		return 0;
 
 	pc = preempt_count();
@@ -3365,7 +3356,6 @@ __init static int tracer_alloc_buffers(void)
 #endif
 
 	/* All seems OK, enable tracing */
-	global_trace.ctrl = 1;
 	tracing_disabled = 0;
 
 	atomic_notifier_chain_register(&panic_notifier_list,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e481edaae1c4..cfda9d219e66 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -172,7 +172,6 @@ struct trace_iterator;
 struct trace_array {
 	struct ring_buffer	*buffer;
 	unsigned long		entries;
-	long			ctrl;
 	int			cpu;
 	cycle_t			time_start;
 	struct task_struct	*waiter;
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index e980b872bef5..8693b7a0a5b2 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -44,14 +44,12 @@ static void stop_function_trace(struct trace_array *tr)
 
 static void function_trace_init(struct trace_array *tr)
 {
-	if (tr->ctrl)
-		start_function_trace(tr);
+	start_function_trace(tr);
 }
 
 static void function_trace_reset(struct trace_array *tr)
 {
-	if (tr->ctrl)
-		stop_function_trace(tr);
+	stop_function_trace(tr);
 }
 
 static void function_trace_start(struct trace_array *tr)
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index ffdf592a54e3..d919d4eaa7cc 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -383,15 +383,12 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
 	irqsoff_trace = tr;
 	/* make sure that the tracer is visible */
 	smp_wmb();
-
-	if (tr->ctrl)
-		start_irqsoff_tracer(tr);
+	start_irqsoff_tracer(tr);
 }
 
 static void irqsoff_tracer_reset(struct trace_array *tr)
 {
-	if (tr->ctrl)
-		stop_irqsoff_tracer(tr);
+	stop_irqsoff_tracer(tr);
 }
 
 static void irqsoff_tracer_start(struct trace_array *tr)
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index fa9354e78b57..51bcf370215e 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -34,17 +34,16 @@ static void mmio_trace_init(struct trace_array *tr)
 {
 	pr_debug("in %s\n", __func__);
 	mmio_trace_array = tr;
-	if (tr->ctrl) {
-		mmio_reset_data(tr);
-		enable_mmiotrace();
-	}
+
+	mmio_reset_data(tr);
+	enable_mmiotrace();
 }
 
 static void mmio_trace_reset(struct trace_array *tr)
 {
 	pr_debug("in %s\n", __func__);
-	if (tr->ctrl)
-		disable_mmiotrace();
+
+	disable_mmiotrace();
 	mmio_reset_data(tr);
 	mmio_trace_array = NULL;
 }
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index e3c5fbfcdd36..2ef1d227e7d8 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -32,14 +32,12 @@ static void nop_trace_init(struct trace_array *tr)
 	for_each_online_cpu(cpu)
 		tracing_reset(tr, cpu);
 
-	if (tr->ctrl)
-		start_nop_trace(tr);
+	start_nop_trace(tr);
 }
 
 static void nop_trace_reset(struct trace_array *tr)
 {
-	if (tr->ctrl)
-		stop_nop_trace(tr);
+	stop_nop_trace(tr);
 }
 
 struct tracer nop_trace __read_mostly =
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 7b73fd11e4aa..be35bdfe2e38 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -209,14 +209,12 @@ static void stop_sched_trace(struct trace_array *tr)
 static void sched_switch_trace_init(struct trace_array *tr)
 {
 	ctx_trace = tr;
-
-	if (tr->ctrl)
-		start_sched_trace(tr);
+	start_sched_trace(tr);
 }
 
 static void sched_switch_trace_reset(struct trace_array *tr)
 {
-	if (tr->ctrl && sched_ref)
+	if (sched_ref)
 		stop_sched_trace(tr);
 }
 
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 23e54d4e4d92..983f2b1478c9 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -334,18 +334,14 @@ static void stop_wakeup_tracer(struct trace_array *tr)
 static void wakeup_tracer_init(struct trace_array *tr)
 {
 	wakeup_trace = tr;
-
-	if (tr->ctrl)
-		start_wakeup_tracer(tr);
+	start_wakeup_tracer(tr);
 }
 
 static void wakeup_tracer_reset(struct trace_array *tr)
 {
-	if (tr->ctrl) {
-		stop_wakeup_tracer(tr);
-		/* make sure we put back any tasks we are tracing */
-		wakeup_reset(tr);
-	}
+	stop_wakeup_tracer(tr);
+	/* make sure we put back any tasks we are tracing */
+	wakeup_reset(tr);
 }
 
 static void wakeup_tracer_start(struct trace_array *tr)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 746934340474..ea4e5d3b15df 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -110,7 +110,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 	ftrace_set_filter(func_name, strlen(func_name), 1);
 
 	/* enable tracing */
-	tr->ctrl = 1;
 	trace->init(tr);
 
 	/* Sleep for a 1/10 of a second */
@@ -181,7 +180,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 	ftrace_enabled = 1;
 	tracer_enabled = 1;
 
-	tr->ctrl = 1;
 	trace->init(tr);
 	/* Sleep for a 1/10 of a second */
 	msleep(100);
@@ -224,7 +222,6 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
 	int ret;
 
 	/* start the tracing */
-	tr->ctrl = 1;
 	trace->init(tr);
 	/* reset the max latency */
 	tracing_max_latency = 0;
@@ -261,7 +258,6 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
 	int ret;
 
 	/* start the tracing */
-	tr->ctrl = 1;
 	trace->init(tr);
 	/* reset the max latency */
 	tracing_max_latency = 0;
@@ -298,7 +294,6 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 	int ret;
 
 	/* start the tracing */
-	tr->ctrl = 1;
 	trace->init(tr);
 
 	/* reset the max latency */
@@ -427,7 +422,6 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
 	wait_for_completion(&isrt);
 
 	/* start the tracing */
-	tr->ctrl = 1;
 	trace->init(tr);
 	/* reset the max latency */
 	tracing_max_latency = 0;
@@ -484,7 +478,6 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
 	int ret;
 
 	/* start the tracing */
-	tr->ctrl = 1;
 	trace->init(tr);
 	/* Sleep for a 1/10 of a second */
 	msleep(100);
@@ -512,7 +505,6 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
 	int ret;
 
 	/* start the tracing */
-	tr->ctrl = 1;
 	trace->init(tr);
 	/* Sleep for a 1/10 of a second */
 	msleep(100);
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 8097430edff9..05f753422aea 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -265,14 +265,12 @@ static void stack_trace_init(struct trace_array *tr)
 {
 	sysprof_trace = tr;
 
-	if (tr->ctrl)
-		start_stack_trace(tr);
+	start_stack_trace(tr);
 }
 
 static void stack_trace_reset(struct trace_array *tr)
 {
-	if (tr->ctrl)
-		stop_stack_trace(tr);
+	stop_stack_trace(tr);
 }
 
 static struct tracer stack_trace __read_mostly =
-- 
cgit v1.2.3


From 769c48eb2530c5c1a393e2c82063f4f050571d24 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 7 Nov 2008 22:36:02 -0500
Subject: ftrace: force pass of preemptoff selftest

Impact: preemptoff not tested in selftest

Due to the BKL not being preemptable anymore, the selftest of the
preemptoff code can not be tested. It requires that it is called
with preemption enabled, but since the BKL is held, that is no
longer the case.

This patch simply skips those tests if it detects that the context
is not preemptable. The following will now show up in the tests:

Testing tracer preemptoff: can not test ... force PASSED
Testing tracer preemptirqsoff: can not test ... force PASSED

When the BKL is removed, or it becomes preemptable once again, then
the tests will be performed.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_selftest.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index ea4e5d3b15df..0728a105dcc1 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -257,6 +257,19 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
 	unsigned long count;
 	int ret;
 
+	/*
+	 * Now that the big kernel lock is no longer preemptable,
+	 * and this is called with the BKL held, it will always
+	 * fail. If preemption is already disabled, simply
+	 * pass the test. When the BKL is removed, or becomes
+	 * preemptible again, we will once again test this,
+	 * so keep it in.
+	 */
+	if (preempt_count()) {
+		printk(KERN_CONT "can not test ... force ");
+		return 0;
+	}
+
 	/* start the tracing */
 	trace->init(tr);
 	/* reset the max latency */
@@ -293,6 +306,19 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 	unsigned long count;
 	int ret;
 
+	/*
+	 * Now that the big kernel lock is no longer preemptable,
+	 * and this is called with the BKL held, it will always
+	 * fail. If preemption is already disabled, simply
+	 * pass the test. When the BKL is removed, or becomes
+	 * preemptible again, we will once again test this,
+	 * so keep it in.
+	 */
+	if (preempt_count()) {
+		printk(KERN_CONT "can not test ... force ");
+		return 0;
+	}
+
 	/* start the tracing */
 	trace->init(tr);
 
-- 
cgit v1.2.3


From a309720c876d7ad2e224bfd1982c92ae4364c82e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 7 Nov 2008 22:36:02 -0500
Subject: ftrace: display start of CPU buffer in trace output

Impact: change in trace output

Because the trace buffers are per cpu ring buffers, the start of
the trace can be confusing. If one CPU is very active at the
end of the trace, its history will not go as far back as the
other CPU traces.  This means that output for a particular CPU
may not appear for the first part of a trace.

To help annotate what is happening, and to prevent any more
confusion, this patch adds a line that annotates the start of
a CPU buffer output.

For example:

       automount-3495  [001]   184.596443: dnotify_parent <-vfs_write
[...]
       automount-3495  [001]   184.596449: dput <-path_put
       automount-3496  [002]   184.596450: down_read_trylock <-do_page_fault
[...]
           sshd-3497  [001]   184.597069: up_read <-do_page_fault
          <idle>-0     [000]   184.597074: __exit_idle <-exit_idle
[...]
       automount-3496  [002]   184.597257: filemap_fault <-__do_fault
          <idle>-0     [003]   184.597261: exit_idle <-smp_apic_timer_interrupt

Note, parsers of a trace output should always ignore any lines that
start with a '#'.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 19 +++++++++++++++++++
 kernel/trace/trace.h |  2 ++
 2 files changed, 21 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 58435415b366..f147f198b9a6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1478,6 +1478,17 @@ void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter)
 		trace_seq_putc(s, '\n');
 }
 
+static void test_cpu_buff_start(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+
+	if (cpu_isset(iter->cpu, iter->started))
+		return;
+
+	cpu_set(iter->cpu, iter->started);
+	trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
+}
+
 static enum print_line_t
 print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 {
@@ -1497,6 +1508,8 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 	if (entry->type == TRACE_CONT)
 		return TRACE_TYPE_HANDLED;
 
+	test_cpu_buff_start(iter);
+
 	next_entry = find_next_entry(iter, NULL, &next_ts);
 	if (!next_entry)
 		next_ts = iter->ts;
@@ -1612,6 +1625,8 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 	if (entry->type == TRACE_CONT)
 		return TRACE_TYPE_HANDLED;
 
+	test_cpu_buff_start(iter);
+
 	comm = trace_find_cmdline(iter->ent->pid);
 
 	t = ns2usecs(iter->ts);
@@ -2631,6 +2646,10 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
 		return -ENOMEM;
 
 	mutex_lock(&trace_types_lock);
+
+	/* trace pipe does not show start of buffer */
+	cpus_setall(iter->started);
+
 	iter->tr = &global_trace;
 	iter->trace = current_trace;
 	filp->private_data = iter;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cfda9d219e66..978145088fb8 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -277,6 +277,8 @@ struct trace_iterator {
 	unsigned long		iter_flags;
 	loff_t			pos;
 	long			idx;
+
+	cpumask_t		started;
 };
 
 int tracing_is_enabled(void);
-- 
cgit v1.2.3


From ae1e9130bfb9ad55eb97ec3fb17a122b7a118f98 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 11 Nov 2008 09:05:16 +0100
Subject: sched: rename SCHED_NO_NO_OMIT_FRAME_POINTER =>
 SCHED_OMIT_FRAME_POINTER

Impact: cleanup, change .config option name

We had this ugly config name for a long time for hysteric raisons.
Rename it to a saner name.

We still cannot get rid of it completely, until /proc/<pid>/stack
usage replaces WCHAN usage for good.

We'll be able to do that in the v2.6.29/v2.6.30 timeframe.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/ia64/Kconfig         | 2 +-
 arch/m32r/Kconfig         | 2 +-
 arch/mips/Kconfig         | 2 +-
 arch/powerpc/Kconfig      | 2 +-
 arch/x86/Kconfig          | 2 +-
 include/asm-m32r/system.h | 2 +-
 kernel/Makefile           | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 27eec71429b0..59d12788b60c 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -99,7 +99,7 @@ config GENERIC_IOMAP
 	bool
 	default y
 
-config SCHED_NO_NO_OMIT_FRAME_POINTER
+config SCHED_OMIT_FRAME_POINTER
 	bool
 	default y
 
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index dbaed4a63815..29047d5c259a 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -273,7 +273,7 @@ config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
 
-config SCHED_NO_NO_OMIT_FRAME_POINTER
+config SCHED_OMIT_FRAME_POINTER
         bool
         default y
 
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index f4af967a6b30..a5255e7c79e0 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -653,7 +653,7 @@ config GENERIC_CMOS_UPDATE
 	bool
 	default y
 
-config SCHED_NO_NO_OMIT_FRAME_POINTER
+config SCHED_OMIT_FRAME_POINTER
 	bool
 	default y
 
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 525c13a4de93..adb23ea1c1ef 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -141,7 +141,7 @@ config GENERIC_NVRAM
 	bool
 	default y if PPC32
 
-config SCHED_NO_NO_OMIT_FRAME_POINTER
+config SCHED_OMIT_FRAME_POINTER
 	bool
 	default y
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1d5550d19b66..74db682ec1ce 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -364,7 +364,7 @@ config X86_RDC321X
 	  as R-8610-(G).
 	  If you don't have one of these chips, you should say N here.
 
-config SCHED_NO_NO_OMIT_FRAME_POINTER
+config SCHED_OMIT_FRAME_POINTER
 	def_bool y
 	prompt "Single-depth WCHAN output"
 	depends on X86
diff --git a/include/asm-m32r/system.h b/include/asm-m32r/system.h
index 70a57c8c002b..c980f5ba8de7 100644
--- a/include/asm-m32r/system.h
+++ b/include/asm-m32r/system.h
@@ -23,7 +23,7 @@
  */
 
 #if defined(CONFIG_FRAME_POINTER) || \
-	!defined(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER)
+	!defined(CONFIG_SCHED_OMIT_FRAME_POINTER)
 #define M32R_PUSH_FP "	push fp\n"
 #define M32R_POP_FP  "	pop  fp\n"
 #else
diff --git a/kernel/Makefile b/kernel/Makefile
index e1af03972148..46e67a398495 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -91,7 +91,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 
-ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
+ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
 # needed for x86 only.  Why this used to be enabled for all architectures is beyond
 # me.  I suspect most platforms don't need this, but until we know that for sure
-- 
cgit v1.2.3


From 5aa1ba6a6c710e747838a22d798ac97a8b248745 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 10 Nov 2008 23:07:30 -0500
Subject: ftrace: prevent ftrace_special from recursion

Impact: stop ftrace_special from recursion

The ftrace_special is used to help debug areas of the kernel.
Because of this, if it is put in certain locations, the fact that
it allows recursion can become a problem if the kernel developer
using does not realize that.

This patch changes ftrace_special to not allow recursion into itself
to make it more robust.

It also changes from preempt disable interrupts disable to prevent
any loss of trace entries.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0c22fe2d43a7..216bbe7547a4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -960,6 +960,7 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
 {
 	struct trace_array *tr = &global_trace;
 	struct trace_array_cpu *data;
+	unsigned long flags;
 	int cpu;
 	int pc;
 
@@ -967,14 +968,15 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
 		return;
 
 	pc = preempt_count();
-	preempt_disable_notrace();
+	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 
-	if (likely(!atomic_read(&data->disabled)))
+	if (likely(atomic_inc_return(&data->disabled) == 1))
 		ftrace_trace_special(tr, data, arg1, arg2, arg3, pc);
 
-	preempt_enable_notrace();
+	atomic_dec(&data->disabled);
+	local_irq_restore(flags);
 }
 
 #ifdef CONFIG_FUNCTION_TRACER
-- 
cgit v1.2.3


From f536aafc5a2e6f0c8f1577a155e6f93db5e469f0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 10 Nov 2008 23:07:30 -0500
Subject: ring-buffer: replace most bug ons with warn on and disable buffer

This patch replaces most of the BUG_ONs in the ring_buffer code with
RB_WARN_ON variants. It adds some more variants as needed for the
replacement. This lets the buffer die nicely and still warn the user.

One BUG_ON remains in the code, and that is because it detects a
bad pointer passed in by the calling function, and not a bug by
the ring buffer code itself.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 65 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 49 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index ee9b93d318b9..a6b8f9d7ac96 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -188,6 +188,7 @@ struct ring_buffer_iter {
 	u64				read_stamp;
 };
 
+/* buffer may be either ring_buffer or ring_buffer_per_cpu */
 #define RB_WARN_ON(buffer, cond)				\
 	do {							\
 		if (unlikely(cond)) {				\
@@ -197,6 +198,15 @@ struct ring_buffer_iter {
 	} while (0)
 
 #define RB_WARN_ON_RET(buffer, cond)				\
+	do {							\
+		if (unlikely(cond)) {				\
+			atomic_inc(&buffer->record_disabled);	\
+			WARN_ON(1);				\
+			return;					\
+		}						\
+	} while (0)
+
+#define RB_WARN_ON_RET_INT(buffer, cond)			\
 	do {							\
 		if (unlikely(cond)) {				\
 			atomic_inc(&buffer->record_disabled);	\
@@ -205,6 +215,15 @@ struct ring_buffer_iter {
 		}						\
 	} while (0)
 
+#define RB_WARN_ON_RET_NULL(buffer, cond)			\
+	do {							\
+		if (unlikely(cond)) {				\
+			atomic_inc(&buffer->record_disabled);	\
+			WARN_ON(1);				\
+			return NULL;				\
+		}						\
+	} while (0)
+
 #define RB_WARN_ON_ONCE(buffer, cond)				\
 	do {							\
 		static int once;				\
@@ -215,6 +234,17 @@ struct ring_buffer_iter {
 		}						\
 	} while (0)
 
+/* buffer must be ring_buffer not per_cpu */
+#define RB_WARN_ON_UNLOCK(buffer, cond)				\
+	do {							\
+		if (unlikely(cond)) {				\
+			mutex_unlock(&buffer->mutex);		\
+			atomic_inc(&buffer->record_disabled);	\
+			WARN_ON(1);				\
+			return -1;				\
+		}						\
+	} while (0)
+
 /**
  * check_pages - integrity check of buffer pages
  * @cpu_buffer: CPU buffer with pages to test
@@ -227,13 +257,13 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
 	struct list_head *head = &cpu_buffer->pages;
 	struct buffer_page *page, *tmp;
 
-	RB_WARN_ON_RET(cpu_buffer, head->next->prev != head);
-	RB_WARN_ON_RET(cpu_buffer, head->prev->next != head);
+	RB_WARN_ON_RET_INT(cpu_buffer, head->next->prev != head);
+	RB_WARN_ON_RET_INT(cpu_buffer, head->prev->next != head);
 
 	list_for_each_entry_safe(page, tmp, head, list) {
-		RB_WARN_ON_RET(cpu_buffer,
+		RB_WARN_ON_RET_INT(cpu_buffer,
 			       page->list.next->prev != &page->list);
-		RB_WARN_ON_RET(cpu_buffer,
+		RB_WARN_ON_RET_INT(cpu_buffer,
 			       page->list.prev->next != &page->list);
 	}
 
@@ -440,13 +470,13 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
 	synchronize_sched();
 
 	for (i = 0; i < nr_pages; i++) {
-		BUG_ON(list_empty(&cpu_buffer->pages));
+		RB_WARN_ON_RET(cpu_buffer, list_empty(&cpu_buffer->pages));
 		p = cpu_buffer->pages.next;
 		page = list_entry(p, struct buffer_page, list);
 		list_del_init(&page->list);
 		free_buffer_page(page);
 	}
-	BUG_ON(list_empty(&cpu_buffer->pages));
+	RB_WARN_ON_RET(cpu_buffer, list_empty(&cpu_buffer->pages));
 
 	rb_reset_cpu(cpu_buffer);
 
@@ -468,7 +498,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
 	synchronize_sched();
 
 	for (i = 0; i < nr_pages; i++) {
-		BUG_ON(list_empty(pages));
+		RB_WARN_ON_RET(cpu_buffer, list_empty(pages));
 		p = pages->next;
 		page = list_entry(p, struct buffer_page, list);
 		list_del_init(&page->list);
@@ -523,7 +553,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 	if (size < buffer_size) {
 
 		/* easy case, just free pages */
-		BUG_ON(nr_pages >= buffer->pages);
+		RB_WARN_ON_UNLOCK(buffer, nr_pages >= buffer->pages);
 
 		rm_pages = buffer->pages - nr_pages;
 
@@ -542,7 +572,8 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 	 * add these pages to the cpu_buffers. Otherwise we just free
 	 * them all and return -ENOMEM;
 	 */
-	BUG_ON(nr_pages <= buffer->pages);
+	RB_WARN_ON_UNLOCK(buffer, nr_pages <= buffer->pages);
+
 	new_pages = nr_pages - buffer->pages;
 
 	for_each_buffer_cpu(buffer, cpu) {
@@ -565,7 +596,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 		rb_insert_pages(cpu_buffer, &pages, new_pages);
 	}
 
-	BUG_ON(!list_empty(&pages));
+	RB_WARN_ON_UNLOCK(buffer, !list_empty(&pages));
 
  out:
 	buffer->pages = nr_pages;
@@ -653,7 +684,7 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
 	     head += rb_event_length(event)) {
 
 		event = __rb_page_index(cpu_buffer->head_page, head);
-		BUG_ON(rb_null_event(event));
+		RB_WARN_ON_RET(cpu_buffer, rb_null_event(event));
 		/* Only count data entries */
 		if (event->type != RINGBUF_TYPE_DATA)
 			continue;
@@ -940,7 +971,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 
 	/* We reserved something on the buffer */
 
-	BUG_ON(write > BUF_PAGE_SIZE);
+	RB_WARN_ON_RET_NULL(cpu_buffer, write > BUF_PAGE_SIZE);
 
 	event = __rb_page_index(tail_page, tail);
 	rb_update_event(event, type, length);
@@ -1621,7 +1652,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
 	reader = rb_get_reader_page(cpu_buffer);
 
 	/* This function should not be called when buffer is empty */
-	BUG_ON(!reader);
+	RB_WARN_ON_RET(cpu_buffer, !reader);
 
 	event = rb_reader_event(cpu_buffer);
 
@@ -1648,7 +1679,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
 	 * Check if we are at the end of the buffer.
 	 */
 	if (iter->head >= rb_page_size(iter->head_page)) {
-		BUG_ON(iter->head_page == cpu_buffer->commit_page);
+		RB_WARN_ON_RET(buffer,
+			       iter->head_page == cpu_buffer->commit_page);
 		rb_inc_iter(iter);
 		return;
 	}
@@ -1661,8 +1693,9 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
 	 * This should not be called to advance the header if we are
 	 * at the tail of the buffer.
 	 */
-	BUG_ON((iter->head_page == cpu_buffer->commit_page) &&
-	       (iter->head + length > rb_commit_index(cpu_buffer)));
+	RB_WARN_ON_RET(cpu_buffer,
+		       (iter->head_page == cpu_buffer->commit_page) &&
+		       (iter->head + length > rb_commit_index(cpu_buffer)));
 
 	rb_update_iter_read_stamp(iter, event);
 
-- 
cgit v1.2.3


From caf4b323b02a16c92fba449952ac6515ddc76d7a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 11 Nov 2008 07:03:45 +0100
Subject: tracing, x86: add low level support for ftrace return tracing

Impact: add infrastructure for function-return tracing

Add low level support for ftrace return tracing.

This plug-in stores return addresses on the thread_info structure of
the current task.

The index of the current return address is initialized when the task
is the first one (init) and when a process forks (the child). It is
not needed when a task does a sys_execve because after this syscall,
it still needs to return on the kernel functions it called.

Note that the code of return_to_handler has been suggested by Steven
Rostedt as almost all of the ideas of improvements in this V3.

For purpose of security, arch/x86/kernel/process_32.c is not traced
because __switch_to() changes the current task during its execution.
That could cause inconsistency in the stored return address of this
function even if I didn't have any crash after testing with tracing on
this function enabled.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig                   |   1 +
 arch/x86/include/asm/ftrace.h      |  26 ++++++
 arch/x86/include/asm/thread_info.h |  24 +++++
 arch/x86/kernel/Makefile           |   6 ++
 arch/x86/kernel/entry_32.S         |  33 +++++++
 arch/x86/kernel/ftrace.c           | 181 +++++++++++++++++++++++++++++++++++--
 include/linux/ftrace.h             |  20 ++++
 include/linux/ftrace_irq.h         |   2 +-
 include/linux/sched.h              |  11 +++
 kernel/Makefile                    |   4 +
 10 files changed, 300 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 27b8a3a39911..ca91e50bdb10 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -11,6 +11,7 @@ config 64BIT
 
 config X86_32
 	def_bool !64BIT
+	select HAVE_FUNCTION_RET_TRACER
 
 config X86_64
 	def_bool 64BIT
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index f8173ed1c970..9b6a1fa19e70 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -20,4 +20,30 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
 #endif /* __ASSEMBLY__ */
 #endif /* CONFIG_FUNCTION_TRACER */
 
+#ifdef CONFIG_FUNCTION_RET_TRACER
+#define FTRACE_RET_STACK_SIZE 20
+
+#ifndef __ASSEMBLY__
+
+/*
+ * Stack of return addresses for functions
+ * of a thread.
+ * Used in struct thread_info
+ */
+struct ftrace_ret_stack {
+	unsigned long ret;
+	unsigned long func;
+	unsigned long long calltime;
+};
+
+/*
+ * Primary handler of a function return.
+ * It relays on ftrace_return_to_handler.
+ * Defined in entry32.S
+ */
+extern void return_to_handler(void);
+
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_FUNCTION_RET_TRACER */
+
 #endif /* _ASM_X86_FTRACE_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e44d379faad2..a71158369fd4 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -20,6 +20,7 @@
 struct task_struct;
 struct exec_domain;
 #include <asm/processor.h>
+#include <asm/ftrace.h>
 
 struct thread_info {
 	struct task_struct	*task;		/* main task structure */
@@ -38,8 +39,30 @@ struct thread_info {
 						*/
 	__u8			supervisor_stack[0];
 #endif
+
+#ifdef CONFIG_FUNCTION_RET_TRACER
+	/* Index of current stored adress in ret_stack */
+	int		curr_ret_stack;
+	/* Stack of return addresses for return function tracing */
+	struct ftrace_ret_stack	ret_stack[FTRACE_RET_STACK_SIZE];
+#endif
 };
 
+#ifdef CONFIG_FUNCTION_RET_TRACER
+#define INIT_THREAD_INFO(tsk)			\
+{						\
+	.task		= &tsk,			\
+	.exec_domain	= &default_exec_domain,	\
+	.flags		= 0,			\
+	.cpu		= 0,			\
+	.preempt_count	= 1,			\
+	.addr_limit	= KERNEL_DS,		\
+	.restart_block = {			\
+		.fn = do_no_restart_syscall,	\
+	},					\
+	.curr_ret_stack = -1,\
+}
+#else
 #define INIT_THREAD_INFO(tsk)			\
 {						\
 	.task		= &tsk,			\
@@ -52,6 +75,7 @@ struct thread_info {
 		.fn = do_no_restart_syscall,	\
 	},					\
 }
+#endif
 
 #define init_thread_info	(init_thread_union.thread_info)
 #define init_stack		(init_thread_union.stack)
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index e489ff9cb3e2..1d8ed95da846 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -14,6 +14,11 @@ CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
 CFLAGS_REMOVE_ftrace.o = -pg
 endif
 
+ifdef CONFIG_FUNCTION_RET_TRACER
+# Don't trace __switch_to() but let it for function tracer
+CFLAGS_REMOVE_process_32.o = -pg
+endif
+
 #
 # vsyscalls (which work on the user stack) should have
 # no stack-protector checks:
@@ -65,6 +70,7 @@ obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o nmi.o
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o
 obj-$(CONFIG_X86_REBOOTFIXUPS)	+= reboot_fixups_32.o
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
+obj-$(CONFIG_FUNCTION_RET_TRACER)	+= ftrace.o
 obj-$(CONFIG_KEXEC)		+= machine_kexec_$(BITS).o
 obj-$(CONFIG_KEXEC)		+= relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 9134de814c97..9a0ac85946db 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1188,6 +1188,10 @@ ENTRY(mcount)
 
 	cmpl $ftrace_stub, ftrace_trace_function
 	jnz trace
+#ifdef CONFIG_FUNCTION_RET_TRACER
+	cmpl $ftrace_stub, ftrace_function_return
+	jnz trace_return
+#endif
 .globl ftrace_stub
 ftrace_stub:
 	ret
@@ -1206,8 +1210,37 @@ trace:
 	popl %edx
 	popl %ecx
 	popl %eax
+	jmp ftrace_stub
 
+#ifdef CONFIG_FUNCTION_RET_TRACER
+trace_return:
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	movl 0xc(%esp), %eax
+	pushl %eax
+	lea 0x4(%ebp), %eax
+	pushl %eax
+	call prepare_ftrace_return
+	addl $8, %esp
+	popl %edx
+	popl %ecx
+	popl %eax
 	jmp ftrace_stub
+
+.globl return_to_handler
+return_to_handler:
+	pushl $0
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	call ftrace_return_to_handler
+	movl %eax, 0xc(%esp)
+	popl %edx
+	popl %ecx
+	popl %eax
+	ret
+#endif /* CONFIG_FUNCTION_RET_TRACER */
 END(mcount)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 #endif /* CONFIG_FUNCTION_TRACER */
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 69149337f2fe..d68033bba223 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -18,10 +18,173 @@
 #include <linux/list.h>
 
 #include <asm/ftrace.h>
+#include <linux/ftrace.h>
 #include <asm/nops.h>
+#include <asm/nmi.h>
 
 
-static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
+
+#ifdef CONFIG_FUNCTION_RET_TRACER
+
+/*
+ * These functions are picked from those used on
+ * this page for dynamic ftrace. They have been
+ * simplified to ignore all traces in NMI context.
+ */
+static atomic_t in_nmi;
+
+void ftrace_nmi_enter(void)
+{
+	atomic_inc(&in_nmi);
+}
+
+void ftrace_nmi_exit(void)
+{
+	atomic_dec(&in_nmi);
+}
+
+/*
+ * Synchronize accesses to return adresses stack with
+ * interrupts.
+ */
+static raw_spinlock_t ret_stack_lock;
+
+/* Add a function return address to the trace stack on thread info.*/
+static int push_return_trace(unsigned long ret, unsigned long long time,
+				unsigned long func)
+{
+	int index;
+	struct thread_info *ti;
+	unsigned long flags;
+	int err = 0;
+
+	raw_local_irq_save(flags);
+	__raw_spin_lock(&ret_stack_lock);
+
+	ti = current_thread_info();
+	/* The return trace stack is full */
+	if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	index = ++ti->curr_ret_stack;
+	ti->ret_stack[index].ret = ret;
+	ti->ret_stack[index].func = func;
+	ti->ret_stack[index].calltime = time;
+
+out:
+	__raw_spin_unlock(&ret_stack_lock);
+	raw_local_irq_restore(flags);
+	return err;
+}
+
+/* Retrieve a function return address to the trace stack on thread info.*/
+static void pop_return_trace(unsigned long *ret, unsigned long long *time,
+				unsigned long *func)
+{
+	struct thread_info *ti;
+	int index;
+	unsigned long flags;
+
+	raw_local_irq_save(flags);
+	__raw_spin_lock(&ret_stack_lock);
+
+	ti = current_thread_info();
+	index = ti->curr_ret_stack;
+	*ret = ti->ret_stack[index].ret;
+	*func = ti->ret_stack[index].func;
+	*time = ti->ret_stack[index].calltime;
+	ti->curr_ret_stack--;
+
+	__raw_spin_unlock(&ret_stack_lock);
+	raw_local_irq_restore(flags);
+}
+
+/*
+ * Send the trace to the ring-buffer.
+ * @return the original return address.
+ */
+unsigned long ftrace_return_to_handler(void)
+{
+	struct ftrace_retfunc trace;
+	pop_return_trace(&trace.ret, &trace.calltime, &trace.func);
+	trace.rettime = cpu_clock(raw_smp_processor_id());
+	ftrace_function_return(&trace);
+
+	return trace.ret;
+}
+
+/*
+ * Hook the return address and push it in the stack of return addrs
+ * in current thread info.
+ */
+asmlinkage
+void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
+{
+	unsigned long old;
+	unsigned long long calltime;
+	int faulted;
+	unsigned long return_hooker = (unsigned long)
+				&return_to_handler;
+
+	/* Nmi's are currently unsupported */
+	if (atomic_read(&in_nmi))
+		return;
+
+	/*
+	 * Protect against fault, even if it shouldn't
+	 * happen. This tool is too much intrusive to
+	 * ignore such a protection.
+	 */
+	asm volatile(
+		"1: movl (%[parent_old]), %[old]\n"
+		"2: movl %[return_hooker], (%[parent_replaced])\n"
+		"   movl $0, %[faulted]\n"
+
+		".section .fixup, \"ax\"\n"
+		"3: movl $1, %[faulted]\n"
+		".previous\n"
+
+		".section __ex_table, \"a\"\n"
+		"   .long 1b, 3b\n"
+		"   .long 2b, 3b\n"
+		".previous\n"
+
+		: [parent_replaced] "=rm" (parent), [old] "=r" (old),
+		  [faulted] "=r" (faulted)
+		: [parent_old] "0" (parent), [return_hooker] "r" (return_hooker)
+		: "memory"
+	);
+
+	if (WARN_ON(faulted)) {
+		unregister_ftrace_return();
+		return;
+	}
+
+	if (WARN_ON(!__kernel_text_address(old))) {
+		unregister_ftrace_return();
+		*parent = old;
+		return;
+	}
+
+	calltime = cpu_clock(raw_smp_processor_id());
+
+	if (push_return_trace(old, calltime, self_addr) == -EBUSY)
+		*parent = old;
+}
+
+static int __init init_ftrace_function_return(void)
+{
+	ret_stack_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+	return 0;
+}
+device_initcall(init_ftrace_function_return);
+
+
+#endif
+
+#ifdef CONFIG_DYNAMIC_FTRACE
 
 union ftrace_code_union {
 	char code[MCOUNT_INSN_SIZE];
@@ -31,17 +194,11 @@ union ftrace_code_union {
 	} __attribute__((packed));
 };
 
-
 static int ftrace_calc_offset(long ip, long addr)
 {
 	return (int)(addr - ip);
 }
 
-unsigned char *ftrace_nop_replace(void)
-{
-	return ftrace_nop;
-}
-
 unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
 {
 	static union ftrace_code_union calc;
@@ -183,6 +340,15 @@ do_ftrace_mod_code(unsigned long ip, void *new_code)
 }
 
 
+
+
+static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
+
+unsigned char *ftrace_nop_replace(void)
+{
+	return ftrace_nop;
+}
+
 int
 ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 		   unsigned char *new_code)
@@ -292,3 +458,4 @@ int __init ftrace_dyn_arch_init(void *data)
 
 	return 0;
 }
+#endif
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 1f5608c11023..dcbbf72a88b1 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -267,6 +267,26 @@ ftrace_init_module(unsigned long *start, unsigned long *end) { }
 #endif
 
 
+/*
+ * Structure that defines a return function trace.
+ */
+struct ftrace_retfunc {
+	unsigned long ret; /* Return address */
+	unsigned long func; /* Current function */
+	unsigned long long calltime;
+	unsigned long long rettime;
+};
+
+#ifdef CONFIG_FUNCTION_RET_TRACER
+/* Type of a callback handler of tracing return function */
+typedef void (*trace_function_return_t)(struct ftrace_retfunc *);
+
+extern void register_ftrace_return(trace_function_return_t func);
+/* The current handler in use */
+extern trace_function_return_t ftrace_function_return;
+extern void unregister_ftrace_return(void);
+#endif
+
 /*
  * Structure which defines the trace of an initcall.
  * You don't have to fill the func field since it is
diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h
index b1299d6729f2..0b4df55d7a74 100644
--- a/include/linux/ftrace_irq.h
+++ b/include/linux/ftrace_irq.h
@@ -2,7 +2,7 @@
 #define _LINUX_FTRACE_IRQ_H
 
 
-#ifdef CONFIG_DYNAMIC_FTRACE
+#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_RET_TRACER)
 extern void ftrace_nmi_enter(void);
 extern void ftrace_nmi_exit(void);
 #else
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 295b7c756ca6..df77abe860c9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2005,6 +2005,17 @@ static inline void setup_thread_stack(struct task_struct *p, struct task_struct
 {
 	*task_thread_info(p) = *task_thread_info(org);
 	task_thread_info(p)->task = p;
+
+#ifdef CONFIG_FUNCTION_RET_TRACER
+	/*
+	 * When fork() creates a child process, this function is called.
+	 * But the child task may not inherit the return adresses traced
+	 * by the return function tracer because it will directly execute
+	 * in userspace and will not return to kernel functions its parent
+	 * used.
+	 */
+	task_thread_info(p)->curr_ret_stack = -1;
+#endif
 }
 
 static inline unsigned long *end_of_stack(struct task_struct *p)
diff --git a/kernel/Makefile b/kernel/Makefile
index 9a3ec66a9d84..af3be57acbbb 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -23,6 +23,10 @@ CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_sched_clock.o = -pg
 CFLAGS_REMOVE_sched.o = -mno-spe -pg
 endif
+ifdef CONFIG_FUNCTION_RET_TRACER
+CFLAGS_REMOVE_extable.o = -pg # For __kernel_text_address()
+CFLAGS_REMOVE_module.o = -pg # For __module_text_address()
+endif
 
 obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
-- 
cgit v1.2.3


From 15e6cb3673ea6277999642802406a764b49391b0 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 11 Nov 2008 07:14:25 +0100
Subject: tracing: add a tracer to catch execution time of kernel functions

Impact: add new tracing plugin which can trace full (entry+exit) function calls

This tracer uses the low level function return ftrace plugin to
measure the execution time of the kernel functions.

The first field is the caller of the function, the second is the
measured function, and the last one is the execution time in
nanoseconds.

- v3:

- HAVE_FUNCTION_RET_TRACER have been added. Each arch that support ftrace return
  should enable it.
- ftrace_return_stub becomes ftrace_stub.
- CONFIG_FUNCTION_RET_TRACER depends now on CONFIG_FUNCTION_TRACER
- Return traces printing can be used for other tracers on trace.c
- Adapt to the new tracing API (no more ctrl_update callback)
- Correct the check of "disabled" during insertion.
- Minor changes...

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig                  | 14 ++++++
 kernel/trace/Makefile                 |  1 +
 kernel/trace/ftrace.c                 | 16 +++++++
 kernel/trace/trace.c                  | 65 +++++++++++++++++++++++----
 kernel/trace/trace.h                  | 35 +++++++++++++++
 kernel/trace/trace_functions_return.c | 82 +++++++++++++++++++++++++++++++++++
 6 files changed, 205 insertions(+), 8 deletions(-)
 create mode 100644 kernel/trace/trace_functions_return.c

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index fc4febc3334a..d986216c8327 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -9,6 +9,9 @@ config NOP_TRACER
 config HAVE_FUNCTION_TRACER
 	bool
 
+config HAVE_FUNCTION_RET_TRACER
+	bool
+
 config HAVE_FUNCTION_TRACE_MCOUNT_TEST
 	bool
 	help
@@ -54,6 +57,17 @@ config FUNCTION_TRACER
 	  (the bootup default), then the overhead of the instructions is very
 	  small and not measurable even in micro-benchmarks.
 
+config FUNCTION_RET_TRACER
+	bool "Kernel Function return Tracer"
+	depends on !DYNAMIC_FTRACE
+	depends on HAVE_FUNCTION_RET_TRACER
+	depends on FUNCTION_TRACER
+	help
+	  Enable the kernel to trace a function at its return.
+	  It's first purpose is to trace the duration of functions.
+	  This is done by setting the current return address on the thread
+	  info structure of the current task.
+
 config IRQSOFF_TRACER
 	bool "Interrupts-off Latency Tracer"
 	default n
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c8228b1a49e9..3e1f361bbc17 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -24,5 +24,6 @@ obj-$(CONFIG_NOP_TRACER) += trace_nop.o
 obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
+obj-$(CONFIG_FUNCTION_RET_TRACER) += trace_functions_return.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4d2e751bfb11..f03fe74ecd67 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1484,3 +1484,19 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 	return ret;
 }
 
+#ifdef CONFIG_FUNCTION_RET_TRACER
+trace_function_return_t ftrace_function_return =
+			(trace_function_return_t)ftrace_stub;
+void register_ftrace_return(trace_function_return_t func)
+{
+	ftrace_function_return = func;
+}
+
+void unregister_ftrace_return(void)
+{
+	ftrace_function_return = (trace_function_return_t)ftrace_stub;
+}
+#endif
+
+
+
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 216bbe7547a4..a3f7ae9cd8e1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -244,13 +244,6 @@ unsigned long nsecs_to_usecs(unsigned long nsecs)
 	return nsecs / 1000;
 }
 
-/*
- * TRACE_ITER_SYM_MASK masks the options in trace_flags that
- * control the output of kernel symbols.
- */
-#define TRACE_ITER_SYM_MASK \
-	(TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
-
 /* These must match the bit postions in trace_iterator_flags */
 static const char *trace_options[] = {
 	"print-parent",
@@ -810,6 +803,35 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data,
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
 }
 
+#ifdef CONFIG_FUNCTION_RET_TRACER
+static void __trace_function_return(struct trace_array *tr,
+				struct trace_array_cpu *data,
+				struct ftrace_retfunc *trace,
+				unsigned long flags,
+				int pc)
+{
+	struct ring_buffer_event *event;
+	struct ftrace_ret_entry *entry;
+	unsigned long irq_flags;
+
+	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+		return;
+
+	event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, flags, pc);
+	entry->ent.type			= TRACE_FN_RET;
+	entry->ip			= trace->func;
+	entry->parent_ip	= trace->ret;
+	entry->rettime		= trace->rettime;
+	entry->calltime		= trace->calltime;
+	ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
+}
+#endif
+
 void
 ftrace(struct trace_array *tr, struct trace_array_cpu *data,
        unsigned long ip, unsigned long parent_ip, unsigned long flags,
@@ -1038,6 +1060,29 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
 	raw_local_irq_restore(flags);
 }
 
+#ifdef CONFIG_FUNCTION_RET_TRACER
+void trace_function_return(struct ftrace_retfunc *trace)
+{
+	struct trace_array *tr = &global_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+	int pc;
+
+	raw_local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+	if (likely(disabled == 1)) {
+		pc = preempt_count();
+		__trace_function_return(tr, data, trace, flags, pc);
+	}
+	atomic_dec(&data->disabled);
+	raw_local_irq_restore(flags);
+}
+#endif /* CONFIG_FUNCTION_RET_TRACER */
+
 static struct ftrace_ops trace_ops __read_mostly =
 {
 	.func = function_trace_call,
@@ -1285,7 +1330,7 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt,
 # define IP_FMT "%016lx"
 #endif
 
-static int
+int
 seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
 {
 	int ret;
@@ -1738,6 +1783,10 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 			trace_seq_print_cont(s, iter);
 		break;
 	}
+	case TRACE_FN_RET: {
+		return print_return_function(iter);
+		break;
+	}
 	}
 	return TRACE_TYPE_HANDLED;
 }
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 978145088fb8..e40ce0c14690 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -22,6 +22,7 @@ enum trace_type {
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
 	TRACE_BOOT,
+	TRACE_FN_RET,
 
 	__TRACE_LAST_TYPE
 };
@@ -48,6 +49,15 @@ struct ftrace_entry {
 	unsigned long		ip;
 	unsigned long		parent_ip;
 };
+
+/* Function return entry */
+struct ftrace_ret_entry {
+	struct trace_entry	ent;
+	unsigned long		ip;
+	unsigned long		parent_ip;
+	unsigned long long	calltime;
+	unsigned long long	rettime;
+};
 extern struct tracer boot_tracer;
 
 /*
@@ -218,6 +228,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_map,		\
 			  TRACE_MMIO_MAP);				\
 		IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT);	\
+		IF_ASSIGN(var, ent, struct ftrace_ret_entry, TRACE_FN_RET); \
 		__ftrace_bad_type();					\
 	} while (0)
 
@@ -321,6 +332,8 @@ void trace_function(struct trace_array *tr,
 		    unsigned long ip,
 		    unsigned long parent_ip,
 		    unsigned long flags, int pc);
+void
+trace_function_return(struct ftrace_retfunc *trace);
 
 void tracing_start_cmdline_record(void);
 void tracing_stop_cmdline_record(void);
@@ -393,6 +406,10 @@ extern void *head_page(struct trace_array_cpu *data);
 extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
 extern void trace_seq_print_cont(struct trace_seq *s,
 				 struct trace_iterator *iter);
+
+extern int
+seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
+		unsigned long sym_flags);
 extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
 				 size_t cnt);
 extern long ns2usecs(cycle_t nsec);
@@ -400,6 +417,17 @@ extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args);
 
 extern unsigned long trace_flags;
 
+/* Standard output formatting function used for function return traces */
+#ifdef CONFIG_FUNCTION_RET_TRACER
+extern enum print_line_t print_return_function(struct trace_iterator *iter);
+#else
+static inline enum print_line_t
+print_return_function(struct trace_iterator *iter)
+{
+	return TRACE_TYPE_UNHANDLED;
+}
+#endif
+
 /*
  * trace_iterator_flags is an enumeration that defines bit
  * positions into trace_flags that controls the output.
@@ -422,6 +450,13 @@ enum trace_iterator_flags {
 	TRACE_ITER_PREEMPTONLY		= 0x800,
 };
 
+/*
+ * TRACE_ITER_SYM_MASK masks the options in trace_flags that
+ * control the output of kernel symbols.
+ */
+#define TRACE_ITER_SYM_MASK \
+	(TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
+
 extern struct tracer nop_trace;
 
 /**
diff --git a/kernel/trace/trace_functions_return.c b/kernel/trace/trace_functions_return.c
new file mode 100644
index 000000000000..7680b21537dd
--- /dev/null
+++ b/kernel/trace/trace_functions_return.c
@@ -0,0 +1,82 @@
+/*
+ *
+ * Function return tracer.
+ * Copyright (c) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ * Mostly borrowed from function tracer which
+ * is Copyright (c) Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/fs.h>
+
+#include "trace.h"
+
+
+static void start_return_trace(struct trace_array *tr)
+{
+	register_ftrace_return(&trace_function_return);
+}
+
+static void stop_return_trace(struct trace_array *tr)
+{
+	unregister_ftrace_return();
+}
+
+static void return_trace_init(struct trace_array *tr)
+{
+	int cpu;
+	for_each_online_cpu(cpu)
+		tracing_reset(tr, cpu);
+
+	start_return_trace(tr);
+}
+
+static void return_trace_reset(struct trace_array *tr)
+{
+		stop_return_trace(tr);
+}
+
+
+enum print_line_t
+print_return_function(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry = iter->ent;
+	struct ftrace_ret_entry *field;
+	int ret;
+
+	if (entry->type == TRACE_FN_RET) {
+		trace_assign_type(field, entry);
+		ret = trace_seq_printf(s, "%pF -> ", (void *)field->parent_ip);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+		ret = seq_print_ip_sym(s, field->ip,
+					trace_flags & TRACE_ITER_SYM_MASK);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+		ret = trace_seq_printf(s, " (%llu ns)\n",
+					field->rettime - field->calltime);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+		else
+			return TRACE_TYPE_HANDLED;
+	}
+	return TRACE_TYPE_UNHANDLED;
+}
+
+static struct tracer return_trace __read_mostly =
+{
+	.name	     = "return",
+	.init	     = return_trace_init,
+	.reset	     = return_trace_reset,
+	.print_line = print_return_function
+};
+
+static __init int init_return_trace(void)
+{
+	return register_tracer(&return_trace);
+}
+
+device_initcall(init_return_trace);
-- 
cgit v1.2.3


From ff9b48c3598732926fa09afd7f526981c32a48cc Mon Sep 17 00:00:00 2001
From: Bharata B Rao <bharata@linux.vnet.ibm.com>
Date: Mon, 10 Nov 2008 21:34:09 +0530
Subject: sched: include group statistics in /proc/sched_debug

Impact: extend /proc/sched_debug info

Since the statistics of a group entity isn't exported directly from the
kernel, it becomes difficult to obtain some of the group statistics.
For example, the current method to obtain exec time of a group entity
is not always accurate. One has to read the exec times of all
the tasks(/proc/<pid>/sched) in the group and add them. This method
fails (or becomes difficult) if we want to collect stats of a group over
a duration where tasks get created and terminated.

This patch makes it easier to obtain group stats by directly including
them in /proc/sched_debug. Stats like group exec time would help user
programs (like LTP) to accurately measure the group fairness.

An example output of group stats from /proc/sched_debug:

cfs_rq[3]:/3/a/1
  .exec_clock                    : 89.598007
  .MIN_vruntime                  : 0.000001
  .min_vruntime                  : 256300.970506
  .max_vruntime                  : 0.000001
  .spread                        : 0.000000
  .spread0                       : -25373.372248
  .nr_running                    : 0
  .load                          : 0
  .yld_exp_empty                 : 0
  .yld_act_empty                 : 0
  .yld_both_empty                : 0
  .yld_count                     : 4474
  .sched_switch                  : 0
  .sched_count                   : 40507
  .sched_goidle                  : 12686
  .ttwu_count                    : 15114
  .ttwu_local                    : 11950
  .bkl_count                     : 67
  .nr_spread_over                : 0
  .shares                        : 0
  .se->exec_start                : 113676.727170
  .se->vruntime                  : 1592.612714
  .se->sum_exec_runtime          : 89.598007
  .se->wait_start                : 0.000000
  .se->sleep_start               : 0.000000
  .se->block_start               : 0.000000
  .se->sleep_max                 : 0.000000
  .se->block_max                 : 0.000000
  .se->exec_max                  : 1.000282
  .se->slice_max                 : 1.999750
  .se->wait_max                  : 54.981093
  .se->wait_sum                  : 217.610521
  .se->wait_count                : 50
  .se->load.weight               : 2

Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Acked-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Acked-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_debug.c | 37 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index d25cefe3f0eb..3625a6598699 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -53,6 +53,40 @@ static unsigned long nsec_low(unsigned long long nsec)
 
 #define SPLIT_NS(x) nsec_high(x), nsec_low(x)
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void print_cfs_group_stats(struct seq_file *m, int cpu,
+		struct task_group *tg)
+{
+	struct sched_entity *se = tg->se[cpu];
+	if (!se)
+		return;
+
+#define P(F) \
+	SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)F)
+#define PN(F) \
+	SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
+
+	PN(se->exec_start);
+	PN(se->vruntime);
+	PN(se->sum_exec_runtime);
+#ifdef CONFIG_SCHEDSTATS
+	PN(se->wait_start);
+	PN(se->sleep_start);
+	PN(se->block_start);
+	PN(se->sleep_max);
+	PN(se->block_max);
+	PN(se->exec_max);
+	PN(se->slice_max);
+	PN(se->wait_max);
+	PN(se->wait_sum);
+	P(se->wait_count);
+#endif
+	P(se->load.weight);
+#undef PN
+#undef P
+}
+#endif
+
 static void
 print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 {
@@ -181,6 +215,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 #ifdef CONFIG_SMP
 	SEQ_printf(m, "  .%-30s: %lu\n", "shares", cfs_rq->shares);
 #endif
+	print_cfs_group_stats(m, cpu, cfs_rq->tg);
 #endif
 }
 
@@ -261,7 +296,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
 	u64 now = ktime_to_ns(ktime_get());
 	int cpu;
 
-	SEQ_printf(m, "Sched Debug Version: v0.07, %s %.*s\n",
+	SEQ_printf(m, "Sched Debug Version: v0.08, %s %.*s\n",
 		init_utsname()->release,
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);
-- 
cgit v1.2.3


From 851f7ff56d9c21272f289dd85fb3f1b6cf7a6e10 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 11 Nov 2008 21:48:14 +1100
Subject: This patch will print cap_permitted and cap_inheritable data in the
 PATH records of any file that has file capabilities set.  Files which do not
 have fcaps set will not have different PATH records.

An example audit record if you run:
setcap "cap_net_admin+pie" /bin/bash
/bin/bash

type=SYSCALL msg=audit(1225741937.363:230): arch=c000003e syscall=59 success=yes exit=0 a0=2119230 a1=210da30 a2=20ee290 a3=8 items=2 ppid=2149 pid=2923 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=pts0 ses=3 comm="ping" exe="/bin/ping" subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null)
type=EXECVE msg=audit(1225741937.363:230): argc=2 a0="ping" a1="www.google.com"
type=CWD msg=audit(1225741937.363:230):  cwd="/root"
type=PATH msg=audit(1225741937.363:230): item=0 name="/bin/ping" inode=49256 dev=fd:00 mode=0104755 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:ping_exec_t:s0 cap_fp=0000000000002000 cap_fi=0000000000002000 cap_fe=1 cap_fver=2
type=PATH msg=audit(1225741937.363:230): item=1 name=(null) inode=507915 dev=fd:00 mode=0100755 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:ld_so_t:s0

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/capability.h |  5 +++
 kernel/auditsc.c           | 82 +++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 82 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/capability.h b/include/linux/capability.h
index d567af247ed8..0f1950181102 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -53,6 +53,7 @@ typedef struct __user_cap_data_struct {
 #define XATTR_NAME_CAPS XATTR_SECURITY_PREFIX XATTR_CAPS_SUFFIX
 
 #define VFS_CAP_REVISION_MASK	0xFF000000
+#define VFS_CAP_REVISION_SHIFT	24
 #define VFS_CAP_FLAGS_MASK	~VFS_CAP_REVISION_MASK
 #define VFS_CAP_FLAGS_EFFECTIVE	0x000001
 
@@ -534,6 +535,10 @@ kernel_cap_t cap_set_effective(const kernel_cap_t pE_new);
 
 extern int capable(int cap);
 
+/* audit system wants to get cap info from files as well */
+struct dentry;
+extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
+
 #endif /* __KERNEL__ */
 
 #endif /* !_LINUX_CAPABILITY_H */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index cf5bc2f5f9c3..de7e9bcba9ae 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -65,6 +65,7 @@
 #include <linux/highmem.h>
 #include <linux/syscalls.h>
 #include <linux/inotify.h>
+#include <linux/capability.h>
 
 #include "audit.h"
 
@@ -84,6 +85,15 @@ int audit_n_rules;
 /* determines whether we collect data for signals sent */
 int audit_signals;
 
+struct audit_cap_data {
+	kernel_cap_t		permitted;
+	kernel_cap_t		inheritable;
+	union {
+		unsigned int	fE;		/* effective bit of a file capability */
+		kernel_cap_t	effective;	/* effective set of a process */
+	};
+};
+
 /* When fs/namei.c:getname() is called, we store the pointer in name and
  * we don't let putname() free it (instead we free all of the saved
  * pointers at syscall exit time).
@@ -100,6 +110,8 @@ struct audit_names {
 	gid_t		gid;
 	dev_t		rdev;
 	u32		osid;
+	struct audit_cap_data fcap;
+	unsigned int	fcap_ver;
 };
 
 struct audit_aux_data {
@@ -1171,6 +1183,35 @@ static void audit_log_execve_info(struct audit_context *context,
 	kfree(buf);
 }
 
+static void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
+{
+	int i;
+
+	audit_log_format(ab, " %s=", prefix);
+	CAP_FOR_EACH_U32(i) {
+		audit_log_format(ab, "%08x", cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]);
+	}
+}
+
+static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
+{
+	kernel_cap_t *perm = &name->fcap.permitted;
+	kernel_cap_t *inh = &name->fcap.inheritable;
+	int log = 0;
+
+	if (!cap_isclear(*perm)) {
+		audit_log_cap(ab, "cap_fp", perm);
+		log = 1;
+	}
+	if (!cap_isclear(*inh)) {
+		audit_log_cap(ab, "cap_fi", inh);
+		log = 1;
+	}
+
+	if (log)
+		audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver);
+}
+
 static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
 {
 	int i, call_panic = 0;
@@ -1421,6 +1462,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			}
 		}
 
+		audit_log_fcaps(ab, n);
+
 		audit_log_end(ab);
 	}
 
@@ -1787,8 +1830,36 @@ static int audit_inc_name_count(struct audit_context *context,
 	return 0;
 }
 
+
+static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry)
+{
+	struct cpu_vfs_cap_data caps;
+	int rc;
+
+	memset(&name->fcap.permitted, 0, sizeof(kernel_cap_t));
+	memset(&name->fcap.inheritable, 0, sizeof(kernel_cap_t));
+	name->fcap.fE = 0;
+	name->fcap_ver = 0;
+
+	if (!dentry)
+		return 0;
+
+	rc = get_vfs_caps_from_disk(dentry, &caps);
+	if (rc)
+		return rc;
+
+	name->fcap.permitted = caps.permitted;
+	name->fcap.inheritable = caps.inheritable;
+	name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
+	name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
+
+	return 0;
+}
+
+
 /* Copy inode data into an audit_names. */
-static void audit_copy_inode(struct audit_names *name, const struct inode *inode)
+static void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
+			     const struct inode *inode)
 {
 	name->ino   = inode->i_ino;
 	name->dev   = inode->i_sb->s_dev;
@@ -1797,6 +1868,7 @@ static void audit_copy_inode(struct audit_names *name, const struct inode *inode
 	name->gid   = inode->i_gid;
 	name->rdev  = inode->i_rdev;
 	security_inode_getsecid(inode, &name->osid);
+	audit_copy_fcaps(name, dentry);
 }
 
 /**
@@ -1831,7 +1903,7 @@ void __audit_inode(const char *name, const struct dentry *dentry)
 		context->names[idx].name = NULL;
 	}
 	handle_path(dentry);
-	audit_copy_inode(&context->names[idx], inode);
+	audit_copy_inode(&context->names[idx], dentry, inode);
 }
 
 /**
@@ -1892,7 +1964,7 @@ void __audit_inode_child(const char *dname, const struct dentry *dentry,
 		if (!strcmp(dname, n->name) ||
 		     !audit_compare_dname_path(dname, n->name, &dirlen)) {
 			if (inode)
-				audit_copy_inode(n, inode);
+				audit_copy_inode(n, NULL, inode);
 			else
 				n->ino = (unsigned long)-1;
 			found_child = n->name;
@@ -1906,7 +1978,7 @@ add_names:
 			return;
 		idx = context->name_count - 1;
 		context->names[idx].name = NULL;
-		audit_copy_inode(&context->names[idx], parent);
+		audit_copy_inode(&context->names[idx], NULL, parent);
 	}
 
 	if (!found_child) {
@@ -1927,7 +1999,7 @@ add_names:
 		}
 
 		if (inode)
-			audit_copy_inode(&context->names[idx], inode);
+			audit_copy_inode(&context->names[idx], NULL, inode);
 		else
 			context->names[idx].ino = (unsigned long)-1;
 	}
-- 
cgit v1.2.3


From 3fc689e96c0c90b6fede5946d6c31075e9464f69 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 11 Nov 2008 21:48:18 +1100
Subject: Any time fcaps or a setuid app under SECURE_NOROOT is used to result
 in a non-zero pE we will crate a new audit record which contains the entire
 set of known information about the executable in question, fP, fI, fE,
 fversion and includes the process's pE, pI, pP.  Before and after the bprm
 capability are applied.  This record type will only be emitted from execve
 syscalls.

an example of making ping use fcaps instead of setuid:

setcap "cat_net_raw+pe" /bin/ping

type=SYSCALL msg=audit(1225742021.015:236): arch=c000003e syscall=59 success=yes exit=0 a0=1457f30 a1=14606b0 a2=1463940 a3=321b770a70 items=2 ppid=2929 pid=2963 auid=0 uid=500 gid=500 euid=500 suid=500 fsuid=500 egid=500 sgid=500 fsgid=500 tty=pts0 ses=3 comm="ping" exe="/bin/ping" subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null)
type=UNKNOWN[1321] msg=audit(1225742021.015:236): fver=2 fp=0000000000002000 fi=0000000000000000 fe=1 old_pp=0000000000000000 old_pi=0000000000000000 old_pe=0000000000000000 new_pp=0000000000002000 new_pi=0000000000000000 new_pe=0000000000002000
type=EXECVE msg=audit(1225742021.015:236): argc=2 a0="ping" a1="127.0.0.1"
type=CWD msg=audit(1225742021.015:236):  cwd="/home/test"
type=PATH msg=audit(1225742021.015:236): item=0 name="/bin/ping" inode=49256 dev=fd:00 mode=0100755 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:ping_exec_t:s0 cap_fp=0000000000002000 cap_fe=1 cap_fver=2
type=PATH msg=audit(1225742021.015:236): item=1 name=(null) inode=507915 dev=fd:00 mode=0100755 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:ld_so_t:s0

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/audit.h | 26 ++++++++++++++++++++
 kernel/auditsc.c      | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++
 security/commoncap.c  | 23 ++++++++++++++++-
 3 files changed, 116 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 6272a395d43c..8cfb9feb2a05 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -99,6 +99,7 @@
 #define AUDIT_OBJ_PID		1318	/* ptrace target */
 #define AUDIT_TTY		1319	/* Input on an administrative TTY */
 #define AUDIT_EOE		1320	/* End of multi-record event */
+#define AUDIT_BPRM_FCAPS	1321	/* Information about fcaps increasing perms */
 
 #define AUDIT_AVC		1400	/* SE Linux avc denial or grant */
 #define AUDIT_SELINUX_ERR	1401	/* Internal SE Linux Errors */
@@ -452,6 +453,7 @@ extern int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_pr
 extern int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, unsigned int __user *u_msg_prio, const struct timespec __user *u_abs_timeout);
 extern int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification);
 extern int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat);
+extern void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_cap_t *pE);
 
 static inline int audit_ipc_obj(struct kern_ipc_perm *ipcp)
 {
@@ -501,6 +503,29 @@ static inline int audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
 		return __audit_mq_getsetattr(mqdes, mqstat);
 	return 0;
 }
+
+/*
+ * ieieeeeee, an audit function without a return code!
+ *
+ * This function might fail!  I decided that it didn't matter.  We are too late
+ * to fail the syscall and the information isn't REQUIRED for any purpose.  It's
+ * just nice to have.  We should be able to look at past audit logs to figure
+ * out this process's current cap set along with the fcaps from the PATH record
+ * and use that to come up with the final set.  Yeah, its ugly, but all the info
+ * is still in the audit log.  So I'm not going to bother mentioning we failed
+ * if we couldn't allocate memory.
+ *
+ * If someone changes their mind they could create the aux record earlier and
+ * then search here and use that earlier allocation.  But I don't wanna.
+ *
+ * -Eric
+ */
+static inline void audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_cap_t *pE)
+{
+	if (unlikely(!audit_dummy_context()))
+		__audit_log_bprm_fcaps(bprm, pP, pE);
+}
+
 extern int audit_n_rules;
 extern int audit_signals;
 #else
@@ -532,6 +557,7 @@ extern int audit_signals;
 #define audit_mq_timedreceive(d,l,p,t) ({ 0; })
 #define audit_mq_notify(d,n) ({ 0; })
 #define audit_mq_getsetattr(d,s) ({ 0; })
+#define audit_log_bprm_fcaps(b, p, e) do { ; } while (0)
 #define audit_ptrace(t) ((void)0)
 #define audit_n_rules 0
 #define audit_signals 0
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index de7e9bcba9ae..3229cd4206f5 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -196,6 +196,14 @@ struct audit_aux_data_pids {
 	int			pid_count;
 };
 
+struct audit_aux_data_bprm_fcaps {
+	struct audit_aux_data	d;
+	struct audit_cap_data	fcap;
+	unsigned int		fcap_ver;
+	struct audit_cap_data	old_pcap;
+	struct audit_cap_data	new_pcap;
+};
+
 struct audit_tree_refs {
 	struct audit_tree_refs *next;
 	struct audit_chunk *c[31];
@@ -1375,6 +1383,20 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]);
 			break; }
 
+		case AUDIT_BPRM_FCAPS: {
+			struct audit_aux_data_bprm_fcaps *axs = (void *)aux;
+			audit_log_format(ab, "fver=%x", axs->fcap_ver);
+			audit_log_cap(ab, "fp", &axs->fcap.permitted);
+			audit_log_cap(ab, "fi", &axs->fcap.inheritable);
+			audit_log_format(ab, " fe=%d", axs->fcap.fE);
+			audit_log_cap(ab, "old_pp", &axs->old_pcap.permitted);
+			audit_log_cap(ab, "old_pi", &axs->old_pcap.inheritable);
+			audit_log_cap(ab, "old_pe", &axs->old_pcap.effective);
+			audit_log_cap(ab, "new_pp", &axs->new_pcap.permitted);
+			audit_log_cap(ab, "new_pi", &axs->new_pcap.inheritable);
+			audit_log_cap(ab, "new_pe", &axs->new_pcap.effective);
+			break; }
+
 		}
 		audit_log_end(ab);
 	}
@@ -2501,6 +2523,52 @@ int __audit_signal_info(int sig, struct task_struct *t)
 	return 0;
 }
 
+/**
+ * __audit_log_bprm_fcaps - store information about a loading bprm and relevant fcaps
+ * @bprm pointer to the bprm being processed
+ * @caps the caps read from the disk
+ *
+ * Simply check if the proc already has the caps given by the file and if not
+ * store the priv escalation info for later auditing at the end of the syscall
+ *
+ * this can fail and we don't care.  See the note in audit.h for
+ * audit_log_bprm_fcaps() for my explaination....
+ *
+ * -Eric
+ */
+void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_cap_t *pE)
+{
+	struct audit_aux_data_bprm_fcaps *ax;
+	struct audit_context *context = current->audit_context;
+	struct cpu_vfs_cap_data vcaps;
+	struct dentry *dentry;
+
+	ax = kmalloc(sizeof(*ax), GFP_KERNEL);
+	if (!ax)
+		return;
+
+	ax->d.type = AUDIT_BPRM_FCAPS;
+	ax->d.next = context->aux;
+	context->aux = (void *)ax;
+
+	dentry = dget(bprm->file->f_dentry);
+	get_vfs_caps_from_disk(dentry, &vcaps);
+	dput(dentry);
+
+	ax->fcap.permitted = vcaps.permitted;
+	ax->fcap.inheritable = vcaps.inheritable;
+	ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
+	ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
+
+	ax->old_pcap.permitted = *pP;
+	ax->old_pcap.inheritable = current->cap_inheritable;
+	ax->old_pcap.effective = *pE;
+
+	ax->new_pcap.permitted = current->cap_permitted;
+	ax->new_pcap.inheritable = current->cap_inheritable;
+	ax->new_pcap.effective = current->cap_effective;
+}
+
 /**
  * audit_core_dumps - record information about processes that end abnormally
  * @signr: signal value
diff --git a/security/commoncap.c b/security/commoncap.c
index d7eff5797b91..d45393380997 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/capability.h>
+#include <linux/audit.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -376,6 +377,9 @@ int cap_bprm_set_security (struct linux_binprm *bprm)
 
 void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
 {
+	kernel_cap_t pP = current->cap_permitted;
+	kernel_cap_t pE = current->cap_effective;
+
 	if (bprm->e_uid != current->uid || bprm->e_gid != current->gid ||
 	    !cap_issubset(bprm->cap_post_exec_permitted,
 			  current->cap_permitted)) {
@@ -409,7 +413,24 @@ void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
 			cap_clear(current->cap_effective);
 	}
 
-	/* AUD: Audit candidate if current->cap_effective is set */
+	/*
+	 * Audit candidate if current->cap_effective is set
+	 *
+	 * We do not bother to audit if 3 things are true:
+	 *   1) cap_effective has all caps
+	 *   2) we are root
+	 *   3) root is supposed to have all caps (SECURE_NOROOT)
+	 * Since this is just a normal root execing a process.
+	 *
+	 * Number 1 above might fail if you don't have a full bset, but I think
+	 * that is interesting information to audit.
+	 */
+	if (!cap_isclear(current->cap_effective)) {
+		if (!cap_issubset(CAP_FULL_SET, current->cap_effective) ||
+		    (bprm->e_uid != 0) || (current->uid != 0) ||
+		    issecure(SECURE_NOROOT))
+			audit_log_bprm_fcaps(bprm, &pP, &pE);
+	}
 
 	current->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
 }
-- 
cgit v1.2.3


From e68b75a027bb94066576139ee33676264f867b87 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 11 Nov 2008 21:48:22 +1100
Subject: When the capset syscall is used it is not possible for audit to
 record the actual capbilities being added/removed.  This patch adds a new
 record type which emits the target pid and the eff, inh, and perm cap sets.

example output if you audit capset syscalls would be:

type=SYSCALL msg=audit(1225743140.465:76): arch=c000003e syscall=126 success=yes exit=0 a0=17f2014 a1=17f201c a2=80000000 a3=7fff2ab7f060 items=0 ppid=2160 pid=2223 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=pts0 ses=1 comm="setcap" exe="/usr/sbin/setcap" subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null)
type=UNKNOWN[1322] msg=audit(1225743140.465:76): pid=0 cap_pi=ffffffffffffffff cap_pp=ffffffffffffffff cap_pe=ffffffffffffffff

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/audit.h | 10 ++++++++++
 kernel/auditsc.c      | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/capability.c   |  5 +++++
 3 files changed, 63 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 8cfb9feb2a05..6fbebac7b1bf 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -100,6 +100,7 @@
 #define AUDIT_TTY		1319	/* Input on an administrative TTY */
 #define AUDIT_EOE		1320	/* End of multi-record event */
 #define AUDIT_BPRM_FCAPS	1321	/* Information about fcaps increasing perms */
+#define AUDIT_CAPSET		1322	/* Record showing argument to sys_capset */
 
 #define AUDIT_AVC		1400	/* SE Linux avc denial or grant */
 #define AUDIT_SELINUX_ERR	1401	/* Internal SE Linux Errors */
@@ -454,6 +455,7 @@ extern int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, unsigned int __u
 extern int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification);
 extern int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat);
 extern void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_cap_t *pE);
+extern int __audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_cap_t *perm);
 
 static inline int audit_ipc_obj(struct kern_ipc_perm *ipcp)
 {
@@ -526,6 +528,13 @@ static inline void audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t
 		__audit_log_bprm_fcaps(bprm, pP, pE);
 }
 
+static inline int audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_cap_t *perm)
+{
+	if (unlikely(!audit_dummy_context()))
+		return __audit_log_capset(pid, eff, inh, perm);
+	return 0;
+}
+
 extern int audit_n_rules;
 extern int audit_signals;
 #else
@@ -558,6 +567,7 @@ extern int audit_signals;
 #define audit_mq_notify(d,n) ({ 0; })
 #define audit_mq_getsetattr(d,s) ({ 0; })
 #define audit_log_bprm_fcaps(b, p, e) do { ; } while (0)
+#define audit_log_capset(pid, e, i, p) ({ 0; })
 #define audit_ptrace(t) ((void)0)
 #define audit_n_rules 0
 #define audit_signals 0
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3229cd4206f5..cef34235b362 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -204,6 +204,12 @@ struct audit_aux_data_bprm_fcaps {
 	struct audit_cap_data	new_pcap;
 };
 
+struct audit_aux_data_capset {
+	struct audit_aux_data	d;
+	pid_t			pid;
+	struct audit_cap_data	cap;
+};
+
 struct audit_tree_refs {
 	struct audit_tree_refs *next;
 	struct audit_chunk *c[31];
@@ -1397,6 +1403,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			audit_log_cap(ab, "new_pe", &axs->new_pcap.effective);
 			break; }
 
+		case AUDIT_CAPSET: {
+			struct audit_aux_data_capset *axs = (void *)aux;
+			audit_log_format(ab, "pid=%d", axs->pid);
+			audit_log_cap(ab, "cap_pi", &axs->cap.inheritable);
+			audit_log_cap(ab, "cap_pp", &axs->cap.permitted);
+			audit_log_cap(ab, "cap_pe", &axs->cap.effective);
+			break; }
+
 		}
 		audit_log_end(ab);
 	}
@@ -2569,6 +2583,40 @@ void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_
 	ax->new_pcap.effective = current->cap_effective;
 }
 
+/**
+ * __audit_log_capset - store information about the arguments to the capset syscall
+ * @pid target pid of the capset call
+ * @eff effective cap set
+ * @inh inheritible cap set
+ * @perm permited cap set
+ *
+ * Record the aguments userspace sent to sys_capset for later printing by the
+ * audit system if applicable
+ */
+int __audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_cap_t *perm)
+{
+	struct audit_aux_data_capset *ax;
+	struct audit_context *context = current->audit_context;
+
+	if (likely(!audit_enabled || !context || context->dummy))
+		return 0;
+
+	ax = kmalloc(sizeof(*ax), GFP_KERNEL);
+	if (!ax)
+		return -ENOMEM;
+
+	ax->d.type = AUDIT_CAPSET;
+	ax->d.next = context->aux;
+	context->aux = (void *)ax;
+
+	ax->pid = pid;
+	ax->cap.effective = *eff;
+	ax->cap.inheritable = *eff;
+	ax->cap.permitted = *perm;
+
+	return 0;
+}
+
 /**
  * audit_core_dumps - record information about processes that end abnormally
  * @signr: signal value
diff --git a/kernel/capability.c b/kernel/capability.c
index e13a68535ad5..19f9eda89975 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -7,6 +7,7 @@
  * 30 May 2002:	Cleanup, Robert M. Love <rml@tech9.net>
  */
 
+#include <linux/audit.h>
 #include <linux/capability.h>
 #include <linux/mm.h>
 #include <linux/module.h>
@@ -468,6 +469,10 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 		i++;
 	}
 
+	ret = audit_log_capset(pid, &effective, &inheritable, &permitted);
+	if (ret)
+		return ret;
+
 	if (pid && (pid != task_pid_vnr(current)))
 		ret = do_sys_capset_other_tasks(pid, &effective, &inheritable,
 						&permitted);
-- 
cgit v1.2.3


From 637d32dc720897616e8a1a4f9e9609e29d431800 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 29 Oct 2008 15:42:12 +1100
Subject: Capabilities: BUG when an invalid capability is requested

If an invalid (large) capability is requested the capabilities system
may panic as it is dereferencing an array of fixed (short) length.  Its
possible (and actually often happens) that the capability system
accidentally stumbled into a valid memory region but it also regularly
happens that it hits invalid memory and BUGs.  If such an operation does
get past cap_capable then the selinux system is sure to have problems as
it already does a (simple) validity check and BUG.  This is known to
happen by the broken and buggy firegl driver.

This patch cleanly checks all capable calls and BUG if a call is for an
invalid capability.  This will likely break the firegl driver for some
situations, but it is the right thing to do.  Garbage into a security
system gets you killed/bugged

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Andrew G. Morgan <morgan@kernel.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/capability.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/capability.c b/kernel/capability.c
index 19f9eda89975..adb262f83de1 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -514,6 +514,11 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
  */
 int capable(int cap)
 {
+	if (unlikely(!cap_valid(cap))) {
+		printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap);
+		BUG();
+	}
+
 	if (has_capability(current, cap)) {
 		current->flags |= PF_SUPERPRIV;
 		return 1;
-- 
cgit v1.2.3


From 934352f214b3251eb0793c1209d346595a661d80 Mon Sep 17 00:00:00 2001
From: Bharata B Rao <bharata@linux.vnet.ibm.com>
Date: Mon, 10 Nov 2008 20:41:13 +0530
Subject: sched: add hierarchical accounting to cpu accounting controller

Impact: improve CPU time accounting of tasks under the cpu accounting controller

Add hierarchical accounting to cpu accounting controller and include
cpuacct documentation.

Currently, while charging the task's cputime to its accounting group,
the accounting group hierarchy isn't updated. This patch charges the cputime
of a task to its accounting group and all its parent accounting groups.

Reported-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Reviewed-by: Paul Menage <menage@google.com>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/controllers/cpuacct.txt | 32 ++++++++++++++++++++++++++++++++
 kernel/sched.c                        | 12 +++++++++---
 2 files changed, 41 insertions(+), 3 deletions(-)
 create mode 100644 Documentation/controllers/cpuacct.txt

(limited to 'kernel')

diff --git a/Documentation/controllers/cpuacct.txt b/Documentation/controllers/cpuacct.txt
new file mode 100644
index 000000000000..bb775fbe43d7
--- /dev/null
+++ b/Documentation/controllers/cpuacct.txt
@@ -0,0 +1,32 @@
+CPU Accounting Controller
+-------------------------
+
+The CPU accounting controller is used to group tasks using cgroups and
+account the CPU usage of these groups of tasks.
+
+The CPU accounting controller supports multi-hierarchy groups. An accounting
+group accumulates the CPU usage of all of its child groups and the tasks
+directly present in its group.
+
+Accounting groups can be created by first mounting the cgroup filesystem.
+
+# mkdir /cgroups
+# mount -t cgroup -ocpuacct none /cgroups
+
+With the above step, the initial or the parent accounting group
+becomes visible at /cgroups. At bootup, this group includes all the
+tasks in the system. /cgroups/tasks lists the tasks in this cgroup.
+/cgroups/cpuacct.usage gives the CPU time (in nanoseconds) obtained by
+this group which is essentially the CPU time obtained by all the tasks
+in the system.
+
+New accounting groups can be created under the parent group /cgroups.
+
+# cd /cgroups
+# mkdir g1
+# echo $$ > g1
+
+The above steps create a new group g1 and move the current shell
+process (bash) into it. CPU time consumed by this bash and its children
+can be obtained from g1/cpuacct.usage and the same is accumulated in
+/cgroups/cpuacct.usage also.
diff --git a/kernel/sched.c b/kernel/sched.c
index 59db86c915f9..ebaf432365f6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9196,11 +9196,12 @@ struct cgroup_subsys cpu_cgroup_subsys = {
  * (balbir@in.ibm.com).
  */
 
-/* track cpu usage of a group of tasks */
+/* track cpu usage of a group of tasks and its child groups */
 struct cpuacct {
 	struct cgroup_subsys_state css;
 	/* cpuusage holds pointer to a u64-type object on every cpu */
 	u64 *cpuusage;
+	struct cpuacct *parent;
 };
 
 struct cgroup_subsys cpuacct_subsys;
@@ -9234,6 +9235,9 @@ static struct cgroup_subsys_state *cpuacct_create(
 		return ERR_PTR(-ENOMEM);
 	}
 
+	if (cgrp->parent)
+		ca->parent = cgroup_ca(cgrp->parent);
+
 	return &ca->css;
 }
 
@@ -9313,14 +9317,16 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
 static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 {
 	struct cpuacct *ca;
+	int cpu;
 
 	if (!cpuacct_subsys.active)
 		return;
 
+	cpu = task_cpu(tsk);
 	ca = task_ca(tsk);
-	if (ca) {
-		u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
 
+	for (; ca; ca = ca->parent) {
+		u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
 		*cpuusage += cputime;
 	}
 }
-- 
cgit v1.2.3


From f83c9d0fe42a7544b4d4ffcebb2e6716fcfd95c0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 11 Nov 2008 18:47:44 +0100
Subject: ring-buffer: add reader lock

Impact: serialize reader accesses to individual CPU ring buffers

The code in the ring buffer expects only one reader at a time, but currently
it puts that requirement on the caller. This is not strong enough, and this
patch adds a "reader_lock" that serializes the access to the reader API
of the ring buffer.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 109 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 78 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a6b8f9d7ac96..17c2ccebb567 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -154,6 +154,7 @@ static inline int test_time_stamp(u64 delta)
 struct ring_buffer_per_cpu {
 	int				cpu;
 	struct ring_buffer		*buffer;
+	spinlock_t			reader_lock; /* serialize readers */
 	raw_spinlock_t			lock;
 	struct lock_class_key		lock_key;
 	struct list_head		pages;
@@ -321,6 +322,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
 
 	cpu_buffer->cpu = cpu;
 	cpu_buffer->buffer = buffer;
+	spin_lock_init(&cpu_buffer->reader_lock);
 	cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 	INIT_LIST_HEAD(&cpu_buffer->pages);
 
@@ -1476,6 +1478,9 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
 void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
 {
 	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 
 	/* Iterator usage is expected to have record disabled */
 	if (list_empty(&cpu_buffer->reader_page->list)) {
@@ -1489,6 +1494,8 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
 		iter->read_stamp = cpu_buffer->read_stamp;
 	else
 		iter->read_stamp = iter->head_page->time_stamp;
+
+	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 }
 
 /**
@@ -1707,17 +1714,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
 		rb_advance_iter(iter);
 }
 
-/**
- * ring_buffer_peek - peek at the next event to be read
- * @buffer: The ring buffer to read
- * @cpu: The cpu to peak at
- * @ts: The timestamp counter of this event.
- *
- * This will return the event that will be read next, but does
- * not consume the data.
- */
-struct ring_buffer_event *
-ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
+static struct ring_buffer_event *
+rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_event *event;
@@ -1779,16 +1777,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	return NULL;
 }
 
-/**
- * ring_buffer_iter_peek - peek at the next event to be read
- * @iter: The ring buffer iterator
- * @ts: The timestamp counter of this event.
- *
- * This will return the event that will be read next, but does
- * not increment the iterator.
- */
-struct ring_buffer_event *
-ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
+static struct ring_buffer_event *
+rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 {
 	struct ring_buffer *buffer;
 	struct ring_buffer_per_cpu *cpu_buffer;
@@ -1849,6 +1839,51 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 	return NULL;
 }
 
+/**
+ * ring_buffer_peek - peek at the next event to be read
+ * @buffer: The ring buffer to read
+ * @cpu: The cpu to peak at
+ * @ts: The timestamp counter of this event.
+ *
+ * This will return the event that will be read next, but does
+ * not consume the data.
+ */
+struct ring_buffer_event *
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
+{
+	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+	struct ring_buffer_event *event;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+	event = rb_buffer_peek(buffer, cpu, ts);
+	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+	return event;
+}
+
+/**
+ * ring_buffer_iter_peek - peek at the next event to be read
+ * @iter: The ring buffer iterator
+ * @ts: The timestamp counter of this event.
+ *
+ * This will return the event that will be read next, but does
+ * not increment the iterator.
+ */
+struct ring_buffer_event *
+ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
+{
+	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+	struct ring_buffer_event *event;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+	event = rb_iter_peek(iter, ts);
+	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+	return event;
+}
+
 /**
  * ring_buffer_consume - return an event and consume it
  * @buffer: The ring buffer to get the next event from
@@ -1860,19 +1895,24 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 struct ring_buffer_event *
 ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
 {
-	struct ring_buffer_per_cpu *cpu_buffer;
+	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
 	struct ring_buffer_event *event;
+	unsigned long flags;
 
 	if (!cpu_isset(cpu, buffer->cpumask))
 		return NULL;
 
-	event = ring_buffer_peek(buffer, cpu, ts);
+	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+	event = rb_buffer_peek(buffer, cpu, ts);
 	if (!event)
-		return NULL;
+		goto out;
 
-	cpu_buffer = buffer->buffers[cpu];
 	rb_advance_reader(cpu_buffer);
 
+ out:
+	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
 	return event;
 }
 
@@ -1909,11 +1949,11 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
 	atomic_inc(&cpu_buffer->record_disabled);
 	synchronize_sched();
 
-	local_irq_save(flags);
+	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	__raw_spin_lock(&cpu_buffer->lock);
 	ring_buffer_iter_reset(iter);
 	__raw_spin_unlock(&cpu_buffer->lock);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
 	return iter;
 }
@@ -1945,12 +1985,17 @@ struct ring_buffer_event *
 ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
 {
 	struct ring_buffer_event *event;
+	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+	unsigned long flags;
 
-	event = ring_buffer_iter_peek(iter, ts);
+	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+	event = rb_iter_peek(iter, ts);
 	if (!event)
-		return NULL;
+		goto out;
 
 	rb_advance_iter(iter);
+ out:
+	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
 	return event;
 }
@@ -1999,13 +2044,15 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
 	if (!cpu_isset(cpu, buffer->cpumask))
 		return;
 
-	local_irq_save(flags);
+	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
 	__raw_spin_lock(&cpu_buffer->lock);
 
 	rb_reset_cpu(cpu_buffer);
 
 	__raw_spin_unlock(&cpu_buffer->lock);
-	local_irq_restore(flags);
+
+	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 }
 
 /**
-- 
cgit v1.2.3


From 3e89c7bb92fc92bb964734341487798c8d497bae Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 11 Nov 2008 15:28:41 -0500
Subject: ring-buffer: clean up warn ons

Impact: Restructure WARN_ONs in ring_buffer.c

The current WARN_ON macros in ring_buffer.c are quite ugly.

This patch cleans them up and uses a single RB_WARN_ON that returns
the value of the condition. This allows the caller to abort the
function if the condition is true.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 143 ++++++++++++++++++---------------------------
 1 file changed, 57 insertions(+), 86 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 17c2ccebb567..8c5cacb25a18 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -191,60 +191,14 @@ struct ring_buffer_iter {
 
 /* buffer may be either ring_buffer or ring_buffer_per_cpu */
 #define RB_WARN_ON(buffer, cond)				\
-	do {							\
-		if (unlikely(cond)) {				\
+	({							\
+		int _____ret = unlikely(cond);			\
+		if (_____ret) {					\
 			atomic_inc(&buffer->record_disabled);	\
 			WARN_ON(1);				\
 		}						\
-	} while (0)
-
-#define RB_WARN_ON_RET(buffer, cond)				\
-	do {							\
-		if (unlikely(cond)) {				\
-			atomic_inc(&buffer->record_disabled);	\
-			WARN_ON(1);				\
-			return;					\
-		}						\
-	} while (0)
-
-#define RB_WARN_ON_RET_INT(buffer, cond)			\
-	do {							\
-		if (unlikely(cond)) {				\
-			atomic_inc(&buffer->record_disabled);	\
-			WARN_ON(1);				\
-			return -1;				\
-		}						\
-	} while (0)
-
-#define RB_WARN_ON_RET_NULL(buffer, cond)			\
-	do {							\
-		if (unlikely(cond)) {				\
-			atomic_inc(&buffer->record_disabled);	\
-			WARN_ON(1);				\
-			return NULL;				\
-		}						\
-	} while (0)
-
-#define RB_WARN_ON_ONCE(buffer, cond)				\
-	do {							\
-		static int once;				\
-		if (unlikely(cond) && !once) {			\
-			once++;					\
-			atomic_inc(&buffer->record_disabled);	\
-			WARN_ON(1);				\
-		}						\
-	} while (0)
-
-/* buffer must be ring_buffer not per_cpu */
-#define RB_WARN_ON_UNLOCK(buffer, cond)				\
-	do {							\
-		if (unlikely(cond)) {				\
-			mutex_unlock(&buffer->mutex);		\
-			atomic_inc(&buffer->record_disabled);	\
-			WARN_ON(1);				\
-			return -1;				\
-		}						\
-	} while (0)
+		_____ret;					\
+	})
 
 /**
  * check_pages - integrity check of buffer pages
@@ -258,14 +212,18 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
 	struct list_head *head = &cpu_buffer->pages;
 	struct buffer_page *page, *tmp;
 
-	RB_WARN_ON_RET_INT(cpu_buffer, head->next->prev != head);
-	RB_WARN_ON_RET_INT(cpu_buffer, head->prev->next != head);
+	if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
+		return -1;
+	if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
+		return -1;
 
 	list_for_each_entry_safe(page, tmp, head, list) {
-		RB_WARN_ON_RET_INT(cpu_buffer,
-			       page->list.next->prev != &page->list);
-		RB_WARN_ON_RET_INT(cpu_buffer,
-			       page->list.prev->next != &page->list);
+		if (RB_WARN_ON(cpu_buffer,
+			       page->list.next->prev != &page->list))
+			return -1;
+		if (RB_WARN_ON(cpu_buffer,
+			       page->list.prev->next != &page->list))
+			return -1;
 	}
 
 	return 0;
@@ -472,13 +430,15 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
 	synchronize_sched();
 
 	for (i = 0; i < nr_pages; i++) {
-		RB_WARN_ON_RET(cpu_buffer, list_empty(&cpu_buffer->pages));
+		if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages)))
+			return;
 		p = cpu_buffer->pages.next;
 		page = list_entry(p, struct buffer_page, list);
 		list_del_init(&page->list);
 		free_buffer_page(page);
 	}
-	RB_WARN_ON_RET(cpu_buffer, list_empty(&cpu_buffer->pages));
+	if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages)))
+		return;
 
 	rb_reset_cpu(cpu_buffer);
 
@@ -500,7 +460,8 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
 	synchronize_sched();
 
 	for (i = 0; i < nr_pages; i++) {
-		RB_WARN_ON_RET(cpu_buffer, list_empty(pages));
+		if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
+			return;
 		p = pages->next;
 		page = list_entry(p, struct buffer_page, list);
 		list_del_init(&page->list);
@@ -555,7 +516,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 	if (size < buffer_size) {
 
 		/* easy case, just free pages */
-		RB_WARN_ON_UNLOCK(buffer, nr_pages >= buffer->pages);
+		if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) {
+			mutex_unlock(&buffer->mutex);
+			return -1;
+		}
 
 		rm_pages = buffer->pages - nr_pages;
 
@@ -574,7 +538,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 	 * add these pages to the cpu_buffers. Otherwise we just free
 	 * them all and return -ENOMEM;
 	 */
-	RB_WARN_ON_UNLOCK(buffer, nr_pages <= buffer->pages);
+	if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) {
+		mutex_unlock(&buffer->mutex);
+		return -1;
+	}
 
 	new_pages = nr_pages - buffer->pages;
 
@@ -598,7 +565,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 		rb_insert_pages(cpu_buffer, &pages, new_pages);
 	}
 
-	RB_WARN_ON_UNLOCK(buffer, !list_empty(&pages));
+	if (RB_WARN_ON(buffer, !list_empty(&pages))) {
+		mutex_unlock(&buffer->mutex);
+		return -1;
+	}
 
  out:
 	buffer->pages = nr_pages;
@@ -686,7 +656,8 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
 	     head += rb_event_length(event)) {
 
 		event = __rb_page_index(cpu_buffer->head_page, head);
-		RB_WARN_ON_RET(cpu_buffer, rb_null_event(event));
+		if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
+			return;
 		/* Only count data entries */
 		if (event->type != RINGBUF_TYPE_DATA)
 			continue;
@@ -739,8 +710,9 @@ rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
 	addr &= PAGE_MASK;
 
 	while (cpu_buffer->commit_page->page != (void *)addr) {
-		RB_WARN_ON(cpu_buffer,
-			   cpu_buffer->commit_page == cpu_buffer->tail_page);
+		if (RB_WARN_ON(cpu_buffer,
+			  cpu_buffer->commit_page == cpu_buffer->tail_page))
+			return;
 		cpu_buffer->commit_page->commit =
 			cpu_buffer->commit_page->write;
 		rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
@@ -896,7 +868,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		reader_page = cpu_buffer->reader_page;
 
 		/* we grabbed the lock before incrementing */
-		RB_WARN_ON(cpu_buffer, next_page == reader_page);
+		if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
+			goto out_unlock;
 
 		/*
 		 * If for some reason, we had an interrupt storm that made
@@ -973,7 +946,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 
 	/* We reserved something on the buffer */
 
-	RB_WARN_ON_RET_NULL(cpu_buffer, write > BUF_PAGE_SIZE);
+	if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE))
+		return NULL;
 
 	event = __rb_page_index(tail_page, tail);
 	rb_update_event(event, type, length);
@@ -1072,10 +1046,8 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 	 * storm or we have something buggy.
 	 * Bail!
 	 */
-	if (unlikely(++nr_loops > 1000)) {
-		RB_WARN_ON(cpu_buffer, 1);
+	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
 		return NULL;
-	}
 
 	ts = ring_buffer_time_stamp(cpu_buffer->cpu);
 
@@ -1591,8 +1563,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	 * a case where we will loop three times. There should be no
 	 * reason to loop four times (that I know of).
 	 */
-	if (unlikely(++nr_loops > 3)) {
-		RB_WARN_ON(cpu_buffer, 1);
+	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) {
 		reader = NULL;
 		goto out;
 	}
@@ -1604,8 +1575,9 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 		goto out;
 
 	/* Never should we have an index greater than the size */
-	RB_WARN_ON(cpu_buffer,
-		   cpu_buffer->reader_page->read > rb_page_size(reader));
+	if (RB_WARN_ON(cpu_buffer,
+		       cpu_buffer->reader_page->read > rb_page_size(reader)))
+		goto out;
 
 	/* check if we caught up to the tail */
 	reader = NULL;
@@ -1659,7 +1631,8 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
 	reader = rb_get_reader_page(cpu_buffer);
 
 	/* This function should not be called when buffer is empty */
-	RB_WARN_ON_RET(cpu_buffer, !reader);
+	if (RB_WARN_ON(cpu_buffer, !reader))
+		return;
 
 	event = rb_reader_event(cpu_buffer);
 
@@ -1686,8 +1659,9 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
 	 * Check if we are at the end of the buffer.
 	 */
 	if (iter->head >= rb_page_size(iter->head_page)) {
-		RB_WARN_ON_RET(buffer,
-			       iter->head_page == cpu_buffer->commit_page);
+		if (RB_WARN_ON(buffer,
+			       iter->head_page == cpu_buffer->commit_page))
+			return;
 		rb_inc_iter(iter);
 		return;
 	}
@@ -1700,9 +1674,10 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
 	 * This should not be called to advance the header if we are
 	 * at the tail of the buffer.
 	 */
-	RB_WARN_ON_RET(cpu_buffer,
+	if (RB_WARN_ON(cpu_buffer,
 		       (iter->head_page == cpu_buffer->commit_page) &&
-		       (iter->head + length > rb_commit_index(cpu_buffer)));
+		       (iter->head + length > rb_commit_index(cpu_buffer))))
+		return;
 
 	rb_update_iter_read_stamp(iter, event);
 
@@ -1736,10 +1711,8 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	 * can have.  Nesting 10 deep of interrupts is clearly
 	 * an anomaly.
 	 */
-	if (unlikely(++nr_loops > 10)) {
-		RB_WARN_ON(cpu_buffer, 1);
+	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
 		return NULL;
-	}
 
 	reader = rb_get_reader_page(cpu_buffer);
 	if (!reader)
@@ -1800,10 +1773,8 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 	 * can have. Nesting 10 deep of interrupts is clearly
 	 * an anomaly.
 	 */
-	if (unlikely(++nr_loops > 10)) {
-		RB_WARN_ON(cpu_buffer, 1);
+	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
 		return NULL;
-	}
 
 	if (rb_per_cpu_empty(cpu_buffer))
 		return NULL;
-- 
cgit v1.2.3


From 3f5ec13696fd4a33bde42f385406cbb1d3cc96fd Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 11 Nov 2008 23:21:31 +0100
Subject: tracing/fastboot: move boot tracer structs and funcs into their own
 header.

Impact: Cleanups on the boot tracer and ftrace

This patch bring some cleanups about the boot tracer headers. The
functions and structures of this tracer have nothing related to ftrace
and should have so their own header file.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 41 -----------------------------------------
 include/trace/boot.h   | 43 +++++++++++++++++++++++++++++++++++++++++++
 init/main.c            |  1 +
 kernel/trace/trace.h   |  1 +
 4 files changed, 45 insertions(+), 41 deletions(-)
 create mode 100644 include/trace/boot.h

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index dcbbf72a88b1..4fbc4a8b86a5 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -287,45 +287,4 @@ extern trace_function_return_t ftrace_function_return;
 extern void unregister_ftrace_return(void);
 #endif
 
-/*
- * Structure which defines the trace of an initcall.
- * You don't have to fill the func field since it is
- * only used internally by the tracer.
- */
-struct boot_trace {
-	pid_t			caller;
-	char			func[KSYM_NAME_LEN];
-	int			result;
-	unsigned long long	duration;		/* usecs */
-	ktime_t			calltime;
-	ktime_t			rettime;
-};
-
-#ifdef CONFIG_BOOT_TRACER
-/* Append the trace on the ring-buffer */
-extern void trace_boot(struct boot_trace *it, initcall_t fn);
-
-/* Tells the tracer that smp_pre_initcall is finished.
- * So we can start the tracing
- */
-extern void start_boot_trace(void);
-
-/* Resume the tracing of other necessary events
- * such as sched switches
- */
-extern void enable_boot_trace(void);
-
-/* Suspend this tracing. Actually, only sched_switches tracing have
- * to be suspended. Initcalls doesn't need it.)
- */
-extern void disable_boot_trace(void);
-#else
-static inline void trace_boot(struct boot_trace *it, initcall_t fn) { }
-static inline void start_boot_trace(void) { }
-static inline void enable_boot_trace(void) { }
-static inline void disable_boot_trace(void) { }
-#endif
-
-
-
 #endif /* _LINUX_FTRACE_H */
diff --git a/include/trace/boot.h b/include/trace/boot.h
new file mode 100644
index 000000000000..4cbe64e46cdc
--- /dev/null
+++ b/include/trace/boot.h
@@ -0,0 +1,43 @@
+#ifndef _LINUX_TRACE_BOOT_H
+#define _LINUX_TRACE_BOOT_H
+
+/*
+ * Structure which defines the trace of an initcall.
+ * You don't have to fill the func field since it is
+ * only used internally by the tracer.
+ */
+struct boot_trace {
+	pid_t			caller;
+	char			func[KSYM_NAME_LEN];
+	int			result;
+	unsigned long long	duration;		/* usecs */
+	ktime_t			calltime;
+	ktime_t			rettime;
+};
+
+#ifdef CONFIG_BOOT_TRACER
+/* Append the trace on the ring-buffer */
+extern void trace_boot(struct boot_trace *it, initcall_t fn);
+
+/* Tells the tracer that smp_pre_initcall is finished.
+ * So we can start the tracing
+ */
+extern void start_boot_trace(void);
+
+/* Resume the tracing of other necessary events
+ * such as sched switches
+ */
+extern void enable_boot_trace(void);
+
+/* Suspend this tracing. Actually, only sched_switches tracing have
+ * to be suspended. Initcalls doesn't need it.)
+ */
+extern void disable_boot_trace(void);
+#else
+static inline void trace_boot(struct boot_trace *it, initcall_t fn) { }
+static inline void start_boot_trace(void) { }
+static inline void enable_boot_trace(void) { }
+static inline void disable_boot_trace(void) { }
+#endif /* CONFIG_BOOT_TRACER */
+
+#endif /* __LINUX_TRACE_BOOT_H */
diff --git a/init/main.c b/init/main.c
index 4b03cd5656ca..16ca1ee071c4 100644
--- a/init/main.c
+++ b/init/main.c
@@ -63,6 +63,7 @@
 #include <linux/signal.h>
 #include <linux/idr.h>
 #include <linux/ftrace.h>
+#include <trace/boot.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e40ce0c14690..f69a5199596b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -8,6 +8,7 @@
 #include <linux/ring_buffer.h>
 #include <linux/mmiotrace.h>
 #include <linux/ftrace.h>
+#include <trace/boot.h>
 
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
-- 
cgit v1.2.3


From 74239072830ef3f1398edeb1bc1076fc330fd4a2 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 11 Nov 2008 23:24:42 +0100
Subject: tracing/fastboot: Use the ring-buffer timestamp for initcall entries

Impact: Split the boot tracer entries in two parts: call and return

Now that we are using the sched tracer from the boot tracer, we want
to use the same timestamp than the ring-buffer to have consistent time
captures between sched events and initcall events.

So we get rid of the old time capture by the boot tracer and split the
initcall events in two parts: call and return. This way we have the
ring buffer timestamp of both.

An example trace:

[   27.904149584] calling  net_ns_init+0x0/0x1c0 @ 1
[   27.904429624] initcall net_ns_init+0x0/0x1c0 returned 0 after 0 msecs
[   27.904575926] calling  reboot_init+0x0/0x20 @ 1
[   27.904655399] initcall reboot_init+0x0/0x20 returned 0 after 0 msecs
[   27.904800228] calling  sysctl_init+0x0/0x30 @ 1
[   27.905142914] initcall sysctl_init+0x0/0x30 returned 0 after 0 msecs
[   27.905287211] calling  ksysfs_init+0x0/0xb0 @ 1
 ##### CPU 0 buffer started ####
            init-1     [000]    27.905395:      1:120:R   + [001]    11:115:S
 ##### CPU 1 buffer started ####
          <idle>-0     [001]    27.905425:      0:140:R ==> [001]    11:115:R
            init-1     [000]    27.905426:      1:120:D ==> [000]     0:140:R
          <idle>-0     [000]    27.905431:      0:140:R   + [000]     4:115:S
          <idle>-0     [000]    27.905451:      0:140:R ==> [000]     4:115:R
     ksoftirqd/0-4     [000]    27.905456:      4:115:S ==> [000]     0:140:R
           udevd-11    [001]    27.905458:     11:115:R   + [001]    14:115:R
          <idle>-0     [000]    27.905459:      0:140:R   + [000]     4:115:S
          <idle>-0     [000]    27.905462:      0:140:R ==> [000]     4:115:R
           udevd-11    [001]    27.905462:     11:115:R ==> [001]    14:115:R
     ksoftirqd/0-4     [000]    27.905467:      4:115:S ==> [000]     0:140:R
          <idle>-0     [000]    27.905470:      0:140:R   + [000]     4:115:S
          <idle>-0     [000]    27.905473:      0:140:R ==> [000]     4:115:R
     ksoftirqd/0-4     [000]    27.905476:      4:115:S ==> [000]     0:140:R
          <idle>-0     [000]    27.905479:      0:140:R   + [000]     4:115:S
          <idle>-0     [000]    27.905482:      0:140:R ==> [000]     4:115:R
     ksoftirqd/0-4     [000]    27.905486:      4:115:S ==> [000]     0:140:R
           udevd-14    [001]    27.905499:     14:120:X ==> [001]    11:115:R
           udevd-11    [001]    27.905506:     11:115:R   + [000]     1:120:D
          <idle>-0     [000]    27.905515:      0:140:R ==> [000]     1:120:R
           udevd-11    [001]    27.905517:     11:115:S ==> [001]     0:140:R
[   27.905557107] initcall ksysfs_init+0x0/0xb0 returned 0 after 3906 msecs
[   27.905705736] calling  init_jiffies_clocksource+0x0/0x10 @ 1
[   27.905779239] initcall init_jiffies_clocksource+0x0/0x10 returned 0 after 0 msecs
[   27.906769814] calling  pm_init+0x0/0x30 @ 1
[   27.906853627] initcall pm_init+0x0/0x30 returned 0 after 0 msecs
[   27.906997803] calling  pm_disk_init+0x0/0x20 @ 1
[   27.907076946] initcall pm_disk_init+0x0/0x20 returned 0 after 0 msecs
[   27.907222556] calling  swsusp_header_init+0x0/0x30 @ 1
[   27.907294325] initcall swsusp_header_init+0x0/0x30 returned 0 after 0 msecs
[   27.907439620] calling  stop_machine_init+0x0/0x50 @ 1
            init-1     [000]    27.907485:      1:120:R   + [000]     2:115:S
            init-1     [000]    27.907490:      1:120:D ==> [000]     2:115:R
        kthreadd-2     [000]    27.907507:      2:115:R   + [001]    15:115:R
          <idle>-0     [001]    27.907517:      0:140:R ==> [001]    15:115:R
        kthreadd-2     [000]    27.907517:      2:115:D ==> [000]     0:140:R
          <idle>-0     [000]    27.907521:      0:140:R   + [000]     4:115:S
          <idle>-0     [000]    27.907524:      0:140:R ==> [000]     4:115:R
           udevd-15    [001]    27.907527:     15:115:D   + [000]     2:115:D
     ksoftirqd/0-4     [000]    27.907537:      4:115:S ==> [000]     2:115:R
           udevd-15    [001]    27.907537:     15:115:D ==> [001]     0:140:R
        kthreadd-2     [000]    27.907546:      2:115:R   + [000]     1:120:D
        kthreadd-2     [000]    27.907550:      2:115:S ==> [000]     1:120:R
            init-1     [000]    27.907584:      1:120:R   + [000]    15:  0:D
            init-1     [000]    27.907589:      1:120:R   + [000]     2:115:S
            init-1     [000]    27.907593:      1:120:D ==> [000]    15:  0:R
           udevd-15    [000]    27.907601:     15:  0:S ==> [000]     2:115:R
 ##### CPU 0 buffer started ####
        kthreadd-2     [000]    27.907616:      2:115:R   + [001]    16:115:R
 ##### CPU 1 buffer started ####
          <idle>-0     [001]    27.907620:      0:140:R ==> [001]    16:115:R
        kthreadd-2     [000]    27.907621:      2:115:D ==> [000]     0:140:R
           udevd-16    [001]    27.907625:     16:115:D   + [000]     2:115:D
          <idle>-0     [000]    27.907628:      0:140:R   + [000]     4:115:S
           udevd-16    [001]    27.907629:     16:115:D ==> [001]     0:140:R
          <idle>-0     [000]    27.907631:      0:140:R ==> [000]     4:115:R
     ksoftirqd/0-4     [000]    27.907636:      4:115:S ==> [000]     2:115:R
        kthreadd-2     [000]    27.907644:      2:115:R   + [000]     1:120:D
        kthreadd-2     [000]    27.907647:      2:115:S ==> [000]     1:120:R
            init-1     [000]    27.907657:      1:120:R   + [001]    16:  0:D
          <idle>-0     [001]    27.907666:      0:140:R ==> [001]    16:  0:R
[   27.907703862] initcall stop_machine_init+0x0/0x50 returned 0 after 0 msecs
[   27.907850704] calling  filelock_init+0x0/0x30 @ 1
[   27.907926573] initcall filelock_init+0x0/0x30 returned 0 after 0 msecs
[   27.908071327] calling  init_script_binfmt+0x0/0x10 @ 1
[   27.908165195] initcall init_script_binfmt+0x0/0x10 returned 0 after 0 msecs
[   27.908309461] calling  init_elf_binfmt+0x0/0x10 @ 1

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/trace/boot.h      |  31 ++++++++----
 init/main.c               |  32 ++++++------
 kernel/trace/trace.h      |  17 +++++--
 kernel/trace/trace_boot.c | 123 +++++++++++++++++++++++++++++++++++-----------
 4 files changed, 144 insertions(+), 59 deletions(-)

(limited to 'kernel')

diff --git a/include/trace/boot.h b/include/trace/boot.h
index 4cbe64e46cdc..6b54537eab02 100644
--- a/include/trace/boot.h
+++ b/include/trace/boot.h
@@ -2,22 +2,30 @@
 #define _LINUX_TRACE_BOOT_H
 
 /*
- * Structure which defines the trace of an initcall.
+ * Structure which defines the trace of an initcall
+ * while it is called.
  * You don't have to fill the func field since it is
  * only used internally by the tracer.
  */
-struct boot_trace {
+struct boot_trace_call {
 	pid_t			caller;
 	char			func[KSYM_NAME_LEN];
-	int			result;
-	unsigned long long	duration;		/* usecs */
-	ktime_t			calltime;
-	ktime_t			rettime;
+};
+
+/*
+ * Structure which defines the trace of an initcall
+ * while it returns.
+ */
+struct boot_trace_ret {
+	char			func[KSYM_NAME_LEN];
+	int				result;
+	unsigned long long	duration;		/* nsecs */
 };
 
 #ifdef CONFIG_BOOT_TRACER
-/* Append the trace on the ring-buffer */
-extern void trace_boot(struct boot_trace *it, initcall_t fn);
+/* Append the traces on the ring-buffer */
+extern void trace_boot_call(struct boot_trace_call *bt, initcall_t fn);
+extern void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn);
 
 /* Tells the tracer that smp_pre_initcall is finished.
  * So we can start the tracing
@@ -34,7 +42,12 @@ extern void enable_boot_trace(void);
  */
 extern void disable_boot_trace(void);
 #else
-static inline void trace_boot(struct boot_trace *it, initcall_t fn) { }
+static inline
+void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) { }
+
+static inline
+void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) { }
+
 static inline void start_boot_trace(void) { }
 static inline void enable_boot_trace(void) { }
 static inline void disable_boot_trace(void) { }
diff --git a/init/main.c b/init/main.c
index 16ca1ee071c4..e810196bf2f2 100644
--- a/init/main.c
+++ b/init/main.c
@@ -704,33 +704,35 @@ core_param(initcall_debug, initcall_debug, bool, 0644);
 int do_one_initcall(initcall_t fn)
 {
 	int count = preempt_count();
-	ktime_t delta;
+	ktime_t calltime, delta, rettime;
 	char msgbuf[64];
-	struct boot_trace it;
+	struct boot_trace_call call;
+	struct boot_trace_ret ret;
 
 	if (initcall_debug) {
-		it.caller = task_pid_nr(current);
-		printk("calling  %pF @ %i\n", fn, it.caller);
-		it.calltime = ktime_get();
+		call.caller = task_pid_nr(current);
+		printk("calling  %pF @ %i\n", fn, call.caller);
+		calltime = ktime_get();
+		trace_boot_call(&call, fn);
 		enable_boot_trace();
 	}
 
-	it.result = fn();
+	ret.result = fn();
 
 	if (initcall_debug) {
-		it.rettime = ktime_get();
-		delta = ktime_sub(it.rettime, it.calltime);
-		it.duration = (unsigned long long) delta.tv64 >> 10;
-		printk("initcall %pF returned %d after %Ld usecs\n", fn,
-			it.result, it.duration);
-		trace_boot(&it, fn);
 		disable_boot_trace();
+		rettime = ktime_get();
+		delta = ktime_sub(rettime, calltime);
+		ret.duration = (unsigned long long) delta.tv64 >> 10;
+		trace_boot_ret(&ret, fn);
+		printk("initcall %pF returned %d after %Ld usecs\n", fn,
+			ret.result, ret.duration);
 	}
 
 	msgbuf[0] = 0;
 
-	if (it.result && it.result != -ENODEV && initcall_debug)
-		sprintf(msgbuf, "error code %d ", it.result);
+	if (ret.result && ret.result != -ENODEV && initcall_debug)
+		sprintf(msgbuf, "error code %d ", ret.result);
 
 	if (preempt_count() != count) {
 		strlcat(msgbuf, "preemption imbalance ", sizeof(msgbuf));
@@ -744,7 +746,7 @@ int do_one_initcall(initcall_t fn)
 		printk("initcall %pF returned with %s\n", fn, msgbuf);
 	}
 
-	return it.result;
+	return ret.result;
 }
 
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f69a5199596b..b5f91f198fd4 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -22,7 +22,8 @@ enum trace_type {
 	TRACE_SPECIAL,
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
-	TRACE_BOOT,
+	TRACE_BOOT_CALL,
+	TRACE_BOOT_RET,
 	TRACE_FN_RET,
 
 	__TRACE_LAST_TYPE
@@ -123,9 +124,14 @@ struct trace_mmiotrace_map {
 	struct mmiotrace_map	map;
 };
 
-struct trace_boot {
+struct trace_boot_call {
 	struct trace_entry	ent;
-	struct boot_trace	initcall;
+	struct boot_trace_call boot_call;
+};
+
+struct trace_boot_ret {
+	struct trace_entry	ent;
+	struct boot_trace_ret boot_ret;
 };
 
 /*
@@ -228,8 +234,9 @@ extern void __ftrace_bad_type(void);
 			  TRACE_MMIO_RW);				\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_map,		\
 			  TRACE_MMIO_MAP);				\
-		IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT);	\
-		IF_ASSIGN(var, ent, struct ftrace_ret_entry, TRACE_FN_RET); \
+		IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\
+		IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
+		IF_ASSIGN(var, ent, struct ftrace_ret_entry, TRACE_FN_RET);\
 		__ftrace_bad_type();					\
 	} while (0)
 
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 8f71915e8bb4..cb333b7fd113 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -58,35 +58,71 @@ static void boot_trace_init(struct trace_array *tr)
 	tracing_sched_switch_assign_trace(tr);
 }
 
-static enum print_line_t initcall_print_line(struct trace_iterator *iter)
+static enum print_line_t
+initcall_call_print_line(struct trace_iterator *iter)
 {
+	struct trace_entry *entry = iter->ent;
+	struct trace_seq *s = &iter->seq;
+	struct trace_boot_call *field;
+	struct boot_trace_call *call;
+	u64 ts;
+	unsigned long nsec_rem;
 	int ret;
+
+	trace_assign_type(field, entry);
+	call = &field->boot_call;
+	ts = iter->ts;
+	nsec_rem = do_div(ts, 1000000000);
+
+	ret = trace_seq_printf(s, "[%5ld.%09ld] calling  %s @ %i\n",
+			(unsigned long)ts, nsec_rem, call->func, call->caller);
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+	else
+		return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+initcall_ret_print_line(struct trace_iterator *iter)
+{
 	struct trace_entry *entry = iter->ent;
-	struct trace_boot *field = (struct trace_boot *)entry;
-	struct boot_trace *it = &field->initcall;
 	struct trace_seq *s = &iter->seq;
-	struct timespec calltime = ktime_to_timespec(it->calltime);
-	struct timespec rettime = ktime_to_timespec(it->rettime);
-
-	if (entry->type == TRACE_BOOT) {
-		ret = trace_seq_printf(s, "[%5ld.%09ld] calling  %s @ %i\n",
-					  calltime.tv_sec,
-					  calltime.tv_nsec,
-					  it->func, it->caller);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-
-		ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
-					  "returned %d after %lld msecs\n",
-					  rettime.tv_sec,
-					  rettime.tv_nsec,
-					  it->func, it->result, it->duration);
-
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
+	struct trace_boot_ret *field;
+	struct boot_trace_ret *init_ret;
+	u64 ts;
+	unsigned long nsec_rem;
+	int ret;
+
+	trace_assign_type(field, entry);
+	init_ret = &field->boot_ret;
+	ts = iter->ts;
+	nsec_rem = do_div(ts, 1000000000);
+
+	ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
+			"returned %d after %llu msecs\n",
+			(unsigned long) ts,
+			nsec_rem,
+			init_ret->func, init_ret->result, init_ret->duration);
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+	else
 		return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t initcall_print_line(struct trace_iterator *iter)
+{
+	struct trace_entry *entry = iter->ent;
+
+	switch (entry->type) {
+	case TRACE_BOOT_CALL:
+		return initcall_call_print_line(iter);
+	case TRACE_BOOT_RET:
+		return initcall_ret_print_line(iter);
+	default:
+		return TRACE_TYPE_UNHANDLED;
 	}
-	return TRACE_TYPE_UNHANDLED;
 }
 
 struct tracer boot_tracer __read_mostly =
@@ -97,11 +133,10 @@ struct tracer boot_tracer __read_mostly =
 	.print_line	= initcall_print_line,
 };
 
-void trace_boot(struct boot_trace *it, initcall_t fn)
+void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
 {
 	struct ring_buffer_event *event;
-	struct trace_boot *entry;
-	struct trace_array_cpu *data;
+	struct trace_boot_call *entry;
 	unsigned long irq_flags;
 	struct trace_array *tr = boot_trace;
 
@@ -111,9 +146,37 @@ void trace_boot(struct boot_trace *it, initcall_t fn)
 	/* Get its name now since this function could
 	 * disappear because it is in the .init section.
 	 */
-	sprint_symbol(it->func, (unsigned long)fn);
+	sprint_symbol(bt->func, (unsigned long)fn);
+	preempt_disable();
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		goto out;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, 0);
+	entry->ent.type = TRACE_BOOT_CALL;
+	entry->boot_call = *bt;
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+	trace_wake_up();
+
+ out:
+	preempt_enable();
+}
+
+void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
+{
+	struct ring_buffer_event *event;
+	struct trace_boot_ret *entry;
+	unsigned long irq_flags;
+	struct trace_array *tr = boot_trace;
+
+	if (!pre_initcalls_finished)
+		return;
+
+	sprint_symbol(bt->func, (unsigned long)fn);
 	preempt_disable();
-	data = tr->data[smp_processor_id()];
 
 	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
 					 &irq_flags);
@@ -121,8 +184,8 @@ void trace_boot(struct boot_trace *it, initcall_t fn)
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, 0);
-	entry->ent.type = TRACE_BOOT;
-	entry->initcall = *it;
+	entry->ent.type = TRACE_BOOT_RET;
+	entry->boot_ret = *bt;
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
 
 	trace_wake_up();
-- 
cgit v1.2.3


From 642edba5f5c545772b89907cf96134c73d6073c7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 12 Nov 2008 00:01:26 -0500
Subject: ring-buffer: fix deadlock from reader_lock in read_start

Impact: deadlock fix in ring_buffer_read_start

The ring_buffer_iter_reset was called from ring_buffer_read_start
where both grabbed the reader_lock.

This patch separates out the internals of ring_buffer_iter_reset
to its own function so that both APIs may grab the reader_lock.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c04c433fbc59..86dc353f89b9 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1475,19 +1475,9 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
 	return overruns;
 }
 
-/**
- * ring_buffer_iter_reset - reset an iterator
- * @iter: The iterator to reset
- *
- * Resets the iterator, so that it will start from the beginning
- * again.
- */
-void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
+static void rb_iter_reset(struct ring_buffer_iter *iter)
 {
 	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
-	unsigned long flags;
-
-	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 
 	/* Iterator usage is expected to have record disabled */
 	if (list_empty(&cpu_buffer->reader_page->list)) {
@@ -1501,7 +1491,22 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
 		iter->read_stamp = cpu_buffer->read_stamp;
 	else
 		iter->read_stamp = iter->head_page->time_stamp;
+}
 
+/**
+ * ring_buffer_iter_reset - reset an iterator
+ * @iter: The iterator to reset
+ *
+ * Resets the iterator, so that it will start from the beginning
+ * again.
+ */
+void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
+{
+	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+	rb_iter_reset(iter);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 }
 
@@ -1957,7 +1962,7 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
 
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	__raw_spin_lock(&cpu_buffer->lock);
-	ring_buffer_iter_reset(iter);
+	rb_iter_reset(iter);
 	__raw_spin_unlock(&cpu_buffer->lock);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
-- 
cgit v1.2.3


From 1f0d69a9fc815db82f15722bf05227190b1d714d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 12 Nov 2008 00:14:39 -0500
Subject: tracing: profile likely and unlikely annotations

Impact: new unlikely/likely profiler

Andrew Morton recently suggested having an in-kernel way to profile
likely and unlikely macros. This patch achieves that goal.

When configured, every(*) likely and unlikely macro gets a counter attached
to it. When the condition is hit, the hit and misses of that condition
are recorded. These numbers can later be retrieved by:

  /debugfs/tracing/profile_likely    - All likely markers
  /debugfs/tracing/profile_unlikely  - All unlikely markers.

# cat /debug/tracing/profile_unlikely | head
 correct incorrect  %        Function                  File              Line
 ------- ---------  -        --------                  ----              ----
    2167        0   0 do_arch_prctl                  process_64.c         832
       0        0   0 do_arch_prctl                  process_64.c         804
    2670        0   0 IS_ERR                         err.h                34
   71230     5693   7 __switch_to                    process_64.c         673
   76919        0   0 __switch_to                    process_64.c         639
   43184    33743  43 __switch_to                    process_64.c         624
   12740    64181  83 __switch_to                    process_64.c         594
   12740    64174  83 __switch_to                    process_64.c         590

# cat /debug/tracing/profile_unlikely | \
  awk '{ if ($3 > 25) print $0; }' |head -20
   44963    35259  43 __switch_to                    process_64.c         624
   12762    67454  84 __switch_to                    process_64.c         594
   12762    67447  84 __switch_to                    process_64.c         590
    1478      595  28 syscall_get_error              syscall.h            51
       0     2821 100 syscall_trace_leave            ptrace.c             1567
       0        1 100 native_smp_prepare_cpus        smpboot.c            1237
   86338   265881  75 calc_delta_fair                sched_fair.c         408
  210410   108540  34 calc_delta_mine                sched.c              1267
       0    54550 100 sched_info_queued              sched_stats.h        222
   51899    66435  56 pick_next_task_fair            sched_fair.c         1422
       6       10  62 yield_task_fair                sched_fair.c         982
    7325     2692  26 rt_policy                      sched.c              144
       0     1270 100 pre_schedule_rt                sched_rt.c           1261
    1268    48073  97 pick_next_task_rt              sched_rt.c           884
       0    45181 100 sched_info_dequeued            sched_stats.h        177
       0       15 100 sched_move_task                sched.c              8700
       0       15 100 sched_move_task                sched.c              8690
   53167    33217  38 schedule                       sched.c              4457
       0    80208 100 sched_info_switch              sched_stats.h        270
   30585    49631  61 context_switch                 sched.c              2619

# cat /debug/tracing/profile_likely | awk '{ if ($3 > 25) print $0; }'
   39900    36577  47 pick_next_task                 sched.c              4397
   20824    15233  42 switch_mm                      mmu_context_64.h     18
       0        7 100 __cancel_work_timer            workqueue.c          560
     617    66484  99 clocksource_adjust             timekeeping.c        456
       0   346340 100 audit_syscall_exit             auditsc.c            1570
      38   347350  99 audit_get_context              auditsc.c            732
       0   345244 100 audit_syscall_entry            auditsc.c            1541
      38     1017  96 audit_free                     auditsc.c            1446
       0     1090 100 audit_alloc                    auditsc.c            862
    2618     1090  29 audit_alloc                    auditsc.c            858
       0        6 100 move_masked_irq                migration.c          9
       1      198  99 probe_sched_wakeup             trace_sched_switch.c 58
       2        2  50 probe_wakeup                   trace_sched_wakeup.c 227
       0        2 100 probe_wakeup_sched_switch      trace_sched_wakeup.c 144
    4514     2090  31 __grab_cache_page              filemap.c            2149
   12882   228786  94 mapping_unevictable            pagemap.h            50
       4       11  73 __flush_cpu_slab               slub.c               1466
  627757   330451  34 slab_free                      slub.c               1731
    2959    61245  95 dentry_lru_del_init            dcache.c             153
     946     1217  56 load_elf_binary                binfmt_elf.c         904
     102       82  44 disk_put_part                  genhd.h              206
       1        1  50 dst_gc_task                    dst.c                82
       0       19 100 tcp_mss_split_point            tcp_output.c         1126

As you can see by the above, there's a bit of work to do in rethinking
the use of some unlikelys and likelys. Note: the unlikely case had 71 hits
that were more than 25%.

Note:  After submitting my first version of this patch, Andrew Morton
  showed me a version written by Daniel Walker, where I picked up
  the following ideas from:

  1)  Using __builtin_constant_p to avoid profiling fixed values.
  2)  Using __FILE__ instead of instruction pointers.
  3)  Using the preprocessor to stop all profiling of likely
       annotations from vsyscall_64.c.

Thanks to Andrew Morton, Arjan van de Ven, Theodore Tso and Ingo Molnar
for their feed back on this patch.

(*) Not ever unlikely is recorded, those that are used by vsyscalls
 (a few of them) had to have profiling disabled.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Theodore Tso <tytso@mit.edu>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/vsyscall_64.c     |   8 ++
 include/asm-generic/vmlinux.lds.h |  14 +++-
 include/linux/compiler.h          |  61 +++++++++++++-
 kernel/trace/Kconfig              |  16 ++++
 kernel/trace/Makefile             |   1 +
 kernel/trace/trace_unlikely.c     | 164 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 261 insertions(+), 3 deletions(-)
 create mode 100644 kernel/trace/trace_unlikely.c

(limited to 'kernel')

diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 0b8b6690a86d..2f90202e59b3 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -17,6 +17,14 @@
  *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
  */
 
+/* Protect userspace from profiling */
+#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
+# undef likely
+# undef unlikely
+# define likely(x)		likely_notrace(x)
+# define unlikely(x)		unlikely_notrace(x)
+#endif
+
 #include <linux/time.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 80744606bad1..e10beb5335c9 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -45,6 +45,17 @@
 #define MCOUNT_REC()
 #endif
 
+#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
+#define LIKELY_PROFILE()	VMLINUX_SYMBOL(__start_likely_profile) = .;   \
+				*(_ftrace_likely)			      \
+				VMLINUX_SYMBOL(__stop_likely_profile) = .;    \
+				VMLINUX_SYMBOL(__start_unlikely_profile) = .; \
+				*(_ftrace_unlikely)			      \
+				VMLINUX_SYMBOL(__stop_unlikely_profile) = .;
+#else
+#define LIKELY_PROFILE()
+#endif
+
 /* .data section */
 #define DATA_DATA							\
 	*(.data)							\
@@ -62,7 +73,8 @@
 	VMLINUX_SYMBOL(__stop___markers) = .;				\
 	VMLINUX_SYMBOL(__start___tracepoints) = .;			\
 	*(__tracepoints)						\
-	VMLINUX_SYMBOL(__stop___tracepoints) = .;
+	VMLINUX_SYMBOL(__stop___tracepoints) = .;			\
+	LIKELY_PROFILE()
 
 #define RO_DATA(align)							\
 	. = ALIGN((align));						\
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 98115d9d04da..935e30cfaf3c 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -59,8 +59,65 @@ extern void __chk_io_ptr(const volatile void __iomem *);
  * specific implementations come from the above header files
  */
 
-#define likely(x)	__builtin_expect(!!(x), 1)
-#define unlikely(x)	__builtin_expect(!!(x), 0)
+#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
+struct ftrace_likely_data {
+	const char *func;
+	const char *file;
+	unsigned line;
+	unsigned long correct;
+	unsigned long incorrect;
+};
+void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect);
+
+#define likely_notrace(x)	__builtin_expect(!!(x), 1)
+#define unlikely_notrace(x)	__builtin_expect(!!(x), 0)
+
+#define likely_check(x) ({						\
+			int ______r;					\
+			static struct ftrace_likely_data		\
+				__attribute__((__aligned__(4)))		\
+				__attribute__((section("_ftrace_likely"))) \
+				______f = {				\
+				.func = __func__,			\
+				.file = __FILE__,			\
+				.line = __LINE__,			\
+			};						\
+			______f.line = __LINE__;			\
+			______r = likely_notrace(x);			\
+			ftrace_likely_update(&______f, ______r, 1);	\
+			______r;					\
+		})
+#define unlikely_check(x) ({						\
+			int ______r;					\
+			static struct ftrace_likely_data		\
+				__attribute__((__aligned__(4)))		\
+				__attribute__((section("_ftrace_unlikely"))) \
+				______f = {				\
+				.func = __func__,			\
+				.file = __FILE__,			\
+				.line = __LINE__,			\
+			};						\
+			______f.line = __LINE__;			\
+			______r = unlikely_notrace(x);			\
+			ftrace_likely_update(&______f, ______r, 0);	\
+			______r;					\
+		})
+
+/*
+ * Using __builtin_constant_p(x) to ignore cases where the return
+ * value is always the same.  This idea is taken from a similar patch
+ * written by Daniel Walker.
+ */
+# ifndef likely
+#  define likely(x)	(__builtin_constant_p(x) ? !!(x) : likely_check(x))
+# endif
+# ifndef unlikely
+#  define unlikely(x)	(__builtin_constant_p(x) ? !!(x) : unlikely_check(x))
+# endif
+#else
+# define likely(x)	__builtin_expect(!!(x), 1)
+# define unlikely(x)	__builtin_expect(!!(x), 0)
+#endif
 
 /* Optimization barrier */
 #ifndef barrier
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d986216c8327..a604f24c755f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -159,6 +159,22 @@ config BOOT_TRACER
 	    selected, because the self-tests are an initcall as well and that
 	    would invalidate the boot trace. )
 
+config TRACE_UNLIKELY_PROFILE
+	bool "Trace likely/unlikely profiler"
+	depends on DEBUG_KERNEL
+	select TRACING
+	help
+	  This tracer profiles all the the likely and unlikely macros
+	  in the kernel. It will display the results in:
+
+	  /debugfs/tracing/profile_likely
+	  /debugfs/tracing/profile_unlikely
+
+	  Note: this will add a significant overhead, only turn this
+	  on if you need to profile the system's use of these macros.
+
+	  Say N if unsure.
+
 config STACK_TRACER
 	bool "Trace max stack"
 	depends on HAVE_FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 3e1f361bbc17..98e70ee27986 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -25,5 +25,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
 obj-$(CONFIG_FUNCTION_RET_TRACER) += trace_functions_return.o
+obj-$(CONFIG_TRACE_UNLIKELY_PROFILE) += trace_unlikely.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace_unlikely.c b/kernel/trace/trace_unlikely.c
new file mode 100644
index 000000000000..94932696069f
--- /dev/null
+++ b/kernel/trace/trace_unlikely.c
@@ -0,0 +1,164 @@
+/*
+ * unlikely profiler
+ *
+ * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
+ */
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include <linux/hash.h>
+#include <linux/fs.h>
+#include <asm/local.h>
+#include "trace.h"
+
+void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect)
+{
+	/* FIXME: Make this atomic! */
+	if (val == expect)
+		f->correct++;
+	else
+		f->incorrect++;
+}
+EXPORT_SYMBOL(ftrace_likely_update);
+
+struct ftrace_pointer {
+	void		*start;
+	void		*stop;
+};
+
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct ftrace_pointer *f = m->private;
+	struct ftrace_likely_data *p = v;
+
+	(*pos)++;
+
+	if (v == (void *)1)
+		return f->start;
+
+	++p;
+
+	if ((void *)p >= (void *)f->stop)
+		return NULL;
+
+	return p;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+	void *t = (void *)1;
+	loff_t l = 0;
+
+	for (; t && l < *pos; t = t_next(m, t, &l))
+		;
+
+	return t;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+	struct ftrace_likely_data *p = v;
+	const char *f;
+	unsigned long percent;
+
+	if (v == (void *)1) {
+		seq_printf(m, " correct incorrect  %% "
+			      "       Function                "
+			      "  File              Line\n"
+			      " ------- ---------  - "
+			      "       --------                "
+			      "  ----              ----\n");
+		return 0;
+	}
+
+	/* Only print the file, not the path */
+	f = p->file + strlen(p->file);
+	while (f >= p->file && *f != '/')
+		f--;
+	f++;
+
+	if (p->correct) {
+		percent = p->incorrect * 100;
+		percent /= p->correct + p->incorrect;
+	} else
+		percent = p->incorrect ? 100 : 0;
+
+	seq_printf(m, "%8lu %8lu %3lu ", p->correct, p->incorrect, percent);
+	seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
+	return 0;
+}
+
+static struct seq_operations tracing_likely_seq_ops = {
+	.start		= t_start,
+	.next		= t_next,
+	.stop		= t_stop,
+	.show		= t_show,
+};
+
+static int tracing_likely_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	ret = seq_open(file, &tracing_likely_seq_ops);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = (void *)inode->i_private;
+	}
+
+	return ret;
+}
+
+static struct file_operations tracing_likely_fops = {
+	.open		= tracing_likely_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+};
+
+extern unsigned long __start_likely_profile[];
+extern unsigned long __stop_likely_profile[];
+extern unsigned long __start_unlikely_profile[];
+extern unsigned long __stop_unlikely_profile[];
+
+static struct ftrace_pointer ftrace_likely_pos = {
+	.start			= __start_likely_profile,
+	.stop			= __stop_likely_profile,
+};
+
+static struct ftrace_pointer ftrace_unlikely_pos = {
+	.start			= __start_unlikely_profile,
+	.stop			= __stop_unlikely_profile,
+};
+
+static __init int ftrace_unlikely_init(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+
+	d_tracer = tracing_init_dentry();
+
+	entry = debugfs_create_file("profile_likely", 0444, d_tracer,
+				    &ftrace_likely_pos,
+				    &tracing_likely_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'profile_likely' entry\n");
+
+	entry = debugfs_create_file("profile_unlikely", 0444, d_tracer,
+				    &ftrace_unlikely_pos,
+				    &tracing_likely_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs"
+			   " 'profile_unlikely' entry\n");
+
+	return 0;
+}
+
+device_initcall(ftrace_unlikely_init);
-- 
cgit v1.2.3


From 52f232cb720a7babb752849cbc2cab2d24021209 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 12 Nov 2008 00:14:40 -0500
Subject: tracing: likely/unlikely branch annotation tracer

Impact: new likely/unlikely branch tracer

This patch adds a way to record the instances of the likely() and unlikely()
branch condition annotations.

When "unlikely" is set in /debugfs/tracing/iter_ctrl the unlikely conditions
will be added to any of the ftrace tracers. The change takes effect when
a new tracer is passed into the current_tracer file.

For example:

 bash-3471  [003]   357.014755: [INCORRECT] sched_info_dequeued:sched_stats.h:177
 bash-3471  [003]   357.014756: [correct] update_curr:sched_fair.c:489
 bash-3471  [003]   357.014758: [correct] calc_delta_fair:sched_fair.c:411
 bash-3471  [003]   357.014759: [correct] account_group_exec_runtime:sched_stats.h:356
 bash-3471  [003]   357.014761: [correct] update_curr:sched_fair.c:489
 bash-3471  [003]   357.014763: [INCORRECT] calc_delta_fair:sched_fair.c:411
 bash-3471  [003]   357.014765: [correct] calc_delta_mine:sched.c:1279

Which shows the normal tracer heading, as well as whether the condition was
correct "[correct]" or was mistaken "[INCORRECT]", followed by the function,
file name and line number.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig          |  22 ++++++++
 kernel/trace/Makefile         |   6 +++
 kernel/trace/trace.c          |  29 +++++++++++
 kernel/trace/trace.h          |  39 +++++++++++++++
 kernel/trace/trace_unlikely.c | 114 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 210 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a604f24c755f..8abcaf821beb 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -175,6 +175,28 @@ config TRACE_UNLIKELY_PROFILE
 
 	  Say N if unsure.
 
+config TRACING_UNLIKELY
+	bool
+	help
+	  Selected by tracers that will trace the likely and unlikely
+	  conditions. This prevents the tracers themselves from being
+	  profiled. Profiling the tracing infrastructure can only happen
+	  when the likelys and unlikelys are not being traced.
+
+config UNLIKELY_TRACER
+	bool "Trace likely/unlikely instances"
+	depends on TRACE_UNLIKELY_PROFILE
+	select TRACING_UNLIKELY
+	help
+	  This traces the events of likely and unlikely condition
+	  calls in the kernel.  The difference between this and the
+	  "Trace likely/unlikely profiler" is that this is not a
+	  histogram of the callers, but actually places the calling
+	  events into a running trace buffer to see when and where the
+	  events happened, as well as their results.
+
+	  Say N if unsure.
+
 config STACK_TRACER
 	bool "Trace max stack"
 	depends on HAVE_FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 98e70ee27986..c938d03516c0 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -10,6 +10,12 @@ CFLAGS_trace_selftest_dynamic.o = -pg
 obj-y += trace_selftest_dynamic.o
 endif
 
+# If unlikely tracing is enabled, do not trace these files
+ifdef CONFIG_TRACING_UNLIKELY
+KBUILD_CFLAGS += '-Dlikely(x)=likely_notrace(x)'
+KBUILD_CFLAGS += '-Dunlikely(x)=unlikely_notrace(x)'
+endif
+
 obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
 obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a3f7ae9cd8e1..83d38634bc90 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -258,6 +258,9 @@ static const char *trace_options[] = {
 	"sched-tree",
 	"ftrace_printk",
 	"ftrace_preempt",
+#ifdef CONFIG_UNLIKELY_TRACER
+	"unlikely",
+#endif
 	NULL
 };
 
@@ -1648,6 +1651,18 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 			trace_seq_print_cont(s, iter);
 		break;
 	}
+	case TRACE_UNLIKELY: {
+		struct trace_unlikely *field;
+
+		trace_assign_type(field, entry);
+
+		trace_seq_printf(s, "[%s] %s:%s:%d\n",
+				 field->correct ? "correct" : "INCORRECT",
+				 field->func,
+				 field->file,
+				 field->line);
+		break;
+	}
 	default:
 		trace_seq_printf(s, "Unknown type %d\n", entry->type);
 	}
@@ -1787,6 +1802,18 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 		return print_return_function(iter);
 		break;
 	}
+	case TRACE_UNLIKELY: {
+		struct trace_unlikely *field;
+
+		trace_assign_type(field, entry);
+
+		trace_seq_printf(s, "[%s] %s:%s:%d\n",
+				 field->correct ? "correct" : "INCORRECT",
+				 field->func,
+				 field->file,
+				 field->line);
+		break;
+	}
 	}
 	return TRACE_TYPE_HANDLED;
 }
@@ -2592,6 +2619,7 @@ static int tracing_set_tracer(char *buf)
 	if (t == current_trace)
 		goto out;
 
+	trace_unlikely_disable();
 	if (current_trace && current_trace->reset)
 		current_trace->reset(tr);
 
@@ -2599,6 +2627,7 @@ static int tracing_set_tracer(char *buf)
 	if (t->init)
 		t->init(tr);
 
+	trace_unlikely_enable(tr);
  out:
 	mutex_unlock(&trace_types_lock);
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b5f91f198fd4..9635aa2c4fc1 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -22,6 +22,7 @@ enum trace_type {
 	TRACE_SPECIAL,
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
+	TRACE_UNLIKELY,
 	TRACE_BOOT_CALL,
 	TRACE_BOOT_RET,
 	TRACE_FN_RET,
@@ -134,6 +135,16 @@ struct trace_boot_ret {
 	struct boot_trace_ret boot_ret;
 };
 
+#define TRACE_FUNC_SIZE 30
+#define TRACE_FILE_SIZE 20
+struct trace_unlikely {
+	struct trace_entry	ent;
+	unsigned	        line;
+	char			func[TRACE_FUNC_SIZE+1];
+	char			file[TRACE_FILE_SIZE+1];
+	char			correct;
+};
+
 /*
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
@@ -236,6 +247,7 @@ extern void __ftrace_bad_type(void);
 			  TRACE_MMIO_MAP);				\
 		IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\
 		IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
+		IF_ASSIGN(var, ent, struct trace_unlikely, TRACE_UNLIKELY); \
 		IF_ASSIGN(var, ent, struct ftrace_ret_entry, TRACE_FN_RET);\
 		__ftrace_bad_type();					\
 	} while (0)
@@ -456,6 +468,9 @@ enum trace_iterator_flags {
 	TRACE_ITER_SCHED_TREE		= 0x200,
 	TRACE_ITER_PRINTK		= 0x400,
 	TRACE_ITER_PREEMPTONLY		= 0x800,
+#ifdef CONFIG_UNLIKELY_TRACER
+	TRACE_ITER_UNLIKELY		= 0x1000,
+#endif
 };
 
 /*
@@ -515,4 +530,28 @@ static inline void ftrace_preempt_enable(int resched)
 		preempt_enable_notrace();
 }
 
+#ifdef CONFIG_UNLIKELY_TRACER
+extern int enable_unlikely_tracing(struct trace_array *tr);
+extern void disable_unlikely_tracing(void);
+static inline int trace_unlikely_enable(struct trace_array *tr)
+{
+	if (trace_flags & TRACE_ITER_UNLIKELY)
+		return enable_unlikely_tracing(tr);
+	return 0;
+}
+static inline void trace_unlikely_disable(void)
+{
+	/* due to races, always disable */
+	disable_unlikely_tracing();
+}
+#else
+static inline int trace_unlikely_enable(struct trace_array *tr)
+{
+	return 0;
+}
+static inline void trace_unlikely_disable(void)
+{
+}
+#endif /* CONFIG_UNLIKELY_TRACER */
+
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_unlikely.c b/kernel/trace/trace_unlikely.c
index 94932696069f..7290e0e7b4e3 100644
--- a/kernel/trace/trace_unlikely.c
+++ b/kernel/trace/trace_unlikely.c
@@ -15,8 +15,122 @@
 #include <asm/local.h>
 #include "trace.h"
 
+#ifdef CONFIG_UNLIKELY_TRACER
+
+static int unlikely_tracing_enabled __read_mostly;
+static DEFINE_MUTEX(unlikely_tracing_mutex);
+static struct trace_array *unlikely_tracer;
+
+static void
+probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
+{
+	struct trace_array *tr = unlikely_tracer;
+	struct ring_buffer_event *event;
+	struct trace_unlikely *entry;
+	unsigned long flags, irq_flags;
+	int cpu, pc;
+	const char *p;
+
+	/*
+	 * I would love to save just the ftrace_likely_data pointer, but
+	 * this code can also be used by modules. Ugly things can happen
+	 * if the module is unloaded, and then we go and read the
+	 * pointer.  This is slower, but much safer.
+	 */
+
+	if (unlikely(!tr))
+		return;
+
+	local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
+		goto out;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		goto out;
+
+	pc = preempt_count();
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, flags, pc);
+	entry->ent.type		= TRACE_UNLIKELY;
+
+	/* Strip off the path, only save the file */
+	p = f->file + strlen(f->file);
+	while (p >= f->file && *p != '/')
+		p--;
+	p++;
+
+	strncpy(entry->func, f->func, TRACE_FUNC_SIZE);
+	strncpy(entry->file, p, TRACE_FILE_SIZE);
+	entry->func[TRACE_FUNC_SIZE] = 0;
+	entry->file[TRACE_FILE_SIZE] = 0;
+	entry->line = f->line;
+	entry->correct = val == expect;
+
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+ out:
+	atomic_dec(&tr->data[cpu]->disabled);
+	local_irq_restore(flags);
+}
+
+static inline
+void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
+{
+	if (!unlikely_tracing_enabled)
+		return;
+
+	probe_likely_condition(f, val, expect);
+}
+
+int enable_unlikely_tracing(struct trace_array *tr)
+{
+	int ret = 0;
+
+	mutex_lock(&unlikely_tracing_mutex);
+	unlikely_tracer = tr;
+	/*
+	 * Must be seen before enabling. The reader is a condition
+	 * where we do not need a matching rmb()
+	 */
+	smp_wmb();
+	unlikely_tracing_enabled++;
+	mutex_unlock(&unlikely_tracing_mutex);
+
+	return ret;
+}
+
+void disable_unlikely_tracing(void)
+{
+	mutex_lock(&unlikely_tracing_mutex);
+
+	if (!unlikely_tracing_enabled)
+		goto out_unlock;
+
+	unlikely_tracing_enabled--;
+
+ out_unlock:
+	mutex_unlock(&unlikely_tracing_mutex);
+}
+#else
+static inline
+void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
+{
+}
+#endif /* CONFIG_UNLIKELY_TRACER */
+
 void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect)
 {
+	/*
+	 * I would love to have a trace point here instead, but the
+	 * trace point code is so inundated with unlikely and likely
+	 * conditions that the recursive nightmare that exists is too
+	 * much to try to get working. At least for now.
+	 */
+	trace_likely_condition(f, val, expect);
+
 	/* FIXME: Make this atomic! */
 	if (val == expect)
 		f->correct++;
-- 
cgit v1.2.3


From f88c4ae9f8c3939bee4337c75c7a673b5de7a8a7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 12 Nov 2008 11:55:41 +0100
Subject: tracing: branch tracer, tweak output

Impact: modify the tracer output, to make it a bit easier to read

Change the output from:

>  bash-3471  [003]   357.014755: [INCORRECT] sched_info_dequeued:sched_stats.h:177
>  bash-3471  [003]   357.014756: [correct] update_curr:sched_fair.c:489
>  bash-3471  [003]   357.014758: [correct] calc_delta_fair:sched_fair.c:411

to:

>  bash-3471  [003]   357.014755: [ MISS ] sched_info_dequeued:sched_stats.h:177
>  bash-3471  [003]   357.014756: [ .... ] update_curr:sched_fair.c:489
>  bash-3471  [003]   357.014758: [ .... ] calc_delta_fair:sched_fair.c:411

it's good to have fields aligned vertically, and the only important
information is a prediction miss, so display only that information.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 83d38634bc90..728a46ec6b06 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1657,7 +1657,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 		trace_assign_type(field, entry);
 
 		trace_seq_printf(s, "[%s] %s:%s:%d\n",
-				 field->correct ? "correct" : "INCORRECT",
+				 field->correct ? " .... " : " MISS ",
 				 field->func,
 				 field->file,
 				 field->line);
@@ -1808,7 +1808,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 		trace_assign_type(field, entry);
 
 		trace_seq_printf(s, "[%s] %s:%s:%d\n",
-				 field->correct ? "correct" : "INCORRECT",
+				 field->correct ? " .... " : " MISS ",
 				 field->func,
 				 field->file,
 				 field->line);
-- 
cgit v1.2.3


From 68d119f0a66f7e3663304343b072e56a2693446b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 12 Nov 2008 14:09:30 +0100
Subject: tracing: finetune branch-tracer output

Steve suggested the to change the output from this:

>  bash-3471  [003]   357.014755: [ MISS ] sched_info_dequeued:sched_stats.h:177
>  bash-3471  [003]   357.014756: [ .... ] update_curr:sched_fair.c:489
>  bash-3471  [003]   357.014758: [ .... ] calc_delta_fair:sched_fair.c:411

to this:

>  bash-3471  [003]   357.014755: [ MISS ] sched_info_dequeued:sched_stats.h:177
>  bash-3471  [003]   357.014756: [  ok  ] update_curr:sched_fair.c:489
>  bash-3471  [003]   357.014758: [  ok  ] calc_delta_fair:sched_fair.c:411

as it makes it clearer to the user what it means exactly.

Acked-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 728a46ec6b06..d842db14a59b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1657,7 +1657,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 		trace_assign_type(field, entry);
 
 		trace_seq_printf(s, "[%s] %s:%s:%d\n",
-				 field->correct ? " .... " : " MISS ",
+				 field->correct ? "  ok  " : " MISS ",
 				 field->func,
 				 field->file,
 				 field->line);
@@ -1808,7 +1808,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 		trace_assign_type(field, entry);
 
 		trace_seq_printf(s, "[%s] %s:%s:%d\n",
-				 field->correct ? " .... " : " MISS ",
+				 field->correct ? "  ok  " : " MISS ",
 				 field->func,
 				 field->file,
 				 field->line);
-- 
cgit v1.2.3


From 2ed84eeb8808cf3c9f039213ca137ffd7d753f0e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 12 Nov 2008 15:24:24 -0500
Subject: trace: rename unlikely profiler to branch profiler

Impact: name change of unlikely tracer and profiler

Ingo Molnar suggested changing the config from UNLIKELY_PROFILE
to BRANCH_PROFILING. I never did like the "unlikely" name so I
went one step farther, and renamed all the unlikely configurations
to a "BRANCH" variant.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/vsyscall_64.c     |  2 +-
 arch/x86/vdso/vclock_gettime.c    |  2 +-
 include/asm-generic/vmlinux.lds.h |  2 +-
 include/linux/compiler.h          | 19 ++++++++++---------
 kernel/trace/Kconfig              | 10 +++++-----
 kernel/trace/Makefile             |  7 +++----
 kernel/trace/trace.c              |  2 +-
 kernel/trace/trace.h              |  6 +++---
 kernel/trace/trace_unlikely.c     |  4 ++--
 9 files changed, 27 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index ece02932ea57..6f3d3d4cd973 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -18,7 +18,7 @@
  */
 
 /* Disable profiling for userspace code: */
-#define DISABLE_UNLIKELY_PROFILE
+#define DISABLE_BRANCH_PROFILING
 
 #include <linux/time.h>
 #include <linux/init.h>
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 6e667631e7dc..d9d35824c56f 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -10,7 +10,7 @@
  */
 
 /* Disable profiling for userspace code: */
-#define DISABLE_UNLIKELY_PROFILE
+#define DISABLE_BRANCH_PROFILING
 
 #include <linux/kernel.h>
 #include <linux/posix-timers.h>
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index e10beb5335c9..a5e4ed9baec8 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -45,7 +45,7 @@
 #define MCOUNT_REC()
 #endif
 
-#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
+#ifdef CONFIG_TRACE_BRANCH_PROFILING
 #define LIKELY_PROFILE()	VMLINUX_SYMBOL(__start_likely_profile) = .;   \
 				*(_ftrace_likely)			      \
 				VMLINUX_SYMBOL(__stop_likely_profile) = .;    \
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 63b7d9089d6e..c7d804a7a4d6 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -59,26 +59,27 @@ extern void __chk_io_ptr(const volatile void __iomem *);
  * specific implementations come from the above header files
  */
 
-/*
- * Note: DISABLE_UNLIKELY_PROFILE can be used by special lowlevel code
- * to disable branch tracing on a per file basis.
- */
-#if defined(CONFIG_TRACE_UNLIKELY_PROFILE) && !defined(DISABLE_UNLIKELY_PROFILE)
-struct ftrace_likely_data {
+struct ftrace_branch_data {
 	const char *func;
 	const char *file;
 	unsigned line;
 	unsigned long correct;
 	unsigned long incorrect;
 };
-void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect);
+
+/*
+ * Note: DISABLE_BRANCH_PROFILING can be used by special lowlevel code
+ * to disable branch tracing on a per file basis.
+ */
+#if defined(CONFIG_TRACE_BRANCH_PROFILING) && !defined(DISABLE_BRANCH_PROFILING)
+void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
 
 #define likely_notrace(x)	__builtin_expect(!!(x), 1)
 #define unlikely_notrace(x)	__builtin_expect(!!(x), 0)
 
 #define likely_check(x) ({						\
 			int ______r;					\
-			static struct ftrace_likely_data		\
+			static struct ftrace_branch_data		\
 				__attribute__((__aligned__(4)))		\
 				__attribute__((section("_ftrace_likely"))) \
 				______f = {				\
@@ -93,7 +94,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect);
 		})
 #define unlikely_check(x) ({						\
 			int ______r;					\
-			static struct ftrace_likely_data		\
+			static struct ftrace_branch_data		\
 				__attribute__((__aligned__(4)))		\
 				__attribute__((section("_ftrace_unlikely"))) \
 				______f = {				\
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8abcaf821beb..9c89526b6b7c 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -159,7 +159,7 @@ config BOOT_TRACER
 	    selected, because the self-tests are an initcall as well and that
 	    would invalidate the boot trace. )
 
-config TRACE_UNLIKELY_PROFILE
+config TRACE_BRANCH_PROFILING
 	bool "Trace likely/unlikely profiler"
 	depends on DEBUG_KERNEL
 	select TRACING
@@ -175,7 +175,7 @@ config TRACE_UNLIKELY_PROFILE
 
 	  Say N if unsure.
 
-config TRACING_UNLIKELY
+config TRACING_BRANCHES
 	bool
 	help
 	  Selected by tracers that will trace the likely and unlikely
@@ -183,10 +183,10 @@ config TRACING_UNLIKELY
 	  profiled. Profiling the tracing infrastructure can only happen
 	  when the likelys and unlikelys are not being traced.
 
-config UNLIKELY_TRACER
+config BRANCH_TRACER
 	bool "Trace likely/unlikely instances"
-	depends on TRACE_UNLIKELY_PROFILE
-	select TRACING_UNLIKELY
+	depends on TRACE_BRANCH_PROFILING
+	select TRACING_BRANCHES
 	help
 	  This traces the events of likely and unlikely condition
 	  calls in the kernel.  The difference between this and the
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c938d03516c0..0087df7ba44e 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -11,9 +11,8 @@ obj-y += trace_selftest_dynamic.o
 endif
 
 # If unlikely tracing is enabled, do not trace these files
-ifdef CONFIG_TRACING_UNLIKELY
-KBUILD_CFLAGS += '-Dlikely(x)=likely_notrace(x)'
-KBUILD_CFLAGS += '-Dunlikely(x)=unlikely_notrace(x)'
+ifdef CONFIG_TRACING_BRANCHES
+KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
 endif
 
 obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
@@ -31,6 +30,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
 obj-$(CONFIG_FUNCTION_RET_TRACER) += trace_functions_return.o
-obj-$(CONFIG_TRACE_UNLIKELY_PROFILE) += trace_unlikely.o
+obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_unlikely.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d842db14a59b..bad59d32a4a9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -258,7 +258,7 @@ static const char *trace_options[] = {
 	"sched-tree",
 	"ftrace_printk",
 	"ftrace_preempt",
-#ifdef CONFIG_UNLIKELY_TRACER
+#ifdef CONFIG_BRANCH_TRACER
 	"unlikely",
 #endif
 	NULL
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9635aa2c4fc1..dccae6312941 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -468,7 +468,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_SCHED_TREE		= 0x200,
 	TRACE_ITER_PRINTK		= 0x400,
 	TRACE_ITER_PREEMPTONLY		= 0x800,
-#ifdef CONFIG_UNLIKELY_TRACER
+#ifdef CONFIG_BRANCH_TRACER
 	TRACE_ITER_UNLIKELY		= 0x1000,
 #endif
 };
@@ -530,7 +530,7 @@ static inline void ftrace_preempt_enable(int resched)
 		preempt_enable_notrace();
 }
 
-#ifdef CONFIG_UNLIKELY_TRACER
+#ifdef CONFIG_BRANCH_TRACER
 extern int enable_unlikely_tracing(struct trace_array *tr);
 extern void disable_unlikely_tracing(void);
 static inline int trace_unlikely_enable(struct trace_array *tr)
@@ -552,6 +552,6 @@ static inline int trace_unlikely_enable(struct trace_array *tr)
 static inline void trace_unlikely_disable(void)
 {
 }
-#endif /* CONFIG_UNLIKELY_TRACER */
+#endif /* CONFIG_BRANCH_TRACER */
 
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_unlikely.c b/kernel/trace/trace_unlikely.c
index 7290e0e7b4e3..856eb3b7f694 100644
--- a/kernel/trace/trace_unlikely.c
+++ b/kernel/trace/trace_unlikely.c
@@ -15,7 +15,7 @@
 #include <asm/local.h>
 #include "trace.h"
 
-#ifdef CONFIG_UNLIKELY_TRACER
+#ifdef CONFIG_BRANCH_TRACER
 
 static int unlikely_tracing_enabled __read_mostly;
 static DEFINE_MUTEX(unlikely_tracing_mutex);
@@ -119,7 +119,7 @@ static inline
 void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
 {
 }
-#endif /* CONFIG_UNLIKELY_TRACER */
+#endif /* CONFIG_BRANCH_TRACER */
 
 void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect)
 {
-- 
cgit v1.2.3


From 9f029e83e968e5661d7be045bbcb620dbb909938 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 12 Nov 2008 15:24:24 -0500
Subject: ftrace: rename unlikely iter_ctrl to branch

Impact: rename of iter_ctrl unlikely to branch

The unlikely name is ugly. This patch converts the iter_ctrl command
"unlikely" and "nounlikely" to "branch" and "nobranch" respectively.

It also renames a lot of internal functions to use "branch" instead
of "unlikely".

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c          | 14 ++++++------
 kernel/trace/trace.h          | 26 +++++++++++-----------
 kernel/trace/trace_unlikely.c | 50 +++++++++++++++++++++----------------------
 3 files changed, 45 insertions(+), 45 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bad59d32a4a9..4bf070bb5272 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -259,7 +259,7 @@ static const char *trace_options[] = {
 	"ftrace_printk",
 	"ftrace_preempt",
 #ifdef CONFIG_BRANCH_TRACER
-	"unlikely",
+	"branch",
 #endif
 	NULL
 };
@@ -1651,8 +1651,8 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 			trace_seq_print_cont(s, iter);
 		break;
 	}
-	case TRACE_UNLIKELY: {
-		struct trace_unlikely *field;
+	case TRACE_BRANCH: {
+		struct trace_branch *field;
 
 		trace_assign_type(field, entry);
 
@@ -1802,8 +1802,8 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 		return print_return_function(iter);
 		break;
 	}
-	case TRACE_UNLIKELY: {
-		struct trace_unlikely *field;
+	case TRACE_BRANCH: {
+		struct trace_branch *field;
 
 		trace_assign_type(field, entry);
 
@@ -2619,7 +2619,7 @@ static int tracing_set_tracer(char *buf)
 	if (t == current_trace)
 		goto out;
 
-	trace_unlikely_disable();
+	trace_branch_disable();
 	if (current_trace && current_trace->reset)
 		current_trace->reset(tr);
 
@@ -2627,7 +2627,7 @@ static int tracing_set_tracer(char *buf)
 	if (t->init)
 		t->init(tr);
 
-	trace_unlikely_enable(tr);
+	trace_branch_enable(tr);
  out:
 	mutex_unlock(&trace_types_lock);
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index dccae6312941..7fbf37b27453 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -22,7 +22,7 @@ enum trace_type {
 	TRACE_SPECIAL,
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
-	TRACE_UNLIKELY,
+	TRACE_BRANCH,
 	TRACE_BOOT_CALL,
 	TRACE_BOOT_RET,
 	TRACE_FN_RET,
@@ -137,7 +137,7 @@ struct trace_boot_ret {
 
 #define TRACE_FUNC_SIZE 30
 #define TRACE_FILE_SIZE 20
-struct trace_unlikely {
+struct trace_branch {
 	struct trace_entry	ent;
 	unsigned	        line;
 	char			func[TRACE_FUNC_SIZE+1];
@@ -247,7 +247,7 @@ extern void __ftrace_bad_type(void);
 			  TRACE_MMIO_MAP);				\
 		IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\
 		IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
-		IF_ASSIGN(var, ent, struct trace_unlikely, TRACE_UNLIKELY); \
+		IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
 		IF_ASSIGN(var, ent, struct ftrace_ret_entry, TRACE_FN_RET);\
 		__ftrace_bad_type();					\
 	} while (0)
@@ -469,7 +469,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_PRINTK		= 0x400,
 	TRACE_ITER_PREEMPTONLY		= 0x800,
 #ifdef CONFIG_BRANCH_TRACER
-	TRACE_ITER_UNLIKELY		= 0x1000,
+	TRACE_ITER_BRANCH		= 0x1000,
 #endif
 };
 
@@ -531,25 +531,25 @@ static inline void ftrace_preempt_enable(int resched)
 }
 
 #ifdef CONFIG_BRANCH_TRACER
-extern int enable_unlikely_tracing(struct trace_array *tr);
-extern void disable_unlikely_tracing(void);
-static inline int trace_unlikely_enable(struct trace_array *tr)
+extern int enable_branch_tracing(struct trace_array *tr);
+extern void disable_branch_tracing(void);
+static inline int trace_branch_enable(struct trace_array *tr)
 {
-	if (trace_flags & TRACE_ITER_UNLIKELY)
-		return enable_unlikely_tracing(tr);
+	if (trace_flags & TRACE_ITER_BRANCH)
+		return enable_branch_tracing(tr);
 	return 0;
 }
-static inline void trace_unlikely_disable(void)
+static inline void trace_branch_disable(void)
 {
 	/* due to races, always disable */
-	disable_unlikely_tracing();
+	disable_branch_tracing();
 }
 #else
-static inline int trace_unlikely_enable(struct trace_array *tr)
+static inline int trace_branch_enable(struct trace_array *tr)
 {
 	return 0;
 }
-static inline void trace_unlikely_disable(void)
+static inline void trace_branch_disable(void)
 {
 }
 #endif /* CONFIG_BRANCH_TRACER */
diff --git a/kernel/trace/trace_unlikely.c b/kernel/trace/trace_unlikely.c
index 856eb3b7f694..e5d5969853a3 100644
--- a/kernel/trace/trace_unlikely.c
+++ b/kernel/trace/trace_unlikely.c
@@ -17,16 +17,16 @@
 
 #ifdef CONFIG_BRANCH_TRACER
 
-static int unlikely_tracing_enabled __read_mostly;
-static DEFINE_MUTEX(unlikely_tracing_mutex);
-static struct trace_array *unlikely_tracer;
+static int branch_tracing_enabled __read_mostly;
+static DEFINE_MUTEX(branch_tracing_mutex);
+static struct trace_array *branch_tracer;
 
 static void
-probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
+probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 {
-	struct trace_array *tr = unlikely_tracer;
+	struct trace_array *tr = branch_tracer;
 	struct ring_buffer_event *event;
-	struct trace_unlikely *entry;
+	struct trace_branch *entry;
 	unsigned long flags, irq_flags;
 	int cpu, pc;
 	const char *p;
@@ -54,7 +54,7 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
 	pc = preempt_count();
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, flags, pc);
-	entry->ent.type		= TRACE_UNLIKELY;
+	entry->ent.type		= TRACE_BRANCH;
 
 	/* Strip off the path, only save the file */
 	p = f->file + strlen(f->file);
@@ -77,51 +77,51 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
 }
 
 static inline
-void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
+void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 {
-	if (!unlikely_tracing_enabled)
+	if (!branch_tracing_enabled)
 		return;
 
 	probe_likely_condition(f, val, expect);
 }
 
-int enable_unlikely_tracing(struct trace_array *tr)
+int enable_branch_tracing(struct trace_array *tr)
 {
 	int ret = 0;
 
-	mutex_lock(&unlikely_tracing_mutex);
-	unlikely_tracer = tr;
+	mutex_lock(&branch_tracing_mutex);
+	branch_tracer = tr;
 	/*
 	 * Must be seen before enabling. The reader is a condition
 	 * where we do not need a matching rmb()
 	 */
 	smp_wmb();
-	unlikely_tracing_enabled++;
-	mutex_unlock(&unlikely_tracing_mutex);
+	branch_tracing_enabled++;
+	mutex_unlock(&branch_tracing_mutex);
 
 	return ret;
 }
 
-void disable_unlikely_tracing(void)
+void disable_branch_tracing(void)
 {
-	mutex_lock(&unlikely_tracing_mutex);
+	mutex_lock(&branch_tracing_mutex);
 
-	if (!unlikely_tracing_enabled)
+	if (!branch_tracing_enabled)
 		goto out_unlock;
 
-	unlikely_tracing_enabled--;
+	branch_tracing_enabled--;
 
  out_unlock:
-	mutex_unlock(&unlikely_tracing_mutex);
+	mutex_unlock(&branch_tracing_mutex);
 }
 #else
 static inline
-void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
+void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 {
 }
 #endif /* CONFIG_BRANCH_TRACER */
 
-void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect)
+void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
 {
 	/*
 	 * I would love to have a trace point here instead, but the
@@ -148,7 +148,7 @@ static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct ftrace_pointer *f = m->private;
-	struct ftrace_likely_data *p = v;
+	struct ftrace_branch_data *p = v;
 
 	(*pos)++;
 
@@ -180,7 +180,7 @@ static void t_stop(struct seq_file *m, void *p)
 
 static int t_show(struct seq_file *m, void *v)
 {
-	struct ftrace_likely_data *p = v;
+	struct ftrace_branch_data *p = v;
 	const char *f;
 	unsigned long percent;
 
@@ -252,7 +252,7 @@ static struct ftrace_pointer ftrace_unlikely_pos = {
 	.stop			= __stop_unlikely_profile,
 };
 
-static __init int ftrace_unlikely_init(void)
+static __init int ftrace_branch_init(void)
 {
 	struct dentry *d_tracer;
 	struct dentry *entry;
@@ -275,4 +275,4 @@ static __init int ftrace_unlikely_init(void)
 	return 0;
 }
 
-device_initcall(ftrace_unlikely_init);
+device_initcall(ftrace_branch_init);
-- 
cgit v1.2.3


From 80e5ea4506791af206266c5921c97f11d3b17866 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 12 Nov 2008 15:24:24 -0500
Subject: ftrace: add tracer called branch

Impact: added new branch tracer

Currently the tracing of branch profiling (unlikelys and likelys hit)
is only activated by the iter_ctrl. This patch adds a tracer called
"branch" that will just trace the branch profiling. The advantage
of adding this tracer is that it can be added to the ftrace selftests
on startup.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h          |  2 ++
 kernel/trace/trace_selftest.c | 23 +++++++++++++++++++++++
 kernel/trace/trace_unlikely.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 67 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 7fbf37b27453..9e015f5bea1d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -420,6 +420,8 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace,
 					       struct trace_array *tr);
 extern int trace_selftest_startup_sysprof(struct tracer *trace,
 					       struct trace_array *tr);
+extern int trace_selftest_startup_branch(struct tracer *trace,
+					 struct trace_array *tr);
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 
 extern void *head_page(struct trace_array_cpu *data);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 0728a105dcc1..24e6e075e6d6 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -13,6 +13,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
 	case TRACE_STACK:
 	case TRACE_PRINT:
 	case TRACE_SPECIAL:
+	case TRACE_BRANCH:
 		return 1;
 	}
 	return 0;
@@ -544,3 +545,25 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
 	return ret;
 }
 #endif /* CONFIG_SYSPROF_TRACER */
+
+#ifdef CONFIG_BRANCH_TRACER
+int
+trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long count;
+	int ret;
+
+	/* start the tracing */
+	trace->init(tr);
+	/* Sleep for a 1/10 of a second */
+	msleep(100);
+	/* stop the tracing. */
+	tracing_stop();
+	/* check the trace buffer */
+	ret = trace_test_buffer(tr, &count);
+	trace->reset(tr);
+	tracing_start();
+
+	return ret;
+}
+#endif /* CONFIG_BRANCH_TRACER */
diff --git a/kernel/trace/trace_unlikely.c b/kernel/trace/trace_unlikely.c
index e5d5969853a3..85265553918f 100644
--- a/kernel/trace/trace_unlikely.c
+++ b/kernel/trace/trace_unlikely.c
@@ -114,6 +114,48 @@ void disable_branch_tracing(void)
  out_unlock:
 	mutex_unlock(&branch_tracing_mutex);
 }
+
+static void start_branch_trace(struct trace_array *tr)
+{
+	enable_branch_tracing(tr);
+}
+
+static void stop_branch_trace(struct trace_array *tr)
+{
+	disable_branch_tracing();
+}
+
+static void branch_trace_init(struct trace_array *tr)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr, cpu);
+
+	start_branch_trace(tr);
+}
+
+static void branch_trace_reset(struct trace_array *tr)
+{
+	stop_branch_trace(tr);
+}
+
+struct tracer branch_trace __read_mostly =
+{
+	.name		= "branch",
+	.init		= branch_trace_init,
+	.reset		= branch_trace_reset,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest	= trace_selftest_startup_branch,
+#endif
+};
+
+__init static int init_branch_trace(void)
+{
+	return register_tracer(&branch_trace);
+}
+
+device_initcall(init_branch_trace);
 #else
 static inline
 void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
-- 
cgit v1.2.3


From 94b80ffd650b22e1fd493ccf6bad7efda4b8ea85 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 12 Nov 2008 16:18:45 -0500
Subject: ftrace: rename trace_unlikely.c file

Impact: File name change of trace_unlikely.c

The "unlikely" name for the tracer is quite ugly. We renamed all the
parts of it to "branch" and now it is time to rename the file too.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Makefile         |   2 +-
 kernel/trace/trace_branch.c   | 320 ++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_unlikely.c | 320 ------------------------------------------
 3 files changed, 321 insertions(+), 321 deletions(-)
 create mode 100644 kernel/trace/trace_branch.c
 delete mode 100644 kernel/trace/trace_unlikely.c

(limited to 'kernel')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 0087df7ba44e..1a8c9259dc69 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -30,6 +30,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
 obj-$(CONFIG_FUNCTION_RET_TRACER) += trace_functions_return.o
-obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_unlikely.o
+obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
new file mode 100644
index 000000000000..85265553918f
--- /dev/null
+++ b/kernel/trace/trace_branch.c
@@ -0,0 +1,320 @@
+/*
+ * unlikely profiler
+ *
+ * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
+ */
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include <linux/hash.h>
+#include <linux/fs.h>
+#include <asm/local.h>
+#include "trace.h"
+
+#ifdef CONFIG_BRANCH_TRACER
+
+static int branch_tracing_enabled __read_mostly;
+static DEFINE_MUTEX(branch_tracing_mutex);
+static struct trace_array *branch_tracer;
+
+static void
+probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
+{
+	struct trace_array *tr = branch_tracer;
+	struct ring_buffer_event *event;
+	struct trace_branch *entry;
+	unsigned long flags, irq_flags;
+	int cpu, pc;
+	const char *p;
+
+	/*
+	 * I would love to save just the ftrace_likely_data pointer, but
+	 * this code can also be used by modules. Ugly things can happen
+	 * if the module is unloaded, and then we go and read the
+	 * pointer.  This is slower, but much safer.
+	 */
+
+	if (unlikely(!tr))
+		return;
+
+	local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
+		goto out;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		goto out;
+
+	pc = preempt_count();
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, flags, pc);
+	entry->ent.type		= TRACE_BRANCH;
+
+	/* Strip off the path, only save the file */
+	p = f->file + strlen(f->file);
+	while (p >= f->file && *p != '/')
+		p--;
+	p++;
+
+	strncpy(entry->func, f->func, TRACE_FUNC_SIZE);
+	strncpy(entry->file, p, TRACE_FILE_SIZE);
+	entry->func[TRACE_FUNC_SIZE] = 0;
+	entry->file[TRACE_FILE_SIZE] = 0;
+	entry->line = f->line;
+	entry->correct = val == expect;
+
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+ out:
+	atomic_dec(&tr->data[cpu]->disabled);
+	local_irq_restore(flags);
+}
+
+static inline
+void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
+{
+	if (!branch_tracing_enabled)
+		return;
+
+	probe_likely_condition(f, val, expect);
+}
+
+int enable_branch_tracing(struct trace_array *tr)
+{
+	int ret = 0;
+
+	mutex_lock(&branch_tracing_mutex);
+	branch_tracer = tr;
+	/*
+	 * Must be seen before enabling. The reader is a condition
+	 * where we do not need a matching rmb()
+	 */
+	smp_wmb();
+	branch_tracing_enabled++;
+	mutex_unlock(&branch_tracing_mutex);
+
+	return ret;
+}
+
+void disable_branch_tracing(void)
+{
+	mutex_lock(&branch_tracing_mutex);
+
+	if (!branch_tracing_enabled)
+		goto out_unlock;
+
+	branch_tracing_enabled--;
+
+ out_unlock:
+	mutex_unlock(&branch_tracing_mutex);
+}
+
+static void start_branch_trace(struct trace_array *tr)
+{
+	enable_branch_tracing(tr);
+}
+
+static void stop_branch_trace(struct trace_array *tr)
+{
+	disable_branch_tracing();
+}
+
+static void branch_trace_init(struct trace_array *tr)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr, cpu);
+
+	start_branch_trace(tr);
+}
+
+static void branch_trace_reset(struct trace_array *tr)
+{
+	stop_branch_trace(tr);
+}
+
+struct tracer branch_trace __read_mostly =
+{
+	.name		= "branch",
+	.init		= branch_trace_init,
+	.reset		= branch_trace_reset,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest	= trace_selftest_startup_branch,
+#endif
+};
+
+__init static int init_branch_trace(void)
+{
+	return register_tracer(&branch_trace);
+}
+
+device_initcall(init_branch_trace);
+#else
+static inline
+void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
+{
+}
+#endif /* CONFIG_BRANCH_TRACER */
+
+void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
+{
+	/*
+	 * I would love to have a trace point here instead, but the
+	 * trace point code is so inundated with unlikely and likely
+	 * conditions that the recursive nightmare that exists is too
+	 * much to try to get working. At least for now.
+	 */
+	trace_likely_condition(f, val, expect);
+
+	/* FIXME: Make this atomic! */
+	if (val == expect)
+		f->correct++;
+	else
+		f->incorrect++;
+}
+EXPORT_SYMBOL(ftrace_likely_update);
+
+struct ftrace_pointer {
+	void		*start;
+	void		*stop;
+};
+
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct ftrace_pointer *f = m->private;
+	struct ftrace_branch_data *p = v;
+
+	(*pos)++;
+
+	if (v == (void *)1)
+		return f->start;
+
+	++p;
+
+	if ((void *)p >= (void *)f->stop)
+		return NULL;
+
+	return p;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+	void *t = (void *)1;
+	loff_t l = 0;
+
+	for (; t && l < *pos; t = t_next(m, t, &l))
+		;
+
+	return t;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+	struct ftrace_branch_data *p = v;
+	const char *f;
+	unsigned long percent;
+
+	if (v == (void *)1) {
+		seq_printf(m, " correct incorrect  %% "
+			      "       Function                "
+			      "  File              Line\n"
+			      " ------- ---------  - "
+			      "       --------                "
+			      "  ----              ----\n");
+		return 0;
+	}
+
+	/* Only print the file, not the path */
+	f = p->file + strlen(p->file);
+	while (f >= p->file && *f != '/')
+		f--;
+	f++;
+
+	if (p->correct) {
+		percent = p->incorrect * 100;
+		percent /= p->correct + p->incorrect;
+	} else
+		percent = p->incorrect ? 100 : 0;
+
+	seq_printf(m, "%8lu %8lu %3lu ", p->correct, p->incorrect, percent);
+	seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
+	return 0;
+}
+
+static struct seq_operations tracing_likely_seq_ops = {
+	.start		= t_start,
+	.next		= t_next,
+	.stop		= t_stop,
+	.show		= t_show,
+};
+
+static int tracing_likely_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	ret = seq_open(file, &tracing_likely_seq_ops);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = (void *)inode->i_private;
+	}
+
+	return ret;
+}
+
+static struct file_operations tracing_likely_fops = {
+	.open		= tracing_likely_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+};
+
+extern unsigned long __start_likely_profile[];
+extern unsigned long __stop_likely_profile[];
+extern unsigned long __start_unlikely_profile[];
+extern unsigned long __stop_unlikely_profile[];
+
+static struct ftrace_pointer ftrace_likely_pos = {
+	.start			= __start_likely_profile,
+	.stop			= __stop_likely_profile,
+};
+
+static struct ftrace_pointer ftrace_unlikely_pos = {
+	.start			= __start_unlikely_profile,
+	.stop			= __stop_unlikely_profile,
+};
+
+static __init int ftrace_branch_init(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+
+	d_tracer = tracing_init_dentry();
+
+	entry = debugfs_create_file("profile_likely", 0444, d_tracer,
+				    &ftrace_likely_pos,
+				    &tracing_likely_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'profile_likely' entry\n");
+
+	entry = debugfs_create_file("profile_unlikely", 0444, d_tracer,
+				    &ftrace_unlikely_pos,
+				    &tracing_likely_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs"
+			   " 'profile_unlikely' entry\n");
+
+	return 0;
+}
+
+device_initcall(ftrace_branch_init);
diff --git a/kernel/trace/trace_unlikely.c b/kernel/trace/trace_unlikely.c
deleted file mode 100644
index 85265553918f..000000000000
--- a/kernel/trace/trace_unlikely.c
+++ /dev/null
@@ -1,320 +0,0 @@
-/*
- * unlikely profiler
- *
- * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
- */
-#include <linux/kallsyms.h>
-#include <linux/seq_file.h>
-#include <linux/spinlock.h>
-#include <linux/debugfs.h>
-#include <linux/uaccess.h>
-#include <linux/module.h>
-#include <linux/ftrace.h>
-#include <linux/hash.h>
-#include <linux/fs.h>
-#include <asm/local.h>
-#include "trace.h"
-
-#ifdef CONFIG_BRANCH_TRACER
-
-static int branch_tracing_enabled __read_mostly;
-static DEFINE_MUTEX(branch_tracing_mutex);
-static struct trace_array *branch_tracer;
-
-static void
-probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
-{
-	struct trace_array *tr = branch_tracer;
-	struct ring_buffer_event *event;
-	struct trace_branch *entry;
-	unsigned long flags, irq_flags;
-	int cpu, pc;
-	const char *p;
-
-	/*
-	 * I would love to save just the ftrace_likely_data pointer, but
-	 * this code can also be used by modules. Ugly things can happen
-	 * if the module is unloaded, and then we go and read the
-	 * pointer.  This is slower, but much safer.
-	 */
-
-	if (unlikely(!tr))
-		return;
-
-	local_irq_save(flags);
-	cpu = raw_smp_processor_id();
-	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
-		goto out;
-
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
-	if (!event)
-		goto out;
-
-	pc = preempt_count();
-	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, flags, pc);
-	entry->ent.type		= TRACE_BRANCH;
-
-	/* Strip off the path, only save the file */
-	p = f->file + strlen(f->file);
-	while (p >= f->file && *p != '/')
-		p--;
-	p++;
-
-	strncpy(entry->func, f->func, TRACE_FUNC_SIZE);
-	strncpy(entry->file, p, TRACE_FILE_SIZE);
-	entry->func[TRACE_FUNC_SIZE] = 0;
-	entry->file[TRACE_FILE_SIZE] = 0;
-	entry->line = f->line;
-	entry->correct = val == expect;
-
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
-
- out:
-	atomic_dec(&tr->data[cpu]->disabled);
-	local_irq_restore(flags);
-}
-
-static inline
-void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
-{
-	if (!branch_tracing_enabled)
-		return;
-
-	probe_likely_condition(f, val, expect);
-}
-
-int enable_branch_tracing(struct trace_array *tr)
-{
-	int ret = 0;
-
-	mutex_lock(&branch_tracing_mutex);
-	branch_tracer = tr;
-	/*
-	 * Must be seen before enabling. The reader is a condition
-	 * where we do not need a matching rmb()
-	 */
-	smp_wmb();
-	branch_tracing_enabled++;
-	mutex_unlock(&branch_tracing_mutex);
-
-	return ret;
-}
-
-void disable_branch_tracing(void)
-{
-	mutex_lock(&branch_tracing_mutex);
-
-	if (!branch_tracing_enabled)
-		goto out_unlock;
-
-	branch_tracing_enabled--;
-
- out_unlock:
-	mutex_unlock(&branch_tracing_mutex);
-}
-
-static void start_branch_trace(struct trace_array *tr)
-{
-	enable_branch_tracing(tr);
-}
-
-static void stop_branch_trace(struct trace_array *tr)
-{
-	disable_branch_tracing();
-}
-
-static void branch_trace_init(struct trace_array *tr)
-{
-	int cpu;
-
-	for_each_online_cpu(cpu)
-		tracing_reset(tr, cpu);
-
-	start_branch_trace(tr);
-}
-
-static void branch_trace_reset(struct trace_array *tr)
-{
-	stop_branch_trace(tr);
-}
-
-struct tracer branch_trace __read_mostly =
-{
-	.name		= "branch",
-	.init		= branch_trace_init,
-	.reset		= branch_trace_reset,
-#ifdef CONFIG_FTRACE_SELFTEST
-	.selftest	= trace_selftest_startup_branch,
-#endif
-};
-
-__init static int init_branch_trace(void)
-{
-	return register_tracer(&branch_trace);
-}
-
-device_initcall(init_branch_trace);
-#else
-static inline
-void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
-{
-}
-#endif /* CONFIG_BRANCH_TRACER */
-
-void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
-{
-	/*
-	 * I would love to have a trace point here instead, but the
-	 * trace point code is so inundated with unlikely and likely
-	 * conditions that the recursive nightmare that exists is too
-	 * much to try to get working. At least for now.
-	 */
-	trace_likely_condition(f, val, expect);
-
-	/* FIXME: Make this atomic! */
-	if (val == expect)
-		f->correct++;
-	else
-		f->incorrect++;
-}
-EXPORT_SYMBOL(ftrace_likely_update);
-
-struct ftrace_pointer {
-	void		*start;
-	void		*stop;
-};
-
-static void *
-t_next(struct seq_file *m, void *v, loff_t *pos)
-{
-	struct ftrace_pointer *f = m->private;
-	struct ftrace_branch_data *p = v;
-
-	(*pos)++;
-
-	if (v == (void *)1)
-		return f->start;
-
-	++p;
-
-	if ((void *)p >= (void *)f->stop)
-		return NULL;
-
-	return p;
-}
-
-static void *t_start(struct seq_file *m, loff_t *pos)
-{
-	void *t = (void *)1;
-	loff_t l = 0;
-
-	for (; t && l < *pos; t = t_next(m, t, &l))
-		;
-
-	return t;
-}
-
-static void t_stop(struct seq_file *m, void *p)
-{
-}
-
-static int t_show(struct seq_file *m, void *v)
-{
-	struct ftrace_branch_data *p = v;
-	const char *f;
-	unsigned long percent;
-
-	if (v == (void *)1) {
-		seq_printf(m, " correct incorrect  %% "
-			      "       Function                "
-			      "  File              Line\n"
-			      " ------- ---------  - "
-			      "       --------                "
-			      "  ----              ----\n");
-		return 0;
-	}
-
-	/* Only print the file, not the path */
-	f = p->file + strlen(p->file);
-	while (f >= p->file && *f != '/')
-		f--;
-	f++;
-
-	if (p->correct) {
-		percent = p->incorrect * 100;
-		percent /= p->correct + p->incorrect;
-	} else
-		percent = p->incorrect ? 100 : 0;
-
-	seq_printf(m, "%8lu %8lu %3lu ", p->correct, p->incorrect, percent);
-	seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
-	return 0;
-}
-
-static struct seq_operations tracing_likely_seq_ops = {
-	.start		= t_start,
-	.next		= t_next,
-	.stop		= t_stop,
-	.show		= t_show,
-};
-
-static int tracing_likely_open(struct inode *inode, struct file *file)
-{
-	int ret;
-
-	ret = seq_open(file, &tracing_likely_seq_ops);
-	if (!ret) {
-		struct seq_file *m = file->private_data;
-		m->private = (void *)inode->i_private;
-	}
-
-	return ret;
-}
-
-static struct file_operations tracing_likely_fops = {
-	.open		= tracing_likely_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-};
-
-extern unsigned long __start_likely_profile[];
-extern unsigned long __stop_likely_profile[];
-extern unsigned long __start_unlikely_profile[];
-extern unsigned long __stop_unlikely_profile[];
-
-static struct ftrace_pointer ftrace_likely_pos = {
-	.start			= __start_likely_profile,
-	.stop			= __stop_likely_profile,
-};
-
-static struct ftrace_pointer ftrace_unlikely_pos = {
-	.start			= __start_unlikely_profile,
-	.stop			= __stop_unlikely_profile,
-};
-
-static __init int ftrace_branch_init(void)
-{
-	struct dentry *d_tracer;
-	struct dentry *entry;
-
-	d_tracer = tracing_init_dentry();
-
-	entry = debugfs_create_file("profile_likely", 0444, d_tracer,
-				    &ftrace_likely_pos,
-				    &tracing_likely_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'profile_likely' entry\n");
-
-	entry = debugfs_create_file("profile_unlikely", 0444, d_tracer,
-				    &ftrace_unlikely_pos,
-				    &tracing_likely_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs"
-			   " 'profile_unlikely' entry\n");
-
-	return 0;
-}
-
-device_initcall(ftrace_branch_init);
-- 
cgit v1.2.3


From a94c80e78bc9f4493ffc25a02d5d7bcd93c399d0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 12 Nov 2008 17:52:36 -0500
Subject: ftrace: rename trace_entries to buffer_size_kb

Impact: rename of debugfs file trace_entries to buffer_size_kb

The original ftrace had fixed size entries, and the number of entries
was shown and modified via the file called trace_entries. By converting
to the unified trace buffer, we now allow for variable size entries
which makes the meaning of trace_entries pointless.

Since trace_size might be confused to the size of the trace, this patch
names it "buffer_size_kb" (thanks to Arjan van de Ven for this idea).

[ mingo@elte.hu: changed from buffer_size to buffer_size_kb ]

( Note, the units are still bytes - the next patch changes that,
  to keep the wide rename patch separate from the unit-change patch. )

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/ftrace.txt | 18 +++++++++---------
 kernel/trace/trace.c     |  4 ++--
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt
index 9cc4d685dde5..a1b58777839b 100644
--- a/Documentation/ftrace.txt
+++ b/Documentation/ftrace.txt
@@ -94,7 +94,7 @@ of ftrace. Here is a list of some of the key files:
 		only be recorded if the latency is greater than
 		the value in this file. (in microseconds)
 
-  trace_entries: This sets or displays the number of bytes each CPU
+  buffer_size_kb: This sets or displays the number of bytes each CPU
 		buffer can hold. The tracer buffers are the same size
 		for each CPU. The displayed number is the size of the
 		 CPU buffer and not total size of all buffers. The
@@ -1299,13 +1299,13 @@ trace entries
 -------------
 
 Having too much or not enough data can be troublesome in diagnosing
-an issue in the kernel. The file trace_entries is used to modify
+an issue in the kernel. The file buffer_size_kb is used to modify
 the size of the internal trace buffers. The number listed
 is the number of entries that can be recorded per CPU. To know
 the full size, multiply the number of possible CPUS with the
 number of entries.
 
- # cat /debug/tracing/trace_entries
+ # cat /debug/tracing/buffer_size_kb
 65620
 
 Note, to modify this, you must have tracing completely disabled. To do that,
@@ -1313,8 +1313,8 @@ echo "nop" into the current_tracer. If the current_tracer is not set
 to "nop", an EINVAL error will be returned.
 
  # echo nop > /debug/tracing/current_tracer
- # echo 100000 > /debug/tracing/trace_entries
- # cat /debug/tracing/trace_entries
+ # echo 100000 > /debug/tracing/buffer_size_kb
+ # cat /debug/tracing/buffer_size_kb
 100045
 
 
@@ -1323,8 +1323,8 @@ are held in individual pages. It allocates the number of pages it takes
 to fulfill the request. If more entries may fit on the last page
 then they will be added.
 
- # echo 1 > /debug/tracing/trace_entries
- # cat /debug/tracing/trace_entries
+ # echo 1 > /debug/tracing/buffer_size_kb
+ # cat /debug/tracing/buffer_size_kb
 85
 
 This shows us that 85 entries can fit in a single page.
@@ -1332,8 +1332,8 @@ This shows us that 85 entries can fit in a single page.
 The number of pages which will be allocated is limited to a percentage
 of available memory. Allocating too much will produce an error.
 
- # echo 1000000000000 > /debug/tracing/trace_entries
+ # echo 1000000000000 > /debug/tracing/buffer_size_kb
 -bash: echo: write error: Cannot allocate memory
- # cat /debug/tracing/trace_entries
+ # cat /debug/tracing/buffer_size_kb
 85
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4bf070bb5272..b42d42056fa4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3198,11 +3198,11 @@ static __init int tracer_init_debugfs(void)
 		pr_warning("Could not create debugfs "
 			   "'trace_pipe' entry\n");
 
-	entry = debugfs_create_file("trace_entries", 0644, d_tracer,
+	entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer,
 				    &global_trace, &tracing_entries_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs "
-			   "'trace_entries' entry\n");
+			   "'buffer_size_kb' entry\n");
 
 	entry = debugfs_create_file("trace_marker", 0220, d_tracer,
 				    NULL, &tracing_mark_fops);
-- 
cgit v1.2.3


From 1696b2b0f44a8d42f3e6b1ea90c21790871c04d9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 13 Nov 2008 00:09:35 -0500
Subject: ftrace: show buffer size in kilobytes

Impact: change the units of buffer_size_kb to kilobytes

This patch changes the units of the buffer_size_kb file to kilobytes.
Reading and writing to the file uses kilobytes as units. To help
users to know what units are used, the output of the file now
looks like:

  # cat /debug/tracing/buffer_size_kb
  1408

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/ftrace.txt | 22 +++++-----------------
 kernel/trace/trace.c     |  5 ++++-
 2 files changed, 9 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt
index a1b58777839b..6d3fe4cdf921 100644
--- a/Documentation/ftrace.txt
+++ b/Documentation/ftrace.txt
@@ -94,10 +94,10 @@ of ftrace. Here is a list of some of the key files:
 		only be recorded if the latency is greater than
 		the value in this file. (in microseconds)
 
-  buffer_size_kb: This sets or displays the number of bytes each CPU
+  buffer_size_kb: This sets or displays the number of kilobytes each CPU
 		buffer can hold. The tracer buffers are the same size
 		for each CPU. The displayed number is the size of the
-		 CPU buffer and not total size of all buffers. The
+		CPU buffer and not total size of all buffers. The
 		trace buffers are allocated in pages (blocks of memory
 		that the kernel uses for allocation, usually 4 KB in size).
 		If the last page allocated has room for more bytes
@@ -1306,28 +1306,16 @@ the full size, multiply the number of possible CPUS with the
 number of entries.
 
  # cat /debug/tracing/buffer_size_kb
-65620
+1408 (units kilobytes)
 
 Note, to modify this, you must have tracing completely disabled. To do that,
 echo "nop" into the current_tracer. If the current_tracer is not set
 to "nop", an EINVAL error will be returned.
 
  # echo nop > /debug/tracing/current_tracer
- # echo 100000 > /debug/tracing/buffer_size_kb
+ # echo 10000 > /debug/tracing/buffer_size_kb
  # cat /debug/tracing/buffer_size_kb
-100045
-
-
-Notice that we echoed in 100,000 but the size is 100,045. The entries
-are held in individual pages. It allocates the number of pages it takes
-to fulfill the request. If more entries may fit on the last page
-then they will be added.
-
- # echo 1 > /debug/tracing/buffer_size_kb
- # cat /debug/tracing/buffer_size_kb
-85
-
-This shows us that 85 entries can fit in a single page.
+10000 (units kilobytes)
 
 The number of pages which will be allocated is limited to a percentage
 of available memory. Allocating too much will produce an error.
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b42d42056fa4..d664aae2e10a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2905,7 +2905,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
 	char buf[64];
 	int r;
 
-	r = sprintf(buf, "%lu\n", tr->entries);
+	r = sprintf(buf, "%lu\n", tr->entries >> 10);
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
@@ -2945,6 +2945,9 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
 			atomic_inc(&max_tr.data[cpu]->disabled);
 	}
 
+	/* value is in KB */
+	val <<= 10;
+
 	if (val != global_trace.entries) {
 		ret = ring_buffer_resize(global_trace.buffer, val);
 		if (ret < 0) {
-- 
cgit v1.2.3


From ee6bce52276c0717ed3e63296e5d9465d339e923 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 12 Nov 2008 17:52:37 -0500
Subject: ftrace: rename iter_ctrl to trace_options

Impact: rename file /debug/tracing/iter_ctrl to /debug/tracing/trace_options

The original ftrace had a file called "iter_ctrl" that would control
the way the output was iterated. But this file grew into a catch all
for different trace options. This patch renames the file from iter_ctrl
to trace_options to reflect this change.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/ftrace.txt | 14 +++++++-------
 kernel/trace/trace.c     | 18 +++++++++---------
 2 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt
index 6d3fe4cdf921..753f4de4b175 100644
--- a/Documentation/ftrace.txt
+++ b/Documentation/ftrace.txt
@@ -82,7 +82,7 @@ of ftrace. Here is a list of some of the key files:
 		tracer is not adding more data, they will display
 		the same information every time they are read.
 
-  iter_ctrl: This file lets the user control the amount of data
+  trace_options: This file lets the user control the amount of data
 		that is displayed in one of the above output
 		files.
 
@@ -316,23 +316,23 @@ The above is mostly meaningful for kernel developers.
   The rest is the same as the 'trace' file.
 
 
-iter_ctrl
----------
+trace_options
+-------------
 
-The iter_ctrl file is used to control what gets printed in the trace
+The trace_options file is used to control what gets printed in the trace
 output. To see what is available, simply cat the file:
 
-  cat /debug/tracing/iter_ctrl
+  cat /debug/tracing/trace_options
   print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \
  noblock nostacktrace nosched-tree
 
 To disable one of the options, echo in the option prepended with "no".
 
-  echo noprint-parent > /debug/tracing/iter_ctrl
+  echo noprint-parent > /debug/tracing/trace_options
 
 To enable an option, leave off the "no".
 
-  echo sym-offset > /debug/tracing/iter_ctrl
+  echo sym-offset > /debug/tracing/trace_options
 
 Here are the available options:
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d664aae2e10a..240423a9d1af 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -204,7 +204,7 @@ static DEFINE_MUTEX(trace_types_lock);
 /* trace_wait is a waitqueue for tasks blocked on trace_poll */
 static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 
-/* trace_flags holds iter_ctrl options */
+/* trace_flags holds trace_options default values */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK;
 
 /**
@@ -2411,7 +2411,7 @@ static struct file_operations tracing_cpumask_fops = {
 };
 
 static ssize_t
-tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
+tracing_trace_options_read(struct file *filp, char __user *ubuf,
 		       size_t cnt, loff_t *ppos)
 {
 	char *buf;
@@ -2448,7 +2448,7 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
 }
 
 static ssize_t
-tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
+tracing_trace_options_write(struct file *filp, const char __user *ubuf,
 			size_t cnt, loff_t *ppos)
 {
 	char buf[64];
@@ -2493,8 +2493,8 @@ tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
 
 static struct file_operations tracing_iter_fops = {
 	.open		= tracing_open_generic,
-	.read		= tracing_iter_ctrl_read,
-	.write		= tracing_iter_ctrl_write,
+	.read		= tracing_trace_options_read,
+	.write		= tracing_trace_options_write,
 };
 
 static const char readme_msg[] =
@@ -2508,9 +2508,9 @@ static const char readme_msg[] =
 	"# echo sched_switch > /debug/tracing/current_tracer\n"
 	"# cat /debug/tracing/current_tracer\n"
 	"sched_switch\n"
-	"# cat /debug/tracing/iter_ctrl\n"
+	"# cat /debug/tracing/trace_options\n"
 	"noprint-parent nosym-offset nosym-addr noverbose\n"
-	"# echo print-parent > /debug/tracing/iter_ctrl\n"
+	"# echo print-parent > /debug/tracing/trace_options\n"
 	"# echo 1 > /debug/tracing/tracing_enabled\n"
 	"# cat /debug/tracing/trace > /tmp/trace.txt\n"
 	"echo 0 > /debug/tracing/tracing_enabled\n"
@@ -3148,10 +3148,10 @@ static __init int tracer_init_debugfs(void)
 	if (!entry)
 		pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
 
-	entry = debugfs_create_file("iter_ctrl", 0644, d_tracer,
+	entry = debugfs_create_file("trace_options", 0644, d_tracer,
 				    NULL, &tracing_iter_fops);
 	if (!entry)
-		pr_warning("Could not create debugfs 'iter_ctrl' entry\n");
+		pr_warning("Could not create debugfs 'trace_options' entry\n");
 
 	entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
 				    NULL, &tracing_cpumask_fops);
-- 
cgit v1.2.3


From 12ef7d448613ead2babd41c3ebfa1fe03c20edef Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 12 Nov 2008 17:52:38 -0500
Subject: ftrace: CPU buffer start annotation clean ups

Impact: better handling of CPU buffer start annotation

Because of the confusion with the per CPU buffers wrapping where
one CPU might be more active at the end of the trace than the other
CPUs causing that one CPU to have a shorter history. Kernel
developers were confused by the "missing" data of that one CPU
at the beginning of the trace output. An annotation was added to
the trace output to show that the buffer had started:

 # tracer: function
 #
 #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
 #              | |       |          |         |
 ##### CPU 3 buffer started ####
          <idle>-0     [003]   158.192959: smp_apic_timer_interrupt
 [...]
           <idle>-0     [003]   161.556520: default_idle
 ##### CPU 1 buffer started ####
           <idle>-0     [001]   161.592494: hrtimer_force_reprogram
 [etc]

But this annotation gets a bit messy when tracers do not fill the
buffers. This patch does a couple of things:

 One) it adds a flag to trace_options to disable these annotations

 Two) it does not annotate if the tracer did not overflow its buffer.

This makes the output much cleaner.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 16 +++++++++++++++-
 kernel/trace/trace.h |  1 +
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 240423a9d1af..4a904623e05d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -205,7 +205,8 @@ static DEFINE_MUTEX(trace_types_lock);
 static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 
 /* trace_flags holds trace_options default values */
-unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK;
+unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
+	TRACE_ITER_ANNOTATE;
 
 /**
  * trace_wake_up - wake up tasks waiting for trace input
@@ -261,6 +262,7 @@ static const char *trace_options[] = {
 #ifdef CONFIG_BRANCH_TRACER
 	"branch",
 #endif
+	"annotate",
 	NULL
 };
 
@@ -1113,6 +1115,7 @@ void tracing_stop_function_trace(void)
 
 enum trace_file_type {
 	TRACE_FILE_LAT_FMT	= 1,
+	TRACE_FILE_ANNOTATE	= 2,
 };
 
 static void trace_iterator_increment(struct trace_iterator *iter, int cpu)
@@ -1532,6 +1535,12 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
 
+	if (!(trace_flags & TRACE_ITER_ANNOTATE))
+		return;
+
+	if (!(iter->iter_flags & TRACE_FILE_ANNOTATE))
+		return;
+
 	if (cpu_isset(iter->cpu, iter->started))
 		return;
 
@@ -2132,6 +2141,11 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 	iter->trace = current_trace;
 	iter->pos = -1;
 
+	/* Annotate start of buffers if we had overruns */
+	if (ring_buffer_overruns(iter->tr->buffer))
+		iter->iter_flags |= TRACE_FILE_ANNOTATE;
+
+
 	for_each_tracing_cpu(cpu) {
 
 		iter->buffer_iter[cpu] =
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9e015f5bea1d..790ea8c0e1f3 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -473,6 +473,7 @@ enum trace_iterator_flags {
 #ifdef CONFIG_BRANCH_TRACER
 	TRACE_ITER_BRANCH		= 0x1000,
 #endif
+	TRACE_ITER_ANNOTATE		= 0x2000,
 };
 
 /*
-- 
cgit v1.2.3


From b3535c6390f27d04273e4eee0bc687f171fbf5f4 Mon Sep 17 00:00:00 2001
From: walimis <walimisdev@gmail.com>
Date: Fri, 14 Nov 2008 00:21:02 +0800
Subject: ftrace: remove unnecessary if condition of
 __unregister_ftrace_function

Because it has goto out before ftrace_list == &ftrace_list_end,
that's to say, we never meet this condition.

Signed-off-by: walimis <walimisdev@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index beb21a51e1ef..54cb9a7d15e5 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -179,8 +179,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 
 	if (ftrace_enabled) {
 		/* If we only have one func left, then call that directly */
-		if (ftrace_list == &ftrace_list_end ||
-		    ftrace_list->next == &ftrace_list_end)
+		if (ftrace_list->next == &ftrace_list_end)
 			ftrace_trace_function = ftrace_list->func;
 	}
 
-- 
cgit v1.2.3


From 76aac0e9a17742e60d408be1a706e9aaad370891 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:12 +1100
Subject: CRED: Wrap task credential accesses in the core kernel

Wrap access to task credentials so that they can be separated more easily from
the task_struct during the introduction of COW creds.

Change most current->(|e|s|fs)[ug]id to current_(|e|s|fs)[ug]id().

Change some task->e?[ug]id to task_e?[ug]id().  In some places it makes more
sense to use RCU directly rather than a convenient wrapper; these will be
addressed by later patches.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-audit@redhat.com
Cc: containers@lists.linux-foundation.org
Cc: linux-mm@kvack.org
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/acct.c           |  7 +++----
 kernel/auditsc.c        |  6 ++++--
 kernel/cgroup.c         |  9 +++++----
 kernel/futex.c          |  8 +++++---
 kernel/futex_compat.c   |  3 ++-
 kernel/ptrace.c         | 15 +++++++++------
 kernel/sched.c          | 11 +++++++----
 kernel/signal.c         | 15 +++++++++------
 kernel/sys.c            | 16 ++++++++--------
 kernel/sysctl.c         |  2 +-
 kernel/timer.c          |  8 ++++----
 kernel/user_namespace.c |  2 +-
 mm/mempolicy.c          |  7 +++++--
 mm/migrate.c            |  7 +++++--
 mm/shmem.c              |  8 ++++----
 15 files changed, 72 insertions(+), 52 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index f6006a60df5d..d57b7cbb98b6 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -530,15 +530,14 @@ static void do_acct_process(struct bsd_acct_struct *acct,
 	do_div(elapsed, AHZ);
 	ac.ac_btime = get_seconds() - elapsed;
 	/* we really need to bite the bullet and change layout */
-	ac.ac_uid = current->uid;
-	ac.ac_gid = current->gid;
+	current_uid_gid(&ac.ac_uid, &ac.ac_gid);
 #if ACCT_VERSION==2
 	ac.ac_ahz = AHZ;
 #endif
 #if ACCT_VERSION==1 || ACCT_VERSION==2
 	/* backward-compatible 16 bit fields */
-	ac.ac_uid16 = current->uid;
-	ac.ac_gid16 = current->gid;
+	ac.ac_uid16 = ac.ac_uid;
+	ac.ac_gid16 = ac.ac_gid;
 #endif
 #if ACCT_VERSION==3
 	ac.ac_pid = task_tgid_nr_ns(current, ns);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index cef34235b362..9c7e47ae4576 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2628,7 +2628,8 @@ void audit_core_dumps(long signr)
 {
 	struct audit_buffer *ab;
 	u32 sid;
-	uid_t auid = audit_get_loginuid(current);
+	uid_t auid = audit_get_loginuid(current), uid;
+	gid_t gid;
 	unsigned int sessionid = audit_get_sessionid(current);
 
 	if (!audit_enabled)
@@ -2638,8 +2639,9 @@ void audit_core_dumps(long signr)
 		return;
 
 	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
+	current_uid_gid(&uid, &gid);
 	audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
-			auid, current->uid, current->gid, sessionid);
+			 auid, uid, gid, sessionid);
 	security_task_getsecid(current, &sid);
 	if (sid) {
 		char *ctx = NULL;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 35eebd5510c2..78f9b310c4f3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -571,8 +571,8 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
 
 	if (inode) {
 		inode->i_mode = mode;
-		inode->i_uid = current->fsuid;
-		inode->i_gid = current->fsgid;
+		inode->i_uid = current_fsuid();
+		inode->i_gid = current_fsgid();
 		inode->i_blocks = 0;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
@@ -1279,6 +1279,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
 {
 	struct task_struct *tsk;
+	uid_t euid;
 	int ret;
 
 	if (pid) {
@@ -1291,8 +1292,8 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
 		get_task_struct(tsk);
 		rcu_read_unlock();
 
-		if ((current->euid) && (current->euid != tsk->uid)
-		    && (current->euid != tsk->suid)) {
+		euid = current_euid();
+		if (euid && euid != tsk->uid && euid != tsk->suid) {
 			put_task_struct(tsk);
 			return -EACCES;
 		}
diff --git a/kernel/futex.c b/kernel/futex.c
index 8af10027514b..e06962132aaf 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -439,10 +439,11 @@ static void free_pi_state(struct futex_pi_state *pi_state)
 static struct task_struct * futex_find_get_task(pid_t pid)
 {
 	struct task_struct *p;
+	uid_t euid = current_euid();
 
 	rcu_read_lock();
 	p = find_task_by_vpid(pid);
-	if (!p || ((current->euid != p->euid) && (current->euid != p->uid)))
+	if (!p || (euid != p->euid && euid != p->uid))
 		p = ERR_PTR(-ESRCH);
 	else
 		get_task_struct(p);
@@ -1829,6 +1830,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
 {
 	struct robust_list_head __user *head;
 	unsigned long ret;
+	uid_t euid = current_euid();
 
 	if (!futex_cmpxchg_enabled)
 		return -ENOSYS;
@@ -1844,8 +1846,8 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
 		if (!p)
 			goto err_unlock;
 		ret = -EPERM;
-		if ((current->euid != p->euid) && (current->euid != p->uid) &&
-				!capable(CAP_SYS_PTRACE))
+		if (euid != p->euid && euid != p->uid &&
+		    !capable(CAP_SYS_PTRACE))
 			goto err_unlock;
 		head = p->robust_list;
 		rcu_read_unlock();
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 04ac3a9e42cf..3254d4e41e88 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -135,6 +135,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
 {
 	struct compat_robust_list_head __user *head;
 	unsigned long ret;
+	uid_t euid = current_euid();
 
 	if (!futex_cmpxchg_enabled)
 		return -ENOSYS;
@@ -150,7 +151,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
 		if (!p)
 			goto err_unlock;
 		ret = -EPERM;
-		if ((current->euid != p->euid) && (current->euid != p->uid) &&
+		if (euid != p->euid && euid != p->uid &&
 				!capable(CAP_SYS_PTRACE))
 			goto err_unlock;
 		head = p->compat_robust_list;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1e68e4c39e2c..937f6b5b2008 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -123,16 +123,19 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 	 * because setting up the necessary parent/child relationship
 	 * or halting the specified task is impossible.
 	 */
+	uid_t uid;
+	gid_t gid;
 	int dumpable = 0;
 	/* Don't let security modules deny introspection */
 	if (task == current)
 		return 0;
-	if (((current->uid != task->euid) ||
-	     (current->uid != task->suid) ||
-	     (current->uid != task->uid) ||
-	     (current->gid != task->egid) ||
-	     (current->gid != task->sgid) ||
-	     (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
+	current_uid_gid(&uid, &gid);
+	if ((uid != task->euid ||
+	     uid != task->suid ||
+	     uid != task->uid  ||
+	     gid != task->egid ||
+	     gid != task->sgid ||
+	     gid != task->gid) && !capable(CAP_SYS_PTRACE))
 		return -EPERM;
 	smp_rmb();
 	if (task->mm)
diff --git a/kernel/sched.c b/kernel/sched.c
index e8819bc6f462..c3b8b1fcde0d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5128,6 +5128,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
 	unsigned long flags;
 	const struct sched_class *prev_class = p->sched_class;
 	struct rq *rq;
+	uid_t euid;
 
 	/* may grab non-irq protected spin_locks */
 	BUG_ON(in_interrupt());
@@ -5180,8 +5181,9 @@ recheck:
 			return -EPERM;
 
 		/* can't change other user's priorities */
-		if ((current->euid != p->euid) &&
-		    (current->euid != p->uid))
+		euid = current_euid();
+		if (euid != p->euid &&
+		    euid != p->uid)
 			return -EPERM;
 	}
 
@@ -5392,6 +5394,7 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
 	cpumask_t cpus_allowed;
 	cpumask_t new_mask = *in_mask;
 	struct task_struct *p;
+	uid_t euid;
 	int retval;
 
 	get_online_cpus();
@@ -5412,9 +5415,9 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
 	get_task_struct(p);
 	read_unlock(&tasklist_lock);
 
+	euid = current_euid();
 	retval = -EPERM;
-	if ((current->euid != p->euid) && (current->euid != p->uid) &&
-			!capable(CAP_SYS_NICE))
+	if (euid != p->euid && euid != p->uid && !capable(CAP_SYS_NICE))
 		goto out_unlock;
 
 	retval = security_task_setscheduler(p, 0, NULL);
diff --git a/kernel/signal.c b/kernel/signal.c
index 4530fc654455..167b535fe1a9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -567,6 +567,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
 				 struct task_struct *t)
 {
 	struct pid *sid;
+	uid_t uid, euid;
 	int error;
 
 	if (!valid_signal(sig))
@@ -579,8 +580,10 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	if (error)
 		return error;
 
-	if ((current->euid ^ t->suid) && (current->euid ^ t->uid) &&
-	    (current->uid  ^ t->suid) && (current->uid  ^ t->uid) &&
+	uid = current_uid();
+	euid = current_euid();
+	if ((euid ^ t->suid) && (euid ^ t->uid) &&
+	    (uid  ^ t->suid) && (uid  ^ t->uid) &&
 	    !capable(CAP_KILL)) {
 		switch (sig) {
 		case SIGCONT:
@@ -844,7 +847,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			q->info.si_errno = 0;
 			q->info.si_code = SI_USER;
 			q->info.si_pid = task_pid_vnr(current);
-			q->info.si_uid = current->uid;
+			q->info.si_uid = current_uid();
 			break;
 		case (unsigned long) SEND_SIG_PRIV:
 			q->info.si_signo = sig;
@@ -1598,7 +1601,7 @@ void ptrace_notify(int exit_code)
 	info.si_signo = SIGTRAP;
 	info.si_code = exit_code;
 	info.si_pid = task_pid_vnr(current);
-	info.si_uid = current->uid;
+	info.si_uid = current_uid();
 
 	/* Let the debugger run.  */
 	spin_lock_irq(&current->sighand->siglock);
@@ -2211,7 +2214,7 @@ sys_kill(pid_t pid, int sig)
 	info.si_errno = 0;
 	info.si_code = SI_USER;
 	info.si_pid = task_tgid_vnr(current);
-	info.si_uid = current->uid;
+	info.si_uid = current_uid();
 
 	return kill_something_info(sig, &info, pid);
 }
@@ -2228,7 +2231,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
 	info.si_errno = 0;
 	info.si_code = SI_TKILL;
 	info.si_pid = task_tgid_vnr(current);
-	info.si_uid = current->uid;
+	info.si_uid = current_uid();
 
 	rcu_read_lock();
 	p = find_task_by_vpid(pid);
diff --git a/kernel/sys.c b/kernel/sys.c
index 31deba8f7d16..ed5c29c748ac 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -114,10 +114,10 @@ void (*pm_power_off_prepare)(void);
 
 static int set_one_prio(struct task_struct *p, int niceval, int error)
 {
+	uid_t euid = current_euid();
 	int no_nice;
 
-	if (p->uid != current->euid &&
-		p->euid != current->euid && !capable(CAP_SYS_NICE)) {
+	if (p->uid != euid && p->euid != euid && !capable(CAP_SYS_NICE)) {
 		error = -EPERM;
 		goto out;
 	}
@@ -176,16 +176,16 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
 		case PRIO_USER:
 			user = current->user;
 			if (!who)
-				who = current->uid;
+				who = current_uid();
 			else
-				if ((who != current->uid) && !(user = find_user(who)))
+				if (who != current_uid() && !(user = find_user(who)))
 					goto out_unlock;	/* No processes for this user */
 
 			do_each_thread(g, p)
 				if (p->uid == who)
 					error = set_one_prio(p, niceval, error);
 			while_each_thread(g, p);
-			if (who != current->uid)
+			if (who != current_uid())
 				free_uid(user);		/* For find_user() */
 			break;
 	}
@@ -238,9 +238,9 @@ asmlinkage long sys_getpriority(int which, int who)
 		case PRIO_USER:
 			user = current->user;
 			if (!who)
-				who = current->uid;
+				who = current_uid();
 			else
-				if ((who != current->uid) && !(user = find_user(who)))
+				if (who != current_uid() && !(user = find_user(who)))
 					goto out_unlock;	/* No processes for this user */
 
 			do_each_thread(g, p)
@@ -250,7 +250,7 @@ asmlinkage long sys_getpriority(int which, int who)
 						retval = niceval;
 				}
 			while_each_thread(g, p);
-			if (who != current->uid)
+			if (who != current_uid())
 				free_uid(user);		/* for find_user() */
 			break;
 	}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9d048fa2d902..511031381c33 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1641,7 +1641,7 @@ out:
 
 static int test_perm(int mode, int op)
 {
-	if (!current->euid)
+	if (!current_euid())
 		mode >>= 6;
 	else if (in_egroup_p(0))
 		mode >>= 3;
diff --git a/kernel/timer.c b/kernel/timer.c
index 56becf373c58..b54e4646cee7 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1123,25 +1123,25 @@ asmlinkage long sys_getppid(void)
 asmlinkage long sys_getuid(void)
 {
 	/* Only we change this so SMP safe */
-	return current->uid;
+	return current_uid();
 }
 
 asmlinkage long sys_geteuid(void)
 {
 	/* Only we change this so SMP safe */
-	return current->euid;
+	return current_euid();
 }
 
 asmlinkage long sys_getgid(void)
 {
 	/* Only we change this so SMP safe */
-	return current->gid;
+	return current_gid();
 }
 
 asmlinkage long sys_getegid(void)
 {
 	/* Only we change this so SMP safe */
-	return  current->egid;
+	return  current_egid();
 }
 
 #endif
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 532858fa5b88..f82730adea00 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -38,7 +38,7 @@ static struct user_namespace *clone_user_ns(struct user_namespace *old_ns)
 	}
 
 	/* Reset current->user with a new one */
-	new_user = alloc_uid(ns, current->uid);
+	new_user = alloc_uid(ns, current_uid());
 	if (!new_user) {
 		free_uid(ns->root_user);
 		kfree(ns);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 36f42573a335..07a96474077d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1115,6 +1115,7 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 	nodemask_t old;
 	nodemask_t new;
 	nodemask_t task_nodes;
+	uid_t uid, euid;
 	int err;
 
 	err = get_nodes(&old, old_nodes, maxnode);
@@ -1144,8 +1145,10 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 	 * capabilities, superuser privileges or the same
 	 * userid as the target process.
 	 */
-	if ((current->euid != task->suid) && (current->euid != task->uid) &&
-	    (current->uid != task->suid) && (current->uid != task->uid) &&
+	uid = current_uid();
+	euid = current_euid();
+	if (euid != task->suid && euid != task->uid &&
+	    uid  != task->suid && uid  != task->uid &&
 	    !capable(CAP_SYS_NICE)) {
 		err = -EPERM;
 		goto out;
diff --git a/mm/migrate.c b/mm/migrate.c
index 6602941bfab0..6263c24c4afe 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1048,6 +1048,7 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
 	struct task_struct *task;
 	struct mm_struct *mm;
 	int err;
+	uid_t uid, euid;
 
 	/* Check flags */
 	if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
@@ -1075,8 +1076,10 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
 	 * capabilities, superuser privileges or the same
 	 * userid as the target process.
 	 */
-	if ((current->euid != task->suid) && (current->euid != task->uid) &&
-	    (current->uid != task->suid) && (current->uid != task->uid) &&
+	uid = current_uid();
+	euid = current_euid();
+	if (euid != task->suid && euid != task->uid &&
+	    uid  != task->suid && uid  != task->uid &&
 	    !capable(CAP_SYS_NICE)) {
 		err = -EPERM;
 		goto out;
diff --git a/mm/shmem.c b/mm/shmem.c
index 0ed075215e5f..f1b0d4871f3a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1513,8 +1513,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
 	inode = new_inode(sb);
 	if (inode) {
 		inode->i_mode = mode;
-		inode->i_uid = current->fsuid;
-		inode->i_gid = current->fsgid;
+		inode->i_uid = current_fsuid();
+		inode->i_gid = current_fsgid();
 		inode->i_blocks = 0;
 		inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -2278,8 +2278,8 @@ static int shmem_fill_super(struct super_block *sb,
 	sbinfo->max_blocks = 0;
 	sbinfo->max_inodes = 0;
 	sbinfo->mode = S_IRWXUGO | S_ISVTX;
-	sbinfo->uid = current->fsuid;
-	sbinfo->gid = current->fsgid;
+	sbinfo->uid = current_fsuid();
+	sbinfo->gid = current_fsgid();
 	sbinfo->mpol = NULL;
 	sb->s_fs_info = sbinfo;
 
-- 
cgit v1.2.3


From 8bbf4976b59fc9fc2861e79cab7beb3f6d647640 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:14 +1100
Subject: KEYS: Alter use of key instantiation link-to-keyring argument

Alter the use of the key instantiation and negation functions' link-to-keyring
arguments.  Currently this specifies a keyring in the target process to link
the key into, creating the keyring if it doesn't exist.  This, however, can be
a problem for copy-on-write credentials as it means that the instantiating
process can alter the credentials of the requesting process.

This patch alters the behaviour such that:

 (1) If keyctl_instantiate_key() or keyctl_negate_key() are given a specific
     keyring by ID (ringid >= 0), then that keyring will be used.

 (2) If keyctl_instantiate_key() or keyctl_negate_key() are given one of the
     special constants that refer to the requesting process's keyrings
     (KEY_SPEC_*_KEYRING, all <= 0), then:

     (a) If sys_request_key() was given a keyring to use (destringid) then the
     	 key will be attached to that keyring.

     (b) If sys_request_key() was given a NULL keyring, then the key being
     	 instantiated will be attached to the default keyring as set by
     	 keyctl_set_reqkey_keyring().

 (3) No extra link will be made.

Decision point (1) follows current behaviour, and allows those instantiators
who've searched for a specifically named keyring in the requestor's keyring so
as to partition the keys by type to still have their named keyrings.

Decision point (2) allows the requestor to make sure that the key or keys that
get produced by request_key() go where they want, whilst allowing the
instantiator to request that the key is retained.  This is mainly useful for
situations where the instantiator makes a secondary request, the key for which
should be retained by the initial requestor:

	+-----------+        +--------------+        +--------------+
	|           |        |              |        |              |
	| Requestor |------->| Instantiator |------->| Instantiator |
	|           |        |              |        |              |
	+-----------+        +--------------+        +--------------+
	           request_key()           request_key()

This might be useful, for example, in Kerberos, where the requestor requests a
ticket, and then the ticket instantiator requests the TGT, which someone else
then has to go and fetch.  The TGT, however, should be retained in the
keyrings of the requestor, not the first instantiator.  To make this explict
an extra special keyring constant is also added.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: James Morris <jmorris@namei.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/key.h              |  16 +++---
 include/linux/keyctl.h           |   4 +-
 kernel/kmod.c                    |   2 +-
 security/keys/internal.h         |  12 ++--
 security/keys/keyctl.c           | 118 +++++++++++++++++++++++----------------
 security/keys/process_keys.c     |  88 ++++++++++++++++++-----------
 security/keys/request_key.c      |  75 +++++++++++++++++--------
 security/keys/request_key_auth.c |   5 +-
 8 files changed, 199 insertions(+), 121 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/key.h b/include/linux/key.h
index 1b70e35a71e3..df709e1af3cd 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -287,11 +287,11 @@ extern void key_fsuid_changed(struct task_struct *tsk);
 extern void key_fsgid_changed(struct task_struct *tsk);
 extern void key_init(void);
 
-#define __install_session_keyring(tsk, keyring)			\
-({								\
-	struct key *old_session = tsk->signal->session_keyring;	\
-	tsk->signal->session_keyring = keyring;			\
-	old_session;						\
+#define __install_session_keyring(keyring)				\
+({									\
+	struct key *old_session = current->signal->session_keyring;	\
+	current->signal->session_keyring = keyring;			\
+	old_session;							\
 })
 
 #else /* CONFIG_KEYS */
@@ -302,11 +302,11 @@ extern void key_init(void);
 #define key_revoke(k)			do { } while(0)
 #define key_put(k)			do { } while(0)
 #define key_ref_put(k)			do { } while(0)
-#define make_key_ref(k, p)			({ NULL; })
-#define key_ref_to_ptr(k)		({ NULL; })
+#define make_key_ref(k, p)		NULL
+#define key_ref_to_ptr(k)		NULL
 #define is_key_possessed(k)		0
 #define switch_uid_keyring(u)		do { } while(0)
-#define __install_session_keyring(t, k)	({ NULL; })
+#define __install_session_keyring(k)	({ NULL; })
 #define copy_keys(f,t)			0
 #define copy_thread_group_keys(t)	0
 #define exit_keys(t)			do { } while(0)
diff --git a/include/linux/keyctl.h b/include/linux/keyctl.h
index 656ee6b77a4a..c0688eb72093 100644
--- a/include/linux/keyctl.h
+++ b/include/linux/keyctl.h
@@ -1,6 +1,6 @@
 /* keyctl.h: keyctl command IDs
  *
- * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2004, 2008 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -20,6 +20,7 @@
 #define KEY_SPEC_USER_SESSION_KEYRING	-5	/* - key ID for UID-session keyring */
 #define KEY_SPEC_GROUP_KEYRING		-6	/* - key ID for GID-specific keyring */
 #define KEY_SPEC_REQKEY_AUTH_KEY	-7	/* - key ID for assumed request_key auth key */
+#define KEY_SPEC_REQUESTOR_KEYRING	-8	/* - key ID for request_key() dest keyring */
 
 /* request-key default keyrings */
 #define KEY_REQKEY_DEFL_NO_CHANGE		-1
@@ -30,6 +31,7 @@
 #define KEY_REQKEY_DEFL_USER_KEYRING		4
 #define KEY_REQKEY_DEFL_USER_SESSION_KEYRING	5
 #define KEY_REQKEY_DEFL_GROUP_KEYRING		6
+#define KEY_REQKEY_DEFL_REQUESTOR_KEYRING	7
 
 /* keyctl commands */
 #define KEYCTL_GET_KEYRING_ID		0	/* ask for a keyring's ID */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 3d3c3ea3a023..f044f8f57703 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -140,7 +140,7 @@ static int ____call_usermodehelper(void *data)
 	/* Unblock all signals and set the session keyring. */
 	new_session = key_get(sub_info->ring);
 	spin_lock_irq(&current->sighand->siglock);
-	old_session = __install_session_keyring(current, new_session);
+	old_session = __install_session_keyring(new_session);
 	flush_signal_handlers(current, 1);
 	sigemptyset(&current->blocked);
 	recalc_sigpending();
diff --git a/security/keys/internal.h b/security/keys/internal.h
index a60c68138b4d..d1586c629788 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -109,8 +109,9 @@ extern key_ref_t search_process_keyrings(struct key_type *type,
 
 extern struct key *find_keyring_by_name(const char *name, bool skip_perm_check);
 
-extern int install_thread_keyring(struct task_struct *tsk);
-extern int install_process_keyring(struct task_struct *tsk);
+extern int install_user_keyrings(void);
+extern int install_thread_keyring(void);
+extern int install_process_keyring(void);
 
 extern struct key *request_key_and_link(struct key_type *type,
 					const char *description,
@@ -120,8 +121,7 @@ extern struct key *request_key_and_link(struct key_type *type,
 					struct key *dest_keyring,
 					unsigned long flags);
 
-extern key_ref_t lookup_user_key(struct task_struct *context,
-				 key_serial_t id, int create, int partial,
+extern key_ref_t lookup_user_key(key_serial_t id, int create, int partial,
 				 key_perm_t perm);
 
 extern long join_session_keyring(const char *name);
@@ -152,6 +152,7 @@ static inline int key_permission(const key_ref_t key_ref, key_perm_t perm)
  */
 struct request_key_auth {
 	struct key		*target_key;
+	struct key		*dest_keyring;
 	struct task_struct	*context;
 	void			*callout_info;
 	size_t			callout_len;
@@ -161,7 +162,8 @@ struct request_key_auth {
 extern struct key_type key_type_request_key_auth;
 extern struct key *request_key_auth_new(struct key *target,
 					const void *callout_info,
-					size_t callout_len);
+					size_t callout_len,
+					struct key *dest_keyring);
 
 extern struct key *key_get_instantiation_authkey(key_serial_t target_id);
 
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 3f09e5b2a784..fcce331eca72 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -103,7 +103,7 @@ asmlinkage long sys_add_key(const char __user *_type,
 	}
 
 	/* find the target keyring (which must be writable) */
-	keyring_ref = lookup_user_key(NULL, ringid, 1, 0, KEY_WRITE);
+	keyring_ref = lookup_user_key(ringid, 1, 0, KEY_WRITE);
 	if (IS_ERR(keyring_ref)) {
 		ret = PTR_ERR(keyring_ref);
 		goto error3;
@@ -185,7 +185,7 @@ asmlinkage long sys_request_key(const char __user *_type,
 	/* get the destination keyring if specified */
 	dest_ref = NULL;
 	if (destringid) {
-		dest_ref = lookup_user_key(NULL, destringid, 1, 0, KEY_WRITE);
+		dest_ref = lookup_user_key(destringid, 1, 0, KEY_WRITE);
 		if (IS_ERR(dest_ref)) {
 			ret = PTR_ERR(dest_ref);
 			goto error3;
@@ -235,7 +235,7 @@ long keyctl_get_keyring_ID(key_serial_t id, int create)
 	key_ref_t key_ref;
 	long ret;
 
-	key_ref = lookup_user_key(NULL, id, create, 0, KEY_SEARCH);
+	key_ref = lookup_user_key(id, create, 0, KEY_SEARCH);
 	if (IS_ERR(key_ref)) {
 		ret = PTR_ERR(key_ref);
 		goto error;
@@ -308,7 +308,7 @@ long keyctl_update_key(key_serial_t id,
 	}
 
 	/* find the target key (which must be writable) */
-	key_ref = lookup_user_key(NULL, id, 0, 0, KEY_WRITE);
+	key_ref = lookup_user_key(id, 0, 0, KEY_WRITE);
 	if (IS_ERR(key_ref)) {
 		ret = PTR_ERR(key_ref);
 		goto error2;
@@ -336,7 +336,7 @@ long keyctl_revoke_key(key_serial_t id)
 	key_ref_t key_ref;
 	long ret;
 
-	key_ref = lookup_user_key(NULL, id, 0, 0, KEY_WRITE);
+	key_ref = lookup_user_key(id, 0, 0, KEY_WRITE);
 	if (IS_ERR(key_ref)) {
 		ret = PTR_ERR(key_ref);
 		goto error;
@@ -362,7 +362,7 @@ long keyctl_keyring_clear(key_serial_t ringid)
 	key_ref_t keyring_ref;
 	long ret;
 
-	keyring_ref = lookup_user_key(NULL, ringid, 1, 0, KEY_WRITE);
+	keyring_ref = lookup_user_key(ringid, 1, 0, KEY_WRITE);
 	if (IS_ERR(keyring_ref)) {
 		ret = PTR_ERR(keyring_ref);
 		goto error;
@@ -388,13 +388,13 @@ long keyctl_keyring_link(key_serial_t id, key_serial_t ringid)
 	key_ref_t keyring_ref, key_ref;
 	long ret;
 
-	keyring_ref = lookup_user_key(NULL, ringid, 1, 0, KEY_WRITE);
+	keyring_ref = lookup_user_key(ringid, 1, 0, KEY_WRITE);
 	if (IS_ERR(keyring_ref)) {
 		ret = PTR_ERR(keyring_ref);
 		goto error;
 	}
 
-	key_ref = lookup_user_key(NULL, id, 1, 0, KEY_LINK);
+	key_ref = lookup_user_key(id, 1, 0, KEY_LINK);
 	if (IS_ERR(key_ref)) {
 		ret = PTR_ERR(key_ref);
 		goto error2;
@@ -422,13 +422,13 @@ long keyctl_keyring_unlink(key_serial_t id, key_serial_t ringid)
 	key_ref_t keyring_ref, key_ref;
 	long ret;
 
-	keyring_ref = lookup_user_key(NULL, ringid, 0, 0, KEY_WRITE);
+	keyring_ref = lookup_user_key(ringid, 0, 0, KEY_WRITE);
 	if (IS_ERR(keyring_ref)) {
 		ret = PTR_ERR(keyring_ref);
 		goto error;
 	}
 
-	key_ref = lookup_user_key(NULL, id, 0, 0, 0);
+	key_ref = lookup_user_key(id, 0, 0, 0);
 	if (IS_ERR(key_ref)) {
 		ret = PTR_ERR(key_ref);
 		goto error2;
@@ -464,7 +464,7 @@ long keyctl_describe_key(key_serial_t keyid,
 	char *tmpbuf;
 	long ret;
 
-	key_ref = lookup_user_key(NULL, keyid, 0, 1, KEY_VIEW);
+	key_ref = lookup_user_key(keyid, 0, 1, KEY_VIEW);
 	if (IS_ERR(key_ref)) {
 		/* viewing a key under construction is permitted if we have the
 		 * authorisation token handy */
@@ -472,7 +472,7 @@ long keyctl_describe_key(key_serial_t keyid,
 			instkey = key_get_instantiation_authkey(keyid);
 			if (!IS_ERR(instkey)) {
 				key_put(instkey);
-				key_ref = lookup_user_key(NULL, keyid,
+				key_ref = lookup_user_key(keyid,
 							  0, 1, 0);
 				if (!IS_ERR(key_ref))
 					goto okay;
@@ -557,7 +557,7 @@ long keyctl_keyring_search(key_serial_t ringid,
 	}
 
 	/* get the keyring at which to begin the search */
-	keyring_ref = lookup_user_key(NULL, ringid, 0, 0, KEY_SEARCH);
+	keyring_ref = lookup_user_key(ringid, 0, 0, KEY_SEARCH);
 	if (IS_ERR(keyring_ref)) {
 		ret = PTR_ERR(keyring_ref);
 		goto error2;
@@ -566,7 +566,7 @@ long keyctl_keyring_search(key_serial_t ringid,
 	/* get the destination keyring if specified */
 	dest_ref = NULL;
 	if (destringid) {
-		dest_ref = lookup_user_key(NULL, destringid, 1, 0, KEY_WRITE);
+		dest_ref = lookup_user_key(destringid, 1, 0, KEY_WRITE);
 		if (IS_ERR(dest_ref)) {
 			ret = PTR_ERR(dest_ref);
 			goto error3;
@@ -636,7 +636,7 @@ long keyctl_read_key(key_serial_t keyid, char __user *buffer, size_t buflen)
 	long ret;
 
 	/* find the key first */
-	key_ref = lookup_user_key(NULL, keyid, 0, 0, 0);
+	key_ref = lookup_user_key(keyid, 0, 0, 0);
 	if (IS_ERR(key_ref)) {
 		ret = -ENOKEY;
 		goto error;
@@ -699,7 +699,7 @@ long keyctl_chown_key(key_serial_t id, uid_t uid, gid_t gid)
 	if (uid == (uid_t) -1 && gid == (gid_t) -1)
 		goto error;
 
-	key_ref = lookup_user_key(NULL, id, 1, 1, KEY_SETATTR);
+	key_ref = lookup_user_key(id, 1, 1, KEY_SETATTR);
 	if (IS_ERR(key_ref)) {
 		ret = PTR_ERR(key_ref);
 		goto error;
@@ -804,7 +804,7 @@ long keyctl_setperm_key(key_serial_t id, key_perm_t perm)
 	if (perm & ~(KEY_POS_ALL | KEY_USR_ALL | KEY_GRP_ALL | KEY_OTH_ALL))
 		goto error;
 
-	key_ref = lookup_user_key(NULL, id, 1, 1, KEY_SETATTR);
+	key_ref = lookup_user_key(id, 1, 1, KEY_SETATTR);
 	if (IS_ERR(key_ref)) {
 		ret = PTR_ERR(key_ref);
 		goto error;
@@ -829,6 +829,43 @@ error:
 
 } /* end keyctl_setperm_key() */
 
+/*
+ * get the destination keyring for instantiation
+ */
+static long get_instantiation_keyring(key_serial_t ringid,
+				      struct request_key_auth *rka,
+				      struct key **_dest_keyring)
+{
+	key_ref_t dkref;
+
+	/* just return a NULL pointer if we weren't asked to make a link */
+	if (ringid == 0) {
+		*_dest_keyring = NULL;
+		return 0;
+	}
+
+	/* if a specific keyring is nominated by ID, then use that */
+	if (ringid > 0) {
+		dkref = lookup_user_key(ringid, 1, 0, KEY_WRITE);
+		if (IS_ERR(dkref))
+			return PTR_ERR(dkref);
+		*_dest_keyring = key_ref_to_ptr(dkref);
+		return 0;
+	}
+
+	if (ringid == KEY_SPEC_REQKEY_AUTH_KEY)
+		return -EINVAL;
+
+	/* otherwise specify the destination keyring recorded in the
+	 * authorisation key (any KEY_SPEC_*_KEYRING) */
+	if (ringid >= KEY_SPEC_REQUESTOR_KEYRING) {
+		*_dest_keyring = rka->dest_keyring;
+		return 0;
+	}
+
+	return -ENOKEY;
+}
+
 /*****************************************************************************/
 /*
  * instantiate the key with the specified payload, and, if one is given, link
@@ -840,8 +877,7 @@ long keyctl_instantiate_key(key_serial_t id,
 			    key_serial_t ringid)
 {
 	struct request_key_auth *rka;
-	struct key *instkey;
-	key_ref_t keyring_ref;
+	struct key *instkey, *dest_keyring;
 	void *payload;
 	long ret;
 	bool vm = false;
@@ -883,21 +919,15 @@ long keyctl_instantiate_key(key_serial_t id,
 
 	/* find the destination keyring amongst those belonging to the
 	 * requesting task */
-	keyring_ref = NULL;
-	if (ringid) {
-		keyring_ref = lookup_user_key(rka->context, ringid, 1, 0,
-					      KEY_WRITE);
-		if (IS_ERR(keyring_ref)) {
-			ret = PTR_ERR(keyring_ref);
-			goto error2;
-		}
-	}
+	ret = get_instantiation_keyring(ringid, rka, &dest_keyring);
+	if (ret < 0)
+		goto error2;
 
 	/* instantiate the key and link it into a keyring */
 	ret = key_instantiate_and_link(rka->target_key, payload, plen,
-				       key_ref_to_ptr(keyring_ref), instkey);
+				       dest_keyring, instkey);
 
-	key_ref_put(keyring_ref);
+	key_put(dest_keyring);
 
 	/* discard the assumed authority if it's just been disabled by
 	 * instantiation of the key */
@@ -924,8 +954,7 @@ error:
 long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid)
 {
 	struct request_key_auth *rka;
-	struct key *instkey;
-	key_ref_t keyring_ref;
+	struct key *instkey, *dest_keyring;
 	long ret;
 
 	/* the appropriate instantiation authorisation key must have been
@@ -941,20 +970,15 @@ long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid)
 
 	/* find the destination keyring if present (which must also be
 	 * writable) */
-	keyring_ref = NULL;
-	if (ringid) {
-		keyring_ref = lookup_user_key(NULL, ringid, 1, 0, KEY_WRITE);
-		if (IS_ERR(keyring_ref)) {
-			ret = PTR_ERR(keyring_ref);
-			goto error;
-		}
-	}
+	ret = get_instantiation_keyring(ringid, rka, &dest_keyring);
+	if (ret < 0)
+		goto error;
 
 	/* instantiate the key and link it into a keyring */
 	ret = key_negate_and_link(rka->target_key, timeout,
-				  key_ref_to_ptr(keyring_ref), instkey);
+				  dest_keyring, instkey);
 
-	key_ref_put(keyring_ref);
+	key_put(dest_keyring);
 
 	/* discard the assumed authority if it's just been disabled by
 	 * instantiation of the key */
@@ -979,13 +1003,13 @@ long keyctl_set_reqkey_keyring(int reqkey_defl)
 
 	switch (reqkey_defl) {
 	case KEY_REQKEY_DEFL_THREAD_KEYRING:
-		ret = install_thread_keyring(current);
+		ret = install_thread_keyring();
 		if (ret < 0)
 			return ret;
 		goto set;
 
 	case KEY_REQKEY_DEFL_PROCESS_KEYRING:
-		ret = install_process_keyring(current);
+		ret = install_process_keyring();
 		if (ret < 0)
 			return ret;
 
@@ -1018,7 +1042,7 @@ long keyctl_set_timeout(key_serial_t id, unsigned timeout)
 	time_t expiry;
 	long ret;
 
-	key_ref = lookup_user_key(NULL, id, 1, 1, KEY_SETATTR);
+	key_ref = lookup_user_key(id, 1, 1, KEY_SETATTR);
 	if (IS_ERR(key_ref)) {
 		ret = PTR_ERR(key_ref);
 		goto error;
@@ -1105,7 +1129,7 @@ long keyctl_get_security(key_serial_t keyid,
 	char *context;
 	long ret;
 
-	key_ref = lookup_user_key(NULL, keyid, 0, 1, KEY_VIEW);
+	key_ref = lookup_user_key(keyid, 0, 1, KEY_VIEW);
 	if (IS_ERR(key_ref)) {
 		if (PTR_ERR(key_ref) != -EACCES)
 			return PTR_ERR(key_ref);
@@ -1117,7 +1141,7 @@ long keyctl_get_security(key_serial_t keyid,
 			return PTR_ERR(key_ref);
 		key_put(instkey);
 
-		key_ref = lookup_user_key(NULL, keyid, 0, 1, 0);
+		key_ref = lookup_user_key(keyid, 0, 1, 0);
 		if (IS_ERR(key_ref))
 			return PTR_ERR(key_ref);
 	}
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 5be6d018759a..1c793b7090a7 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -40,9 +40,9 @@ struct key_user root_key_user = {
 /*
  * install user and user session keyrings for a particular UID
  */
-static int install_user_keyrings(struct task_struct *tsk)
+int install_user_keyrings(void)
 {
-	struct user_struct *user = tsk->user;
+	struct user_struct *user = current->user;
 	struct key *uid_keyring, *session_keyring;
 	char buf[20];
 	int ret;
@@ -67,7 +67,7 @@ static int install_user_keyrings(struct task_struct *tsk)
 		uid_keyring = find_keyring_by_name(buf, true);
 		if (IS_ERR(uid_keyring)) {
 			uid_keyring = keyring_alloc(buf, user->uid, (gid_t) -1,
-						    tsk, KEY_ALLOC_IN_QUOTA,
+						    current, KEY_ALLOC_IN_QUOTA,
 						    NULL);
 			if (IS_ERR(uid_keyring)) {
 				ret = PTR_ERR(uid_keyring);
@@ -83,7 +83,8 @@ static int install_user_keyrings(struct task_struct *tsk)
 		if (IS_ERR(session_keyring)) {
 			session_keyring =
 				keyring_alloc(buf, user->uid, (gid_t) -1,
-					      tsk, KEY_ALLOC_IN_QUOTA, NULL);
+					      current, KEY_ALLOC_IN_QUOTA,
+					      NULL);
 			if (IS_ERR(session_keyring)) {
 				ret = PTR_ERR(session_keyring);
 				goto error_release;
@@ -146,8 +147,9 @@ void switch_uid_keyring(struct user_struct *new_user)
 /*
  * install a fresh thread keyring, discarding the old one
  */
-int install_thread_keyring(struct task_struct *tsk)
+int install_thread_keyring(void)
 {
+	struct task_struct *tsk = current;
 	struct key *keyring, *old;
 	char buf[20];
 	int ret;
@@ -178,8 +180,9 @@ error:
 /*
  * make sure a process keyring is installed
  */
-int install_process_keyring(struct task_struct *tsk)
+int install_process_keyring(void)
 {
+	struct task_struct *tsk = current;
 	struct key *keyring;
 	char buf[20];
 	int ret;
@@ -218,9 +221,9 @@ error:
  * install a session keyring, discarding the old one
  * - if a keyring is not supplied, an empty one is invented
  */
-static int install_session_keyring(struct task_struct *tsk,
-				   struct key *keyring)
+static int install_session_keyring(struct key *keyring)
 {
+	struct task_struct *tsk = current;
 	unsigned long flags;
 	struct key *old;
 	char buf[20];
@@ -572,93 +575,91 @@ static int lookup_user_key_possessed(const struct key *key, const void *target)
  * - don't create special keyrings unless so requested
  * - partially constructed keys aren't found unless requested
  */
-key_ref_t lookup_user_key(struct task_struct *context, key_serial_t id,
-			  int create, int partial, key_perm_t perm)
+key_ref_t lookup_user_key(key_serial_t id, int create, int partial,
+			  key_perm_t perm)
 {
+	struct request_key_auth *rka;
+	struct task_struct *t = current;
 	key_ref_t key_ref, skey_ref;
 	struct key *key;
 	int ret;
 
-	if (!context)
-		context = current;
-
 	key_ref = ERR_PTR(-ENOKEY);
 
 	switch (id) {
 	case KEY_SPEC_THREAD_KEYRING:
-		if (!context->thread_keyring) {
+		if (!t->thread_keyring) {
 			if (!create)
 				goto error;
 
-			ret = install_thread_keyring(context);
+			ret = install_thread_keyring();
 			if (ret < 0) {
 				key = ERR_PTR(ret);
 				goto error;
 			}
 		}
 
-		key = context->thread_keyring;
+		key = t->thread_keyring;
 		atomic_inc(&key->usage);
 		key_ref = make_key_ref(key, 1);
 		break;
 
 	case KEY_SPEC_PROCESS_KEYRING:
-		if (!context->signal->process_keyring) {
+		if (!t->signal->process_keyring) {
 			if (!create)
 				goto error;
 
-			ret = install_process_keyring(context);
+			ret = install_process_keyring();
 			if (ret < 0) {
 				key = ERR_PTR(ret);
 				goto error;
 			}
 		}
 
-		key = context->signal->process_keyring;
+		key = t->signal->process_keyring;
 		atomic_inc(&key->usage);
 		key_ref = make_key_ref(key, 1);
 		break;
 
 	case KEY_SPEC_SESSION_KEYRING:
-		if (!context->signal->session_keyring) {
+		if (!t->signal->session_keyring) {
 			/* always install a session keyring upon access if one
 			 * doesn't exist yet */
-			ret = install_user_keyrings(context);
+			ret = install_user_keyrings();
 			if (ret < 0)
 				goto error;
-			ret = install_session_keyring(
-				context, context->user->session_keyring);
+			ret = install_session_keyring(t->user->session_keyring);
 			if (ret < 0)
 				goto error;
 		}
 
 		rcu_read_lock();
-		key = rcu_dereference(context->signal->session_keyring);
+		key = rcu_dereference(t->signal->session_keyring);
 		atomic_inc(&key->usage);
 		rcu_read_unlock();
 		key_ref = make_key_ref(key, 1);
 		break;
 
 	case KEY_SPEC_USER_KEYRING:
-		if (!context->user->uid_keyring) {
-			ret = install_user_keyrings(context);
+		if (!t->user->uid_keyring) {
+			ret = install_user_keyrings();
 			if (ret < 0)
 				goto error;
 		}
 
-		key = context->user->uid_keyring;
+		key = t->user->uid_keyring;
 		atomic_inc(&key->usage);
 		key_ref = make_key_ref(key, 1);
 		break;
 
 	case KEY_SPEC_USER_SESSION_KEYRING:
-		if (!context->user->session_keyring) {
-			ret = install_user_keyrings(context);
+		if (!t->user->session_keyring) {
+			ret = install_user_keyrings();
 			if (ret < 0)
 				goto error;
 		}
 
-		key = context->user->session_keyring;
+		key = t->user->session_keyring;
 		atomic_inc(&key->usage);
 		key_ref = make_key_ref(key, 1);
 		break;
@@ -669,7 +670,7 @@ key_ref_t lookup_user_key(struct task_struct *context, key_serial_t id,
 		goto error;
 
 	case KEY_SPEC_REQKEY_AUTH_KEY:
-		key = context->request_key_auth;
+		key = t->request_key_auth;
 		if (!key)
 			goto error;
 
@@ -677,6 +678,25 @@ key_ref_t lookup_user_key(struct task_struct *context, key_serial_t id,
 		key_ref = make_key_ref(key, 1);
 		break;
 
+	case KEY_SPEC_REQUESTOR_KEYRING:
+		if (!t->request_key_auth)
+			goto error;
+
+		down_read(&t->request_key_auth->sem);
+		if (t->request_key_auth->flags & KEY_FLAG_REVOKED) {
+			key_ref = ERR_PTR(-EKEYREVOKED);
+			key = NULL;
+		} else {
+			rka = t->request_key_auth->payload.data;
+			key = rka->dest_keyring;
+			atomic_inc(&key->usage);
+		}
+		up_read(&t->request_key_auth->sem);
+		if (!key)
+			goto error;
+		key_ref = make_key_ref(key, 1);
+		break;
+
 	default:
 		key_ref = ERR_PTR(-EINVAL);
 		if (id < 1)
@@ -725,7 +745,7 @@ key_ref_t lookup_user_key(struct task_struct *context, key_serial_t id,
 		goto invalid_key;
 
 	/* check the permissions */
-	ret = key_task_permission(key_ref, context, perm);
+	ret = key_task_permission(key_ref, t, perm);
 	if (ret < 0)
 		goto invalid_key;
 
@@ -754,7 +774,7 @@ long join_session_keyring(const char *name)
 
 	/* if no name is provided, install an anonymous keyring */
 	if (!name) {
-		ret = install_session_keyring(tsk, NULL);
+		ret = install_session_keyring(NULL);
 		if (ret < 0)
 			goto error;
 
@@ -784,7 +804,7 @@ long join_session_keyring(const char *name)
 	}
 
 	/* we've got a keyring - now to install it */
-	ret = install_session_keyring(tsk, keyring);
+	ret = install_session_keyring(keyring);
 	if (ret < 0)
 		goto error2;
 
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index 91953c814497..8e9d93b4a402 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -76,6 +76,10 @@ static int call_sbin_request_key(struct key_construction *cons,
 
 	kenter("{%d},{%d},%s", key->serial, authkey->serial, op);
 
+	ret = install_user_keyrings();
+	if (ret < 0)
+		goto error_alloc;
+
 	/* allocate a new session keyring */
 	sprintf(desc, "_req.%u", key->serial);
 
@@ -165,7 +169,8 @@ error_alloc:
  * - we ignore program failure and go on key status instead
  */
 static int construct_key(struct key *key, const void *callout_info,
-			 size_t callout_len, void *aux)
+			 size_t callout_len, void *aux,
+			 struct key *dest_keyring)
 {
 	struct key_construction *cons;
 	request_key_actor_t actor;
@@ -179,7 +184,8 @@ static int construct_key(struct key *key, const void *callout_info,
 		return -ENOMEM;
 
 	/* allocate an authorisation key */
-	authkey = request_key_auth_new(key, callout_info, callout_len);
+	authkey = request_key_auth_new(key, callout_info, callout_len,
+				       dest_keyring);
 	if (IS_ERR(authkey)) {
 		kfree(cons);
 		ret = PTR_ERR(authkey);
@@ -207,27 +213,48 @@ static int construct_key(struct key *key, const void *callout_info,
 }
 
 /*
- * link a key to the appropriate destination keyring
- * - the caller must hold a write lock on the destination keyring
+ * get the appropriate destination keyring for the request
+ * - we return whatever keyring we select with an extra reference upon it which
+ *   the caller must release
  */
-static void construct_key_make_link(struct key *key, struct key *dest_keyring)
+static void construct_get_dest_keyring(struct key **_dest_keyring)
 {
+	struct request_key_auth *rka;
 	struct task_struct *tsk = current;
-	struct key *drop = NULL;
+	struct key *dest_keyring = *_dest_keyring, *authkey;
 
-	kenter("{%d},%p", key->serial, dest_keyring);
+	kenter("%p", dest_keyring);
 
 	/* find the appropriate keyring */
-	if (!dest_keyring) {
+	if (dest_keyring) {
+		/* the caller supplied one */
+		key_get(dest_keyring);
+	} else {
+		/* use a default keyring; falling through the cases until we
+		 * find one that we actually have */
 		switch (tsk->jit_keyring) {
 		case KEY_REQKEY_DEFL_DEFAULT:
+		case KEY_REQKEY_DEFL_REQUESTOR_KEYRING:
+			if (tsk->request_key_auth) {
+				authkey = tsk->request_key_auth;
+				down_read(&authkey->sem);
+				rka = authkey->payload.data;
+				if (!test_bit(KEY_FLAG_REVOKED,
+					      &authkey->flags))
+					dest_keyring =
+						key_get(rka->dest_keyring);
+				up_read(&authkey->sem);
+				if (dest_keyring)
+					break;
+			}
+
 		case KEY_REQKEY_DEFL_THREAD_KEYRING:
-			dest_keyring = tsk->thread_keyring;
+			dest_keyring = key_get(tsk->thread_keyring);
 			if (dest_keyring)
 				break;
 
 		case KEY_REQKEY_DEFL_PROCESS_KEYRING:
-			dest_keyring = tsk->signal->process_keyring;
+			dest_keyring = key_get(tsk->signal->process_keyring);
 			if (dest_keyring)
 				break;
 
@@ -236,17 +263,16 @@ static void construct_key_make_link(struct key *key, struct key *dest_keyring)
 			dest_keyring = key_get(
 				rcu_dereference(tsk->signal->session_keyring));
 			rcu_read_unlock();
-			drop = dest_keyring;
 
 			if (dest_keyring)
 				break;
 
 		case KEY_REQKEY_DEFL_USER_SESSION_KEYRING:
-			dest_keyring = tsk->user->session_keyring;
+			dest_keyring = key_get(tsk->user->session_keyring);
 			break;
 
 		case KEY_REQKEY_DEFL_USER_KEYRING:
-			dest_keyring = tsk->user->uid_keyring;
+			dest_keyring = key_get(tsk->user->uid_keyring);
 			break;
 
 		case KEY_REQKEY_DEFL_GROUP_KEYRING:
@@ -255,10 +281,9 @@ static void construct_key_make_link(struct key *key, struct key *dest_keyring)
 		}
 	}
 
-	/* and attach the key to it */
-	__key_link(dest_keyring, key);
-	key_put(drop);
-	kleave("");
+	*_dest_keyring = dest_keyring;
+	kleave(" [dk %d]", key_serial(dest_keyring));
+	return;
 }
 
 /*
@@ -288,8 +313,7 @@ static int construct_alloc_key(struct key_type *type,
 
 	set_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags);
 
-	if (dest_keyring)
-		down_write(&dest_keyring->sem);
+	down_write(&dest_keyring->sem);
 
 	/* attach the key to the destination keyring under lock, but we do need
 	 * to do another check just in case someone beat us to it whilst we
@@ -301,12 +325,10 @@ static int construct_alloc_key(struct key_type *type,
 	if (!IS_ERR(key_ref))
 		goto key_already_present;
 
-	if (dest_keyring)
-		construct_key_make_link(key, dest_keyring);
+	__key_link(dest_keyring, key);
 
 	mutex_unlock(&key_construction_mutex);
-	if (dest_keyring)
-		up_write(&dest_keyring->sem);
+	up_write(&dest_keyring->sem);
 	mutex_unlock(&user->cons_lock);
 	*_key = key;
 	kleave(" = 0 [%d]", key_serial(key));
@@ -348,21 +370,26 @@ static struct key *construct_key_and_link(struct key_type *type,
 	if (!user)
 		return ERR_PTR(-ENOMEM);
 
+	construct_get_dest_keyring(&dest_keyring);
+
 	ret = construct_alloc_key(type, description, dest_keyring, flags, user,
 				  &key);
 	key_user_put(user);
 
 	if (ret == 0) {
-		ret = construct_key(key, callout_info, callout_len, aux);
+		ret = construct_key(key, callout_info, callout_len, aux,
+				    dest_keyring);
 		if (ret < 0)
 			goto construction_failed;
 	}
 
+	key_put(dest_keyring);
 	return key;
 
 construction_failed:
 	key_negate_and_link(key, key_negative_timeout, NULL, NULL);
 	key_put(key);
+	key_put(dest_keyring);
 	return ERR_PTR(ret);
 }
 
diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c
index 729156b3485e..1762d44711d5 100644
--- a/security/keys/request_key_auth.c
+++ b/security/keys/request_key_auth.c
@@ -128,6 +128,7 @@ static void request_key_auth_destroy(struct key *key)
 	}
 
 	key_put(rka->target_key);
+	key_put(rka->dest_keyring);
 	kfree(rka->callout_info);
 	kfree(rka);
 
@@ -139,7 +140,7 @@ static void request_key_auth_destroy(struct key *key)
  * access to the caller's security data
  */
 struct key *request_key_auth_new(struct key *target, const void *callout_info,
-				 size_t callout_len)
+				 size_t callout_len, struct key *dest_keyring)
 {
 	struct request_key_auth *rka, *irka;
 	struct key *authkey = NULL;
@@ -188,6 +189,7 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info,
 	}
 
 	rka->target_key = key_get(target);
+	rka->dest_keyring = key_get(dest_keyring);
 	memcpy(rka->callout_info, callout_info, callout_len);
 	rka->callout_len = callout_len;
 
@@ -223,6 +225,7 @@ error_inst:
 	key_put(authkey);
 error_alloc:
 	key_put(rka->target_key);
+	key_put(rka->dest_keyring);
 	kfree(rka->callout_info);
 	kfree(rka);
 	kleave("= %d", ret);
-- 
cgit v1.2.3


From 1cdcbec1a3372c0c49c59d292e708fd07b509f18 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:14 +1100
Subject: CRED: Neuter sys_capset()

Take away the ability for sys_capset() to affect processes other than current.

This means that current will not need to lock its own credentials when reading
them against interference by other processes.

This has effectively been the case for a while anyway, since:

 (1) Without LSM enabled, sys_capset() is disallowed.

 (2) With file-based capabilities, sys_capset() is neutered.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Andrew G. Morgan <morgan@kernel.org>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/open.c                |  12 +--
 include/linux/security.h |  48 ++++------
 kernel/capability.c      | 227 +++++------------------------------------------
 security/commoncap.c     |  29 ++----
 security/security.c      |  18 ++--
 security/selinux/hooks.c |  10 +--
 6 files changed, 61 insertions(+), 283 deletions(-)

(limited to 'kernel')

diff --git a/fs/open.c b/fs/open.c
index 83cdb9dee0c1..500cc0c54762 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -441,17 +441,7 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 	current->fsgid = current->gid;
 
 	if (!issecure(SECURE_NO_SETUID_FIXUP)) {
-		/*
-		 * Clear the capabilities if we switch to a non-root user
-		 */
-#ifndef CONFIG_SECURITY_FILE_CAPABILITIES
-		/*
-		 * FIXME: There is a race here against sys_capset.  The
-		 * capabilities can change yet we will restore the old
-		 * value below.  We should hold task_capabilities_lock,
-		 * but we cannot because user_path_at can sleep.
-		 */
-#endif /* ndef CONFIG_SECURITY_FILE_CAPABILITIES */
+		/* Clear the capabilities if we switch to a non-root user */
 		if (current->uid)
 			old_cap = cap_set_effective(__cap_empty_set);
 		else
diff --git a/include/linux/security.h b/include/linux/security.h
index 5fe28a671cd3..d1ce8beddbd7 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -53,8 +53,8 @@ extern int cap_settime(struct timespec *ts, struct timezone *tz);
 extern int cap_ptrace_may_access(struct task_struct *child, unsigned int mode);
 extern int cap_ptrace_traceme(struct task_struct *parent);
 extern int cap_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted);
-extern int cap_capset_check(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted);
-extern void cap_capset_set(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted);
+extern int cap_capset_check(kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted);
+extern void cap_capset_set(kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted);
 extern int cap_bprm_set_security(struct linux_binprm *bprm);
 extern void cap_bprm_apply_creds(struct linux_binprm *bprm, int unsafe);
 extern int cap_bprm_secureexec(struct linux_binprm *bprm);
@@ -1191,24 +1191,14 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	Return 0 if the capability sets were successfully obtained.
  * @capset_check:
  *	Check permission before setting the @effective, @inheritable, and
- *	@permitted capability sets for the @target process.
- *	Caveat:  @target is also set to current if a set of processes is
- *	specified (i.e. all processes other than current and init or a
- *	particular process group).  Hence, the capset_set hook may need to
- *	revalidate permission to the actual target process.
- *	@target contains the task_struct structure for target process.
+ *	@permitted capability sets for the current process.
  *	@effective contains the effective capability set.
  *	@inheritable contains the inheritable capability set.
  *	@permitted contains the permitted capability set.
  *	Return 0 if permission is granted.
  * @capset_set:
  *	Set the @effective, @inheritable, and @permitted capability sets for
- *	the @target process.  Since capset_check cannot always check permission
- *	to the real @target process, this hook may also perform permission
- *	checking to determine if the current process is allowed to set the
- *	capability sets of the @target process.  However, this hook has no way
- *	of returning an error due to the structure of the sys_capset code.
- *	@target contains the task_struct structure for target process.
+ *	the current process.
  *	@effective contains the effective capability set.
  *	@inheritable contains the inheritable capability set.
  *	@permitted contains the permitted capability set.
@@ -1303,12 +1293,10 @@ struct security_operations {
 	int (*capget) (struct task_struct *target,
 		       kernel_cap_t *effective,
 		       kernel_cap_t *inheritable, kernel_cap_t *permitted);
-	int (*capset_check) (struct task_struct *target,
-			     kernel_cap_t *effective,
+	int (*capset_check) (kernel_cap_t *effective,
 			     kernel_cap_t *inheritable,
 			     kernel_cap_t *permitted);
-	void (*capset_set) (struct task_struct *target,
-			    kernel_cap_t *effective,
+	void (*capset_set) (kernel_cap_t *effective,
 			    kernel_cap_t *inheritable,
 			    kernel_cap_t *permitted);
 	int (*capable) (struct task_struct *tsk, int cap, int audit);
@@ -1572,12 +1560,10 @@ int security_capget(struct task_struct *target,
 		    kernel_cap_t *effective,
 		    kernel_cap_t *inheritable,
 		    kernel_cap_t *permitted);
-int security_capset_check(struct task_struct *target,
-			  kernel_cap_t *effective,
+int security_capset_check(kernel_cap_t *effective,
 			  kernel_cap_t *inheritable,
 			  kernel_cap_t *permitted);
-void security_capset_set(struct task_struct *target,
-			 kernel_cap_t *effective,
+void security_capset_set(kernel_cap_t *effective,
 			 kernel_cap_t *inheritable,
 			 kernel_cap_t *permitted);
 int security_capable(struct task_struct *tsk, int cap);
@@ -1769,20 +1755,18 @@ static inline int security_capget(struct task_struct *target,
 	return cap_capget(target, effective, inheritable, permitted);
 }
 
-static inline int security_capset_check(struct task_struct *target,
-					 kernel_cap_t *effective,
-					 kernel_cap_t *inheritable,
-					 kernel_cap_t *permitted)
+static inline int security_capset_check(kernel_cap_t *effective,
+					kernel_cap_t *inheritable,
+					kernel_cap_t *permitted)
 {
-	return cap_capset_check(target, effective, inheritable, permitted);
+	return cap_capset_check(effective, inheritable, permitted);
 }
 
-static inline void security_capset_set(struct task_struct *target,
-					kernel_cap_t *effective,
-					kernel_cap_t *inheritable,
-					kernel_cap_t *permitted)
+static inline void security_capset_set(kernel_cap_t *effective,
+				       kernel_cap_t *inheritable,
+				       kernel_cap_t *permitted)
 {
-	cap_capset_set(target, effective, inheritable, permitted);
+	cap_capset_set(effective, inheritable, permitted);
 }
 
 static inline int security_capable(struct task_struct *tsk, int cap)
diff --git a/kernel/capability.c b/kernel/capability.c
index adb262f83de1..58b00519624a 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -127,160 +127,6 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
 	return 0;
 }
 
-#ifndef CONFIG_SECURITY_FILE_CAPABILITIES
-
-/*
- * Without filesystem capability support, we nominally support one process
- * setting the capabilities of another
- */
-static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
-				     kernel_cap_t *pIp, kernel_cap_t *pPp)
-{
-	struct task_struct *target;
-	int ret;
-
-	spin_lock(&task_capability_lock);
-	read_lock(&tasklist_lock);
-
-	if (pid && pid != task_pid_vnr(current)) {
-		target = find_task_by_vpid(pid);
-		if (!target) {
-			ret = -ESRCH;
-			goto out;
-		}
-	} else
-		target = current;
-
-	ret = security_capget(target, pEp, pIp, pPp);
-
-out:
-	read_unlock(&tasklist_lock);
-	spin_unlock(&task_capability_lock);
-
-	return ret;
-}
-
-/*
- * cap_set_pg - set capabilities for all processes in a given process
- * group.  We call this holding task_capability_lock and tasklist_lock.
- */
-static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective,
-			     kernel_cap_t *inheritable,
-			     kernel_cap_t *permitted)
-{
-	struct task_struct *g, *target;
-	int ret = -EPERM;
-	int found = 0;
-	struct pid *pgrp;
-
-	spin_lock(&task_capability_lock);
-	read_lock(&tasklist_lock);
-
-	pgrp = find_vpid(pgrp_nr);
-	do_each_pid_task(pgrp, PIDTYPE_PGID, g) {
-		target = g;
-		while_each_thread(g, target) {
-			if (!security_capset_check(target, effective,
-						   inheritable, permitted)) {
-				security_capset_set(target, effective,
-						    inheritable, permitted);
-				ret = 0;
-			}
-			found = 1;
-		}
-	} while_each_pid_task(pgrp, PIDTYPE_PGID, g);
-
-	read_unlock(&tasklist_lock);
-	spin_unlock(&task_capability_lock);
-
-	if (!found)
-		ret = 0;
-	return ret;
-}
-
-/*
- * cap_set_all - set capabilities for all processes other than init
- * and self.  We call this holding task_capability_lock and tasklist_lock.
- */
-static inline int cap_set_all(kernel_cap_t *effective,
-			      kernel_cap_t *inheritable,
-			      kernel_cap_t *permitted)
-{
-	struct task_struct *g, *target;
-	int ret = -EPERM;
-	int found = 0;
-
-	spin_lock(&task_capability_lock);
-	read_lock(&tasklist_lock);
-
-	do_each_thread(g, target) {
-		if (target == current
-		    || is_container_init(target->group_leader))
-			continue;
-		found = 1;
-		if (security_capset_check(target, effective, inheritable,
-					  permitted))
-			continue;
-		ret = 0;
-		security_capset_set(target, effective, inheritable, permitted);
-	} while_each_thread(g, target);
-
-	read_unlock(&tasklist_lock);
-	spin_unlock(&task_capability_lock);
-
-	if (!found)
-		ret = 0;
-
-	return ret;
-}
-
-/*
- * Given the target pid does not refer to the current process we
- * need more elaborate support... (This support is not present when
- * filesystem capabilities are configured.)
- */
-static inline int do_sys_capset_other_tasks(pid_t pid, kernel_cap_t *effective,
-					    kernel_cap_t *inheritable,
-					    kernel_cap_t *permitted)
-{
-	struct task_struct *target;
-	int ret;
-
-	if (!capable(CAP_SETPCAP))
-		return -EPERM;
-
-	if (pid == -1)	          /* all procs other than current and init */
-		return cap_set_all(effective, inheritable, permitted);
-
-	else if (pid < 0)                    /* all procs in process group */
-		return cap_set_pg(-pid, effective, inheritable, permitted);
-
-	/* target != current */
-	spin_lock(&task_capability_lock);
-	read_lock(&tasklist_lock);
-
-	target = find_task_by_vpid(pid);
-	if (!target)
-		ret = -ESRCH;
-	else {
-		ret = security_capset_check(target, effective, inheritable,
-					    permitted);
-
-		/* having verified that the proposed changes are legal,
-		   we now put them into effect. */
-		if (!ret)
-			security_capset_set(target, effective, inheritable,
-					    permitted);
-	}
-
-	read_unlock(&tasklist_lock);
-	spin_unlock(&task_capability_lock);
-
-	return ret;
-}
-
-#else /* ie., def CONFIG_SECURITY_FILE_CAPABILITIES */
-
 /*
  * If we have configured with filesystem capability support, then the
  * only thing that can change the capabilities of the current process
@@ -314,22 +160,6 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
 	return ret;
 }
 
-/*
- * With filesystem capability support configured, the kernel does not
- * permit the changing of capabilities in one process by another
- * process. (CAP_SETPCAP has much less broad semantics when configured
- * this way.)
- */
-static inline int do_sys_capset_other_tasks(pid_t pid,
-					    kernel_cap_t *effective,
-					    kernel_cap_t *inheritable,
-					    kernel_cap_t *permitted)
-{
-	return -EPERM;
-}
-
-#endif /* ie., ndef CONFIG_SECURITY_FILE_CAPABILITIES */
-
 /*
  * Atomically modify the effective capabilities returning the original
  * value. No permission check is performed here - it is assumed that the
@@ -424,16 +254,14 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
  * @data: pointer to struct that contains the effective, permitted,
  *	and inheritable capabilities
  *
- * Set capabilities for a given process, all processes, or all
- * processes in a given process group.
+ * Set capabilities for the current process only.  The ability to any other
+ * process(es) has been deprecated and removed.
  *
  * The restrictions on setting capabilities are specified as:
  *
- * [pid is for the 'target' task.  'current' is the calling task.]
- *
- * I: any raised capabilities must be a subset of the (old current) permitted
- * P: any raised capabilities must be a subset of the (old current) permitted
- * E: must be set to a subset of (new target) permitted
+ * I: any raised capabilities must be a subset of the old permitted
+ * P: any raised capabilities must be a subset of the old permitted
+ * E: must be set to a subset of new permitted
  *
  * Returns 0 on success and < 0 on error.
  */
@@ -452,10 +280,13 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 	if (get_user(pid, &header->pid))
 		return -EFAULT;
 
+	/* may only affect current now */
+	if (pid != 0 && pid != task_pid_vnr(current))
+		return -EPERM;
+
 	if (copy_from_user(&kdata, data, tocopy
-			   * sizeof(struct __user_cap_data_struct))) {
+			   * sizeof(struct __user_cap_data_struct)))
 		return -EFAULT;
-	}
 
 	for (i = 0; i < tocopy; i++) {
 		effective.cap[i] = kdata[i].effective;
@@ -473,32 +304,20 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 	if (ret)
 		return ret;
 
-	if (pid && (pid != task_pid_vnr(current)))
-		ret = do_sys_capset_other_tasks(pid, &effective, &inheritable,
-						&permitted);
-	else {
-		/*
-		 * This lock is required even when filesystem
-		 * capability support is configured - it protects the
-		 * sys_capget() call from returning incorrect data in
-		 * the case that the targeted process is not the
-		 * current one.
-		 */
-		spin_lock(&task_capability_lock);
-
-		ret = security_capset_check(current, &effective, &inheritable,
-					    &permitted);
-		/*
-		 * Having verified that the proposed changes are
-		 * legal, we now put them into effect.
-		 */
-		if (!ret)
-			security_capset_set(current, &effective, &inheritable,
-					    &permitted);
-		spin_unlock(&task_capability_lock);
-	}
-
+	/* This lock is required even when filesystem capability support is
+	 * configured - it protects the sys_capget() call from returning
+	 * incorrect data in the case that the targeted process is not the
+	 * current one.
+	 */
+	spin_lock(&task_capability_lock);
 
+	ret = security_capset_check(&effective, &inheritable, &permitted);
+	/* Having verified that the proposed changes are legal, we now put them
+	 * into effect.
+	 */
+	if (!ret)
+		security_capset_set(&effective, &inheritable, &permitted);
+	spin_unlock(&task_capability_lock);
 	return ret;
 }
 
diff --git a/security/commoncap.c b/security/commoncap.c
index 8283271f0768..e3f36ef629fa 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -96,15 +96,6 @@ int cap_capget (struct task_struct *target, kernel_cap_t *effective,
 
 #ifdef CONFIG_SECURITY_FILE_CAPABILITIES
 
-static inline int cap_block_setpcap(struct task_struct *target)
-{
-	/*
-	 * No support for remote process capability manipulation with
-	 * filesystem capability support.
-	 */
-	return (target != current);
-}
-
 static inline int cap_inh_is_capped(void)
 {
 	/*
@@ -119,7 +110,6 @@ static inline int cap_limit_ptraced_target(void) { return 1; }
 
 #else /* ie., ndef CONFIG_SECURITY_FILE_CAPABILITIES */
 
-static inline int cap_block_setpcap(struct task_struct *t) { return 0; }
 static inline int cap_inh_is_capped(void) { return 1; }
 static inline int cap_limit_ptraced_target(void)
 {
@@ -128,21 +118,18 @@ static inline int cap_limit_ptraced_target(void)
 
 #endif /* def CONFIG_SECURITY_FILE_CAPABILITIES */
 
-int cap_capset_check (struct task_struct *target, kernel_cap_t *effective,
+int cap_capset_check (kernel_cap_t *effective,
 		      kernel_cap_t *inheritable, kernel_cap_t *permitted)
 {
-	if (cap_block_setpcap(target)) {
-		return -EPERM;
-	}
 	if (cap_inh_is_capped()
 	    && !cap_issubset(*inheritable,
-			     cap_combine(target->cap_inheritable,
+			     cap_combine(current->cap_inheritable,
 					 current->cap_permitted))) {
 		/* incapable of using this inheritable set */
 		return -EPERM;
 	}
 	if (!cap_issubset(*inheritable,
-			   cap_combine(target->cap_inheritable,
+			   cap_combine(current->cap_inheritable,
 				       current->cap_bset))) {
 		/* no new pI capabilities outside bounding set */
 		return -EPERM;
@@ -150,7 +137,7 @@ int cap_capset_check (struct task_struct *target, kernel_cap_t *effective,
 
 	/* verify restrictions on target's new Permitted set */
 	if (!cap_issubset (*permitted,
-			   cap_combine (target->cap_permitted,
+			   cap_combine (current->cap_permitted,
 					current->cap_permitted))) {
 		return -EPERM;
 	}
@@ -163,12 +150,12 @@ int cap_capset_check (struct task_struct *target, kernel_cap_t *effective,
 	return 0;
 }
 
-void cap_capset_set (struct task_struct *target, kernel_cap_t *effective,
+void cap_capset_set (kernel_cap_t *effective,
 		     kernel_cap_t *inheritable, kernel_cap_t *permitted)
 {
-	target->cap_effective = *effective;
-	target->cap_inheritable = *inheritable;
-	target->cap_permitted = *permitted;
+	current->cap_effective = *effective;
+	current->cap_inheritable = *inheritable;
+	current->cap_permitted = *permitted;
 }
 
 static inline void bprm_clear_caps(struct linux_binprm *bprm)
diff --git a/security/security.c b/security/security.c
index 346f21e0ec2c..dca37381e2a7 100644
--- a/security/security.c
+++ b/security/security.c
@@ -145,20 +145,18 @@ int security_capget(struct task_struct *target,
 	return security_ops->capget(target, effective, inheritable, permitted);
 }
 
-int security_capset_check(struct task_struct *target,
-			   kernel_cap_t *effective,
-			   kernel_cap_t *inheritable,
-			   kernel_cap_t *permitted)
+int security_capset_check(kernel_cap_t *effective,
+			  kernel_cap_t *inheritable,
+			  kernel_cap_t *permitted)
 {
-	return security_ops->capset_check(target, effective, inheritable, permitted);
+	return security_ops->capset_check(effective, inheritable, permitted);
 }
 
-void security_capset_set(struct task_struct *target,
-			  kernel_cap_t *effective,
-			  kernel_cap_t *inheritable,
-			  kernel_cap_t *permitted)
+void security_capset_set(kernel_cap_t *effective,
+			 kernel_cap_t *inheritable,
+			 kernel_cap_t *permitted)
 {
-	security_ops->capset_set(target, effective, inheritable, permitted);
+	security_ops->capset_set(effective, inheritable, permitted);
 }
 
 int security_capable(struct task_struct *tsk, int cap)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 378dc53c08e8..df9986940e9c 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1790,22 +1790,22 @@ static int selinux_capget(struct task_struct *target, kernel_cap_t *effective,
 	return secondary_ops->capget(target, effective, inheritable, permitted);
 }
 
-static int selinux_capset_check(struct task_struct *target, kernel_cap_t *effective,
+static int selinux_capset_check(kernel_cap_t *effective,
 				kernel_cap_t *inheritable, kernel_cap_t *permitted)
 {
 	int error;
 
-	error = secondary_ops->capset_check(target, effective, inheritable, permitted);
+	error = secondary_ops->capset_check(effective, inheritable, permitted);
 	if (error)
 		return error;
 
-	return task_has_perm(current, target, PROCESS__SETCAP);
+	return task_has_perm(current, current, PROCESS__SETCAP);
 }
 
-static void selinux_capset_set(struct task_struct *target, kernel_cap_t *effective,
+static void selinux_capset_set(kernel_cap_t *effective,
 			       kernel_cap_t *inheritable, kernel_cap_t *permitted)
 {
-	secondary_ops->capset_set(target, effective, inheritable, permitted);
+	secondary_ops->capset_set(effective, inheritable, permitted);
 }
 
 static int selinux_capable(struct task_struct *tsk, int cap, int audit)
-- 
cgit v1.2.3


From b6dff3ec5e116e3af6f537d4caedcad6b9e5082a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:16 +1100
Subject: CRED: Separate task security context from task_struct

Separate the task security context from task_struct.  At this point, the
security data is temporarily embedded in the task_struct with two pointers
pointing to it.

Note that the Alpha arch is altered as it refers to (E)UID and (E)GID in
entry.S via asm-offsets.

With comment fixes Signed-off-by: Marc Dionne <marc.c.dionne@gmail.com>

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 arch/alpha/kernel/asm-offsets.c  |  11 +-
 arch/alpha/kernel/entry.S        |  10 +-
 arch/ia64/ia32/sys_ia32.c        |   8 +-
 arch/mips/kernel/kspd.c          |   4 +-
 arch/s390/kernel/compat_linux.c  |  28 ++---
 drivers/connector/cn_proc.c      |   8 +-
 fs/binfmt_elf.c                  |  12 +-
 fs/binfmt_elf_fdpic.c            |  12 +-
 fs/exec.c                        |   4 +-
 fs/fcntl.c                       |   4 +-
 fs/file_table.c                  |   4 +-
 fs/fuse/dir.c                    |  12 +-
 fs/hugetlbfs/inode.c             |   4 +-
 fs/ioprio.c                      |  12 +-
 fs/nfsd/auth.c                   |  22 ++--
 fs/nfsd/nfs4recover.c            |  12 +-
 fs/nfsd/nfsfh.c                  |   6 +-
 fs/open.c                        |  17 +--
 fs/proc/array.c                  |  18 +--
 fs/proc/base.c                   |  16 +--
 fs/xfs/linux-2.6/xfs_cred.h      |   6 +-
 fs/xfs/linux-2.6/xfs_globals.h   |   2 +-
 fs/xfs/xfs_inode.h               |   2 +-
 fs/xfs/xfs_vnodeops.h            |  10 +-
 include/linux/cred.h             | 155 +++++++++++++++++++----
 include/linux/init_task.h        |  24 ++--
 include/linux/sched.h            |  52 +-------
 include/linux/securebits.h       |   2 +-
 ipc/mqueue.c                     |   2 +-
 ipc/shm.c                        |   4 +-
 kernel/auditsc.c                 |  52 ++++----
 kernel/capability.c              |   4 +-
 kernel/cgroup.c                  |   4 +-
 kernel/exit.c                    |  10 +-
 kernel/fork.c                    |  24 ++--
 kernel/futex.c                   |   6 +-
 kernel/futex_compat.c            |   5 +-
 kernel/ptrace.c                  |  19 +--
 kernel/sched.c                   |  10 +-
 kernel/signal.c                  |  16 +--
 kernel/sys.c                     | 266 ++++++++++++++++++++++-----------------
 kernel/trace/trace.c             |   2 +-
 kernel/tsacct.c                  |   4 +-
 kernel/uid16.c                   |  28 ++---
 kernel/user.c                    |   4 +-
 mm/mempolicy.c                   |  10 +-
 mm/migrate.c                     |  10 +-
 mm/oom_kill.c                    |   2 +-
 net/core/scm.c                   |  10 +-
 net/sunrpc/auth.c                |   2 +-
 security/commoncap.c             | 161 +++++++++++++-----------
 security/keys/keyctl.c           |  25 ++--
 security/keys/permission.c       |  11 +-
 security/keys/process_keys.c     |  98 ++++++++-------
 security/keys/request_key.c      |  18 +--
 security/keys/request_key_auth.c |  12 +-
 security/selinux/exports.c       |   2 +-
 security/selinux/hooks.c         | 116 ++++++++---------
 security/selinux/selinuxfs.c     |   2 +-
 security/selinux/xfrm.c          |   6 +-
 security/smack/smack_access.c    |   4 +-
 security/smack/smack_lsm.c       |  77 ++++++------
 security/smack/smackfs.c         |   6 +-
 63 files changed, 832 insertions(+), 677 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/asm-offsets.c b/arch/alpha/kernel/asm-offsets.c
index 4b18cd94d59d..6ff8886e7e22 100644
--- a/arch/alpha/kernel/asm-offsets.c
+++ b/arch/alpha/kernel/asm-offsets.c
@@ -19,15 +19,18 @@ void foo(void)
 	BLANK();
 
         DEFINE(TASK_BLOCKED, offsetof(struct task_struct, blocked));
-        DEFINE(TASK_UID, offsetof(struct task_struct, uid));
-        DEFINE(TASK_EUID, offsetof(struct task_struct, euid));
-        DEFINE(TASK_GID, offsetof(struct task_struct, gid));
-        DEFINE(TASK_EGID, offsetof(struct task_struct, egid));
+        DEFINE(TASK_CRED, offsetof(struct task_struct, cred));
         DEFINE(TASK_REAL_PARENT, offsetof(struct task_struct, real_parent));
         DEFINE(TASK_GROUP_LEADER, offsetof(struct task_struct, group_leader));
         DEFINE(TASK_TGID, offsetof(struct task_struct, tgid));
         BLANK();
 
+        DEFINE(CRED_UID,  offsetof(struct cred, uid));
+        DEFINE(CRED_EUID, offsetof(struct cred, euid));
+        DEFINE(CRED_GID,  offsetof(struct cred, gid));
+        DEFINE(CRED_EGID, offsetof(struct cred, egid));
+        BLANK();
+
 	DEFINE(SIZEOF_PT_REGS, sizeof(struct pt_regs));
 	DEFINE(PT_PTRACED, PT_PTRACED);
 	DEFINE(CLONE_VM, CLONE_VM);
diff --git a/arch/alpha/kernel/entry.S b/arch/alpha/kernel/entry.S
index 5fc61e281ac7..f77345bc66a9 100644
--- a/arch/alpha/kernel/entry.S
+++ b/arch/alpha/kernel/entry.S
@@ -850,8 +850,9 @@ osf_getpriority:
 sys_getxuid:
 	.prologue 0
 	ldq	$2, TI_TASK($8)
-	ldl	$0, TASK_UID($2)
-	ldl	$1, TASK_EUID($2)
+	ldq	$3, TASK_CRED($2)
+	ldl	$0, CRED_UID($3)
+	ldl	$1, CRED_EUID($3)
 	stq	$1, 80($sp)
 	ret
 .end sys_getxuid
@@ -862,8 +863,9 @@ sys_getxuid:
 sys_getxgid:
 	.prologue 0
 	ldq	$2, TI_TASK($8)
-	ldl	$0, TASK_GID($2)
-	ldl	$1, TASK_EGID($2)
+	ldq	$3, TASK_CRED($2)
+	ldl	$0, CRED_GID($3)
+	ldl	$1, CRED_EGID($3)
 	stq	$1, 80($sp)
 	ret
 .end sys_getxgid
diff --git a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c
index 5e92ae00bdbb..2445a9d3488e 100644
--- a/arch/ia64/ia32/sys_ia32.c
+++ b/arch/ia64/ia32/sys_ia32.c
@@ -1772,20 +1772,20 @@ sys32_getgroups16 (int gidsetsize, short __user *grouplist)
 	if (gidsetsize < 0)
 		return -EINVAL;
 
-	get_group_info(current->group_info);
-	i = current->group_info->ngroups;
+	get_group_info(current->cred->group_info);
+	i = current->cred->group_info->ngroups;
 	if (gidsetsize) {
 		if (i > gidsetsize) {
 			i = -EINVAL;
 			goto out;
 		}
-		if (groups16_to_user(grouplist, current->group_info)) {
+		if (groups16_to_user(grouplist, current->cred->group_info)) {
 			i = -EFAULT;
 			goto out;
 		}
 	}
 out:
-	put_group_info(current->group_info);
+	put_group_info(current->cred->group_info);
 	return i;
 }
 
diff --git a/arch/mips/kernel/kspd.c b/arch/mips/kernel/kspd.c
index b0591ae0ce56..fd6e51224034 100644
--- a/arch/mips/kernel/kspd.c
+++ b/arch/mips/kernel/kspd.c
@@ -174,8 +174,8 @@ static unsigned int translate_open_flags(int flags)
 
 static void sp_setfsuidgid( uid_t uid, gid_t gid)
 {
-	current->fsuid = uid;
-	current->fsgid = gid;
+	current->cred->fsuid = uid;
+	current->cred->fsgid = gid;
 
 	key_fsuid_changed(current);
 	key_fsgid_changed(current);
diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index 4646382af34f..6cc87d8c8682 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -148,9 +148,9 @@ asmlinkage long sys32_getresuid16(u16 __user *ruid, u16 __user *euid, u16 __user
 {
 	int retval;
 
-	if (!(retval = put_user(high2lowuid(current->uid), ruid)) &&
-	    !(retval = put_user(high2lowuid(current->euid), euid)))
-		retval = put_user(high2lowuid(current->suid), suid);
+	if (!(retval = put_user(high2lowuid(current->cred->uid), ruid)) &&
+	    !(retval = put_user(high2lowuid(current->cred->euid), euid)))
+		retval = put_user(high2lowuid(current->cred->suid), suid);
 
 	return retval;
 }
@@ -165,9 +165,9 @@ asmlinkage long sys32_getresgid16(u16 __user *rgid, u16 __user *egid, u16 __user
 {
 	int retval;
 
-	if (!(retval = put_user(high2lowgid(current->gid), rgid)) &&
-	    !(retval = put_user(high2lowgid(current->egid), egid)))
-		retval = put_user(high2lowgid(current->sgid), sgid);
+	if (!(retval = put_user(high2lowgid(current->cred->gid), rgid)) &&
+	    !(retval = put_user(high2lowgid(current->cred->egid), egid)))
+		retval = put_user(high2lowgid(current->cred->sgid), sgid);
 
 	return retval;
 }
@@ -217,20 +217,20 @@ asmlinkage long sys32_getgroups16(int gidsetsize, u16 __user *grouplist)
 	if (gidsetsize < 0)
 		return -EINVAL;
 
-	get_group_info(current->group_info);
-	i = current->group_info->ngroups;
+	get_group_info(current->cred->group_info);
+	i = current->cred->group_info->ngroups;
 	if (gidsetsize) {
 		if (i > gidsetsize) {
 			i = -EINVAL;
 			goto out;
 		}
-		if (groups16_to_user(grouplist, current->group_info)) {
+		if (groups16_to_user(grouplist, current->cred->group_info)) {
 			i = -EFAULT;
 			goto out;
 		}
 	}
 out:
-	put_group_info(current->group_info);
+	put_group_info(current->cred->group_info);
 	return i;
 }
 
@@ -261,22 +261,22 @@ asmlinkage long sys32_setgroups16(int gidsetsize, u16 __user *grouplist)
 
 asmlinkage long sys32_getuid16(void)
 {
-	return high2lowuid(current->uid);
+	return high2lowuid(current->cred->uid);
 }
 
 asmlinkage long sys32_geteuid16(void)
 {
-	return high2lowuid(current->euid);
+	return high2lowuid(current->cred->euid);
 }
 
 asmlinkage long sys32_getgid16(void)
 {
-	return high2lowgid(current->gid);
+	return high2lowgid(current->cred->gid);
 }
 
 asmlinkage long sys32_getegid16(void)
 {
-	return high2lowgid(current->egid);
+	return high2lowgid(current->cred->egid);
 }
 
 /*
diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c
index 5c9f67f98d10..354c1ff17159 100644
--- a/drivers/connector/cn_proc.c
+++ b/drivers/connector/cn_proc.c
@@ -116,11 +116,11 @@ void proc_id_connector(struct task_struct *task, int which_id)
 	ev->event_data.id.process_pid = task->pid;
 	ev->event_data.id.process_tgid = task->tgid;
 	if (which_id == PROC_EVENT_UID) {
-	 	ev->event_data.id.r.ruid = task->uid;
-	 	ev->event_data.id.e.euid = task->euid;
+		ev->event_data.id.r.ruid = task->cred->uid;
+		ev->event_data.id.e.euid = task->cred->euid;
 	} else if (which_id == PROC_EVENT_GID) {
-	   	ev->event_data.id.r.rgid = task->gid;
-	   	ev->event_data.id.e.egid = task->egid;
+		ev->event_data.id.r.rgid = task->cred->gid;
+		ev->event_data.id.e.egid = task->cred->egid;
 	} else
 	     	return;
 	get_seq(&msg->seq, &ev->cpu);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 8fcfa398d350..7a52477ce493 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -223,10 +223,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 	NEW_AUX_ENT(AT_BASE, interp_load_addr);
 	NEW_AUX_ENT(AT_FLAGS, 0);
 	NEW_AUX_ENT(AT_ENTRY, exec->e_entry);
-	NEW_AUX_ENT(AT_UID, tsk->uid);
-	NEW_AUX_ENT(AT_EUID, tsk->euid);
-	NEW_AUX_ENT(AT_GID, tsk->gid);
-	NEW_AUX_ENT(AT_EGID, tsk->egid);
+	NEW_AUX_ENT(AT_UID, tsk->cred->uid);
+	NEW_AUX_ENT(AT_EUID, tsk->cred->euid);
+	NEW_AUX_ENT(AT_GID, tsk->cred->gid);
+	NEW_AUX_ENT(AT_EGID, tsk->cred->egid);
  	NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
 	NEW_AUX_ENT(AT_EXECFN, bprm->exec);
 	if (k_platform) {
@@ -1388,8 +1388,8 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
 	psinfo->pr_zomb = psinfo->pr_sname == 'Z';
 	psinfo->pr_nice = task_nice(p);
 	psinfo->pr_flag = p->flags;
-	SET_UID(psinfo->pr_uid, p->uid);
-	SET_GID(psinfo->pr_gid, p->gid);
+	SET_UID(psinfo->pr_uid, p->cred->uid);
+	SET_GID(psinfo->pr_gid, p->cred->gid);
 	strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
 	
 	return 0;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 488584c87512..9f67054c2c4e 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -623,10 +623,10 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 	NEW_AUX_ENT(AT_BASE,	interp_params->elfhdr_addr);
 	NEW_AUX_ENT(AT_FLAGS,	0);
 	NEW_AUX_ENT(AT_ENTRY,	exec_params->entry_addr);
-	NEW_AUX_ENT(AT_UID,	(elf_addr_t) current_uid());
-	NEW_AUX_ENT(AT_EUID,	(elf_addr_t) current_euid());
-	NEW_AUX_ENT(AT_GID,	(elf_addr_t) current_gid());
-	NEW_AUX_ENT(AT_EGID,	(elf_addr_t) current_egid());
+	NEW_AUX_ENT(AT_UID,	(elf_addr_t) current->cred->uid);
+	NEW_AUX_ENT(AT_EUID,	(elf_addr_t) current->cred->euid);
+	NEW_AUX_ENT(AT_GID,	(elf_addr_t) current->cred->gid);
+	NEW_AUX_ENT(AT_EGID,	(elf_addr_t) current->cred->egid);
 	NEW_AUX_ENT(AT_SECURE,	security_bprm_secureexec(bprm));
 	NEW_AUX_ENT(AT_EXECFN,	bprm->exec);
 
@@ -1440,8 +1440,8 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
 	psinfo->pr_zomb = psinfo->pr_sname == 'Z';
 	psinfo->pr_nice = task_nice(p);
 	psinfo->pr_flag = p->flags;
-	SET_UID(psinfo->pr_uid, p->uid);
-	SET_GID(psinfo->pr_gid, p->gid);
+	SET_UID(psinfo->pr_uid, p->cred->uid);
+	SET_GID(psinfo->pr_gid, p->cred->gid);
 	strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
 
 	return 0;
diff --git a/fs/exec.c b/fs/exec.c
index 604834f3b208..31149e430a89 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1738,7 +1738,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	 */
 	if (get_dumpable(mm) == 2) {	/* Setuid core dump mode */
 		flag = O_EXCL;		/* Stop rewrite attacks */
-		current->fsuid = 0;	/* Dump root private */
+		current->cred->fsuid = 0;	/* Dump root private */
 	}
 
 	retval = coredump_wait(exit_code, &core_state);
@@ -1834,7 +1834,7 @@ fail_unlock:
 	if (helper_argv)
 		argv_free(helper_argv);
 
-	current->fsuid = fsuid;
+	current->cred->fsuid = fsuid;
 	coredump_finish(mm);
 fail:
 	return retval;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index bf049a805e59..63964d863ad6 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -401,8 +401,8 @@ static inline int sigio_perm(struct task_struct *p,
                              struct fown_struct *fown, int sig)
 {
 	return (((fown->euid == 0) ||
-		 (fown->euid == p->suid) || (fown->euid == p->uid) ||
-		 (fown->uid == p->suid) || (fown->uid == p->uid)) &&
+		 (fown->euid == p->cred->suid) || (fown->euid == p->cred->uid) ||
+		 (fown->uid == p->cred->suid) || (fown->uid == p->cred->uid)) &&
 		!security_file_send_sigiotask(p, fown, sig));
 }
 
diff --git a/fs/file_table.c b/fs/file_table.c
index 5ad0eca6eea2..3152b53cfab0 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -122,8 +122,8 @@ struct file *get_empty_filp(void)
 	INIT_LIST_HEAD(&f->f_u.fu_list);
 	atomic_long_set(&f->f_count, 1);
 	rwlock_init(&f->f_owner.lock);
-	f->f_uid = tsk->fsuid;
-	f->f_gid = tsk->fsgid;
+	f->f_uid = tsk->cred->fsuid;
+	f->f_gid = tsk->cred->fsgid;
 	eventpoll_init_file(f);
 	/* f->f_version: 0 */
 	return f;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index fd03330cadeb..e97a98981862 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -872,12 +872,12 @@ int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task)
 	if (fc->flags & FUSE_ALLOW_OTHER)
 		return 1;
 
-	if (task->euid == fc->user_id &&
-	    task->suid == fc->user_id &&
-	    task->uid == fc->user_id &&
-	    task->egid == fc->group_id &&
-	    task->sgid == fc->group_id &&
-	    task->gid == fc->group_id)
+	if (task->cred->euid == fc->user_id &&
+	    task->cred->suid == fc->user_id &&
+	    task->cred->uid == fc->user_id &&
+	    task->cred->egid == fc->group_id &&
+	    task->cred->sgid == fc->group_id &&
+	    task->cred->gid == fc->group_id)
 		return 1;
 
 	return 0;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 08ad76c79b49..870a721b8bd2 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -958,7 +958,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size)
 	if (!can_do_hugetlb_shm())
 		return ERR_PTR(-EPERM);
 
-	if (!user_shm_lock(size, current->user))
+	if (!user_shm_lock(size, current->cred->user))
 		return ERR_PTR(-ENOMEM);
 
 	root = hugetlbfs_vfsmount->mnt_root;
@@ -998,7 +998,7 @@ out_inode:
 out_dentry:
 	dput(dentry);
 out_shm_unlock:
-	user_shm_unlock(size, current->user);
+	user_shm_unlock(size, current->cred->user);
 	return ERR_PTR(error);
 }
 
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 68d2cd807118..bb5210af77c2 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -32,8 +32,8 @@ static int set_task_ioprio(struct task_struct *task, int ioprio)
 	int err;
 	struct io_context *ioc;
 
-	if (task->uid != current_euid() &&
-	    task->uid != current_uid() && !capable(CAP_SYS_NICE))
+	if (task->cred->uid != current_euid() &&
+	    task->cred->uid != current_uid() && !capable(CAP_SYS_NICE))
 		return -EPERM;
 
 	err = security_task_setioprio(task, ioprio);
@@ -123,7 +123,7 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
 			break;
 		case IOPRIO_WHO_USER:
 			if (!who)
-				user = current->user;
+				user = current->cred->user;
 			else
 				user = find_user(who);
 
@@ -131,7 +131,7 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
 				break;
 
 			do_each_thread(g, p) {
-				if (p->uid != who)
+				if (p->cred->uid != who)
 					continue;
 				ret = set_task_ioprio(p, ioprio);
 				if (ret)
@@ -216,7 +216,7 @@ asmlinkage long sys_ioprio_get(int which, int who)
 			break;
 		case IOPRIO_WHO_USER:
 			if (!who)
-				user = current->user;
+				user = current->cred->user;
 			else
 				user = find_user(who);
 
@@ -224,7 +224,7 @@ asmlinkage long sys_ioprio_get(int which, int who)
 				break;
 
 			do_each_thread(g, p) {
-				if (p->uid != user->uid)
+				if (p->cred->uid != user->uid)
 					continue;
 				tmpio = get_task_ioprio(p);
 				if (tmpio < 0)
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 294992e9bf69..808fc03a6fbd 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -27,6 +27,7 @@ int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp)
 
 int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 {
+	struct cred *act_as = current->cred ;
 	struct svc_cred	cred = rqstp->rq_cred;
 	int i;
 	int flags = nfsexp_flags(rqstp, exp);
@@ -55,25 +56,26 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 		get_group_info(cred.cr_group_info);
 
 	if (cred.cr_uid != (uid_t) -1)
-		current->fsuid = cred.cr_uid;
+		act_as->fsuid = cred.cr_uid;
 	else
-		current->fsuid = exp->ex_anon_uid;
+		act_as->fsuid = exp->ex_anon_uid;
 	if (cred.cr_gid != (gid_t) -1)
-		current->fsgid = cred.cr_gid;
+		act_as->fsgid = cred.cr_gid;
 	else
-		current->fsgid = exp->ex_anon_gid;
+		act_as->fsgid = exp->ex_anon_gid;
 
 	if (!cred.cr_group_info)
 		return -ENOMEM;
-	ret = set_current_groups(cred.cr_group_info);
+	ret = set_groups(act_as, cred.cr_group_info);
 	put_group_info(cred.cr_group_info);
 	if ((cred.cr_uid)) {
-		current->cap_effective =
-			cap_drop_nfsd_set(current->cap_effective);
+		act_as->cap_effective =
+			cap_drop_nfsd_set(act_as->cap_effective);
 	} else {
-		current->cap_effective =
-			cap_raise_nfsd_set(current->cap_effective,
-					   current->cap_permitted);
+		act_as->cap_effective =
+			cap_raise_nfsd_set(act_as->cap_effective,
+					   act_as->cap_permitted);
 	}
 	return ret;
 }
+
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index bb93946ace22..a5e14e8695ea 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -57,17 +57,17 @@ static int rec_dir_init = 0;
 static void
 nfs4_save_user(uid_t *saveuid, gid_t *savegid)
 {
-	*saveuid = current->fsuid;
-	*savegid = current->fsgid;
-	current->fsuid = 0;
-	current->fsgid = 0;
+	*saveuid = current->cred->fsuid;
+	*savegid = current->cred->fsgid;
+	current->cred->fsuid = 0;
+	current->cred->fsgid = 0;
 }
 
 static void
 nfs4_reset_user(uid_t saveuid, gid_t savegid)
 {
-	current->fsuid = saveuid;
-	current->fsgid = savegid;
+	current->cred->fsuid = saveuid;
+	current->cred->fsgid = savegid;
 }
 
 static void
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index cd25d91895a1..e67cfaea0865 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -186,9 +186,9 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
 		 * access control settings being in effect, we cannot
 		 * fix that case easily.
 		 */
-		current->cap_effective =
-			cap_raise_nfsd_set(current->cap_effective,
-					   current->cap_permitted);
+		current->cred->cap_effective =
+			cap_raise_nfsd_set(current->cred->cap_effective,
+					   current->cred->cap_permitted);
 	} else {
 		error = nfsd_setuser_and_check_port(rqstp, exp);
 		if (error)
diff --git a/fs/open.c b/fs/open.c
index 500cc0c54762..b1238e195e7e 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -425,6 +425,7 @@ out:
  */
 asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 {
+	struct cred *cred = current->cred;
 	struct path path;
 	struct inode *inode;
 	int old_fsuid, old_fsgid;
@@ -434,18 +435,18 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 	if (mode & ~S_IRWXO)	/* where's F_OK, X_OK, W_OK, R_OK? */
 		return -EINVAL;
 
-	old_fsuid = current->fsuid;
-	old_fsgid = current->fsgid;
+	old_fsuid = cred->fsuid;
+	old_fsgid = cred->fsgid;
 
-	current->fsuid = current->uid;
-	current->fsgid = current->gid;
+	cred->fsuid = cred->uid;
+	cred->fsgid = cred->gid;
 
 	if (!issecure(SECURE_NO_SETUID_FIXUP)) {
 		/* Clear the capabilities if we switch to a non-root user */
-		if (current->uid)
+		if (current->cred->uid)
 			old_cap = cap_set_effective(__cap_empty_set);
 		else
-			old_cap = cap_set_effective(current->cap_permitted);
+			old_cap = cap_set_effective(cred->cap_permitted);
 	}
 
 	res = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
@@ -484,8 +485,8 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 out_path_release:
 	path_put(&path);
 out:
-	current->fsuid = old_fsuid;
-	current->fsgid = old_fsgid;
+	cred->fsuid = old_fsuid;
+	cred->fsgid = old_fsgid;
 
 	if (!issecure(SECURE_NO_SETUID_FIXUP))
 		cap_set_effective(old_cap);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 6af7fba7abb1..62fe9b2009b6 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -182,8 +182,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 		task_tgid_nr_ns(p, ns),
 		pid_nr_ns(pid, ns),
 		ppid, tpid,
-		p->uid, p->euid, p->suid, p->fsuid,
-		p->gid, p->egid, p->sgid, p->fsgid);
+		p->cred->uid, p->cred->euid, p->cred->suid, p->cred->fsuid,
+		p->cred->gid, p->cred->egid, p->cred->sgid, p->cred->fsgid);
 
 	task_lock(p);
 	if (p->files)
@@ -194,7 +194,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 		fdt ? fdt->max_fds : 0);
 	rcu_read_unlock();
 
-	group_info = p->group_info;
+	group_info = p->cred->group_info;
 	get_group_info(group_info);
 	task_unlock(p);
 
@@ -262,7 +262,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
 		blocked = p->blocked;
 		collect_sigign_sigcatch(p, &ignored, &caught);
 		num_threads = atomic_read(&p->signal->count);
-		qsize = atomic_read(&p->user->sigpending);
+		qsize = atomic_read(&p->cred->user->sigpending);
 		qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur;
 		unlock_task_sighand(p, &flags);
 	}
@@ -293,10 +293,12 @@ static void render_cap_t(struct seq_file *m, const char *header,
 
 static inline void task_cap(struct seq_file *m, struct task_struct *p)
 {
-	render_cap_t(m, "CapInh:\t", &p->cap_inheritable);
-	render_cap_t(m, "CapPrm:\t", &p->cap_permitted);
-	render_cap_t(m, "CapEff:\t", &p->cap_effective);
-	render_cap_t(m, "CapBnd:\t", &p->cap_bset);
+	struct cred *cred = p->cred;
+
+	render_cap_t(m, "CapInh:\t", &cred->cap_inheritable);
+	render_cap_t(m, "CapPrm:\t", &cred->cap_permitted);
+	render_cap_t(m, "CapEff:\t", &cred->cap_effective);
+	render_cap_t(m, "CapBnd:\t", &cred->cap_bset);
 }
 
 static inline void task_context_switch_counts(struct seq_file *m,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 486cf3fe7139..6862b360c36c 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1428,8 +1428,8 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
 	inode->i_uid = 0;
 	inode->i_gid = 0;
 	if (task_dumpable(task)) {
-		inode->i_uid = task->euid;
-		inode->i_gid = task->egid;
+		inode->i_uid = task->cred->euid;
+		inode->i_gid = task->cred->egid;
 	}
 	security_task_to_inode(task, inode);
 
@@ -1454,8 +1454,8 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat
 	if (task) {
 		if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
 		    task_dumpable(task)) {
-			stat->uid = task->euid;
-			stat->gid = task->egid;
+			stat->uid = task->cred->euid;
+			stat->gid = task->cred->egid;
 		}
 	}
 	rcu_read_unlock();
@@ -1486,8 +1486,8 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
 	if (task) {
 		if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
 		    task_dumpable(task)) {
-			inode->i_uid = task->euid;
-			inode->i_gid = task->egid;
+			inode->i_uid = task->cred->euid;
+			inode->i_gid = task->cred->egid;
 		} else {
 			inode->i_uid = 0;
 			inode->i_gid = 0;
@@ -1658,8 +1658,8 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 				rcu_read_unlock();
 				put_files_struct(files);
 				if (task_dumpable(task)) {
-					inode->i_uid = task->euid;
-					inode->i_gid = task->egid;
+					inode->i_uid = task->cred->euid;
+					inode->i_gid = task->cred->egid;
 				} else {
 					inode->i_uid = 0;
 					inode->i_gid = 0;
diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h
index 293043a5573a..8c022cd0ad67 100644
--- a/fs/xfs/linux-2.6/xfs_cred.h
+++ b/fs/xfs/linux-2.6/xfs_cred.h
@@ -23,11 +23,9 @@
 /*
  * Credentials
  */
-typedef struct cred {
-       /* EMPTY */
-} cred_t;
+typedef const struct cred cred_t;
 
-extern struct cred *sys_cred;
+extern cred_t *sys_cred;
 
 /* this is a hack.. (assumes sys_cred is the only cred_t in the system) */
 static inline int capable_cred(cred_t *cr, int cid)
diff --git a/fs/xfs/linux-2.6/xfs_globals.h b/fs/xfs/linux-2.6/xfs_globals.h
index 2770b0085ee8..6eda8a3eb6f1 100644
--- a/fs/xfs/linux-2.6/xfs_globals.h
+++ b/fs/xfs/linux-2.6/xfs_globals.h
@@ -19,6 +19,6 @@
 #define __XFS_GLOBALS_H__
 
 extern uint64_t	xfs_panic_mask;		/* set to cause more panics */
-extern struct cred *sys_cred;
+extern cred_t *sys_cred;
 
 #endif	/* __XFS_GLOBALS_H__ */
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 1420c49674d7..6be310d41daf 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -497,7 +497,7 @@ int		xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
 			  xfs_inode_t **, xfs_daddr_t, uint);
 int		xfs_iread_extents(struct xfs_trans *, xfs_inode_t *, int);
 int		xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t,
-			   xfs_nlink_t, xfs_dev_t, struct cred *, xfs_prid_t,
+			   xfs_nlink_t, xfs_dev_t, cred_t *, xfs_prid_t,
 			   int, struct xfs_buf **, boolean_t *, xfs_inode_t **);
 void		xfs_dinode_from_disk(struct xfs_icdinode *,
 				     struct xfs_dinode_core *);
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index e932a96bec54..7b0c2ab88333 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -16,7 +16,7 @@ struct xfs_iomap;
 
 int xfs_open(struct xfs_inode *ip);
 int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags,
-		struct cred *credp);
+		cred_t *credp);
 #define	XFS_ATTR_DMI		0x01	/* invocation from a DMI function */
 #define	XFS_ATTR_NONBLOCK	0x02	/* return EAGAIN if operation would block */
 #define XFS_ATTR_NOLOCK		0x04	/* Don't grab any conflicting locks */
@@ -28,24 +28,24 @@ int xfs_inactive(struct xfs_inode *ip);
 int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
 		struct xfs_inode **ipp, struct xfs_name *ci_name);
 int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode,
-		xfs_dev_t rdev, struct xfs_inode **ipp, struct cred *credp);
+		xfs_dev_t rdev, struct xfs_inode **ipp, cred_t *credp);
 int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
 		struct xfs_inode *ip);
 int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
 		struct xfs_name *target_name);
 int xfs_mkdir(struct xfs_inode *dp, struct xfs_name *dir_name,
-		mode_t mode, struct xfs_inode **ipp, struct cred *credp);
+		mode_t mode, struct xfs_inode **ipp, cred_t *credp);
 int xfs_readdir(struct xfs_inode	*dp, void *dirent, size_t bufsize,
 		       xfs_off_t *offset, filldir_t filldir);
 int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
 		const char *target_path, mode_t mode, struct xfs_inode **ipp,
-		struct cred *credp);
+		cred_t *credp);
 int xfs_inode_flush(struct xfs_inode *ip, int flags);
 int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
 int xfs_reclaim(struct xfs_inode *ip);
 int xfs_change_file_space(struct xfs_inode *ip, int cmd,
 		xfs_flock64_t *bf, xfs_off_t offset,
-		struct cred *credp, int	attr_flags);
+		cred_t *credp, int	attr_flags);
 int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
 		struct xfs_inode *src_ip, struct xfs_inode *target_dp,
 		struct xfs_name *target_name, struct xfs_inode *target_ip);
diff --git a/include/linux/cred.h b/include/linux/cred.h
index b69222cc1fd2..3e65587a72e5 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -12,39 +12,150 @@
 #ifndef _LINUX_CRED_H
 #define _LINUX_CRED_H
 
-#define get_current_user()	(get_uid(current->user))
-
-#define task_uid(task)		((task)->uid)
-#define task_gid(task)		((task)->gid)
-#define task_euid(task)		((task)->euid)
-#define task_egid(task)		((task)->egid)
-
-#define current_uid()		(current->uid)
-#define current_gid()		(current->gid)
-#define current_euid()		(current->euid)
-#define current_egid()		(current->egid)
-#define current_suid()		(current->suid)
-#define current_sgid()		(current->sgid)
-#define current_fsuid()		(current->fsuid)
-#define current_fsgid()		(current->fsgid)
-#define current_cap()		(current->cap_effective)
+#include <linux/capability.h>
+#include <linux/key.h>
+#include <asm/atomic.h>
+
+struct user_struct;
+struct cred;
+
+/*
+ * COW Supplementary groups list
+ */
+#define NGROUPS_SMALL		32
+#define NGROUPS_PER_BLOCK	((unsigned int)(PAGE_SIZE / sizeof(gid_t)))
+
+struct group_info {
+	atomic_t	usage;
+	int		ngroups;
+	int		nblocks;
+	gid_t		small_block[NGROUPS_SMALL];
+	gid_t		*blocks[0];
+};
+
+/**
+ * get_group_info - Get a reference to a group info structure
+ * @group_info: The group info to reference
+ *
+ * This must be called with the owning task locked (via task_lock()) when task
+ * != current.  The reason being that the vast majority of callers are looking
+ * at current->group_info, which can not be changed except by the current task.
+ * Changing current->group_info requires the task lock, too.
+ */
+#define get_group_info(group_info)		\
+do {						\
+	atomic_inc(&(group_info)->usage);	\
+} while (0)
+
+/**
+ * put_group_info - Release a reference to a group info structure
+ * @group_info: The group info to release
+ */
+#define put_group_info(group_info)			\
+do {							\
+	if (atomic_dec_and_test(&(group_info)->usage))	\
+		groups_free(group_info);		\
+} while (0)
+
+extern struct group_info *groups_alloc(int);
+extern void groups_free(struct group_info *);
+extern int set_current_groups(struct group_info *);
+extern int set_groups(struct cred *, struct group_info *);
+extern int groups_search(struct group_info *, gid_t);
+
+/* access the groups "array" with this macro */
+#define GROUP_AT(gi, i) \
+	((gi)->blocks[(i) / NGROUPS_PER_BLOCK][(i) % NGROUPS_PER_BLOCK])
+
+extern int in_group_p(gid_t);
+extern int in_egroup_p(gid_t);
+
+/*
+ * The security context of a task
+ *
+ * The parts of the context break down into two categories:
+ *
+ *  (1) The objective context of a task.  These parts are used when some other
+ *	task is attempting to affect this one.
+ *
+ *  (2) The subjective context.  These details are used when the task is acting
+ *	upon another object, be that a file, a task, a key or whatever.
+ *
+ * Note that some members of this structure belong to both categories - the
+ * LSM security pointer for instance.
+ *
+ * A task has two security pointers.  task->real_cred points to the objective
+ * context that defines that task's actual details.  The objective part of this
+ * context is used whenever that task is acted upon.
+ *
+ * task->cred points to the subjective context that defines the details of how
+ * that task is going to act upon another object.  This may be overridden
+ * temporarily to point to another security context, but normally points to the
+ * same context as task->real_cred.
+ */
+struct cred {
+	atomic_t	usage;
+	uid_t		uid;		/* real UID of the task */
+	gid_t		gid;		/* real GID of the task */
+	uid_t		suid;		/* saved UID of the task */
+	gid_t		sgid;		/* saved GID of the task */
+	uid_t		euid;		/* effective UID of the task */
+	gid_t		egid;		/* effective GID of the task */
+	uid_t		fsuid;		/* UID for VFS ops */
+	gid_t		fsgid;		/* GID for VFS ops */
+	unsigned	securebits;	/* SUID-less security management */
+	kernel_cap_t	cap_inheritable; /* caps our children can inherit */
+	kernel_cap_t	cap_permitted;	/* caps we're permitted */
+	kernel_cap_t	cap_effective;	/* caps we can actually use */
+	kernel_cap_t	cap_bset;	/* capability bounding set */
+#ifdef CONFIG_KEYS
+	unsigned char	jit_keyring;	/* default keyring to attach requested
+					 * keys to */
+	struct key	*thread_keyring; /* keyring private to this thread */
+	struct key	*request_key_auth; /* assumed request_key authority */
+#endif
+#ifdef CONFIG_SECURITY
+	void		*security;	/* subjective LSM security */
+#endif
+	struct user_struct *user;	/* real user ID subscription */
+	struct group_info *group_info;	/* supplementary groups for euid/fsgid */
+	struct rcu_head	rcu;		/* RCU deletion hook */
+	spinlock_t	lock;		/* lock for pointer changes */
+};
+
+#define get_current_user()	(get_uid(current->cred->user))
+
+#define task_uid(task)		((task)->cred->uid)
+#define task_gid(task)		((task)->cred->gid)
+#define task_euid(task)		((task)->cred->euid)
+#define task_egid(task)		((task)->cred->egid)
+
+#define current_uid()		(current->cred->uid)
+#define current_gid()		(current->cred->gid)
+#define current_euid()		(current->cred->euid)
+#define current_egid()		(current->cred->egid)
+#define current_suid()		(current->cred->suid)
+#define current_sgid()		(current->cred->sgid)
+#define current_fsuid()		(current->cred->fsuid)
+#define current_fsgid()		(current->cred->fsgid)
+#define current_cap()		(current->cred->cap_effective)
 
 #define current_uid_gid(_uid, _gid)		\
 do {						\
-	*(_uid) = current->uid;			\
-	*(_gid) = current->gid;			\
+	*(_uid) = current->cred->uid;		\
+	*(_gid) = current->cred->gid;		\
 } while(0)
 
 #define current_euid_egid(_uid, _gid)		\
 do {						\
-	*(_uid) = current->euid;		\
-	*(_gid) = current->egid;		\
+	*(_uid) = current->cred->euid;		\
+	*(_gid) = current->cred->egid;		\
 } while(0)
 
 #define current_fsuid_fsgid(_uid, _gid)		\
 do {						\
-	*(_uid) = current->fsuid;		\
-	*(_gid) = current->fsgid;		\
+	*(_uid) = current->cred->fsuid;		\
+	*(_gid) = current->cred->fsgid;		\
 } while(0)
 
 #endif /* _LINUX_CRED_H */
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 23fd8909b9e5..9de41ccd67b5 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -113,6 +113,21 @@ extern struct group_info init_groups;
 # define CAP_INIT_BSET  CAP_INIT_EFF_SET
 #endif
 
+extern struct cred init_cred;
+
+#define INIT_CRED(p)						\
+{								\
+	.usage			= ATOMIC_INIT(3),		\
+	.securebits		= SECUREBITS_DEFAULT,		\
+	.cap_inheritable	= CAP_INIT_INH_SET,		\
+	.cap_permitted		= CAP_FULL_SET,			\
+	.cap_effective		= CAP_INIT_EFF_SET,		\
+	.cap_bset		= CAP_INIT_BSET,		\
+	.user			= INIT_USER,			\
+	.group_info		= &init_groups,			\
+	.lock			= __SPIN_LOCK_UNLOCKED(p.lock),	\
+}
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -147,13 +162,8 @@ extern struct group_info init_groups;
 	.children	= LIST_HEAD_INIT(tsk.children),			\
 	.sibling	= LIST_HEAD_INIT(tsk.sibling),			\
 	.group_leader	= &tsk,						\
-	.group_info	= &init_groups,					\
-	.cap_effective	= CAP_INIT_EFF_SET,				\
-	.cap_inheritable = CAP_INIT_INH_SET,				\
-	.cap_permitted	= CAP_FULL_SET,					\
-	.cap_bset 	= CAP_INIT_BSET,				\
-	.securebits     = SECUREBITS_DEFAULT,				\
-	.user		= INIT_USER,					\
+	.__temp_cred	= INIT_CRED(tsk.__temp_cred),			\
+	.cred		= &tsk.__temp_cred,				\
 	.comm		= "swapper",					\
 	.thread		= INIT_THREAD,					\
 	.fs		= &init_fs,					\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b483f39a7112..c8b92502354d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -660,6 +660,7 @@ extern struct user_struct *find_user(uid_t);
 extern struct user_struct root_user;
 #define INIT_USER (&root_user)
 
+
 struct backing_dev_info;
 struct reclaim_state;
 
@@ -883,38 +884,7 @@ partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
 #endif	/* !CONFIG_SMP */
 
 struct io_context;			/* See blkdev.h */
-#define NGROUPS_SMALL		32
-#define NGROUPS_PER_BLOCK	((unsigned int)(PAGE_SIZE / sizeof(gid_t)))
-struct group_info {
-	int ngroups;
-	atomic_t usage;
-	gid_t small_block[NGROUPS_SMALL];
-	int nblocks;
-	gid_t *blocks[0];
-};
-
-/*
- * get_group_info() must be called with the owning task locked (via task_lock())
- * when task != current.  The reason being that the vast majority of callers are
- * looking at current->group_info, which can not be changed except by the
- * current task.  Changing current->group_info requires the task lock, too.
- */
-#define get_group_info(group_info) do { \
-	atomic_inc(&(group_info)->usage); \
-} while (0)
 
-#define put_group_info(group_info) do { \
-	if (atomic_dec_and_test(&(group_info)->usage)) \
-		groups_free(group_info); \
-} while (0)
-
-extern struct group_info *groups_alloc(int gidsetsize);
-extern void groups_free(struct group_info *group_info);
-extern int set_current_groups(struct group_info *group_info);
-extern int groups_search(struct group_info *group_info, gid_t grp);
-/* access the groups "array" with this macro */
-#define GROUP_AT(gi, i) \
-    ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK])
 
 #ifdef ARCH_HAS_PREFETCH_SWITCH_STACK
 extern void prefetch_stack(struct task_struct *t);
@@ -1181,17 +1151,9 @@ struct task_struct {
 	struct list_head cpu_timers[3];
 
 /* process credentials */
-	uid_t uid,euid,suid,fsuid;
-	gid_t gid,egid,sgid,fsgid;
-	struct group_info *group_info;
-	kernel_cap_t   cap_effective, cap_inheritable, cap_permitted, cap_bset;
-	struct user_struct *user;
-	unsigned securebits;
-#ifdef CONFIG_KEYS
-	unsigned char jit_keyring;	/* default keyring to attach requested keys to */
-	struct key *request_key_auth;	/* assumed request_key authority */
-	struct key *thread_keyring;	/* keyring private to this thread */
-#endif
+	struct cred __temp_cred __deprecated; /* temporary credentials to be removed */
+	struct cred *cred;	/* actual/objective task credentials */
+
 	char comm[TASK_COMM_LEN]; /* executable name excluding path
 				     - access with [gs]et_task_comm (which lock
 				       it with task_lock())
@@ -1228,9 +1190,6 @@ struct task_struct {
 	int (*notifier)(void *priv);
 	void *notifier_data;
 	sigset_t *notifier_mask;
-#ifdef CONFIG_SECURITY
-	void *security;
-#endif
 	struct audit_context *audit_context;
 #ifdef CONFIG_AUDITSYSCALL
 	uid_t loginuid;
@@ -1787,9 +1746,6 @@ extern void wake_up_new_task(struct task_struct *tsk,
 extern void sched_fork(struct task_struct *p, int clone_flags);
 extern void sched_dead(struct task_struct *p);
 
-extern int in_group_p(gid_t);
-extern int in_egroup_p(gid_t);
-
 extern void proc_caches_init(void);
 extern void flush_signals(struct task_struct *);
 extern void ignore_signals(struct task_struct *);
diff --git a/include/linux/securebits.h b/include/linux/securebits.h
index 92f09bdf1175..6d389491bfa2 100644
--- a/include/linux/securebits.h
+++ b/include/linux/securebits.h
@@ -32,7 +32,7 @@
    setting is locked or not. A setting which is locked cannot be
    changed from user-level. */
 #define issecure_mask(X)	(1 << (X))
-#define issecure(X)		(issecure_mask(X) & current->securebits)
+#define issecure(X)		(issecure_mask(X) & current->cred->securebits)
 
 #define SECURE_ALL_BITS		(issecure_mask(SECURE_NOROOT) | \
 				 issecure_mask(SECURE_NO_SETUID_FIXUP) | \
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index abda5991d7e3..e1885b494bac 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -126,7 +126,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb, int mode,
 		if (S_ISREG(mode)) {
 			struct mqueue_inode_info *info;
 			struct task_struct *p = current;
-			struct user_struct *u = p->user;
+			struct user_struct *u = p->cred->user;
 			unsigned long mq_bytes, mq_msg_tblsz;
 
 			inode->i_fop = &mqueue_file_operations;
diff --git a/ipc/shm.c b/ipc/shm.c
index 0c3debbe32d5..264a9d33c5dd 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -366,7 +366,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 	if (shmflg & SHM_HUGETLB) {
 		/* hugetlb_file_setup takes care of mlock user accounting */
 		file = hugetlb_file_setup(name, size);
-		shp->mlock_user = current->user;
+		shp->mlock_user = current->cred->user;
 	} else {
 		int acctflag = VM_ACCOUNT;
 		/*
@@ -767,7 +767,7 @@ asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf)
 			goto out_unlock;
 		
 		if(cmd==SHM_LOCK) {
-			struct user_struct * user = current->user;
+			struct user_struct *user = current->cred->user;
 			if (!is_file_hugepages(shp->shm_file)) {
 				err = shmem_lock(shp->shm_file, 1, user);
 				if (!err && !(shp->shm_perm.mode & SHM_LOCKED)){
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 9c7e47ae4576..2febf5165fad 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -447,6 +447,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 			      struct audit_names *name,
 			      enum audit_state *state)
 {
+	struct cred *cred = tsk->cred;
 	int i, j, need_sid = 1;
 	u32 sid;
 
@@ -466,28 +467,28 @@ static int audit_filter_rules(struct task_struct *tsk,
 			}
 			break;
 		case AUDIT_UID:
-			result = audit_comparator(tsk->uid, f->op, f->val);
+			result = audit_comparator(cred->uid, f->op, f->val);
 			break;
 		case AUDIT_EUID:
-			result = audit_comparator(tsk->euid, f->op, f->val);
+			result = audit_comparator(cred->euid, f->op, f->val);
 			break;
 		case AUDIT_SUID:
-			result = audit_comparator(tsk->suid, f->op, f->val);
+			result = audit_comparator(cred->suid, f->op, f->val);
 			break;
 		case AUDIT_FSUID:
-			result = audit_comparator(tsk->fsuid, f->op, f->val);
+			result = audit_comparator(cred->fsuid, f->op, f->val);
 			break;
 		case AUDIT_GID:
-			result = audit_comparator(tsk->gid, f->op, f->val);
+			result = audit_comparator(cred->gid, f->op, f->val);
 			break;
 		case AUDIT_EGID:
-			result = audit_comparator(tsk->egid, f->op, f->val);
+			result = audit_comparator(cred->egid, f->op, f->val);
 			break;
 		case AUDIT_SGID:
-			result = audit_comparator(tsk->sgid, f->op, f->val);
+			result = audit_comparator(cred->sgid, f->op, f->val);
 			break;
 		case AUDIT_FSGID:
-			result = audit_comparator(tsk->fsgid, f->op, f->val);
+			result = audit_comparator(cred->fsgid, f->op, f->val);
 			break;
 		case AUDIT_PERS:
 			result = audit_comparator(tsk->personality, f->op, f->val);
@@ -1228,6 +1229,7 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
 
 static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
 {
+	struct cred *cred = tsk->cred;
 	int i, call_panic = 0;
 	struct audit_buffer *ab;
 	struct audit_aux_data *aux;
@@ -1237,14 +1239,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 	context->pid = tsk->pid;
 	if (!context->ppid)
 		context->ppid = sys_getppid();
-	context->uid = tsk->uid;
-	context->gid = tsk->gid;
-	context->euid = tsk->euid;
-	context->suid = tsk->suid;
-	context->fsuid = tsk->fsuid;
-	context->egid = tsk->egid;
-	context->sgid = tsk->sgid;
-	context->fsgid = tsk->fsgid;
+	context->uid = cred->uid;
+	context->gid = cred->gid;
+	context->euid = cred->euid;
+	context->suid = cred->suid;
+	context->fsuid = cred->fsuid;
+	context->egid = cred->egid;
+	context->sgid = cred->sgid;
+	context->fsgid = cred->fsgid;
 	context->personality = tsk->personality;
 
 	ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
@@ -2086,7 +2088,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
 			audit_log_format(ab, "login pid=%d uid=%u "
 				"old auid=%u new auid=%u"
 				" old ses=%u new ses=%u",
-				task->pid, task->uid,
+				task->pid, task->cred->uid,
 				task->loginuid, loginuid,
 				task->sessionid, sessionid);
 			audit_log_end(ab);
@@ -2469,7 +2471,7 @@ void __audit_ptrace(struct task_struct *t)
 
 	context->target_pid = t->pid;
 	context->target_auid = audit_get_loginuid(t);
-	context->target_uid = t->uid;
+	context->target_uid = t->cred->uid;
 	context->target_sessionid = audit_get_sessionid(t);
 	security_task_getsecid(t, &context->target_sid);
 	memcpy(context->target_comm, t->comm, TASK_COMM_LEN);
@@ -2495,7 +2497,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
 			if (tsk->loginuid != -1)
 				audit_sig_uid = tsk->loginuid;
 			else
-				audit_sig_uid = tsk->uid;
+				audit_sig_uid = tsk->cred->uid;
 			security_task_getsecid(tsk, &audit_sig_sid);
 		}
 		if (!audit_signals || audit_dummy_context())
@@ -2507,7 +2509,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
 	if (!ctx->target_pid) {
 		ctx->target_pid = t->tgid;
 		ctx->target_auid = audit_get_loginuid(t);
-		ctx->target_uid = t->uid;
+		ctx->target_uid = t->cred->uid;
 		ctx->target_sessionid = audit_get_sessionid(t);
 		security_task_getsecid(t, &ctx->target_sid);
 		memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN);
@@ -2528,7 +2530,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
 
 	axp->target_pid[axp->pid_count] = t->tgid;
 	axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
-	axp->target_uid[axp->pid_count] = t->uid;
+	axp->target_uid[axp->pid_count] = t->cred->uid;
 	axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
 	security_task_getsecid(t, &axp->target_sid[axp->pid_count]);
 	memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN);
@@ -2575,12 +2577,12 @@ void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_
 	ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
 
 	ax->old_pcap.permitted = *pP;
-	ax->old_pcap.inheritable = current->cap_inheritable;
+	ax->old_pcap.inheritable = current->cred->cap_inheritable;
 	ax->old_pcap.effective = *pE;
 
-	ax->new_pcap.permitted = current->cap_permitted;
-	ax->new_pcap.inheritable = current->cap_inheritable;
-	ax->new_pcap.effective = current->cap_effective;
+	ax->new_pcap.permitted = current->cred->cap_permitted;
+	ax->new_pcap.inheritable = current->cred->cap_inheritable;
+	ax->new_pcap.effective = current->cred->cap_effective;
 }
 
 /**
diff --git a/kernel/capability.c b/kernel/capability.c
index 58b00519624a..a404b980b1bd 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -171,8 +171,8 @@ kernel_cap_t cap_set_effective(const kernel_cap_t pE_new)
 
 	spin_lock(&task_capability_lock);
 
-	pE_old = current->cap_effective;
-	current->cap_effective = pE_new;
+	pE_old = current->cred->cap_effective;
+	current->cred->cap_effective = pE_new;
 
 	spin_unlock(&task_capability_lock);
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 78f9b310c4f3..e210526e6401 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1293,7 +1293,9 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
 		rcu_read_unlock();
 
 		euid = current_euid();
-		if (euid && euid != tsk->uid && euid != tsk->suid) {
+		if (euid &&
+		    euid != tsk->cred->uid &&
+		    euid != tsk->cred->suid) {
 			put_task_struct(tsk);
 			return -EACCES;
 		}
diff --git a/kernel/exit.c b/kernel/exit.c
index 80137a5d9467..e0f6e1892fb9 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -160,7 +160,7 @@ void release_task(struct task_struct * p)
 	int zap_leader;
 repeat:
 	tracehook_prepare_release_task(p);
-	atomic_dec(&p->user->processes);
+	atomic_dec(&p->cred->user->processes);
 	proc_flush_task(p);
 	write_lock_irq(&tasklist_lock);
 	tracehook_finish_release_task(p);
@@ -1272,7 +1272,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
 		return 0;
 
 	if (unlikely(options & WNOWAIT)) {
-		uid_t uid = p->uid;
+		uid_t uid = p->cred->uid;
 		int exit_code = p->exit_code;
 		int why, status;
 
@@ -1393,7 +1393,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
 	if (!retval && infop)
 		retval = put_user(pid, &infop->si_pid);
 	if (!retval && infop)
-		retval = put_user(p->uid, &infop->si_uid);
+		retval = put_user(p->cred->uid, &infop->si_uid);
 	if (!retval)
 		retval = pid;
 
@@ -1458,7 +1458,7 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
 	if (!unlikely(options & WNOWAIT))
 		p->exit_code = 0;
 
-	uid = p->uid;
+	uid = p->cred->uid;
 unlock_sig:
 	spin_unlock_irq(&p->sighand->siglock);
 	if (!exit_code)
@@ -1535,7 +1535,7 @@ static int wait_task_continued(struct task_struct *p, int options,
 	spin_unlock_irq(&p->sighand->siglock);
 
 	pid = task_pid_vnr(p);
-	uid = p->uid;
+	uid = p->cred->uid;
 	get_task_struct(p);
 	read_unlock(&tasklist_lock);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index f6083561dfe0..81fdc7733908 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -147,8 +147,8 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(tsk == current);
 
 	security_task_free(tsk);
-	free_uid(tsk->user);
-	put_group_info(tsk->group_info);
+	free_uid(tsk->__temp_cred.user);
+	put_group_info(tsk->__temp_cred.group_info);
 	delayacct_tsk_free(tsk);
 
 	if (!profile_handoff_task(tsk))
@@ -969,17 +969,18 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
 	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
 #endif
+	p->cred = &p->__temp_cred;
 	retval = -EAGAIN;
-	if (atomic_read(&p->user->processes) >=
+	if (atomic_read(&p->cred->user->processes) >=
 			p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
 		if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
-		    p->user != current->nsproxy->user_ns->root_user)
+		    p->cred->user != current->nsproxy->user_ns->root_user)
 			goto bad_fork_free;
 	}
 
-	atomic_inc(&p->user->__count);
-	atomic_inc(&p->user->processes);
-	get_group_info(p->group_info);
+	atomic_inc(&p->cred->user->__count);
+	atomic_inc(&p->cred->user->processes);
+	get_group_info(p->cred->group_info);
 
 	/*
 	 * If multiple threads are within copy_process(), then this check
@@ -1035,9 +1036,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->real_start_time = p->start_time;
 	monotonic_to_bootbased(&p->real_start_time);
 #ifdef CONFIG_SECURITY
-	p->security = NULL;
+	p->cred->security = NULL;
 #endif
-	p->cap_bset = current->cap_bset;
 	p->io_context = NULL;
 	p->audit_context = NULL;
 	cgroup_fork(p);
@@ -1298,9 +1298,9 @@ bad_fork_cleanup_cgroup:
 bad_fork_cleanup_put_domain:
 	module_put(task_thread_info(p)->exec_domain->module);
 bad_fork_cleanup_count:
-	put_group_info(p->group_info);
-	atomic_dec(&p->user->processes);
-	free_uid(p->user);
+	put_group_info(p->cred->group_info);
+	atomic_dec(&p->cred->user->processes);
+	free_uid(p->cred->user);
 bad_fork_free:
 	free_task(p);
 fork_out:
diff --git a/kernel/futex.c b/kernel/futex.c
index e06962132aaf..28421d8210b8 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -443,7 +443,8 @@ static struct task_struct * futex_find_get_task(pid_t pid)
 
 	rcu_read_lock();
 	p = find_task_by_vpid(pid);
-	if (!p || (euid != p->euid && euid != p->uid))
+	if (!p || (euid != p->cred->euid &&
+		   euid != p->cred->uid))
 		p = ERR_PTR(-ESRCH);
 	else
 		get_task_struct(p);
@@ -1846,7 +1847,8 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
 		if (!p)
 			goto err_unlock;
 		ret = -EPERM;
-		if (euid != p->euid && euid != p->uid &&
+		if (euid != p->cred->euid &&
+		    euid != p->cred->uid &&
 		    !capable(CAP_SYS_PTRACE))
 			goto err_unlock;
 		head = p->robust_list;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 3254d4e41e88..2c3fd5ed34f5 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -151,8 +151,9 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
 		if (!p)
 			goto err_unlock;
 		ret = -EPERM;
-		if (euid != p->euid && euid != p->uid &&
-				!capable(CAP_SYS_PTRACE))
+		if (euid != p->cred->euid &&
+		    euid != p->cred->uid &&
+		    !capable(CAP_SYS_PTRACE))
 			goto err_unlock;
 		head = p->compat_robust_list;
 		read_unlock(&tasklist_lock);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 937f6b5b2008..49849d12dd12 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -115,6 +115,8 @@ int ptrace_check_attach(struct task_struct *child, int kill)
 
 int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 {
+	struct cred *cred = current->cred, *tcred = task->cred;
+
 	/* May we inspect the given task?
 	 * This check is used both for attaching with ptrace
 	 * and for allowing access to sensitive information in /proc.
@@ -123,19 +125,18 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 	 * because setting up the necessary parent/child relationship
 	 * or halting the specified task is impossible.
 	 */
-	uid_t uid;
-	gid_t gid;
+	uid_t uid = cred->uid;
+	gid_t gid = cred->gid;
 	int dumpable = 0;
 	/* Don't let security modules deny introspection */
 	if (task == current)
 		return 0;
-	current_uid_gid(&uid, &gid);
-	if ((uid != task->euid ||
-	     uid != task->suid ||
-	     uid != task->uid  ||
-	     gid != task->egid ||
-	     gid != task->sgid ||
-	     gid != task->gid) && !capable(CAP_SYS_PTRACE))
+	if ((uid != tcred->euid ||
+	     uid != tcred->suid ||
+	     uid != tcred->uid  ||
+	     gid != tcred->egid ||
+	     gid != tcred->sgid ||
+	     gid != tcred->gid) && !capable(CAP_SYS_PTRACE))
 		return -EPERM;
 	smp_rmb();
 	if (task->mm)
diff --git a/kernel/sched.c b/kernel/sched.c
index c3b8b1fcde0d..733c59e645aa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -345,7 +345,7 @@ static inline struct task_group *task_group(struct task_struct *p)
 	struct task_group *tg;
 
 #ifdef CONFIG_USER_SCHED
-	tg = p->user->tg;
+	tg = p->cred->user->tg;
 #elif defined(CONFIG_CGROUP_SCHED)
 	tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
 				struct task_group, css);
@@ -5182,8 +5182,8 @@ recheck:
 
 		/* can't change other user's priorities */
 		euid = current_euid();
-		if (euid != p->euid &&
-		    euid != p->uid)
+		if (euid != p->cred->euid &&
+		    euid != p->cred->uid)
 			return -EPERM;
 	}
 
@@ -5417,7 +5417,9 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
 
 	euid = current_euid();
 	retval = -EPERM;
-	if (euid != p->euid && euid != p->uid && !capable(CAP_SYS_NICE))
+	if (euid != p->cred->euid &&
+	    euid != p->cred->uid &&
+	    !capable(CAP_SYS_NICE))
 		goto out_unlock;
 
 	retval = security_task_setscheduler(p, 0, NULL);
diff --git a/kernel/signal.c b/kernel/signal.c
index 167b535fe1a9..80e8a6489f97 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -187,7 +187,7 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
 	 * In order to avoid problems with "switch_user()", we want to make
 	 * sure that the compiler doesn't re-load "t->user"
 	 */
-	user = t->user;
+	user = t->cred->user;
 	barrier();
 	atomic_inc(&user->sigpending);
 	if (override_rlimit ||
@@ -582,8 +582,8 @@ static int check_kill_permission(int sig, struct siginfo *info,
 
 	uid = current_uid();
 	euid = current_euid();
-	if ((euid ^ t->suid) && (euid ^ t->uid) &&
-	    (uid  ^ t->suid) && (uid  ^ t->uid) &&
+	if ((euid ^ t->cred->suid) && (euid ^ t->cred->uid) &&
+	    (uid  ^ t->cred->suid) && (uid  ^ t->cred->uid) &&
 	    !capable(CAP_KILL)) {
 		switch (sig) {
 		case SIGCONT:
@@ -1100,8 +1100,8 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
 		goto out_unlock;
 	}
 	if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)))
-	    && (euid != p->suid) && (euid != p->uid)
-	    && (uid != p->suid) && (uid != p->uid)) {
+	    && (euid != p->cred->suid) && (euid != p->cred->uid)
+	    && (uid != p->cred->suid) && (uid != p->cred->uid)) {
 		ret = -EPERM;
 		goto out_unlock;
 	}
@@ -1374,7 +1374,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 	info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
 	rcu_read_unlock();
 
-	info.si_uid = tsk->uid;
+	info.si_uid = tsk->cred->uid;
 
 	thread_group_cputime(tsk, &cputime);
 	info.si_utime = cputime_to_jiffies(cputime.utime);
@@ -1445,7 +1445,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
 	info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
 	rcu_read_unlock();
 
-	info.si_uid = tsk->uid;
+	info.si_uid = tsk->cred->uid;
 
 	info.si_utime = cputime_to_clock_t(tsk->utime);
 	info.si_stime = cputime_to_clock_t(tsk->stime);
@@ -1713,7 +1713,7 @@ static int ptrace_signal(int signr, siginfo_t *info,
 		info->si_errno = 0;
 		info->si_code = SI_USER;
 		info->si_pid = task_pid_vnr(current->parent);
-		info->si_uid = current->parent->uid;
+		info->si_uid = current->parent->cred->uid;
 	}
 
 	/* If the (new) signal is now blocked, requeue it.  */
diff --git a/kernel/sys.c b/kernel/sys.c
index ed5c29c748ac..5d81f07c0150 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -117,7 +117,9 @@ static int set_one_prio(struct task_struct *p, int niceval, int error)
 	uid_t euid = current_euid();
 	int no_nice;
 
-	if (p->uid != euid && p->euid != euid && !capable(CAP_SYS_NICE)) {
+	if (p->cred->uid  != euid &&
+	    p->cred->euid != euid &&
+	    !capable(CAP_SYS_NICE)) {
 		error = -EPERM;
 		goto out;
 	}
@@ -174,7 +176,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
 			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 			break;
 		case PRIO_USER:
-			user = current->user;
+			user = current->cred->user;
 			if (!who)
 				who = current_uid();
 			else
@@ -182,7 +184,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
 					goto out_unlock;	/* No processes for this user */
 
 			do_each_thread(g, p)
-				if (p->uid == who)
+				if (p->cred->uid == who)
 					error = set_one_prio(p, niceval, error);
 			while_each_thread(g, p);
 			if (who != current_uid())
@@ -236,7 +238,7 @@ asmlinkage long sys_getpriority(int which, int who)
 			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 			break;
 		case PRIO_USER:
-			user = current->user;
+			user = current->cred->user;
 			if (!who)
 				who = current_uid();
 			else
@@ -244,7 +246,7 @@ asmlinkage long sys_getpriority(int which, int who)
 					goto out_unlock;	/* No processes for this user */
 
 			do_each_thread(g, p)
-				if (p->uid == who) {
+				if (p->cred->uid == who) {
 					niceval = 20 - task_nice(p);
 					if (niceval > retval)
 						retval = niceval;
@@ -472,8 +474,9 @@ void ctrl_alt_del(void)
  */
 asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
 {
-	int old_rgid = current->gid;
-	int old_egid = current->egid;
+	struct cred *cred = current->cred;
+	int old_rgid = cred->gid;
+	int old_egid = cred->egid;
 	int new_rgid = old_rgid;
 	int new_egid = old_egid;
 	int retval;
@@ -484,7 +487,7 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
 
 	if (rgid != (gid_t) -1) {
 		if ((old_rgid == rgid) ||
-		    (current->egid==rgid) ||
+		    (cred->egid == rgid) ||
 		    capable(CAP_SETGID))
 			new_rgid = rgid;
 		else
@@ -492,8 +495,8 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
 	}
 	if (egid != (gid_t) -1) {
 		if ((old_rgid == egid) ||
-		    (current->egid == egid) ||
-		    (current->sgid == egid) ||
+		    (cred->egid == egid) ||
+		    (cred->sgid == egid) ||
 		    capable(CAP_SETGID))
 			new_egid = egid;
 		else
@@ -505,10 +508,10 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
 	}
 	if (rgid != (gid_t) -1 ||
 	    (egid != (gid_t) -1 && egid != old_rgid))
-		current->sgid = new_egid;
-	current->fsgid = new_egid;
-	current->egid = new_egid;
-	current->gid = new_rgid;
+		cred->sgid = new_egid;
+	cred->fsgid = new_egid;
+	cred->egid = new_egid;
+	cred->gid = new_rgid;
 	key_fsgid_changed(current);
 	proc_id_connector(current, PROC_EVENT_GID);
 	return 0;
@@ -521,7 +524,8 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
  */
 asmlinkage long sys_setgid(gid_t gid)
 {
-	int old_egid = current->egid;
+	struct cred *cred = current->cred;
+	int old_egid = cred->egid;
 	int retval;
 
 	retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID);
@@ -533,13 +537,13 @@ asmlinkage long sys_setgid(gid_t gid)
 			set_dumpable(current->mm, suid_dumpable);
 			smp_wmb();
 		}
-		current->gid = current->egid = current->sgid = current->fsgid = gid;
-	} else if ((gid == current->gid) || (gid == current->sgid)) {
+		cred->gid = cred->egid = cred->sgid = cred->fsgid = gid;
+	} else if ((gid == cred->gid) || (gid == cred->sgid)) {
 		if (old_egid != gid) {
 			set_dumpable(current->mm, suid_dumpable);
 			smp_wmb();
 		}
-		current->egid = current->fsgid = gid;
+		cred->egid = cred->fsgid = gid;
 	}
 	else
 		return -EPERM;
@@ -570,7 +574,7 @@ static int set_user(uid_t new_ruid, int dumpclear)
 		set_dumpable(current->mm, suid_dumpable);
 		smp_wmb();
 	}
-	current->uid = new_ruid;
+	current->cred->uid = new_ruid;
 	return 0;
 }
 
@@ -591,6 +595,7 @@ static int set_user(uid_t new_ruid, int dumpclear)
  */
 asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
 {
+	struct cred *cred = current->cred;
 	int old_ruid, old_euid, old_suid, new_ruid, new_euid;
 	int retval;
 
@@ -598,14 +603,14 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
 	if (retval)
 		return retval;
 
-	new_ruid = old_ruid = current->uid;
-	new_euid = old_euid = current->euid;
-	old_suid = current->suid;
+	new_ruid = old_ruid = cred->uid;
+	new_euid = old_euid = cred->euid;
+	old_suid = cred->suid;
 
 	if (ruid != (uid_t) -1) {
 		new_ruid = ruid;
 		if ((old_ruid != ruid) &&
-		    (current->euid != ruid) &&
+		    (cred->euid != ruid) &&
 		    !capable(CAP_SETUID))
 			return -EPERM;
 	}
@@ -613,8 +618,8 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
 	if (euid != (uid_t) -1) {
 		new_euid = euid;
 		if ((old_ruid != euid) &&
-		    (current->euid != euid) &&
-		    (current->suid != euid) &&
+		    (cred->euid != euid) &&
+		    (cred->suid != euid) &&
 		    !capable(CAP_SETUID))
 			return -EPERM;
 	}
@@ -626,11 +631,11 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
 		set_dumpable(current->mm, suid_dumpable);
 		smp_wmb();
 	}
-	current->fsuid = current->euid = new_euid;
+	cred->fsuid = cred->euid = new_euid;
 	if (ruid != (uid_t) -1 ||
 	    (euid != (uid_t) -1 && euid != old_ruid))
-		current->suid = current->euid;
-	current->fsuid = current->euid;
+		cred->suid = cred->euid;
+	cred->fsuid = cred->euid;
 
 	key_fsuid_changed(current);
 	proc_id_connector(current, PROC_EVENT_UID);
@@ -653,7 +658,8 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
  */
 asmlinkage long sys_setuid(uid_t uid)
 {
-	int old_euid = current->euid;
+	struct cred *cred = current->cred;
+	int old_euid = cred->euid;
 	int old_ruid, old_suid, new_suid;
 	int retval;
 
@@ -661,23 +667,23 @@ asmlinkage long sys_setuid(uid_t uid)
 	if (retval)
 		return retval;
 
-	old_ruid = current->uid;
-	old_suid = current->suid;
+	old_ruid = cred->uid;
+	old_suid = cred->suid;
 	new_suid = old_suid;
 	
 	if (capable(CAP_SETUID)) {
 		if (uid != old_ruid && set_user(uid, old_euid != uid) < 0)
 			return -EAGAIN;
 		new_suid = uid;
-	} else if ((uid != current->uid) && (uid != new_suid))
+	} else if ((uid != cred->uid) && (uid != new_suid))
 		return -EPERM;
 
 	if (old_euid != uid) {
 		set_dumpable(current->mm, suid_dumpable);
 		smp_wmb();
 	}
-	current->fsuid = current->euid = uid;
-	current->suid = new_suid;
+	cred->fsuid = cred->euid = uid;
+	cred->suid = new_suid;
 
 	key_fsuid_changed(current);
 	proc_id_connector(current, PROC_EVENT_UID);
@@ -692,9 +698,10 @@ asmlinkage long sys_setuid(uid_t uid)
  */
 asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
 {
-	int old_ruid = current->uid;
-	int old_euid = current->euid;
-	int old_suid = current->suid;
+	struct cred *cred = current->cred;
+	int old_ruid = cred->uid;
+	int old_euid = cred->euid;
+	int old_suid = cred->suid;
 	int retval;
 
 	retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES);
@@ -702,30 +709,31 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
 		return retval;
 
 	if (!capable(CAP_SETUID)) {
-		if ((ruid != (uid_t) -1) && (ruid != current->uid) &&
-		    (ruid != current->euid) && (ruid != current->suid))
+		if ((ruid != (uid_t) -1) && (ruid != cred->uid) &&
+		    (ruid != cred->euid) && (ruid != cred->suid))
 			return -EPERM;
-		if ((euid != (uid_t) -1) && (euid != current->uid) &&
-		    (euid != current->euid) && (euid != current->suid))
+		if ((euid != (uid_t) -1) && (euid != cred->uid) &&
+		    (euid != cred->euid) && (euid != cred->suid))
 			return -EPERM;
-		if ((suid != (uid_t) -1) && (suid != current->uid) &&
-		    (suid != current->euid) && (suid != current->suid))
+		if ((suid != (uid_t) -1) && (suid != cred->uid) &&
+		    (suid != cred->euid) && (suid != cred->suid))
 			return -EPERM;
 	}
 	if (ruid != (uid_t) -1) {
-		if (ruid != current->uid && set_user(ruid, euid != current->euid) < 0)
+		if (ruid != cred->uid &&
+		    set_user(ruid, euid != cred->euid) < 0)
 			return -EAGAIN;
 	}
 	if (euid != (uid_t) -1) {
-		if (euid != current->euid) {
+		if (euid != cred->euid) {
 			set_dumpable(current->mm, suid_dumpable);
 			smp_wmb();
 		}
-		current->euid = euid;
+		cred->euid = euid;
 	}
-	current->fsuid = current->euid;
+	cred->fsuid = cred->euid;
 	if (suid != (uid_t) -1)
-		current->suid = suid;
+		cred->suid = suid;
 
 	key_fsuid_changed(current);
 	proc_id_connector(current, PROC_EVENT_UID);
@@ -735,11 +743,12 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
 
 asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid)
 {
+	struct cred *cred = current->cred;
 	int retval;
 
-	if (!(retval = put_user(current->uid, ruid)) &&
-	    !(retval = put_user(current->euid, euid)))
-		retval = put_user(current->suid, suid);
+	if (!(retval = put_user(cred->uid, ruid)) &&
+	    !(retval = put_user(cred->euid, euid)))
+		retval = put_user(cred->suid, suid);
 
 	return retval;
 }
@@ -749,6 +758,7 @@ asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __us
  */
 asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
 {
+	struct cred *cred = current->cred;
 	int retval;
 
 	retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES);
@@ -756,28 +766,28 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
 		return retval;
 
 	if (!capable(CAP_SETGID)) {
-		if ((rgid != (gid_t) -1) && (rgid != current->gid) &&
-		    (rgid != current->egid) && (rgid != current->sgid))
+		if ((rgid != (gid_t) -1) && (rgid != cred->gid) &&
+		    (rgid != cred->egid) && (rgid != cred->sgid))
 			return -EPERM;
-		if ((egid != (gid_t) -1) && (egid != current->gid) &&
-		    (egid != current->egid) && (egid != current->sgid))
+		if ((egid != (gid_t) -1) && (egid != cred->gid) &&
+		    (egid != cred->egid) && (egid != cred->sgid))
 			return -EPERM;
-		if ((sgid != (gid_t) -1) && (sgid != current->gid) &&
-		    (sgid != current->egid) && (sgid != current->sgid))
+		if ((sgid != (gid_t) -1) && (sgid != cred->gid) &&
+		    (sgid != cred->egid) && (sgid != cred->sgid))
 			return -EPERM;
 	}
 	if (egid != (gid_t) -1) {
-		if (egid != current->egid) {
+		if (egid != cred->egid) {
 			set_dumpable(current->mm, suid_dumpable);
 			smp_wmb();
 		}
-		current->egid = egid;
+		cred->egid = egid;
 	}
-	current->fsgid = current->egid;
+	cred->fsgid = cred->egid;
 	if (rgid != (gid_t) -1)
-		current->gid = rgid;
+		cred->gid = rgid;
 	if (sgid != (gid_t) -1)
-		current->sgid = sgid;
+		cred->sgid = sgid;
 
 	key_fsgid_changed(current);
 	proc_id_connector(current, PROC_EVENT_GID);
@@ -786,11 +796,12 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
 
 asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid)
 {
+	struct cred *cred = current->cred;
 	int retval;
 
-	if (!(retval = put_user(current->gid, rgid)) &&
-	    !(retval = put_user(current->egid, egid)))
-		retval = put_user(current->sgid, sgid);
+	if (!(retval = put_user(cred->gid, rgid)) &&
+	    !(retval = put_user(cred->egid, egid)))
+		retval = put_user(cred->sgid, sgid);
 
 	return retval;
 }
@@ -804,20 +815,21 @@ asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __us
  */
 asmlinkage long sys_setfsuid(uid_t uid)
 {
+	struct cred *cred = current->cred;
 	int old_fsuid;
 
-	old_fsuid = current->fsuid;
+	old_fsuid = cred->fsuid;
 	if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS))
 		return old_fsuid;
 
-	if (uid == current->uid || uid == current->euid ||
-	    uid == current->suid || uid == current->fsuid || 
+	if (uid == cred->uid || uid == cred->euid ||
+	    uid == cred->suid || uid == cred->fsuid ||
 	    capable(CAP_SETUID)) {
 		if (uid != old_fsuid) {
 			set_dumpable(current->mm, suid_dumpable);
 			smp_wmb();
 		}
-		current->fsuid = uid;
+		cred->fsuid = uid;
 	}
 
 	key_fsuid_changed(current);
@@ -833,20 +845,21 @@ asmlinkage long sys_setfsuid(uid_t uid)
  */
 asmlinkage long sys_setfsgid(gid_t gid)
 {
+	struct cred *cred = current->cred;
 	int old_fsgid;
 
-	old_fsgid = current->fsgid;
+	old_fsgid = cred->fsgid;
 	if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS))
 		return old_fsgid;
 
-	if (gid == current->gid || gid == current->egid ||
-	    gid == current->sgid || gid == current->fsgid || 
+	if (gid == cred->gid || gid == cred->egid ||
+	    gid == cred->sgid || gid == cred->fsgid ||
 	    capable(CAP_SETGID)) {
 		if (gid != old_fsgid) {
 			set_dumpable(current->mm, suid_dumpable);
 			smp_wmb();
 		}
-		current->fsgid = gid;
+		cred->fsgid = gid;
 		key_fsgid_changed(current);
 		proc_id_connector(current, PROC_EVENT_GID);
 	}
@@ -1208,8 +1221,15 @@ int groups_search(struct group_info *group_info, gid_t grp)
 	return 0;
 }
 
-/* validate and set current->group_info */
-int set_current_groups(struct group_info *group_info)
+/**
+ * set_groups - Change a group subscription in a security record
+ * @sec: The security record to alter
+ * @group_info: The group list to impose
+ *
+ * Validate a group subscription and, if valid, impose it upon a task security
+ * record.
+ */
+int set_groups(struct cred *cred, struct group_info *group_info)
 {
 	int retval;
 	struct group_info *old_info;
@@ -1221,20 +1241,34 @@ int set_current_groups(struct group_info *group_info)
 	groups_sort(group_info);
 	get_group_info(group_info);
 
-	task_lock(current);
-	old_info = current->group_info;
-	current->group_info = group_info;
-	task_unlock(current);
+	spin_lock(&cred->lock);
+	old_info = cred->group_info;
+	cred->group_info = group_info;
+	spin_unlock(&cred->lock);
 
 	put_group_info(old_info);
-
 	return 0;
 }
 
+EXPORT_SYMBOL(set_groups);
+
+/**
+ * set_current_groups - Change current's group subscription
+ * @group_info: The group list to impose
+ *
+ * Validate a group subscription and, if valid, impose it upon current's task
+ * security record.
+ */
+int set_current_groups(struct group_info *group_info)
+{
+	return set_groups(current->cred, group_info);
+}
+
 EXPORT_SYMBOL(set_current_groups);
 
 asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist)
 {
+	struct cred *cred = current->cred;
 	int i = 0;
 
 	/*
@@ -1246,13 +1280,13 @@ asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist)
 		return -EINVAL;
 
 	/* no need to grab task_lock here; it cannot change */
-	i = current->group_info->ngroups;
+	i = cred->group_info->ngroups;
 	if (gidsetsize) {
 		if (i > gidsetsize) {
 			i = -EINVAL;
 			goto out;
 		}
-		if (groups_to_user(grouplist, current->group_info)) {
+		if (groups_to_user(grouplist, cred->group_info)) {
 			i = -EFAULT;
 			goto out;
 		}
@@ -1296,9 +1330,10 @@ asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist)
  */
 int in_group_p(gid_t grp)
 {
+	struct cred *cred = current->cred;
 	int retval = 1;
-	if (grp != current->fsgid)
-		retval = groups_search(current->group_info, grp);
+	if (grp != cred->fsgid)
+		retval = groups_search(cred->group_info, grp);
 	return retval;
 }
 
@@ -1306,9 +1341,10 @@ EXPORT_SYMBOL(in_group_p);
 
 int in_egroup_p(gid_t grp)
 {
+	struct cred *cred = current->cred;
 	int retval = 1;
-	if (grp != current->egid)
-		retval = groups_search(current->group_info, grp);
+	if (grp != cred->egid)
+		retval = groups_search(cred->group_info, grp);
 	return retval;
 }
 
@@ -1624,7 +1660,9 @@ asmlinkage long sys_umask(int mask)
 asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 			  unsigned long arg4, unsigned long arg5)
 {
-	long error = 0;
+	struct task_struct *me = current;
+	unsigned char comm[sizeof(me->comm)];
+	long error;
 
 	if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error))
 		return error;
@@ -1635,39 +1673,41 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 				error = -EINVAL;
 				break;
 			}
-			current->pdeath_signal = arg2;
+			me->pdeath_signal = arg2;
+			error = 0;
 			break;
 		case PR_GET_PDEATHSIG:
-			error = put_user(current->pdeath_signal, (int __user *)arg2);
+			error = put_user(me->pdeath_signal, (int __user *)arg2);
 			break;
 		case PR_GET_DUMPABLE:
-			error = get_dumpable(current->mm);
+			error = get_dumpable(me->mm);
 			break;
 		case PR_SET_DUMPABLE:
 			if (arg2 < 0 || arg2 > 1) {
 				error = -EINVAL;
 				break;
 			}
-			set_dumpable(current->mm, arg2);
+			set_dumpable(me->mm, arg2);
+			error = 0;
 			break;
 
 		case PR_SET_UNALIGN:
-			error = SET_UNALIGN_CTL(current, arg2);
+			error = SET_UNALIGN_CTL(me, arg2);
 			break;
 		case PR_GET_UNALIGN:
-			error = GET_UNALIGN_CTL(current, arg2);
+			error = GET_UNALIGN_CTL(me, arg2);
 			break;
 		case PR_SET_FPEMU:
-			error = SET_FPEMU_CTL(current, arg2);
+			error = SET_FPEMU_CTL(me, arg2);
 			break;
 		case PR_GET_FPEMU:
-			error = GET_FPEMU_CTL(current, arg2);
+			error = GET_FPEMU_CTL(me, arg2);
 			break;
 		case PR_SET_FPEXC:
-			error = SET_FPEXC_CTL(current, arg2);
+			error = SET_FPEXC_CTL(me, arg2);
 			break;
 		case PR_GET_FPEXC:
-			error = GET_FPEXC_CTL(current, arg2);
+			error = GET_FPEXC_CTL(me, arg2);
 			break;
 		case PR_GET_TIMING:
 			error = PR_TIMING_STATISTICAL;
@@ -1675,33 +1715,28 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 		case PR_SET_TIMING:
 			if (arg2 != PR_TIMING_STATISTICAL)
 				error = -EINVAL;
+			else
+				error = 0;
 			break;
 
-		case PR_SET_NAME: {
-			struct task_struct *me = current;
-			unsigned char ncomm[sizeof(me->comm)];
-
-			ncomm[sizeof(me->comm)-1] = 0;
-			if (strncpy_from_user(ncomm, (char __user *)arg2,
-						sizeof(me->comm)-1) < 0)
+		case PR_SET_NAME:
+			comm[sizeof(me->comm)-1] = 0;
+			if (strncpy_from_user(comm, (char __user *)arg2,
+					      sizeof(me->comm) - 1) < 0)
 				return -EFAULT;
-			set_task_comm(me, ncomm);
+			set_task_comm(me, comm);
 			return 0;
-		}
-		case PR_GET_NAME: {
-			struct task_struct *me = current;
-			unsigned char tcomm[sizeof(me->comm)];
-
-			get_task_comm(tcomm, me);
-			if (copy_to_user((char __user *)arg2, tcomm, sizeof(tcomm)))
+		case PR_GET_NAME:
+			get_task_comm(comm, me);
+			if (copy_to_user((char __user *)arg2, comm,
+					 sizeof(comm)))
 				return -EFAULT;
 			return 0;
-		}
 		case PR_GET_ENDIAN:
-			error = GET_ENDIAN(current, arg2);
+			error = GET_ENDIAN(me, arg2);
 			break;
 		case PR_SET_ENDIAN:
-			error = SET_ENDIAN(current, arg2);
+			error = SET_ENDIAN(me, arg2);
 			break;
 
 		case PR_GET_SECCOMP:
@@ -1725,6 +1760,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 					current->default_timer_slack_ns;
 			else
 				current->timer_slack_ns = arg2;
+			error = 0;
 			break;
 		default:
 			error = -EINVAL;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9f3b478f9171..5c97c5b4ea8f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -246,7 +246,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 
 	memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
 	data->pid = tsk->pid;
-	data->uid = tsk->uid;
+	data->uid = task_uid(tsk);
 	data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
 	data->policy = tsk->policy;
 	data->rt_priority = tsk->rt_priority;
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 8ebcd8532dfb..6d1ed07bf312 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -53,8 +53,8 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
 		stats->ac_flag |= AXSIG;
 	stats->ac_nice	 = task_nice(tsk);
 	stats->ac_sched	 = tsk->policy;
-	stats->ac_uid	 = tsk->uid;
-	stats->ac_gid	 = tsk->gid;
+	stats->ac_uid	 = tsk->cred->uid;
+	stats->ac_gid	 = tsk->cred->gid;
 	stats->ac_pid	 = tsk->pid;
 	rcu_read_lock();
 	stats->ac_ppid	 = pid_alive(tsk) ?
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 3e41c1673e2f..71f07fc39fea 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -86,9 +86,9 @@ asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid,
 {
 	int retval;
 
-	if (!(retval = put_user(high2lowuid(current->uid), ruid)) &&
-	    !(retval = put_user(high2lowuid(current->euid), euid)))
-		retval = put_user(high2lowuid(current->suid), suid);
+	if (!(retval = put_user(high2lowuid(current->cred->uid), ruid)) &&
+	    !(retval = put_user(high2lowuid(current->cred->euid), euid)))
+		retval = put_user(high2lowuid(current->cred->suid), suid);
 
 	return retval;
 }
@@ -106,9 +106,9 @@ asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid,
 {
 	int retval;
 
-	if (!(retval = put_user(high2lowgid(current->gid), rgid)) &&
-	    !(retval = put_user(high2lowgid(current->egid), egid)))
-		retval = put_user(high2lowgid(current->sgid), sgid);
+	if (!(retval = put_user(high2lowgid(current->cred->gid), rgid)) &&
+	    !(retval = put_user(high2lowgid(current->cred->egid), egid)))
+		retval = put_user(high2lowgid(current->cred->sgid), sgid);
 
 	return retval;
 }
@@ -166,20 +166,20 @@ asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t __user *grouplist)
 	if (gidsetsize < 0)
 		return -EINVAL;
 
-	get_group_info(current->group_info);
-	i = current->group_info->ngroups;
+	get_group_info(current->cred->group_info);
+	i = current->cred->group_info->ngroups;
 	if (gidsetsize) {
 		if (i > gidsetsize) {
 			i = -EINVAL;
 			goto out;
 		}
-		if (groups16_to_user(grouplist, current->group_info)) {
+		if (groups16_to_user(grouplist, current->cred->group_info)) {
 			i = -EFAULT;
 			goto out;
 		}
 	}
 out:
-	put_group_info(current->group_info);
+	put_group_info(current->cred->group_info);
 	return i;
 }
 
@@ -210,20 +210,20 @@ asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist)
 
 asmlinkage long sys_getuid16(void)
 {
-	return high2lowuid(current->uid);
+	return high2lowuid(current->cred->uid);
 }
 
 asmlinkage long sys_geteuid16(void)
 {
-	return high2lowuid(current->euid);
+	return high2lowuid(current->cred->euid);
 }
 
 asmlinkage long sys_getgid16(void)
 {
-	return high2lowgid(current->gid);
+	return high2lowgid(current->cred->gid);
 }
 
 asmlinkage long sys_getegid16(void)
 {
-	return high2lowgid(current->egid);
+	return high2lowgid(current->cred->egid);
 }
diff --git a/kernel/user.c b/kernel/user.c
index 39d6159fae43..104d22ac84d5 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -457,11 +457,11 @@ void switch_uid(struct user_struct *new_user)
 	 * cheaply with the new uid cache, so if it matters
 	 * we should be checking for it.  -DaveM
 	 */
-	old_user = current->user;
+	old_user = current->cred->user;
 	atomic_inc(&new_user->processes);
 	atomic_dec(&old_user->processes);
 	switch_uid_keyring(new_user);
-	current->user = new_user;
+	current->cred->user = new_user;
 	sched_switch_user(current);
 
 	/*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 07a96474077d..b23492ee3e50 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1110,12 +1110,12 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 		const unsigned long __user *old_nodes,
 		const unsigned long __user *new_nodes)
 {
+	struct cred *cred, *tcred;
 	struct mm_struct *mm;
 	struct task_struct *task;
 	nodemask_t old;
 	nodemask_t new;
 	nodemask_t task_nodes;
-	uid_t uid, euid;
 	int err;
 
 	err = get_nodes(&old, old_nodes, maxnode);
@@ -1145,10 +1145,10 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 	 * capabilities, superuser privileges or the same
 	 * userid as the target process.
 	 */
-	uid = current_uid();
-	euid = current_euid();
-	if (euid != task->suid && euid != task->uid &&
-	    uid  != task->suid && uid  != task->uid &&
+	cred = current->cred;
+	tcred = task->cred;
+	if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
+	    cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
 	    !capable(CAP_SYS_NICE)) {
 		err = -EPERM;
 		goto out;
diff --git a/mm/migrate.c b/mm/migrate.c
index 6263c24c4afe..794443da1b4f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1045,10 +1045,10 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
 			const int __user *nodes,
 			int __user *status, int flags)
 {
+	struct cred *cred, *tcred;
 	struct task_struct *task;
 	struct mm_struct *mm;
 	int err;
-	uid_t uid, euid;
 
 	/* Check flags */
 	if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
@@ -1076,10 +1076,10 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
 	 * capabilities, superuser privileges or the same
 	 * userid as the target process.
 	 */
-	uid = current_uid();
-	euid = current_euid();
-	if (euid != task->suid && euid != task->uid &&
-	    uid  != task->suid && uid  != task->uid &&
+	cred = current->cred;
+	tcred = task->cred;
+	if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
+	    cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
 	    !capable(CAP_SYS_NICE)) {
 		err = -EPERM;
 		goto out;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 34a458aa7997..3af787ba2077 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -298,7 +298,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
 
 		task_lock(p);
 		printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d     %3d %s\n",
-		       p->pid, p->uid, p->tgid, p->mm->total_vm,
+		       p->pid, p->cred->uid, p->tgid, p->mm->total_vm,
 		       get_mm_rss(p->mm), (int)task_cpu(p), p->oomkilladj,
 		       p->comm);
 		task_unlock(p);
diff --git a/net/core/scm.c b/net/core/scm.c
index 4681d8f9b45b..c28ca32a7d93 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -44,11 +44,13 @@
 
 static __inline__ int scm_check_creds(struct ucred *creds)
 {
+	struct cred *cred = current->cred;
+
 	if ((creds->pid == task_tgid_vnr(current) || capable(CAP_SYS_ADMIN)) &&
-	    ((creds->uid == current_uid()   || creds->uid == current_euid() ||
-	      creds->uid == current_suid()) || capable(CAP_SETUID)) &&
-	    ((creds->gid == current_gid()   || creds->gid == current_egid() ||
-	      creds->gid == current_sgid()) || capable(CAP_SETGID))) {
+	    ((creds->uid == cred->uid   || creds->uid == cred->euid ||
+	      creds->uid == cred->suid) || capable(CAP_SETUID)) &&
+	    ((creds->gid == cred->gid   || creds->gid == cred->egid ||
+	      creds->gid == cred->sgid) || capable(CAP_SETGID))) {
 	       return 0;
 	}
 	return -EPERM;
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 8fc380578807..c79543212602 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -353,7 +353,7 @@ rpcauth_lookupcred(struct rpc_auth *auth, int flags)
 	struct auth_cred acred = {
 		.uid = current_fsuid(),
 		.gid = current_fsgid(),
-		.group_info = current->group_info,
+		.group_info = current->cred->group_info,
 	};
 	struct rpc_cred *ret;
 
diff --git a/security/commoncap.c b/security/commoncap.c
index fb4e240720d8..fa61679f8c73 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -30,7 +30,7 @@
 
 int cap_netlink_send(struct sock *sk, struct sk_buff *skb)
 {
-	NETLINK_CB(skb).eff_cap = current->cap_effective;
+	NETLINK_CB(skb).eff_cap = current_cap();
 	return 0;
 }
 
@@ -52,7 +52,7 @@ EXPORT_SYMBOL(cap_netlink_recv);
 int cap_capable(struct task_struct *tsk, int cap, int audit)
 {
 	/* Derived from include/linux/sched.h:capable. */
-	if (cap_raised(tsk->cap_effective, cap))
+	if (cap_raised(tsk->cred->cap_effective, cap))
 		return 0;
 	return -EPERM;
 }
@@ -67,7 +67,8 @@ int cap_settime(struct timespec *ts, struct timezone *tz)
 int cap_ptrace_may_access(struct task_struct *child, unsigned int mode)
 {
 	/* Derived from arch/i386/kernel/ptrace.c:sys_ptrace. */
-	if (cap_issubset(child->cap_permitted, current->cap_permitted))
+	if (cap_issubset(child->cred->cap_permitted,
+			 current->cred->cap_permitted))
 		return 0;
 	if (capable(CAP_SYS_PTRACE))
 		return 0;
@@ -76,8 +77,8 @@ int cap_ptrace_may_access(struct task_struct *child, unsigned int mode)
 
 int cap_ptrace_traceme(struct task_struct *parent)
 {
-	/* Derived from arch/i386/kernel/ptrace.c:sys_ptrace. */
-	if (cap_issubset(current->cap_permitted, parent->cap_permitted))
+	if (cap_issubset(current->cred->cap_permitted,
+			 parent->cred->cap_permitted))
 		return 0;
 	if (has_capability(parent, CAP_SYS_PTRACE))
 		return 0;
@@ -87,10 +88,12 @@ int cap_ptrace_traceme(struct task_struct *parent)
 int cap_capget (struct task_struct *target, kernel_cap_t *effective,
 		kernel_cap_t *inheritable, kernel_cap_t *permitted)
 {
+	struct cred *cred = target->cred;
+
 	/* Derived from kernel/capability.c:sys_capget. */
-	*effective = target->cap_effective;
-	*inheritable = target->cap_inheritable;
-	*permitted = target->cap_permitted;
+	*effective   = cred->cap_effective;
+	*inheritable = cred->cap_inheritable;
+	*permitted   = cred->cap_permitted;
 	return 0;
 }
 
@@ -122,24 +125,26 @@ int cap_capset_check(const kernel_cap_t *effective,
 		     const kernel_cap_t *inheritable,
 		     const kernel_cap_t *permitted)
 {
+	const struct cred *cred = current->cred;
+
 	if (cap_inh_is_capped()
 	    && !cap_issubset(*inheritable,
-			     cap_combine(current->cap_inheritable,
-					 current->cap_permitted))) {
+			     cap_combine(cred->cap_inheritable,
+					 cred->cap_permitted))) {
 		/* incapable of using this inheritable set */
 		return -EPERM;
 	}
 	if (!cap_issubset(*inheritable,
-			   cap_combine(current->cap_inheritable,
-				       current->cap_bset))) {
+			   cap_combine(cred->cap_inheritable,
+				       cred->cap_bset))) {
 		/* no new pI capabilities outside bounding set */
 		return -EPERM;
 	}
 
 	/* verify restrictions on target's new Permitted set */
 	if (!cap_issubset (*permitted,
-			   cap_combine (current->cap_permitted,
-					current->cap_permitted))) {
+			   cap_combine (cred->cap_permitted,
+					cred->cap_permitted))) {
 		return -EPERM;
 	}
 
@@ -155,9 +160,11 @@ void cap_capset_set(const kernel_cap_t *effective,
 		    const kernel_cap_t *inheritable,
 		    const kernel_cap_t *permitted)
 {
-	current->cap_effective = *effective;
-	current->cap_inheritable = *inheritable;
-	current->cap_permitted = *permitted;
+	struct cred *cred = current->cred;
+
+	cred->cap_effective   = *effective;
+	cred->cap_inheritable = *inheritable;
+	cred->cap_permitted   = *permitted;
 }
 
 static inline void bprm_clear_caps(struct linux_binprm *bprm)
@@ -211,8 +218,8 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
 		 * pP' = (X & fP) | (pI & fI)
 		 */
 		bprm->cap_post_exec_permitted.cap[i] =
-			(current->cap_bset.cap[i] & permitted) |
-			(current->cap_inheritable.cap[i] & inheritable);
+			(current->cred->cap_bset.cap[i] & permitted) |
+			(current->cred->cap_inheritable.cap[i] & inheritable);
 
 		if (permitted & ~bprm->cap_post_exec_permitted.cap[i]) {
 			/*
@@ -354,8 +361,8 @@ int cap_bprm_set_security (struct linux_binprm *bprm)
 		if (bprm->e_uid == 0 || current_uid() == 0) {
 			/* pP' = (cap_bset & ~0) | (pI & ~0) */
 			bprm->cap_post_exec_permitted = cap_combine(
-				current->cap_bset, current->cap_inheritable
-				);
+				current->cred->cap_bset,
+				current->cred->cap_inheritable);
 			bprm->cap_effective = (bprm->e_uid == 0);
 			ret = 0;
 		}
@@ -366,44 +373,39 @@ int cap_bprm_set_security (struct linux_binprm *bprm)
 
 void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
 {
-	kernel_cap_t pP = current->cap_permitted;
-	kernel_cap_t pE = current->cap_effective;
-	uid_t uid;
-	gid_t gid;
+	struct cred *cred = current->cred;
 
-	current_uid_gid(&uid, &gid);
-
-	if (bprm->e_uid != uid || bprm->e_gid != gid ||
+	if (bprm->e_uid != cred->uid || bprm->e_gid != cred->gid ||
 	    !cap_issubset(bprm->cap_post_exec_permitted,
-			  current->cap_permitted)) {
+			  cred->cap_permitted)) {
 		set_dumpable(current->mm, suid_dumpable);
 		current->pdeath_signal = 0;
 
 		if (unsafe & ~LSM_UNSAFE_PTRACE_CAP) {
 			if (!capable(CAP_SETUID)) {
-				bprm->e_uid = uid;
-				bprm->e_gid = gid;
+				bprm->e_uid = cred->uid;
+				bprm->e_gid = cred->gid;
 			}
 			if (cap_limit_ptraced_target()) {
 				bprm->cap_post_exec_permitted = cap_intersect(
 					bprm->cap_post_exec_permitted,
-					current->cap_permitted);
+					cred->cap_permitted);
 			}
 		}
 	}
 
-	current->suid = current->euid = current->fsuid = bprm->e_uid;
-	current->sgid = current->egid = current->fsgid = bprm->e_gid;
+	cred->suid = cred->euid = cred->fsuid = bprm->e_uid;
+	cred->sgid = cred->egid = cred->fsgid = bprm->e_gid;
 
 	/* For init, we want to retain the capabilities set
 	 * in the init_task struct. Thus we skip the usual
 	 * capability rules */
 	if (!is_global_init(current)) {
-		current->cap_permitted = bprm->cap_post_exec_permitted;
+		cred->cap_permitted = bprm->cap_post_exec_permitted;
 		if (bprm->cap_effective)
-			current->cap_effective = bprm->cap_post_exec_permitted;
+			cred->cap_effective = bprm->cap_post_exec_permitted;
 		else
-			cap_clear(current->cap_effective);
+			cap_clear(cred->cap_effective);
 	}
 
 	/*
@@ -418,27 +420,30 @@ void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
 	 * Number 1 above might fail if you don't have a full bset, but I think
 	 * that is interesting information to audit.
 	 */
-	if (!cap_isclear(current->cap_effective)) {
-		if (!cap_issubset(CAP_FULL_SET, current->cap_effective) ||
-		    (bprm->e_uid != 0) || (current->uid != 0) ||
+	if (!cap_isclear(cred->cap_effective)) {
+		if (!cap_issubset(CAP_FULL_SET, cred->cap_effective) ||
+		    (bprm->e_uid != 0) || (cred->uid != 0) ||
 		    issecure(SECURE_NOROOT))
-			audit_log_bprm_fcaps(bprm, &pP, &pE);
+			audit_log_bprm_fcaps(bprm, &cred->cap_permitted,
+					     &cred->cap_effective);
 	}
 
-	current->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
+	cred->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
 }
 
 int cap_bprm_secureexec (struct linux_binprm *bprm)
 {
-	if (current_uid() != 0) {
+	const struct cred *cred = current->cred;
+
+	if (cred->uid != 0) {
 		if (bprm->cap_effective)
 			return 1;
 		if (!cap_isclear(bprm->cap_post_exec_permitted))
 			return 1;
 	}
 
-	return (current_euid() != current_uid() ||
-		current_egid() != current_gid());
+	return (cred->euid != cred->uid ||
+		cred->egid != cred->gid);
 }
 
 int cap_inode_setxattr(struct dentry *dentry, const char *name,
@@ -501,25 +506,27 @@ int cap_inode_removexattr(struct dentry *dentry, const char *name)
 static inline void cap_emulate_setxuid (int old_ruid, int old_euid,
 					int old_suid)
 {
-	uid_t euid = current_euid();
+	struct cred *cred = current->cred;
 
 	if ((old_ruid == 0 || old_euid == 0 || old_suid == 0) &&
-	    (current_uid()  != 0 && euid != 0 && current_suid() != 0) &&
+	    (cred->uid != 0 && cred->euid != 0 && cred->suid != 0) &&
 	    !issecure(SECURE_KEEP_CAPS)) {
-		cap_clear (current->cap_permitted);
-		cap_clear (current->cap_effective);
+		cap_clear (cred->cap_permitted);
+		cap_clear (cred->cap_effective);
 	}
-	if (old_euid == 0 && euid != 0) {
-		cap_clear (current->cap_effective);
+	if (old_euid == 0 && cred->euid != 0) {
+		cap_clear (cred->cap_effective);
 	}
-	if (old_euid != 0 && euid == 0) {
-		current->cap_effective = current->cap_permitted;
+	if (old_euid != 0 && cred->euid == 0) {
+		cred->cap_effective = cred->cap_permitted;
 	}
 }
 
 int cap_task_post_setuid (uid_t old_ruid, uid_t old_euid, uid_t old_suid,
 			  int flags)
 {
+	struct cred *cred = current->cred;
+
 	switch (flags) {
 	case LSM_SETID_RE:
 	case LSM_SETID_ID:
@@ -541,16 +548,16 @@ int cap_task_post_setuid (uid_t old_ruid, uid_t old_euid, uid_t old_suid,
 			 */
 
 			if (!issecure (SECURE_NO_SETUID_FIXUP)) {
-				if (old_fsuid == 0 && current_fsuid() != 0) {
-					current->cap_effective =
+				if (old_fsuid == 0 && cred->fsuid != 0) {
+					cred->cap_effective =
 						cap_drop_fs_set(
-						    current->cap_effective);
+							cred->cap_effective);
 				}
-				if (old_fsuid != 0 && current_fsuid() == 0) {
-					current->cap_effective =
+				if (old_fsuid != 0 && cred->fsuid == 0) {
+					cred->cap_effective =
 						cap_raise_fs_set(
-						    current->cap_effective,
-						    current->cap_permitted);
+						    cred->cap_effective,
+						    cred->cap_permitted);
 				}
 			}
 			break;
@@ -575,7 +582,8 @@ int cap_task_post_setuid (uid_t old_ruid, uid_t old_euid, uid_t old_suid,
  */
 static int cap_safe_nice(struct task_struct *p)
 {
-	if (!cap_issubset(p->cap_permitted, current->cap_permitted) &&
+	if (!cap_issubset(p->cred->cap_permitted,
+			  current->cred->cap_permitted) &&
 	    !capable(CAP_SYS_NICE))
 		return -EPERM;
 	return 0;
@@ -610,7 +618,7 @@ static long cap_prctl_drop(unsigned long cap)
 		return -EPERM;
 	if (!cap_valid(cap))
 		return -EINVAL;
-	cap_lower(current->cap_bset, cap);
+	cap_lower(current->cred->cap_bset, cap);
 	return 0;
 }
 
@@ -633,6 +641,7 @@ int cap_task_setnice (struct task_struct *p, int nice)
 int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 		   unsigned long arg4, unsigned long arg5, long *rc_p)
 {
+	struct cred *cred = current->cred;
 	long error = 0;
 
 	switch (option) {
@@ -640,7 +649,7 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 		if (!cap_valid(arg2))
 			error = -EINVAL;
 		else
-			error = !!cap_raised(current->cap_bset, arg2);
+			error = !!cap_raised(cred->cap_bset, arg2);
 		break;
 #ifdef CONFIG_SECURITY_FILE_CAPABILITIES
 	case PR_CAPBSET_DROP:
@@ -667,9 +676,9 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 	 * capability-based-privilege environment.
 	 */
 	case PR_SET_SECUREBITS:
-		if ((((current->securebits & SECURE_ALL_LOCKS) >> 1)
-		     & (current->securebits ^ arg2))                  /*[1]*/
-		    || ((current->securebits & SECURE_ALL_LOCKS
+		if ((((cred->securebits & SECURE_ALL_LOCKS) >> 1)
+		     & (cred->securebits ^ arg2))                  /*[1]*/
+		    || ((cred->securebits & SECURE_ALL_LOCKS
 			 & ~arg2))                                    /*[2]*/
 		    || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS)) /*[3]*/
 		    || (cap_capable(current, CAP_SETPCAP, SECURITY_CAP_AUDIT) != 0)) { /*[4]*/
@@ -682,11 +691,11 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 			 */
 			error = -EPERM;  /* cannot change a locked bit */
 		} else {
-			current->securebits = arg2;
+			cred->securebits = arg2;
 		}
 		break;
 	case PR_GET_SECUREBITS:
-		error = current->securebits;
+		error = cred->securebits;
 		break;
 
 #endif /* def CONFIG_SECURITY_FILE_CAPABILITIES */
@@ -701,10 +710,9 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 		else if (issecure(SECURE_KEEP_CAPS_LOCKED))
 			error = -EPERM;
 		else if (arg2)
-			current->securebits |= issecure_mask(SECURE_KEEP_CAPS);
+			cred->securebits |= issecure_mask(SECURE_KEEP_CAPS);
 		else
-			current->securebits &=
-				~issecure_mask(SECURE_KEEP_CAPS);
+			cred->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
 		break;
 
 	default:
@@ -719,11 +727,12 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 
 void cap_task_reparent_to_init (struct task_struct *p)
 {
-	cap_set_init_eff(p->cap_effective);
-	cap_clear(p->cap_inheritable);
-	cap_set_full(p->cap_permitted);
-	p->securebits = SECUREBITS_DEFAULT;
-	return;
+	struct cred *cred = p->cred;
+
+	cap_set_init_eff(cred->cap_effective);
+	cap_clear(cred->cap_inheritable);
+	cap_set_full(cred->cap_permitted);
+	p->cred->securebits = SECUREBITS_DEFAULT;
 }
 
 int cap_syslog (int type)
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index fcce331eca72..8833b447adef 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -889,7 +889,7 @@ long keyctl_instantiate_key(key_serial_t id,
 	/* the appropriate instantiation authorisation key must have been
 	 * assumed before calling this */
 	ret = -EPERM;
-	instkey = current->request_key_auth;
+	instkey = current->cred->request_key_auth;
 	if (!instkey)
 		goto error;
 
@@ -932,8 +932,8 @@ long keyctl_instantiate_key(key_serial_t id,
 	/* discard the assumed authority if it's just been disabled by
 	 * instantiation of the key */
 	if (ret == 0) {
-		key_put(current->request_key_auth);
-		current->request_key_auth = NULL;
+		key_put(current->cred->request_key_auth);
+		current->cred->request_key_auth = NULL;
 	}
 
 error2:
@@ -960,7 +960,7 @@ long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid)
 	/* the appropriate instantiation authorisation key must have been
 	 * assumed before calling this */
 	ret = -EPERM;
-	instkey = current->request_key_auth;
+	instkey = current->cred->request_key_auth;
 	if (!instkey)
 		goto error;
 
@@ -983,8 +983,8 @@ long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid)
 	/* discard the assumed authority if it's just been disabled by
 	 * instantiation of the key */
 	if (ret == 0) {
-		key_put(current->request_key_auth);
-		current->request_key_auth = NULL;
+		key_put(current->cred->request_key_auth);
+		current->cred->request_key_auth = NULL;
 	}
 
 error:
@@ -999,6 +999,7 @@ error:
  */
 long keyctl_set_reqkey_keyring(int reqkey_defl)
 {
+	struct cred *cred = current->cred;
 	int ret;
 
 	switch (reqkey_defl) {
@@ -1018,10 +1019,10 @@ long keyctl_set_reqkey_keyring(int reqkey_defl)
 	case KEY_REQKEY_DEFL_USER_KEYRING:
 	case KEY_REQKEY_DEFL_USER_SESSION_KEYRING:
 	set:
-		current->jit_keyring = reqkey_defl;
+		cred->jit_keyring = reqkey_defl;
 
 	case KEY_REQKEY_DEFL_NO_CHANGE:
-		return current->jit_keyring;
+		return cred->jit_keyring;
 
 	case KEY_REQKEY_DEFL_GROUP_KEYRING:
 	default:
@@ -1086,8 +1087,8 @@ long keyctl_assume_authority(key_serial_t id)
 
 	/* we divest ourselves of authority if given an ID of 0 */
 	if (id == 0) {
-		key_put(current->request_key_auth);
-		current->request_key_auth = NULL;
+		key_put(current->cred->request_key_auth);
+		current->cred->request_key_auth = NULL;
 		ret = 0;
 		goto error;
 	}
@@ -1103,8 +1104,8 @@ long keyctl_assume_authority(key_serial_t id)
 		goto error;
 	}
 
-	key_put(current->request_key_auth);
-	current->request_key_auth = authkey;
+	key_put(current->cred->request_key_auth);
+	current->cred->request_key_auth = authkey;
 	ret = authkey->serial;
 
 error:
diff --git a/security/keys/permission.c b/security/keys/permission.c
index 3b41f9b52537..baf3d5f31e71 100644
--- a/security/keys/permission.c
+++ b/security/keys/permission.c
@@ -22,6 +22,7 @@ int key_task_permission(const key_ref_t key_ref,
 			struct task_struct *context,
 			key_perm_t perm)
 {
+	struct cred *cred = context->cred;
 	struct key *key;
 	key_perm_t kperm;
 	int ret;
@@ -29,7 +30,7 @@ int key_task_permission(const key_ref_t key_ref,
 	key = key_ref_to_ptr(key_ref);
 
 	/* use the second 8-bits of permissions for keys the caller owns */
-	if (key->uid == context->fsuid) {
+	if (key->uid == cred->fsuid) {
 		kperm = key->perm >> 16;
 		goto use_these_perms;
 	}
@@ -37,14 +38,14 @@ int key_task_permission(const key_ref_t key_ref,
 	/* use the third 8-bits of permissions for keys the caller has a group
 	 * membership in common with */
 	if (key->gid != -1 && key->perm & KEY_GRP_ALL) {
-		if (key->gid == context->fsgid) {
+		if (key->gid == cred->fsgid) {
 			kperm = key->perm >> 8;
 			goto use_these_perms;
 		}
 
-		task_lock(context);
-		ret = groups_search(context->group_info, key->gid);
-		task_unlock(context);
+		spin_lock(&cred->lock);
+		ret = groups_search(cred->group_info, key->gid);
+		spin_unlock(&cred->lock);
 
 		if (ret) {
 			kperm = key->perm >> 8;
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 1c793b7090a7..b0904cdda2e7 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -42,7 +42,7 @@ struct key_user root_key_user = {
  */
 int install_user_keyrings(void)
 {
-	struct user_struct *user = current->user;
+	struct user_struct *user = current->cred->user;
 	struct key *uid_keyring, *session_keyring;
 	char buf[20];
 	int ret;
@@ -156,7 +156,7 @@ int install_thread_keyring(void)
 
 	sprintf(buf, "_tid.%u", tsk->pid);
 
-	keyring = keyring_alloc(buf, tsk->uid, tsk->gid, tsk,
+	keyring = keyring_alloc(buf, tsk->cred->uid, tsk->cred->gid, tsk,
 				KEY_ALLOC_QUOTA_OVERRUN, NULL);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
@@ -164,8 +164,8 @@ int install_thread_keyring(void)
 	}
 
 	task_lock(tsk);
-	old = tsk->thread_keyring;
-	tsk->thread_keyring = keyring;
+	old = tsk->cred->thread_keyring;
+	tsk->cred->thread_keyring = keyring;
 	task_unlock(tsk);
 
 	ret = 0;
@@ -192,7 +192,7 @@ int install_process_keyring(void)
 	if (!tsk->signal->process_keyring) {
 		sprintf(buf, "_pid.%u", tsk->tgid);
 
-		keyring = keyring_alloc(buf, tsk->uid, tsk->gid, tsk,
+		keyring = keyring_alloc(buf, tsk->cred->uid, tsk->cred->gid, tsk,
 					KEY_ALLOC_QUOTA_OVERRUN, NULL);
 		if (IS_ERR(keyring)) {
 			ret = PTR_ERR(keyring);
@@ -238,7 +238,7 @@ static int install_session_keyring(struct key *keyring)
 		if (tsk->signal->session_keyring)
 			flags = KEY_ALLOC_IN_QUOTA;
 
-		keyring = keyring_alloc(buf, tsk->uid, tsk->gid, tsk,
+		keyring = keyring_alloc(buf, tsk->cred->uid, tsk->cred->gid, tsk,
 					flags, NULL);
 		if (IS_ERR(keyring))
 			return PTR_ERR(keyring);
@@ -292,14 +292,14 @@ int copy_thread_group_keys(struct task_struct *tsk)
  */
 int copy_keys(unsigned long clone_flags, struct task_struct *tsk)
 {
-	key_check(tsk->thread_keyring);
-	key_check(tsk->request_key_auth);
+	key_check(tsk->cred->thread_keyring);
+	key_check(tsk->cred->request_key_auth);
 
 	/* no thread keyring yet */
-	tsk->thread_keyring = NULL;
+	tsk->cred->thread_keyring = NULL;
 
 	/* copy the request_key() authorisation for this thread */
-	key_get(tsk->request_key_auth);
+	key_get(tsk->cred->request_key_auth);
 
 	return 0;
 
@@ -322,8 +322,8 @@ void exit_thread_group_keys(struct signal_struct *tg)
  */
 void exit_keys(struct task_struct *tsk)
 {
-	key_put(tsk->thread_keyring);
-	key_put(tsk->request_key_auth);
+	key_put(tsk->cred->thread_keyring);
+	key_put(tsk->cred->request_key_auth);
 
 } /* end exit_keys() */
 
@@ -337,8 +337,8 @@ int exec_keys(struct task_struct *tsk)
 
 	/* newly exec'd tasks don't get a thread keyring */
 	task_lock(tsk);
-	old = tsk->thread_keyring;
-	tsk->thread_keyring = NULL;
+	old = tsk->cred->thread_keyring;
+	tsk->cred->thread_keyring = NULL;
 	task_unlock(tsk);
 
 	key_put(old);
@@ -373,10 +373,11 @@ int suid_keys(struct task_struct *tsk)
 void key_fsuid_changed(struct task_struct *tsk)
 {
 	/* update the ownership of the thread keyring */
-	if (tsk->thread_keyring) {
-		down_write(&tsk->thread_keyring->sem);
-		tsk->thread_keyring->uid = tsk->fsuid;
-		up_write(&tsk->thread_keyring->sem);
+	BUG_ON(!tsk->cred);
+	if (tsk->cred->thread_keyring) {
+		down_write(&tsk->cred->thread_keyring->sem);
+		tsk->cred->thread_keyring->uid = tsk->cred->fsuid;
+		up_write(&tsk->cred->thread_keyring->sem);
 	}
 
 } /* end key_fsuid_changed() */
@@ -388,10 +389,11 @@ void key_fsuid_changed(struct task_struct *tsk)
 void key_fsgid_changed(struct task_struct *tsk)
 {
 	/* update the ownership of the thread keyring */
-	if (tsk->thread_keyring) {
-		down_write(&tsk->thread_keyring->sem);
-		tsk->thread_keyring->gid = tsk->fsgid;
-		up_write(&tsk->thread_keyring->sem);
+	BUG_ON(!tsk->cred);
+	if (tsk->cred->thread_keyring) {
+		down_write(&tsk->cred->thread_keyring->sem);
+		tsk->cred->thread_keyring->gid = tsk->cred->fsgid;
+		up_write(&tsk->cred->thread_keyring->sem);
 	}
 
 } /* end key_fsgid_changed() */
@@ -426,9 +428,9 @@ key_ref_t search_process_keyrings(struct key_type *type,
 	err = ERR_PTR(-EAGAIN);
 
 	/* search the thread keyring first */
-	if (context->thread_keyring) {
+	if (context->cred->thread_keyring) {
 		key_ref = keyring_search_aux(
-			make_key_ref(context->thread_keyring, 1),
+			make_key_ref(context->cred->thread_keyring, 1),
 			context, type, description, match);
 		if (!IS_ERR(key_ref))
 			goto found;
@@ -493,9 +495,9 @@ key_ref_t search_process_keyrings(struct key_type *type,
 		}
 	}
 	/* or search the user-session keyring */
-	else if (context->user->session_keyring) {
+	else if (context->cred->user->session_keyring) {
 		key_ref = keyring_search_aux(
-			make_key_ref(context->user->session_keyring, 1),
+			make_key_ref(context->cred->user->session_keyring, 1),
 			context, type, description, match);
 		if (!IS_ERR(key_ref))
 			goto found;
@@ -517,20 +519,20 @@ key_ref_t search_process_keyrings(struct key_type *type,
 	 * search the keyrings of the process mentioned there
 	 * - we don't permit access to request_key auth keys via this method
 	 */
-	if (context->request_key_auth &&
+	if (context->cred->request_key_auth &&
 	    context == current &&
 	    type != &key_type_request_key_auth
 	    ) {
 		/* defend against the auth key being revoked */
-		down_read(&context->request_key_auth->sem);
+		down_read(&context->cred->request_key_auth->sem);
 
-		if (key_validate(context->request_key_auth) == 0) {
-			rka = context->request_key_auth->payload.data;
+		if (key_validate(context->cred->request_key_auth) == 0) {
+			rka = context->cred->request_key_auth->payload.data;
 
 			key_ref = search_process_keyrings(type, description,
 							  match, rka->context);
 
-			up_read(&context->request_key_auth->sem);
+			up_read(&context->cred->request_key_auth->sem);
 
 			if (!IS_ERR(key_ref))
 				goto found;
@@ -547,7 +549,7 @@ key_ref_t search_process_keyrings(struct key_type *type,
 				break;
 			}
 		} else {
-			up_read(&context->request_key_auth->sem);
+			up_read(&context->cred->request_key_auth->sem);
 		}
 	}
 
@@ -580,15 +582,16 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial,
 {
 	struct request_key_auth *rka;
 	struct task_struct *t = current;
-	key_ref_t key_ref, skey_ref;
+	struct cred *cred = t->cred;
 	struct key *key;
+	key_ref_t key_ref, skey_ref;
 	int ret;
 
 	key_ref = ERR_PTR(-ENOKEY);
 
 	switch (id) {
 	case KEY_SPEC_THREAD_KEYRING:
-		if (!t->thread_keyring) {
+		if (!cred->thread_keyring) {
 			if (!create)
 				goto error;
 
@@ -599,7 +602,7 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial,
 			}
 		}
 
-		key = t->thread_keyring;
+		key = cred->thread_keyring;
 		atomic_inc(&key->usage);
 		key_ref = make_key_ref(key, 1);
 		break;
@@ -628,7 +631,8 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial,
 			ret = install_user_keyrings();
 			if (ret < 0)
 				goto error;
-			ret = install_session_keyring(t->user->session_keyring);
+			ret = install_session_keyring(
+				cred->user->session_keyring);
 			if (ret < 0)
 				goto error;
 		}
@@ -641,25 +645,25 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial,
 		break;
 
 	case KEY_SPEC_USER_KEYRING:
-		if (!t->user->uid_keyring) {
+		if (!cred->user->uid_keyring) {
 			ret = install_user_keyrings();
 			if (ret < 0)
 				goto error;
 		}
 
-		key = t->user->uid_keyring;
+		key = cred->user->uid_keyring;
 		atomic_inc(&key->usage);
 		key_ref = make_key_ref(key, 1);
 		break;
 
 	case KEY_SPEC_USER_SESSION_KEYRING:
-		if (!t->user->session_keyring) {
+		if (!cred->user->session_keyring) {
 			ret = install_user_keyrings();
 			if (ret < 0)
 				goto error;
 		}
 
-		key = t->user->session_keyring;
+		key = cred->user->session_keyring;
 		atomic_inc(&key->usage);
 		key_ref = make_key_ref(key, 1);
 		break;
@@ -670,7 +674,7 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial,
 		goto error;
 
 	case KEY_SPEC_REQKEY_AUTH_KEY:
-		key = t->request_key_auth;
+		key = cred->request_key_auth;
 		if (!key)
 			goto error;
 
@@ -679,19 +683,19 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial,
 		break;
 
 	case KEY_SPEC_REQUESTOR_KEYRING:
-		if (!t->request_key_auth)
+		if (!cred->request_key_auth)
 			goto error;
 
-		down_read(&t->request_key_auth->sem);
-		if (t->request_key_auth->flags & KEY_FLAG_REVOKED) {
+		down_read(&cred->request_key_auth->sem);
+		if (cred->request_key_auth->flags & KEY_FLAG_REVOKED) {
 			key_ref = ERR_PTR(-EKEYREVOKED);
 			key = NULL;
 		} else {
-			rka = t->request_key_auth->payload.data;
+			rka = cred->request_key_auth->payload.data;
 			key = rka->dest_keyring;
 			atomic_inc(&key->usage);
 		}
-		up_read(&t->request_key_auth->sem);
+		up_read(&cred->request_key_auth->sem);
 		if (!key)
 			goto error;
 		key_ref = make_key_ref(key, 1);
@@ -791,7 +795,7 @@ long join_session_keyring(const char *name)
 	keyring = find_keyring_by_name(name, false);
 	if (PTR_ERR(keyring) == -ENOKEY) {
 		/* not found - try and create a new one */
-		keyring = keyring_alloc(name, tsk->uid, tsk->gid, tsk,
+		keyring = keyring_alloc(name, tsk->cred->uid, tsk->cred->gid, tsk,
 					KEY_ALLOC_IN_QUOTA, NULL);
 		if (IS_ERR(keyring)) {
 			ret = PTR_ERR(keyring);
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index 8e9d93b4a402..3e9b9eb1dd28 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -104,7 +104,8 @@ static int call_sbin_request_key(struct key_construction *cons,
 
 	/* we specify the process's default keyrings */
 	sprintf(keyring_str[0], "%d",
-		tsk->thread_keyring ? tsk->thread_keyring->serial : 0);
+		tsk->cred->thread_keyring ?
+		tsk->cred->thread_keyring->serial : 0);
 
 	prkey = 0;
 	if (tsk->signal->process_keyring)
@@ -117,7 +118,7 @@ static int call_sbin_request_key(struct key_construction *cons,
 		sskey = rcu_dereference(tsk->signal->session_keyring)->serial;
 		rcu_read_unlock();
 	} else {
-		sskey = tsk->user->session_keyring->serial;
+		sskey = tsk->cred->user->session_keyring->serial;
 	}
 
 	sprintf(keyring_str[2], "%d", sskey);
@@ -232,11 +233,11 @@ static void construct_get_dest_keyring(struct key **_dest_keyring)
 	} else {
 		/* use a default keyring; falling through the cases until we
 		 * find one that we actually have */
-		switch (tsk->jit_keyring) {
+		switch (tsk->cred->jit_keyring) {
 		case KEY_REQKEY_DEFL_DEFAULT:
 		case KEY_REQKEY_DEFL_REQUESTOR_KEYRING:
-			if (tsk->request_key_auth) {
-				authkey = tsk->request_key_auth;
+			if (tsk->cred->request_key_auth) {
+				authkey = tsk->cred->request_key_auth;
 				down_read(&authkey->sem);
 				rka = authkey->payload.data;
 				if (!test_bit(KEY_FLAG_REVOKED,
@@ -249,7 +250,7 @@ static void construct_get_dest_keyring(struct key **_dest_keyring)
 			}
 
 		case KEY_REQKEY_DEFL_THREAD_KEYRING:
-			dest_keyring = key_get(tsk->thread_keyring);
+			dest_keyring = key_get(tsk->cred->thread_keyring);
 			if (dest_keyring)
 				break;
 
@@ -268,11 +269,12 @@ static void construct_get_dest_keyring(struct key **_dest_keyring)
 				break;
 
 		case KEY_REQKEY_DEFL_USER_SESSION_KEYRING:
-			dest_keyring = key_get(tsk->user->session_keyring);
+			dest_keyring =
+				key_get(tsk->cred->user->session_keyring);
 			break;
 
 		case KEY_REQKEY_DEFL_USER_KEYRING:
-			dest_keyring = key_get(tsk->user->uid_keyring);
+			dest_keyring = key_get(tsk->cred->user->uid_keyring);
 			break;
 
 		case KEY_REQKEY_DEFL_GROUP_KEYRING:
diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c
index 1762d44711d5..2125579d5d73 100644
--- a/security/keys/request_key_auth.c
+++ b/security/keys/request_key_auth.c
@@ -164,22 +164,22 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info,
 
 	/* see if the calling process is already servicing the key request of
 	 * another process */
-	if (current->request_key_auth) {
+	if (current->cred->request_key_auth) {
 		/* it is - use that instantiation context here too */
-		down_read(&current->request_key_auth->sem);
+		down_read(&current->cred->request_key_auth->sem);
 
 		/* if the auth key has been revoked, then the key we're
 		 * servicing is already instantiated */
 		if (test_bit(KEY_FLAG_REVOKED,
-			     &current->request_key_auth->flags))
+			     &current->cred->request_key_auth->flags))
 			goto auth_key_revoked;
 
-		irka = current->request_key_auth->payload.data;
+		irka = current->cred->request_key_auth->payload.data;
 		rka->context = irka->context;
 		rka->pid = irka->pid;
 		get_task_struct(rka->context);
 
-		up_read(&current->request_key_auth->sem);
+		up_read(&current->cred->request_key_auth->sem);
 	}
 	else {
 		/* it isn't - use this process as the context */
@@ -214,7 +214,7 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info,
 	return authkey;
 
 auth_key_revoked:
-	up_read(&current->request_key_auth->sem);
+	up_read(&current->cred->request_key_auth->sem);
 	kfree(rka->callout_info);
 	kfree(rka);
 	kleave("= -EKEYREVOKED");
diff --git a/security/selinux/exports.c b/security/selinux/exports.c
index 64af2d3409ef..cf02490cd1eb 100644
--- a/security/selinux/exports.c
+++ b/security/selinux/exports.c
@@ -39,7 +39,7 @@ EXPORT_SYMBOL_GPL(selinux_string_to_sid);
 int selinux_secmark_relabel_packet_permission(u32 sid)
 {
 	if (selinux_enabled) {
-		struct task_security_struct *tsec = current->security;
+		struct task_security_struct *tsec = current->cred->security;
 
 		return avc_has_perm(tsec->sid, sid, SECCLASS_PACKET,
 				    PACKET__RELABELTO, NULL);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 9f6da154cc82..328308f2882a 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -167,21 +167,21 @@ static int task_alloc_security(struct task_struct *task)
 		return -ENOMEM;
 
 	tsec->osid = tsec->sid = SECINITSID_UNLABELED;
-	task->security = tsec;
+	task->cred->security = tsec;
 
 	return 0;
 }
 
 static void task_free_security(struct task_struct *task)
 {
-	struct task_security_struct *tsec = task->security;
-	task->security = NULL;
+	struct task_security_struct *tsec = task->cred->security;
+	task->cred->security = NULL;
 	kfree(tsec);
 }
 
 static int inode_alloc_security(struct inode *inode)
 {
-	struct task_security_struct *tsec = current->security;
+	struct task_security_struct *tsec = current->cred->security;
 	struct inode_security_struct *isec;
 
 	isec = kmem_cache_zalloc(sel_inode_cache, GFP_NOFS);
@@ -215,7 +215,7 @@ static void inode_free_security(struct inode *inode)
 
 static int file_alloc_security(struct file *file)
 {
-	struct task_security_struct *tsec = current->security;
+	struct task_security_struct *tsec = current->cred->security;
 	struct file_security_struct *fsec;
 
 	fsec = kzalloc(sizeof(struct file_security_struct), GFP_KERNEL);
@@ -554,7 +554,7 @@ static int selinux_set_mnt_opts(struct super_block *sb,
 				struct security_mnt_opts *opts)
 {
 	int rc = 0, i;
-	struct task_security_struct *tsec = current->security;
+	struct task_security_struct *tsec = current->cred->security;
 	struct superblock_security_struct *sbsec = sb->s_security;
 	const char *name = sb->s_type->name;
 	struct inode *inode = sbsec->sb->s_root->d_inode;
@@ -1353,8 +1353,8 @@ static int task_has_perm(struct task_struct *tsk1,
 {
 	struct task_security_struct *tsec1, *tsec2;
 
-	tsec1 = tsk1->security;
-	tsec2 = tsk2->security;
+	tsec1 = tsk1->cred->security;
+	tsec2 = tsk2->cred->security;
 	return avc_has_perm(tsec1->sid, tsec2->sid,
 			    SECCLASS_PROCESS, perms, NULL);
 }
@@ -1374,7 +1374,7 @@ static int task_has_capability(struct task_struct *tsk,
 	u32 av = CAP_TO_MASK(cap);
 	int rc;
 
-	tsec = tsk->security;
+	tsec = tsk->cred->security;
 
 	AVC_AUDIT_DATA_INIT(&ad, CAP);
 	ad.tsk = tsk;
@@ -1405,7 +1405,7 @@ static int task_has_system(struct task_struct *tsk,
 {
 	struct task_security_struct *tsec;
 
-	tsec = tsk->security;
+	tsec = tsk->cred->security;
 
 	return avc_has_perm(tsec->sid, SECINITSID_KERNEL,
 			    SECCLASS_SYSTEM, perms, NULL);
@@ -1426,7 +1426,7 @@ static int inode_has_perm(struct task_struct *tsk,
 	if (unlikely(IS_PRIVATE(inode)))
 		return 0;
 
-	tsec = tsk->security;
+	tsec = tsk->cred->security;
 	isec = inode->i_security;
 
 	if (!adp) {
@@ -1466,7 +1466,7 @@ static int file_has_perm(struct task_struct *tsk,
 				struct file *file,
 				u32 av)
 {
-	struct task_security_struct *tsec = tsk->security;
+	struct task_security_struct *tsec = tsk->cred->security;
 	struct file_security_struct *fsec = file->f_security;
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct avc_audit_data ad;
@@ -1503,7 +1503,7 @@ static int may_create(struct inode *dir,
 	struct avc_audit_data ad;
 	int rc;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	dsec = dir->i_security;
 	sbsec = dir->i_sb->s_security;
 
@@ -1540,7 +1540,7 @@ static int may_create_key(u32 ksid,
 {
 	struct task_security_struct *tsec;
 
-	tsec = ctx->security;
+	tsec = ctx->cred->security;
 
 	return avc_has_perm(tsec->sid, ksid, SECCLASS_KEY, KEY__CREATE, NULL);
 }
@@ -1561,7 +1561,7 @@ static int may_link(struct inode *dir,
 	u32 av;
 	int rc;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	dsec = dir->i_security;
 	isec = dentry->d_inode->i_security;
 
@@ -1606,7 +1606,7 @@ static inline int may_rename(struct inode *old_dir,
 	int old_is_dir, new_is_dir;
 	int rc;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	old_dsec = old_dir->i_security;
 	old_isec = old_dentry->d_inode->i_security;
 	old_is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
@@ -1659,7 +1659,7 @@ static int superblock_has_perm(struct task_struct *tsk,
 	struct task_security_struct *tsec;
 	struct superblock_security_struct *sbsec;
 
-	tsec = tsk->security;
+	tsec = tsk->cred->security;
 	sbsec = sb->s_security;
 	return avc_has_perm(tsec->sid, sbsec->sid, SECCLASS_FILESYSTEM,
 			    perms, ad);
@@ -1758,8 +1758,8 @@ static int selinux_ptrace_may_access(struct task_struct *child,
 		return rc;
 
 	if (mode == PTRACE_MODE_READ) {
-		struct task_security_struct *tsec = current->security;
-		struct task_security_struct *csec = child->security;
+		struct task_security_struct *tsec = current->cred->security;
+		struct task_security_struct *csec = child->cred->security;
 		return avc_has_perm(tsec->sid, csec->sid,
 				    SECCLASS_FILE, FILE__READ, NULL);
 	}
@@ -1874,7 +1874,7 @@ static int selinux_sysctl(ctl_table *table, int op)
 	if (rc)
 		return rc;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 
 	rc = selinux_sysctl_get_sid(table, (op == 0001) ?
 				    SECCLASS_DIR : SECCLASS_FILE, &tsid);
@@ -2025,7 +2025,7 @@ static int selinux_bprm_set_security(struct linux_binprm *bprm)
 	if (bsec->set)
 		return 0;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	isec = inode->i_security;
 
 	/* Default to the current task SID. */
@@ -2090,7 +2090,7 @@ static int selinux_bprm_check_security(struct linux_binprm *bprm)
 
 static int selinux_bprm_secureexec(struct linux_binprm *bprm)
 {
-	struct task_security_struct *tsec = current->security;
+	struct task_security_struct *tsec = current->cred->security;
 	int atsecure = 0;
 
 	if (tsec->osid != tsec->sid) {
@@ -2214,7 +2214,7 @@ static void selinux_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
 
 	secondary_ops->bprm_apply_creds(bprm, unsafe);
 
-	tsec = current->security;
+	tsec = current->cred->security;
 
 	bsec = bprm->security;
 	sid = bsec->sid;
@@ -2243,7 +2243,7 @@ static void selinux_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
 			rcu_read_lock();
 			tracer = tracehook_tracer_task(current);
 			if (likely(tracer != NULL)) {
-				sec = tracer->security;
+				sec = tracer->cred->security;
 				ptsid = sec->sid;
 			}
 			rcu_read_unlock();
@@ -2274,7 +2274,7 @@ static void selinux_bprm_post_apply_creds(struct linux_binprm *bprm)
 	int rc, i;
 	unsigned long flags;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	bsec = bprm->security;
 
 	if (bsec->unsafe) {
@@ -2521,7 +2521,7 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir,
 	int rc;
 	char *namep = NULL, *context;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	dsec = dir->i_security;
 	sbsec = dir->i_sb->s_security;
 
@@ -2706,7 +2706,7 @@ static int selinux_inode_setotherxattr(struct dentry *dentry, const char *name)
 static int selinux_inode_setxattr(struct dentry *dentry, const char *name,
 				  const void *value, size_t size, int flags)
 {
-	struct task_security_struct *tsec = current->security;
+	struct task_security_struct *tsec = current->cred->security;
 	struct inode *inode = dentry->d_inode;
 	struct inode_security_struct *isec = inode->i_security;
 	struct superblock_security_struct *sbsec;
@@ -2918,7 +2918,7 @@ static int selinux_revalidate_file_permission(struct file *file, int mask)
 static int selinux_file_permission(struct file *file, int mask)
 {
 	struct inode *inode = file->f_path.dentry->d_inode;
-	struct task_security_struct *tsec = current->security;
+	struct task_security_struct *tsec = current->cred->security;
 	struct file_security_struct *fsec = file->f_security;
 	struct inode_security_struct *isec = inode->i_security;
 
@@ -2995,7 +2995,8 @@ static int selinux_file_mmap(struct file *file, unsigned long reqprot,
 			     unsigned long addr, unsigned long addr_only)
 {
 	int rc = 0;
-	u32 sid = ((struct task_security_struct *)(current->security))->sid;
+	u32 sid = ((struct task_security_struct *)
+		   (current->cred->security))->sid;
 
 	if (addr < mmap_min_addr)
 		rc = avc_has_perm(sid, sid, SECCLASS_MEMPROTECT,
@@ -3107,7 +3108,7 @@ static int selinux_file_set_fowner(struct file *file)
 	struct task_security_struct *tsec;
 	struct file_security_struct *fsec;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	fsec = file->f_security;
 	fsec->fown_sid = tsec->sid;
 
@@ -3125,7 +3126,7 @@ static int selinux_file_send_sigiotask(struct task_struct *tsk,
 	/* struct fown_struct is never outside the context of a struct file */
 	file = container_of(fown, struct file, f_owner);
 
-	tsec = tsk->security;
+	tsec = tsk->cred->security;
 	fsec = file->f_security;
 
 	if (!signum)
@@ -3188,12 +3189,12 @@ static int selinux_task_alloc_security(struct task_struct *tsk)
 	struct task_security_struct *tsec1, *tsec2;
 	int rc;
 
-	tsec1 = current->security;
+	tsec1 = current->cred->security;
 
 	rc = task_alloc_security(tsk);
 	if (rc)
 		return rc;
-	tsec2 = tsk->security;
+	tsec2 = tsk->cred->security;
 
 	tsec2->osid = tsec1->osid;
 	tsec2->sid = tsec1->sid;
@@ -3251,7 +3252,7 @@ static int selinux_task_getsid(struct task_struct *p)
 
 static void selinux_task_getsecid(struct task_struct *p, u32 *secid)
 {
-	struct task_security_struct *tsec = p->security;
+	struct task_security_struct *tsec = p->cred->security;
 	*secid = tsec->sid;
 }
 
@@ -3343,7 +3344,7 @@ static int selinux_task_kill(struct task_struct *p, struct siginfo *info,
 		perm = PROCESS__SIGNULL; /* null signal; existence test */
 	else
 		perm = signal_to_av(sig);
-	tsec = p->security;
+	tsec = p->cred->security;
 	if (secid)
 		rc = avc_has_perm(secid, tsec->sid, SECCLASS_PROCESS, perm, NULL);
 	else
@@ -3375,7 +3376,7 @@ static void selinux_task_reparent_to_init(struct task_struct *p)
 
 	secondary_ops->task_reparent_to_init(p);
 
-	tsec = p->security;
+	tsec = p->cred->security;
 	tsec->osid = tsec->sid;
 	tsec->sid = SECINITSID_KERNEL;
 	return;
@@ -3384,7 +3385,7 @@ static void selinux_task_reparent_to_init(struct task_struct *p)
 static void selinux_task_to_inode(struct task_struct *p,
 				  struct inode *inode)
 {
-	struct task_security_struct *tsec = p->security;
+	struct task_security_struct *tsec = p->cred->security;
 	struct inode_security_struct *isec = inode->i_security;
 
 	isec->sid = tsec->sid;
@@ -3632,7 +3633,7 @@ static int socket_has_perm(struct task_struct *task, struct socket *sock,
 	struct avc_audit_data ad;
 	int err = 0;
 
-	tsec = task->security;
+	tsec = task->cred->security;
 	isec = SOCK_INODE(sock)->i_security;
 
 	if (isec->sid == SECINITSID_KERNEL)
@@ -3656,7 +3657,7 @@ static int selinux_socket_create(int family, int type,
 	if (kern)
 		goto out;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	newsid = tsec->sockcreate_sid ? : tsec->sid;
 	err = avc_has_perm(tsec->sid, newsid,
 			   socket_type_to_security_class(family, type,
@@ -3677,7 +3678,7 @@ static int selinux_socket_post_create(struct socket *sock, int family,
 
 	isec = SOCK_INODE(sock)->i_security;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	newsid = tsec->sockcreate_sid ? : tsec->sid;
 	isec->sclass = socket_type_to_security_class(family, type, protocol);
 	isec->sid = kern ? SECINITSID_KERNEL : newsid;
@@ -3723,7 +3724,7 @@ static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, in
 		struct sock *sk = sock->sk;
 		u32 sid, node_perm;
 
-		tsec = current->security;
+		tsec = current->cred->security;
 		isec = SOCK_INODE(sock)->i_security;
 
 		if (family == PF_INET) {
@@ -4764,7 +4765,7 @@ static int ipc_alloc_security(struct task_struct *task,
 			      struct kern_ipc_perm *perm,
 			      u16 sclass)
 {
-	struct task_security_struct *tsec = task->security;
+	struct task_security_struct *tsec = task->cred->security;
 	struct ipc_security_struct *isec;
 
 	isec = kzalloc(sizeof(struct ipc_security_struct), GFP_KERNEL);
@@ -4814,7 +4815,7 @@ static int ipc_has_perm(struct kern_ipc_perm *ipc_perms,
 	struct ipc_security_struct *isec;
 	struct avc_audit_data ad;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	isec = ipc_perms->security;
 
 	AVC_AUDIT_DATA_INIT(&ad, IPC);
@@ -4845,7 +4846,7 @@ static int selinux_msg_queue_alloc_security(struct msg_queue *msq)
 	if (rc)
 		return rc;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	isec = msq->q_perm.security;
 
 	AVC_AUDIT_DATA_INIT(&ad, IPC);
@@ -4871,7 +4872,7 @@ static int selinux_msg_queue_associate(struct msg_queue *msq, int msqflg)
 	struct ipc_security_struct *isec;
 	struct avc_audit_data ad;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	isec = msq->q_perm.security;
 
 	AVC_AUDIT_DATA_INIT(&ad, IPC);
@@ -4917,7 +4918,7 @@ static int selinux_msg_queue_msgsnd(struct msg_queue *msq, struct msg_msg *msg,
 	struct avc_audit_data ad;
 	int rc;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	isec = msq->q_perm.security;
 	msec = msg->security;
 
@@ -4965,7 +4966,7 @@ static int selinux_msg_queue_msgrcv(struct msg_queue *msq, struct msg_msg *msg,
 	struct avc_audit_data ad;
 	int rc;
 
-	tsec = target->security;
+	tsec = target->cred->security;
 	isec = msq->q_perm.security;
 	msec = msg->security;
 
@@ -4992,7 +4993,7 @@ static int selinux_shm_alloc_security(struct shmid_kernel *shp)
 	if (rc)
 		return rc;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	isec = shp->shm_perm.security;
 
 	AVC_AUDIT_DATA_INIT(&ad, IPC);
@@ -5018,7 +5019,7 @@ static int selinux_shm_associate(struct shmid_kernel *shp, int shmflg)
 	struct ipc_security_struct *isec;
 	struct avc_audit_data ad;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	isec = shp->shm_perm.security;
 
 	AVC_AUDIT_DATA_INIT(&ad, IPC);
@@ -5091,7 +5092,7 @@ static int selinux_sem_alloc_security(struct sem_array *sma)
 	if (rc)
 		return rc;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	isec = sma->sem_perm.security;
 
 	AVC_AUDIT_DATA_INIT(&ad, IPC);
@@ -5117,7 +5118,7 @@ static int selinux_sem_associate(struct sem_array *sma, int semflg)
 	struct ipc_security_struct *isec;
 	struct avc_audit_data ad;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	isec = sma->sem_perm.security;
 
 	AVC_AUDIT_DATA_INIT(&ad, IPC);
@@ -5224,7 +5225,7 @@ static int selinux_getprocattr(struct task_struct *p,
 			return error;
 	}
 
-	tsec = p->security;
+	tsec = p->cred->security;
 
 	if (!strcmp(name, "current"))
 		sid = tsec->sid;
@@ -5308,7 +5309,7 @@ static int selinux_setprocattr(struct task_struct *p,
 	   operation.  See selinux_bprm_set_security for the execve
 	   checks and may_create for the file creation checks. The
 	   operation will then fail if the context is not permitted. */
-	tsec = p->security;
+	tsec = p->cred->security;
 	if (!strcmp(name, "exec"))
 		tsec->exec_sid = sid;
 	else if (!strcmp(name, "fscreate"))
@@ -5361,7 +5362,8 @@ boundary_ok:
 		rcu_read_lock();
 		tracer = tracehook_tracer_task(p);
 		if (tracer != NULL) {
-			struct task_security_struct *ptsec = tracer->security;
+			struct task_security_struct *ptsec =
+				tracer->cred->security;
 			u32 ptsid = ptsec->sid;
 			rcu_read_unlock();
 			error = avc_has_perm_noaudit(ptsid, sid,
@@ -5405,7 +5407,7 @@ static void selinux_release_secctx(char *secdata, u32 seclen)
 static int selinux_key_alloc(struct key *k, struct task_struct *tsk,
 			     unsigned long flags)
 {
-	struct task_security_struct *tsec = tsk->security;
+	struct task_security_struct *tsec = tsk->cred->security;
 	struct key_security_struct *ksec;
 
 	ksec = kzalloc(sizeof(struct key_security_struct), GFP_KERNEL);
@@ -5439,7 +5441,7 @@ static int selinux_key_permission(key_ref_t key_ref,
 
 	key = key_ref_to_ptr(key_ref);
 
-	tsec = ctx->security;
+	tsec = ctx->cred->security;
 	ksec = key->security;
 
 	/* if no specific permissions are requested, we skip the
@@ -5683,7 +5685,7 @@ static __init int selinux_init(void)
 	/* Set the security state for the initial task. */
 	if (task_alloc_security(current))
 		panic("SELinux:  Failed to initialize initial task.\n");
-	tsec = current->security;
+	tsec = current->cred->security;
 	tsec->osid = tsec->sid = SECINITSID_KERNEL;
 
 	sel_inode_cache = kmem_cache_create("selinux_inode_security",
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 69c9dccc8cf0..10715d1330b9 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -97,7 +97,7 @@ static int task_has_security(struct task_struct *tsk,
 {
 	struct task_security_struct *tsec;
 
-	tsec = tsk->security;
+	tsec = tsk->cred->security;
 	if (!tsec)
 		return -EACCES;
 
diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c
index 8f17f542a116..d7db76617b0e 100644
--- a/security/selinux/xfrm.c
+++ b/security/selinux/xfrm.c
@@ -197,7 +197,7 @@ static int selinux_xfrm_sec_ctx_alloc(struct xfrm_sec_ctx **ctxp,
 	struct xfrm_user_sec_ctx *uctx, u32 sid)
 {
 	int rc = 0;
-	struct task_security_struct *tsec = current->security;
+	struct task_security_struct *tsec = current->cred->security;
 	struct xfrm_sec_ctx *ctx = NULL;
 	char *ctx_str = NULL;
 	u32 str_len;
@@ -333,7 +333,7 @@ void selinux_xfrm_policy_free(struct xfrm_sec_ctx *ctx)
  */
 int selinux_xfrm_policy_delete(struct xfrm_sec_ctx *ctx)
 {
-	struct task_security_struct *tsec = current->security;
+	struct task_security_struct *tsec = current->cred->security;
 	int rc = 0;
 
 	if (ctx) {
@@ -378,7 +378,7 @@ void selinux_xfrm_state_free(struct xfrm_state *x)
   */
 int selinux_xfrm_state_delete(struct xfrm_state *x)
 {
-	struct task_security_struct *tsec = current->security;
+	struct task_security_struct *tsec = current->cred->security;
 	struct xfrm_sec_ctx *ctx = x->security;
 	int rc = 0;
 
diff --git a/security/smack/smack_access.c b/security/smack/smack_access.c
index 79ff21ed4c3b..b6dd4fc0fb0b 100644
--- a/security/smack/smack_access.c
+++ b/security/smack/smack_access.c
@@ -164,7 +164,7 @@ int smk_curacc(char *obj_label, u32 mode)
 {
 	int rc;
 
-	rc = smk_access(current->security, obj_label, mode);
+	rc = smk_access(current->cred->security, obj_label, mode);
 	if (rc == 0)
 		return 0;
 
@@ -173,7 +173,7 @@ int smk_curacc(char *obj_label, u32 mode)
 	 * only one that gets privilege and current does not
 	 * have that label.
 	 */
-	if (smack_onlycap != NULL && smack_onlycap != current->security)
+	if (smack_onlycap != NULL && smack_onlycap != current->cred->security)
 		return rc;
 
 	if (capable(CAP_MAC_OVERRIDE))
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 6e2dc0bab70d..791da238d049 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -102,7 +102,8 @@ static int smack_ptrace_may_access(struct task_struct *ctp, unsigned int mode)
 	if (rc != 0)
 		return rc;
 
-	rc = smk_access(current->security, ctp->security, MAY_READWRITE);
+	rc = smk_access(current->cred->security, ctp->cred->security,
+			MAY_READWRITE);
 	if (rc != 0 && capable(CAP_MAC_OVERRIDE))
 		return 0;
 	return rc;
@@ -124,7 +125,8 @@ static int smack_ptrace_traceme(struct task_struct *ptp)
 	if (rc != 0)
 		return rc;
 
-	rc = smk_access(ptp->security, current->security, MAY_READWRITE);
+	rc = smk_access(ptp->cred->security, current->cred->security,
+			MAY_READWRITE);
 	if (rc != 0 && has_capability(ptp, CAP_MAC_OVERRIDE))
 		return 0;
 	return rc;
@@ -141,7 +143,7 @@ static int smack_ptrace_traceme(struct task_struct *ptp)
 static int smack_syslog(int type)
 {
 	int rc;
-	char *sp = current->security;
+	char *sp = current->cred->security;
 
 	rc = cap_syslog(type);
 	if (rc != 0)
@@ -373,7 +375,7 @@ static int smack_sb_umount(struct vfsmount *mnt, int flags)
  */
 static int smack_inode_alloc_security(struct inode *inode)
 {
-	inode->i_security = new_inode_smack(current->security);
+	inode->i_security = new_inode_smack(current->cred->security);
 	if (inode->i_security == NULL)
 		return -ENOMEM;
 	return 0;
@@ -818,7 +820,7 @@ static int smack_file_permission(struct file *file, int mask)
  */
 static int smack_file_alloc_security(struct file *file)
 {
-	file->f_security = current->security;
+	file->f_security = current->cred->security;
 	return 0;
 }
 
@@ -916,7 +918,7 @@ static int smack_file_fcntl(struct file *file, unsigned int cmd,
  */
 static int smack_file_set_fowner(struct file *file)
 {
-	file->f_security = current->security;
+	file->f_security = current->cred->security;
 	return 0;
 }
 
@@ -941,7 +943,7 @@ static int smack_file_send_sigiotask(struct task_struct *tsk,
 	 * struct fown_struct is never outside the context of a struct file
 	 */
 	file = container_of(fown, struct file, f_owner);
-	rc = smk_access(file->f_security, tsk->security, MAY_WRITE);
+	rc = smk_access(file->f_security, tsk->cred->security, MAY_WRITE);
 	if (rc != 0 && has_capability(tsk, CAP_MAC_OVERRIDE))
 		return 0;
 	return rc;
@@ -984,7 +986,7 @@ static int smack_file_receive(struct file *file)
  */
 static int smack_task_alloc_security(struct task_struct *tsk)
 {
-	tsk->security = current->security;
+	tsk->cred->security = current->cred->security;
 
 	return 0;
 }
@@ -999,7 +1001,7 @@ static int smack_task_alloc_security(struct task_struct *tsk)
  */
 static void smack_task_free_security(struct task_struct *task)
 {
-	task->security = NULL;
+	task->cred->security = NULL;
 }
 
 /**
@@ -1011,7 +1013,7 @@ static void smack_task_free_security(struct task_struct *task)
  */
 static int smack_task_setpgid(struct task_struct *p, pid_t pgid)
 {
-	return smk_curacc(p->security, MAY_WRITE);
+	return smk_curacc(p->cred->security, MAY_WRITE);
 }
 
 /**
@@ -1022,7 +1024,7 @@ static int smack_task_setpgid(struct task_struct *p, pid_t pgid)
  */
 static int smack_task_getpgid(struct task_struct *p)
 {
-	return smk_curacc(p->security, MAY_READ);
+	return smk_curacc(p->cred->security, MAY_READ);
 }
 
 /**
@@ -1033,7 +1035,7 @@ static int smack_task_getpgid(struct task_struct *p)
  */
 static int smack_task_getsid(struct task_struct *p)
 {
-	return smk_curacc(p->security, MAY_READ);
+	return smk_curacc(p->cred->security, MAY_READ);
 }
 
 /**
@@ -1045,7 +1047,7 @@ static int smack_task_getsid(struct task_struct *p)
  */
 static void smack_task_getsecid(struct task_struct *p, u32 *secid)
 {
-	*secid = smack_to_secid(p->security);
+	*secid = smack_to_secid(p->cred->security);
 }
 
 /**
@@ -1061,7 +1063,7 @@ static int smack_task_setnice(struct task_struct *p, int nice)
 
 	rc = cap_task_setnice(p, nice);
 	if (rc == 0)
-		rc = smk_curacc(p->security, MAY_WRITE);
+		rc = smk_curacc(p->cred->security, MAY_WRITE);
 	return rc;
 }
 
@@ -1078,7 +1080,7 @@ static int smack_task_setioprio(struct task_struct *p, int ioprio)
 
 	rc = cap_task_setioprio(p, ioprio);
 	if (rc == 0)
-		rc = smk_curacc(p->security, MAY_WRITE);
+		rc = smk_curacc(p->cred->security, MAY_WRITE);
 	return rc;
 }
 
@@ -1090,7 +1092,7 @@ static int smack_task_setioprio(struct task_struct *p, int ioprio)
  */
 static int smack_task_getioprio(struct task_struct *p)
 {
-	return smk_curacc(p->security, MAY_READ);
+	return smk_curacc(p->cred->security, MAY_READ);
 }
 
 /**
@@ -1108,7 +1110,7 @@ static int smack_task_setscheduler(struct task_struct *p, int policy,
 
 	rc = cap_task_setscheduler(p, policy, lp);
 	if (rc == 0)
-		rc = smk_curacc(p->security, MAY_WRITE);
+		rc = smk_curacc(p->cred->security, MAY_WRITE);
 	return rc;
 }
 
@@ -1120,7 +1122,7 @@ static int smack_task_setscheduler(struct task_struct *p, int policy,
  */
 static int smack_task_getscheduler(struct task_struct *p)
 {
-	return smk_curacc(p->security, MAY_READ);
+	return smk_curacc(p->cred->security, MAY_READ);
 }
 
 /**
@@ -1131,7 +1133,7 @@ static int smack_task_getscheduler(struct task_struct *p)
  */
 static int smack_task_movememory(struct task_struct *p)
 {
-	return smk_curacc(p->security, MAY_WRITE);
+	return smk_curacc(p->cred->security, MAY_WRITE);
 }
 
 /**
@@ -1154,13 +1156,13 @@ static int smack_task_kill(struct task_struct *p, struct siginfo *info,
 	 * can write the receiver.
 	 */
 	if (secid == 0)
-		return smk_curacc(p->security, MAY_WRITE);
+		return smk_curacc(p->cred->security, MAY_WRITE);
 	/*
 	 * If the secid isn't 0 we're dealing with some USB IO
 	 * specific behavior. This is not clean. For one thing
 	 * we can't take privilege into account.
 	 */
-	return smk_access(smack_from_secid(secid), p->security, MAY_WRITE);
+	return smk_access(smack_from_secid(secid), p->cred->security, MAY_WRITE);
 }
 
 /**
@@ -1173,7 +1175,7 @@ static int smack_task_wait(struct task_struct *p)
 {
 	int rc;
 
-	rc = smk_access(current->security, p->security, MAY_WRITE);
+	rc = smk_access(current->cred->security, p->cred->security, MAY_WRITE);
 	if (rc == 0)
 		return 0;
 
@@ -1204,7 +1206,7 @@ static int smack_task_wait(struct task_struct *p)
 static void smack_task_to_inode(struct task_struct *p, struct inode *inode)
 {
 	struct inode_smack *isp = inode->i_security;
-	isp->smk_inode = p->security;
+	isp->smk_inode = p->cred->security;
 }
 
 /*
@@ -1223,7 +1225,7 @@ static void smack_task_to_inode(struct task_struct *p, struct inode *inode)
  */
 static int smack_sk_alloc_security(struct sock *sk, int family, gfp_t gfp_flags)
 {
-	char *csp = current->security;
+	char *csp = current->cred->security;
 	struct socket_smack *ssp;
 
 	ssp = kzalloc(sizeof(struct socket_smack), gfp_flags);
@@ -1448,7 +1450,7 @@ static int smack_flags_to_may(int flags)
  */
 static int smack_msg_msg_alloc_security(struct msg_msg *msg)
 {
-	msg->security = current->security;
+	msg->security = current->cred->security;
 	return 0;
 }
 
@@ -1484,7 +1486,7 @@ static int smack_shm_alloc_security(struct shmid_kernel *shp)
 {
 	struct kern_ipc_perm *isp = &shp->shm_perm;
 
-	isp->security = current->security;
+	isp->security = current->cred->security;
 	return 0;
 }
 
@@ -1593,7 +1595,7 @@ static int smack_sem_alloc_security(struct sem_array *sma)
 {
 	struct kern_ipc_perm *isp = &sma->sem_perm;
 
-	isp->security = current->security;
+	isp->security = current->cred->security;
 	return 0;
 }
 
@@ -1697,7 +1699,7 @@ static int smack_msg_queue_alloc_security(struct msg_queue *msq)
 {
 	struct kern_ipc_perm *kisp = &msq->q_perm;
 
-	kisp->security = current->security;
+	kisp->security = current->cred->security;
 	return 0;
 }
 
@@ -1852,7 +1854,7 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode)
 	struct super_block *sbp;
 	struct superblock_smack *sbsp;
 	struct inode_smack *isp;
-	char *csp = current->security;
+	char *csp = current->cred->security;
 	char *fetched;
 	char *final;
 	struct dentry *dp;
@@ -2009,7 +2011,7 @@ static int smack_getprocattr(struct task_struct *p, char *name, char **value)
 	if (strcmp(name, "current") != 0)
 		return -EINVAL;
 
-	cp = kstrdup(p->security, GFP_KERNEL);
+	cp = kstrdup(p->cred->security, GFP_KERNEL);
 	if (cp == NULL)
 		return -ENOMEM;
 
@@ -2055,7 +2057,7 @@ static int smack_setprocattr(struct task_struct *p, char *name,
 	if (newsmack == NULL)
 		return -EINVAL;
 
-	p->security = newsmack;
+	p->cred->security = newsmack;
 	return size;
 }
 
@@ -2288,8 +2290,8 @@ static void smack_sock_graft(struct sock *sk, struct socket *parent)
 		return;
 
 	ssp = sk->sk_security;
-	ssp->smk_in = current->security;
-	ssp->smk_out = current->security;
+	ssp->smk_in = current->cred->security;
+	ssp->smk_out = current->cred->security;
 	ssp->smk_packet[0] = '\0';
 
 	rc = smack_netlabel(sk);
@@ -2362,7 +2364,7 @@ static int smack_inet_conn_request(struct sock *sk, struct sk_buff *skb,
 static int smack_key_alloc(struct key *key, struct task_struct *tsk,
 			   unsigned long flags)
 {
-	key->security = tsk->security;
+	key->security = tsk->cred->security;
 	return 0;
 }
 
@@ -2403,10 +2405,11 @@ static int smack_key_permission(key_ref_t key_ref,
 	/*
 	 * This should not occur
 	 */
-	if (context->security == NULL)
+	if (context->cred->security == NULL)
 		return -EACCES;
 
-	return smk_access(context->security, keyp->security, MAY_READWRITE);
+	return smk_access(context->cred->security, keyp->security,
+			  MAY_READWRITE);
 }
 #endif /* CONFIG_KEYS */
 
@@ -2726,7 +2729,7 @@ static __init int smack_init(void)
 	/*
 	 * Set the security state for the initial task.
 	 */
-	current->security = &smack_known_floor.smk_known;
+	current->cred->security = &smack_known_floor.smk_known;
 
 	/*
 	 * Initialize locks
diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c
index c21d8c8bf0c7..c5ca279e0506 100644
--- a/security/smack/smackfs.c
+++ b/security/smack/smackfs.c
@@ -336,7 +336,7 @@ static void smk_cipso_doi(void)
 
 	audit_info.loginuid = audit_get_loginuid(current);
 	audit_info.sessionid = audit_get_sessionid(current);
-	audit_info.secid = smack_to_secid(current->security);
+	audit_info.secid = smack_to_secid(current->cred->security);
 
 	rc = netlbl_cfg_map_del(NULL, &audit_info);
 	if (rc != 0)
@@ -371,7 +371,7 @@ static void smk_unlbl_ambient(char *oldambient)
 
 	audit_info.loginuid = audit_get_loginuid(current);
 	audit_info.sessionid = audit_get_sessionid(current);
-	audit_info.secid = smack_to_secid(current->security);
+	audit_info.secid = smack_to_secid(current->cred->security);
 
 	if (oldambient != NULL) {
 		rc = netlbl_cfg_map_del(oldambient, &audit_info);
@@ -843,7 +843,7 @@ static ssize_t smk_write_onlycap(struct file *file, const char __user *buf,
 				 size_t count, loff_t *ppos)
 {
 	char in[SMK_LABELLEN];
-	char *sp = current->security;
+	char *sp = current->cred->security;
 
 	if (!capable(CAP_MAC_ADMIN))
 		return -EPERM;
-- 
cgit v1.2.3


From f1752eec6145c97163dbce62d17cf5d928e28a27 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:17 +1100
Subject: CRED: Detach the credentials from task_struct

Detach the credentials from task_struct, duplicating them in copy_process()
and releasing them in __put_task_struct().

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/cred.h       | 29 ++++++++++++++
 include/linux/init_task.h  | 16 +-------
 include/linux/sched.h      |  1 -
 include/linux/security.h   | 26 ++++++-------
 kernel/Makefile            |  2 +-
 kernel/cred.c              | 96 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/fork.c              | 24 +++---------
 security/capability.c      |  8 ++--
 security/security.c        |  8 ++--
 security/selinux/hooks.c   | 32 ++++++++--------
 security/smack/smack_lsm.c | 20 +++++-----
 11 files changed, 179 insertions(+), 83 deletions(-)
 create mode 100644 kernel/cred.c

(limited to 'kernel')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index 3e65587a72e5..a7a686074cb0 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -158,4 +158,33 @@ do {						\
 	*(_gid) = current->cred->fsgid;		\
 } while(0)
 
+extern void __put_cred(struct cred *);
+extern int copy_creds(struct task_struct *, unsigned long);
+
+/**
+ * get_cred - Get a reference on a set of credentials
+ * @cred: The credentials to reference
+ *
+ * Get a reference on the specified set of credentials.  The caller must
+ * release the reference.
+ */
+static inline struct cred *get_cred(struct cred *cred)
+{
+	atomic_inc(&cred->usage);
+	return cred;
+}
+
+/**
+ * put_cred - Release a reference to a set of credentials
+ * @cred: The credentials to release
+ *
+ * Release a reference to a set of credentials, deleting them when the last ref
+ * is released.
+ */
+static inline void put_cred(struct cred *cred)
+{
+	if (atomic_dec_and_test(&(cred)->usage))
+		__put_cred(cred);
+}
+
 #endif /* _LINUX_CRED_H */
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 9de41ccd67b5..5e24c54b6dfd 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -115,19 +115,6 @@ extern struct group_info init_groups;
 
 extern struct cred init_cred;
 
-#define INIT_CRED(p)						\
-{								\
-	.usage			= ATOMIC_INIT(3),		\
-	.securebits		= SECUREBITS_DEFAULT,		\
-	.cap_inheritable	= CAP_INIT_INH_SET,		\
-	.cap_permitted		= CAP_FULL_SET,			\
-	.cap_effective		= CAP_INIT_EFF_SET,		\
-	.cap_bset		= CAP_INIT_BSET,		\
-	.user			= INIT_USER,			\
-	.group_info		= &init_groups,			\
-	.lock			= __SPIN_LOCK_UNLOCKED(p.lock),	\
-}
-
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -162,8 +149,7 @@ extern struct cred init_cred;
 	.children	= LIST_HEAD_INIT(tsk.children),			\
 	.sibling	= LIST_HEAD_INIT(tsk.sibling),			\
 	.group_leader	= &tsk,						\
-	.__temp_cred	= INIT_CRED(tsk.__temp_cred),			\
-	.cred		= &tsk.__temp_cred,				\
+	.cred		= &init_cred,					\
 	.comm		= "swapper",					\
 	.thread		= INIT_THREAD,					\
 	.fs		= &init_fs,					\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c8b92502354d..740cf946c8cc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1151,7 +1151,6 @@ struct task_struct {
 	struct list_head cpu_timers[3];
 
 /* process credentials */
-	struct cred __temp_cred __deprecated; /* temporary credentials to be removed */
 	struct cred *cred;	/* actual/objective task credentials */
 
 	char comm[TASK_COMM_LEN]; /* executable name excluding path
diff --git a/include/linux/security.h b/include/linux/security.h
index 9f305d4a31a7..9239cc11eb9c 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -593,15 +593,15 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	manual page for definitions of the @clone_flags.
  *	@clone_flags contains the flags indicating what should be shared.
  *	Return 0 if permission is granted.
- * @task_alloc_security:
- *	@p contains the task_struct for child process.
- *	Allocate and attach a security structure to the p->security field. The
- *	security field is initialized to NULL when the task structure is
+ * @cred_alloc_security:
+ *	@cred contains the cred struct for child process.
+ *	Allocate and attach a security structure to the cred->security field.
+ *	The security field is initialized to NULL when the task structure is
  *	allocated.
  *	Return 0 if operation was successful.
- * @task_free_security:
- *	@p contains the task_struct for process.
- *	Deallocate and clear the p->security field.
+ * @cred_free:
+ *	@cred points to the credentials.
+ *	Deallocate and clear the cred->security field in a set of credentials.
  * @task_setuid:
  *	Check permission before setting one or more of the user identity
  *	attributes of the current process.  The @flags parameter indicates
@@ -1405,8 +1405,8 @@ struct security_operations {
 	int (*dentry_open) (struct file *file);
 
 	int (*task_create) (unsigned long clone_flags);
-	int (*task_alloc_security) (struct task_struct *p);
-	void (*task_free_security) (struct task_struct *p);
+	int (*cred_alloc_security) (struct cred *cred);
+	void (*cred_free) (struct cred *cred);
 	int (*task_setuid) (uid_t id0, uid_t id1, uid_t id2, int flags);
 	int (*task_post_setuid) (uid_t old_ruid /* or fsuid */ ,
 				 uid_t old_euid, uid_t old_suid, int flags);
@@ -1660,8 +1660,8 @@ int security_file_send_sigiotask(struct task_struct *tsk,
 int security_file_receive(struct file *file);
 int security_dentry_open(struct file *file);
 int security_task_create(unsigned long clone_flags);
-int security_task_alloc(struct task_struct *p);
-void security_task_free(struct task_struct *p);
+int security_cred_alloc(struct cred *cred);
+void security_cred_free(struct cred *cred);
 int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags);
 int security_task_post_setuid(uid_t old_ruid, uid_t old_euid,
 			      uid_t old_suid, int flags);
@@ -2181,12 +2181,12 @@ static inline int security_task_create(unsigned long clone_flags)
 	return 0;
 }
 
-static inline int security_task_alloc(struct task_struct *p)
+static inline int security_cred_alloc(struct cred *cred)
 {
 	return 0;
 }
 
-static inline void security_task_free(struct task_struct *p)
+static inline void security_cred_free(struct cred *cred)
 { }
 
 static inline int security_task_setuid(uid_t id0, uid_t id1, uid_t id2,
diff --git a/kernel/Makefile b/kernel/Makefile
index 9a3ec66a9d84..5a6a612c302d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
 	    rcupdate.o extable.o params.o posix-timers.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
-	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o
+	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o
 
 CFLAGS_REMOVE_sched.o = -mno-spe
 
diff --git a/kernel/cred.c b/kernel/cred.c
new file mode 100644
index 000000000000..833244a7cb05
--- /dev/null
+++ b/kernel/cred.c
@@ -0,0 +1,96 @@
+/* Task credentials management
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/module.h>
+#include <linux/cred.h>
+#include <linux/sched.h>
+#include <linux/key.h>
+#include <linux/keyctl.h>
+#include <linux/init_task.h>
+#include <linux/security.h>
+
+/*
+ * The initial credentials for the initial task
+ */
+struct cred init_cred = {
+	.usage			= ATOMIC_INIT(3),
+	.securebits		= SECUREBITS_DEFAULT,
+	.cap_inheritable	= CAP_INIT_INH_SET,
+	.cap_permitted		= CAP_FULL_SET,
+	.cap_effective		= CAP_INIT_EFF_SET,
+	.cap_bset		= CAP_INIT_BSET,
+	.user			= INIT_USER,
+	.group_info		= &init_groups,
+};
+
+/*
+ * The RCU callback to actually dispose of a set of credentials
+ */
+static void put_cred_rcu(struct rcu_head *rcu)
+{
+	struct cred *cred = container_of(rcu, struct cred, rcu);
+
+	BUG_ON(atomic_read(&cred->usage) != 0);
+
+	key_put(cred->thread_keyring);
+	key_put(cred->request_key_auth);
+	put_group_info(cred->group_info);
+	free_uid(cred->user);
+	security_cred_free(cred);
+	kfree(cred);
+}
+
+/**
+ * __put_cred - Destroy a set of credentials
+ * @sec: The record to release
+ *
+ * Destroy a set of credentials on which no references remain.
+ */
+void __put_cred(struct cred *cred)
+{
+	call_rcu(&cred->rcu, put_cred_rcu);
+}
+EXPORT_SYMBOL(__put_cred);
+
+/*
+ * Copy credentials for the new process created by fork()
+ */
+int copy_creds(struct task_struct *p, unsigned long clone_flags)
+{
+	struct cred *pcred;
+	int ret;
+
+	pcred = kmemdup(p->cred, sizeof(*p->cred), GFP_KERNEL);
+	if (!pcred)
+		return -ENOMEM;
+
+#ifdef CONFIG_SECURITY
+	pcred->security = NULL;
+#endif
+
+	ret = security_cred_alloc(pcred);
+	if (ret < 0) {
+		kfree(pcred);
+		return ret;
+	}
+
+	atomic_set(&pcred->usage, 1);
+	get_group_info(pcred->group_info);
+	get_uid(pcred->user);
+	key_get(pcred->thread_keyring);
+	key_get(pcred->request_key_auth);
+
+	atomic_inc(&pcred->user->processes);
+
+	/* RCU assignment is unneeded here as no-one can have accessed this
+	 * pointer yet, barring us */
+	p->cred = pcred;
+	return 0;
+}
diff --git a/kernel/fork.c b/kernel/fork.c
index 81fdc7733908..c932e283ddfc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -146,9 +146,7 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
-	security_task_free(tsk);
-	free_uid(tsk->__temp_cred.user);
-	put_group_info(tsk->__temp_cred.group_info);
+	put_cred(tsk->cred);
 	delayacct_tsk_free(tsk);
 
 	if (!profile_handoff_task(tsk))
@@ -969,7 +967,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
 	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
 #endif
-	p->cred = &p->__temp_cred;
 	retval = -EAGAIN;
 	if (atomic_read(&p->cred->user->processes) >=
 			p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
@@ -978,9 +975,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 			goto bad_fork_free;
 	}
 
-	atomic_inc(&p->cred->user->__count);
-	atomic_inc(&p->cred->user->processes);
-	get_group_info(p->cred->group_info);
+	retval = copy_creds(p, clone_flags);
+	if (retval < 0)
+		goto bad_fork_free;
 
 	/*
 	 * If multiple threads are within copy_process(), then this check
@@ -1035,9 +1032,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	do_posix_clock_monotonic_gettime(&p->start_time);
 	p->real_start_time = p->start_time;
 	monotonic_to_bootbased(&p->real_start_time);
-#ifdef CONFIG_SECURITY
-	p->cred->security = NULL;
-#endif
 	p->io_context = NULL;
 	p->audit_context = NULL;
 	cgroup_fork(p);
@@ -1082,10 +1076,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	sched_fork(p, clone_flags);
 
-	if ((retval = security_task_alloc(p)))
-		goto bad_fork_cleanup_policy;
 	if ((retval = audit_alloc(p)))
-		goto bad_fork_cleanup_security;
+		goto bad_fork_cleanup_policy;
 	/* copy all the process information */
 	if ((retval = copy_semundo(clone_flags, p)))
 		goto bad_fork_cleanup_audit;
@@ -1284,8 +1276,6 @@ bad_fork_cleanup_semundo:
 	exit_sem(p);
 bad_fork_cleanup_audit:
 	audit_free(p);
-bad_fork_cleanup_security:
-	security_task_free(p);
 bad_fork_cleanup_policy:
 #ifdef CONFIG_NUMA
 	mpol_put(p->mempolicy);
@@ -1298,9 +1288,7 @@ bad_fork_cleanup_cgroup:
 bad_fork_cleanup_put_domain:
 	module_put(task_thread_info(p)->exec_domain->module);
 bad_fork_cleanup_count:
-	put_group_info(p->cred->group_info);
-	atomic_dec(&p->cred->user->processes);
-	free_uid(p->cred->user);
+	put_cred(p->cred);
 bad_fork_free:
 	free_task(p);
 fork_out:
diff --git a/security/capability.c b/security/capability.c
index 245874819036..6c4b5137ca7b 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -340,12 +340,12 @@ static int cap_task_create(unsigned long clone_flags)
 	return 0;
 }
 
-static int cap_task_alloc_security(struct task_struct *p)
+static int cap_cred_alloc_security(struct cred *cred)
 {
 	return 0;
 }
 
-static void cap_task_free_security(struct task_struct *p)
+static void cap_cred_free(struct cred *cred)
 {
 }
 
@@ -890,8 +890,8 @@ void security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, file_receive);
 	set_to_cap_if_null(ops, dentry_open);
 	set_to_cap_if_null(ops, task_create);
-	set_to_cap_if_null(ops, task_alloc_security);
-	set_to_cap_if_null(ops, task_free_security);
+	set_to_cap_if_null(ops, cred_alloc_security);
+	set_to_cap_if_null(ops, cred_free);
 	set_to_cap_if_null(ops, task_setuid);
 	set_to_cap_if_null(ops, task_post_setuid);
 	set_to_cap_if_null(ops, task_setgid);
diff --git a/security/security.c b/security/security.c
index 81c956a12300..d058f7d5b10a 100644
--- a/security/security.c
+++ b/security/security.c
@@ -616,14 +616,14 @@ int security_task_create(unsigned long clone_flags)
 	return security_ops->task_create(clone_flags);
 }
 
-int security_task_alloc(struct task_struct *p)
+int security_cred_alloc(struct cred *cred)
 {
-	return security_ops->task_alloc_security(p);
+	return security_ops->cred_alloc_security(cred);
 }
 
-void security_task_free(struct task_struct *p)
+void security_cred_free(struct cred *cred)
 {
-	security_ops->task_free_security(p);
+	security_ops->cred_free(cred);
 }
 
 int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 328308f2882a..658435dce37c 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -158,7 +158,7 @@ static int selinux_secmark_enabled(void)
 
 /* Allocate and free functions for each kind of security blob. */
 
-static int task_alloc_security(struct task_struct *task)
+static int cred_alloc_security(struct cred *cred)
 {
 	struct task_security_struct *tsec;
 
@@ -167,18 +167,11 @@ static int task_alloc_security(struct task_struct *task)
 		return -ENOMEM;
 
 	tsec->osid = tsec->sid = SECINITSID_UNLABELED;
-	task->cred->security = tsec;
+	cred->security = tsec;
 
 	return 0;
 }
 
-static void task_free_security(struct task_struct *task)
-{
-	struct task_security_struct *tsec = task->cred->security;
-	task->cred->security = NULL;
-	kfree(tsec);
-}
-
 static int inode_alloc_security(struct inode *inode)
 {
 	struct task_security_struct *tsec = current->cred->security;
@@ -3184,17 +3177,17 @@ static int selinux_task_create(unsigned long clone_flags)
 	return task_has_perm(current, current, PROCESS__FORK);
 }
 
-static int selinux_task_alloc_security(struct task_struct *tsk)
+static int selinux_cred_alloc_security(struct cred *cred)
 {
 	struct task_security_struct *tsec1, *tsec2;
 	int rc;
 
 	tsec1 = current->cred->security;
 
-	rc = task_alloc_security(tsk);
+	rc = cred_alloc_security(cred);
 	if (rc)
 		return rc;
-	tsec2 = tsk->cred->security;
+	tsec2 = cred->security;
 
 	tsec2->osid = tsec1->osid;
 	tsec2->sid = tsec1->sid;
@@ -3208,9 +3201,14 @@ static int selinux_task_alloc_security(struct task_struct *tsk)
 	return 0;
 }
 
-static void selinux_task_free_security(struct task_struct *tsk)
+/*
+ * detach and free the LSM part of a set of credentials
+ */
+static void selinux_cred_free(struct cred *cred)
 {
-	task_free_security(tsk);
+	struct task_security_struct *tsec = cred->security;
+	cred->security = NULL;
+	kfree(tsec);
 }
 
 static int selinux_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags)
@@ -5552,8 +5550,8 @@ static struct security_operations selinux_ops = {
 	.dentry_open =			selinux_dentry_open,
 
 	.task_create =			selinux_task_create,
-	.task_alloc_security =		selinux_task_alloc_security,
-	.task_free_security =		selinux_task_free_security,
+	.cred_alloc_security =		selinux_cred_alloc_security,
+	.cred_free =			selinux_cred_free,
 	.task_setuid =			selinux_task_setuid,
 	.task_post_setuid =		selinux_task_post_setuid,
 	.task_setgid =			selinux_task_setgid,
@@ -5683,7 +5681,7 @@ static __init int selinux_init(void)
 	printk(KERN_INFO "SELinux:  Initializing.\n");
 
 	/* Set the security state for the initial task. */
-	if (task_alloc_security(current))
+	if (cred_alloc_security(current->cred))
 		panic("SELinux:  Failed to initialize initial task.\n");
 	tsec = current->cred->security;
 	tsec->osid = tsec->sid = SECINITSID_KERNEL;
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 791da238d049..cc837314fb0e 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -975,8 +975,8 @@ static int smack_file_receive(struct file *file)
  */
 
 /**
- * smack_task_alloc_security - "allocate" a task blob
- * @tsk: the task in need of a blob
+ * smack_cred_alloc_security - "allocate" a task cred blob
+ * @cred: the task creds in need of a blob
  *
  * Smack isn't using copies of blobs. Everyone
  * points to an immutable list. No alloc required.
@@ -984,24 +984,24 @@ static int smack_file_receive(struct file *file)
  *
  * Always returns 0
  */
-static int smack_task_alloc_security(struct task_struct *tsk)
+static int smack_cred_alloc_security(struct cred *cred)
 {
-	tsk->cred->security = current->cred->security;
+	cred->security = current->cred->security;
 
 	return 0;
 }
 
 /**
- * smack_task_free_security - "free" a task blob
- * @task: the task with the blob
+ * smack_cred_free - "free" task-level security credentials
+ * @cred: the credentials in question
  *
  * Smack isn't using copies of blobs. Everyone
  * points to an immutable list. The blobs never go away.
  * There is no leak here.
  */
-static void smack_task_free_security(struct task_struct *task)
+static void smack_cred_free(struct cred *cred)
 {
-	task->cred->security = NULL;
+	cred->security = NULL;
 }
 
 /**
@@ -2630,8 +2630,8 @@ struct security_operations smack_ops = {
 	.file_send_sigiotask = 		smack_file_send_sigiotask,
 	.file_receive = 		smack_file_receive,
 
-	.task_alloc_security = 		smack_task_alloc_security,
-	.task_free_security = 		smack_task_free_security,
+	.cred_alloc_security = 		smack_cred_alloc_security,
+	.cred_free =			smack_cred_free,
 	.task_post_setuid =		cap_task_post_setuid,
 	.task_setpgid = 		smack_task_setpgid,
 	.task_getpgid = 		smack_task_getpgid,
-- 
cgit v1.2.3


From 86a264abe542cfececb4df129bc45a0338d8cdb9 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:18 +1100
Subject: CRED: Wrap current->cred and a few other accessors

Wrap current->cred and a few other accessors to hide their actual
implementation.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 arch/ia64/ia32/sys_ia32.c     |   7 +-
 drivers/net/tun.c             |   8 +-
 drivers/usb/core/devio.c      |  10 ++-
 fs/binfmt_elf.c               |  10 +--
 fs/binfmt_elf_fdpic.c         |   9 +-
 fs/exec.c                     |   5 +-
 fs/fcntl.c                    |   3 +-
 fs/file_table.c               |   7 +-
 fs/hugetlbfs/inode.c          |   5 +-
 fs/ioprio.c                   |   4 +-
 fs/smbfs/dir.c                |   3 +-
 include/linux/cred.h          | 187 ++++++++++++++++++++++++++++++++----------
 include/linux/securebits.h    |   2 +-
 ipc/mqueue.c                  |   2 +-
 ipc/shm.c                     |   4 +-
 kernel/sys.c                  |  59 +++++++------
 kernel/uid16.c                |  31 +++----
 net/core/scm.c                |   2 +-
 net/sunrpc/auth.c             |  14 ++--
 security/commoncap.c          |   2 +-
 security/keys/process_keys.c  |   2 +-
 security/keys/request_key.c   |  11 +--
 security/selinux/exports.c    |   8 +-
 security/selinux/xfrm.c       |   6 +-
 security/smack/smack_access.c |   2 +-
 security/smack/smack_lsm.c    |  26 +++---
 security/smack/smackfs.c      |   4 +-
 27 files changed, 271 insertions(+), 162 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c
index 2445a9d3488e..16ef61a91d95 100644
--- a/arch/ia64/ia32/sys_ia32.c
+++ b/arch/ia64/ia32/sys_ia32.c
@@ -1767,25 +1767,24 @@ groups16_from_user(struct group_info *group_info, short __user *grouplist)
 asmlinkage long
 sys32_getgroups16 (int gidsetsize, short __user *grouplist)
 {
+	const struct cred *cred = current_cred();
 	int i;
 
 	if (gidsetsize < 0)
 		return -EINVAL;
 
-	get_group_info(current->cred->group_info);
-	i = current->cred->group_info->ngroups;
+	i = cred->group_info->ngroups;
 	if (gidsetsize) {
 		if (i > gidsetsize) {
 			i = -EINVAL;
 			goto out;
 		}
-		if (groups16_to_user(grouplist, current->cred->group_info)) {
+		if (groups16_to_user(grouplist, cred->group_info)) {
 			i = -EFAULT;
 			goto out;
 		}
 	}
 out:
-	put_group_info(current->cred->group_info);
 	return i;
 }
 
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index b14e2025e221..55dc70c6b4db 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -702,6 +702,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 	struct tun_net *tn;
 	struct tun_struct *tun;
 	struct net_device *dev;
+	const struct cred *cred = current_cred();
 	int err;
 
 	tn = net_generic(net, tun_net_id);
@@ -712,11 +713,12 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 
 		/* Check permissions */
 		if (((tun->owner != -1 &&
-		      current_euid() != tun->owner) ||
+		      cred->euid != tun->owner) ||
 		     (tun->group != -1 &&
-		      current_egid() != tun->group)) &&
-		     !capable(CAP_NET_ADMIN))
+		      cred->egid != tun->group)) &&
+		    !capable(CAP_NET_ADMIN)) {
 			return -EPERM;
+		}
 	}
 	else if (__dev_get_by_name(net, ifr->ifr_name))
 		return -EINVAL;
diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index 1aadb9387027..aa79280df15d 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -574,6 +574,7 @@ static int usbdev_open(struct inode *inode, struct file *file)
 {
 	struct usb_device *dev = NULL;
 	struct dev_state *ps;
+	const struct cred *cred = current_cred();
 	int ret;
 
 	lock_kernel();
@@ -617,8 +618,8 @@ static int usbdev_open(struct inode *inode, struct file *file)
 	init_waitqueue_head(&ps->wait);
 	ps->discsignr = 0;
 	ps->disc_pid = get_pid(task_pid(current));
-	ps->disc_uid = current_uid();
-	ps->disc_euid = current_euid();
+	ps->disc_uid = cred->uid;
+	ps->disc_euid = cred->euid;
 	ps->disccontext = NULL;
 	ps->ifclaimed = 0;
 	security_task_getsecid(current, &ps->secid);
@@ -967,6 +968,7 @@ static int proc_do_submiturb(struct dev_state *ps, struct usbdevfs_urb *uurb,
 	struct usb_host_endpoint *ep;
 	struct async *as;
 	struct usb_ctrlrequest *dr = NULL;
+	const struct cred *cred = current_cred();
 	unsigned int u, totlen, isofrmlen;
 	int ret, ifnum = -1;
 	int is_in;
@@ -1174,8 +1176,8 @@ static int proc_do_submiturb(struct dev_state *ps, struct usbdevfs_urb *uurb,
 	as->signr = uurb->signr;
 	as->ifnum = ifnum;
 	as->pid = get_pid(task_pid(current));
-	as->uid = current_uid();
-	as->euid = current_euid();
+	as->uid = cred->uid;
+	as->euid = cred->euid;
 	security_task_getsecid(current, &as->secid);
 	if (!is_in) {
 		if (copy_from_user(as->urb->transfer_buffer, uurb->buffer,
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 7a52477ce493..0e6655613169 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -157,7 +157,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 	int items;
 	elf_addr_t *elf_info;
 	int ei_index = 0;
-	struct task_struct *tsk = current;
+	const struct cred *cred = current_cred();
 	struct vm_area_struct *vma;
 
 	/*
@@ -223,10 +223,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 	NEW_AUX_ENT(AT_BASE, interp_load_addr);
 	NEW_AUX_ENT(AT_FLAGS, 0);
 	NEW_AUX_ENT(AT_ENTRY, exec->e_entry);
-	NEW_AUX_ENT(AT_UID, tsk->cred->uid);
-	NEW_AUX_ENT(AT_EUID, tsk->cred->euid);
-	NEW_AUX_ENT(AT_GID, tsk->cred->gid);
-	NEW_AUX_ENT(AT_EGID, tsk->cred->egid);
+	NEW_AUX_ENT(AT_UID, cred->uid);
+	NEW_AUX_ENT(AT_EUID, cred->euid);
+	NEW_AUX_ENT(AT_GID, cred->gid);
+	NEW_AUX_ENT(AT_EGID, cred->egid);
  	NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
 	NEW_AUX_ENT(AT_EXECFN, bprm->exec);
 	if (k_platform) {
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 9f67054c2c4e..1f6e8c023b4c 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -475,6 +475,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 				   struct elf_fdpic_params *exec_params,
 				   struct elf_fdpic_params *interp_params)
 {
+	const struct cred *cred = current_cred();
 	unsigned long sp, csp, nitems;
 	elf_caddr_t __user *argv, *envp;
 	size_t platform_len = 0, len;
@@ -623,10 +624,10 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 	NEW_AUX_ENT(AT_BASE,	interp_params->elfhdr_addr);
 	NEW_AUX_ENT(AT_FLAGS,	0);
 	NEW_AUX_ENT(AT_ENTRY,	exec_params->entry_addr);
-	NEW_AUX_ENT(AT_UID,	(elf_addr_t) current->cred->uid);
-	NEW_AUX_ENT(AT_EUID,	(elf_addr_t) current->cred->euid);
-	NEW_AUX_ENT(AT_GID,	(elf_addr_t) current->cred->gid);
-	NEW_AUX_ENT(AT_EGID,	(elf_addr_t) current->cred->egid);
+	NEW_AUX_ENT(AT_UID,	(elf_addr_t) cred->uid);
+	NEW_AUX_ENT(AT_EUID,	(elf_addr_t) cred->euid);
+	NEW_AUX_ENT(AT_GID,	(elf_addr_t) cred->gid);
+	NEW_AUX_ENT(AT_EGID,	(elf_addr_t) cred->egid);
 	NEW_AUX_ENT(AT_SECURE,	security_bprm_secureexec(bprm));
 	NEW_AUX_ENT(AT_EXECFN,	bprm->exec);
 
diff --git a/fs/exec.c b/fs/exec.c
index 31149e430a89..a5330e1a2216 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1388,6 +1388,7 @@ EXPORT_SYMBOL(set_binfmt);
  */
 static int format_corename(char *corename, long signr)
 {
+	const struct cred *cred = current_cred();
 	const char *pat_ptr = core_pattern;
 	int ispipe = (*pat_ptr == '|');
 	char *out_ptr = corename;
@@ -1424,7 +1425,7 @@ static int format_corename(char *corename, long signr)
 			/* uid */
 			case 'u':
 				rc = snprintf(out_ptr, out_end - out_ptr,
-					      "%d", current_uid());
+					      "%d", cred->uid);
 				if (rc > out_end - out_ptr)
 					goto out;
 				out_ptr += rc;
@@ -1432,7 +1433,7 @@ static int format_corename(char *corename, long signr)
 			/* gid */
 			case 'g':
 				rc = snprintf(out_ptr, out_end - out_ptr,
-					      "%d", current_gid());
+					      "%d", cred->gid);
 				if (rc > out_end - out_ptr)
 					goto out;
 				out_ptr += rc;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 63964d863ad6..c594cc0e40fb 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -205,13 +205,14 @@ static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
 int __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
 		int force)
 {
+	const struct cred *cred = current_cred();
 	int err;
 	
 	err = security_file_set_fowner(filp);
 	if (err)
 		return err;
 
-	f_modown(filp, pid, type, current_uid(), current_euid(), force);
+	f_modown(filp, pid, type, cred->uid, cred->euid, force);
 	return 0;
 }
 EXPORT_SYMBOL(__f_setown);
diff --git a/fs/file_table.c b/fs/file_table.c
index 3152b53cfab0..bc4563fe791d 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -94,7 +94,7 @@ int proc_nr_files(ctl_table *table, int write, struct file *filp,
  */
 struct file *get_empty_filp(void)
 {
-	struct task_struct *tsk;
+	const struct cred *cred = current_cred();
 	static int old_max;
 	struct file * f;
 
@@ -118,12 +118,11 @@ struct file *get_empty_filp(void)
 	if (security_file_alloc(f))
 		goto fail_sec;
 
-	tsk = current;
 	INIT_LIST_HEAD(&f->f_u.fu_list);
 	atomic_long_set(&f->f_count, 1);
 	rwlock_init(&f->f_owner.lock);
-	f->f_uid = tsk->cred->fsuid;
-	f->f_gid = tsk->cred->fsgid;
+	f->f_uid = cred->fsuid;
+	f->f_gid = cred->fsgid;
 	eventpoll_init_file(f);
 	/* f->f_version: 0 */
 	return f;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 870a721b8bd2..7d479ce3aceb 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -951,6 +951,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size)
 	struct inode *inode;
 	struct dentry *dentry, *root;
 	struct qstr quick_string;
+	struct user_struct *user = current_user();
 
 	if (!hugetlbfs_vfsmount)
 		return ERR_PTR(-ENOENT);
@@ -958,7 +959,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size)
 	if (!can_do_hugetlb_shm())
 		return ERR_PTR(-EPERM);
 
-	if (!user_shm_lock(size, current->cred->user))
+	if (!user_shm_lock(size, user))
 		return ERR_PTR(-ENOMEM);
 
 	root = hugetlbfs_vfsmount->mnt_root;
@@ -998,7 +999,7 @@ out_inode:
 out_dentry:
 	dput(dentry);
 out_shm_unlock:
-	user_shm_unlock(size, current->cred->user);
+	user_shm_unlock(size, user);
 	return ERR_PTR(error);
 }
 
diff --git a/fs/ioprio.c b/fs/ioprio.c
index bb5210af77c2..5112554fd210 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -123,7 +123,7 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
 			break;
 		case IOPRIO_WHO_USER:
 			if (!who)
-				user = current->cred->user;
+				user = current_user();
 			else
 				user = find_user(who);
 
@@ -216,7 +216,7 @@ asmlinkage long sys_ioprio_get(int which, int who)
 			break;
 		case IOPRIO_WHO_USER:
 			if (!who)
-				user = current->cred->user;
+				user = current_user();
 			else
 				user = find_user(who);
 
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index 9e9bb0db4f6d..e7ddd0328ddc 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -667,8 +667,7 @@ smb_make_node(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 
 	attr.ia_valid = ATTR_MODE | ATTR_UID | ATTR_GID;
 	attr.ia_mode = mode;
-	attr.ia_uid = current_euid();
-	attr.ia_gid = current_egid();
+	current_euid_egid(&attr.ia_uid, &attr.ia_gid);
 
 	if (!new_valid_dev(dev))
 		return -EINVAL;
diff --git a/include/linux/cred.h b/include/linux/cred.h
index a7a686074cb0..4221ec6000c1 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -37,15 +37,16 @@ struct group_info {
  * get_group_info - Get a reference to a group info structure
  * @group_info: The group info to reference
  *
- * This must be called with the owning task locked (via task_lock()) when task
- * != current.  The reason being that the vast majority of callers are looking
- * at current->group_info, which can not be changed except by the current task.
- * Changing current->group_info requires the task lock, too.
+ * This gets a reference to a set of supplementary groups.
+ *
+ * If the caller is accessing a task's credentials, they must hold the RCU read
+ * lock when reading.
  */
-#define get_group_info(group_info)		\
-do {						\
-	atomic_inc(&(group_info)->usage);	\
-} while (0)
+static inline struct group_info *get_group_info(struct group_info *gi)
+{
+	atomic_inc(&gi->usage);
+	return gi;
+}
 
 /**
  * put_group_info - Release a reference to a group info structure
@@ -61,7 +62,7 @@ extern struct group_info *groups_alloc(int);
 extern void groups_free(struct group_info *);
 extern int set_current_groups(struct group_info *);
 extern int set_groups(struct cred *, struct group_info *);
-extern int groups_search(struct group_info *, gid_t);
+extern int groups_search(const struct group_info *, gid_t);
 
 /* access the groups "array" with this macro */
 #define GROUP_AT(gi, i) \
@@ -123,41 +124,6 @@ struct cred {
 	spinlock_t	lock;		/* lock for pointer changes */
 };
 
-#define get_current_user()	(get_uid(current->cred->user))
-
-#define task_uid(task)		((task)->cred->uid)
-#define task_gid(task)		((task)->cred->gid)
-#define task_euid(task)		((task)->cred->euid)
-#define task_egid(task)		((task)->cred->egid)
-
-#define current_uid()		(current->cred->uid)
-#define current_gid()		(current->cred->gid)
-#define current_euid()		(current->cred->euid)
-#define current_egid()		(current->cred->egid)
-#define current_suid()		(current->cred->suid)
-#define current_sgid()		(current->cred->sgid)
-#define current_fsuid()		(current->cred->fsuid)
-#define current_fsgid()		(current->cred->fsgid)
-#define current_cap()		(current->cred->cap_effective)
-
-#define current_uid_gid(_uid, _gid)		\
-do {						\
-	*(_uid) = current->cred->uid;		\
-	*(_gid) = current->cred->gid;		\
-} while(0)
-
-#define current_euid_egid(_uid, _gid)		\
-do {						\
-	*(_uid) = current->cred->euid;		\
-	*(_gid) = current->cred->egid;		\
-} while(0)
-
-#define current_fsuid_fsgid(_uid, _gid)		\
-do {						\
-	*(_uid) = current->cred->fsuid;		\
-	*(_gid) = current->cred->fsgid;		\
-} while(0)
-
 extern void __put_cred(struct cred *);
 extern int copy_creds(struct task_struct *, unsigned long);
 
@@ -187,4 +153,137 @@ static inline void put_cred(struct cred *cred)
 		__put_cred(cred);
 }
 
+/**
+ * current_cred - Access the current task's credentials
+ *
+ * Access the credentials of the current task.
+ */
+#define current_cred() \
+	(current->cred)
+
+/**
+ * __task_cred - Access another task's credentials
+ * @task: The task to query
+ *
+ * Access the credentials of another task.  The caller must hold the
+ * RCU readlock.
+ *
+ * The caller must make sure task doesn't go away, either by holding a ref on
+ * task or by holding tasklist_lock to prevent it from being unlinked.
+ */
+#define __task_cred(task) \
+	((const struct cred *)(rcu_dereference((task)->cred)))
+
+/**
+ * get_task_cred - Get another task's credentials
+ * @task: The task to query
+ *
+ * Get the credentials of a task, pinning them so that they can't go away.
+ * Accessing a task's credentials directly is not permitted.
+ *
+ * The caller must make sure task doesn't go away, either by holding a ref on
+ * task or by holding tasklist_lock to prevent it from being unlinked.
+ */
+#define get_task_cred(task)				\
+({							\
+	struct cred *__cred;				\
+	rcu_read_lock();				\
+	__cred = (struct cred *) __task_cred((task));	\
+	get_cred(__cred);				\
+	rcu_read_unlock();				\
+	__cred;						\
+})
+
+/**
+ * get_current_cred - Get the current task's credentials
+ *
+ * Get the credentials of the current task, pinning them so that they can't go
+ * away.  Accessing the current task's credentials directly is not permitted.
+ */
+#define get_current_cred()				\
+	(get_cred(current_cred()))
+
+/**
+ * get_current_user - Get the current task's user_struct
+ *
+ * Get the user record of the current task, pinning it so that it can't go
+ * away.
+ */
+#define get_current_user()				\
+({							\
+	struct user_struct *__u;			\
+	struct cred *__cred;				\
+	__cred = (struct cred *) current_cred();	\
+	__u = get_uid(__cred->user);			\
+	__u;						\
+})
+
+/**
+ * get_current_groups - Get the current task's supplementary group list
+ *
+ * Get the supplementary group list of the current task, pinning it so that it
+ * can't go away.
+ */
+#define get_current_groups()				\
+({							\
+	struct group_info *__groups;			\
+	struct cred *__cred;				\
+	__cred = (struct cred *) current_cred();	\
+	__groups = get_group_info(__cred->group_info);	\
+	__groups;					\
+})
+
+#define task_cred_xxx(task, xxx)		\
+({						\
+	__typeof__(task->cred->xxx) ___val;	\
+	rcu_read_lock();			\
+	___val = __task_cred((task))->xxx;	\
+	rcu_read_unlock();			\
+	___val;					\
+})
+
+#define task_uid(task)		(task_cred_xxx((task), uid))
+#define task_euid(task)		(task_cred_xxx((task), euid))
+
+#define current_cred_xxx(xxx)			\
+({						\
+	current->cred->xxx;			\
+})
+
+#define current_uid()		(current_cred_xxx(uid))
+#define current_gid()		(current_cred_xxx(gid))
+#define current_euid()		(current_cred_xxx(euid))
+#define current_egid()		(current_cred_xxx(egid))
+#define current_suid()		(current_cred_xxx(suid))
+#define current_sgid()		(current_cred_xxx(sgid))
+#define current_fsuid() 	(current_cred_xxx(fsuid))
+#define current_fsgid() 	(current_cred_xxx(fsgid))
+#define current_cap()		(current_cred_xxx(cap_effective))
+#define current_user()		(current_cred_xxx(user))
+#define current_security()	(current_cred_xxx(security))
+
+#define current_uid_gid(_uid, _gid)		\
+do {						\
+	const struct cred *__cred;		\
+	__cred = current_cred();		\
+	*(_uid) = __cred->uid;			\
+	*(_gid) = __cred->gid;			\
+} while(0)
+
+#define current_euid_egid(_euid, _egid)		\
+do {						\
+	const struct cred *__cred;		\
+	__cred = current_cred();		\
+	*(_euid) = __cred->euid;		\
+	*(_egid) = __cred->egid;		\
+} while(0)
+
+#define current_fsuid_fsgid(_fsuid, _fsgid)	\
+do {						\
+	const struct cred *__cred;		\
+	__cred = current_cred();		\
+	*(_fsuid) = __cred->fsuid;		\
+	*(_fsgid) = __cred->fsgid;		\
+} while(0)
+
 #endif /* _LINUX_CRED_H */
diff --git a/include/linux/securebits.h b/include/linux/securebits.h
index 6d389491bfa2..d2c5ed845bcc 100644
--- a/include/linux/securebits.h
+++ b/include/linux/securebits.h
@@ -32,7 +32,7 @@
    setting is locked or not. A setting which is locked cannot be
    changed from user-level. */
 #define issecure_mask(X)	(1 << (X))
-#define issecure(X)		(issecure_mask(X) & current->cred->securebits)
+#define issecure(X)		(issecure_mask(X) & current_cred_xxx(securebits))
 
 #define SECURE_ALL_BITS		(issecure_mask(SECURE_NOROOT) | \
 				 issecure_mask(SECURE_NO_SETUID_FIXUP) | \
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index e1885b494bac..1151881ccb9a 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -112,6 +112,7 @@ static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)
 static struct inode *mqueue_get_inode(struct super_block *sb, int mode,
 							struct mq_attr *attr)
 {
+	struct user_struct *u = current_user();
 	struct inode *inode;
 
 	inode = new_inode(sb);
@@ -126,7 +127,6 @@ static struct inode *mqueue_get_inode(struct super_block *sb, int mode,
 		if (S_ISREG(mode)) {
 			struct mqueue_inode_info *info;
 			struct task_struct *p = current;
-			struct user_struct *u = p->cred->user;
 			unsigned long mq_bytes, mq_msg_tblsz;
 
 			inode->i_fop = &mqueue_file_operations;
diff --git a/ipc/shm.c b/ipc/shm.c
index 264a9d33c5dd..38a055758a9b 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -366,7 +366,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 	if (shmflg & SHM_HUGETLB) {
 		/* hugetlb_file_setup takes care of mlock user accounting */
 		file = hugetlb_file_setup(name, size);
-		shp->mlock_user = current->cred->user;
+		shp->mlock_user = current_user();
 	} else {
 		int acctflag = VM_ACCOUNT;
 		/*
@@ -767,7 +767,7 @@ asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf)
 			goto out_unlock;
 		
 		if(cmd==SHM_LOCK) {
-			struct user_struct *user = current->cred->user;
+			struct user_struct *user = current_user();
 			if (!is_file_hugepages(shp->shm_file)) {
 				err = shmem_lock(shp->shm_file, 1, user);
 				if (!err && !(shp->shm_perm.mode & SHM_LOCKED)){
diff --git a/kernel/sys.c b/kernel/sys.c
index 5d81f07c0150..c4d6b59553e9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -143,6 +143,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
 {
 	struct task_struct *g, *p;
 	struct user_struct *user;
+	const struct cred *cred = current_cred();
 	int error = -EINVAL;
 	struct pid *pgrp;
 
@@ -176,18 +177,18 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
 			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 			break;
 		case PRIO_USER:
-			user = current->cred->user;
+			user = cred->user;
 			if (!who)
-				who = current_uid();
-			else
-				if (who != current_uid() && !(user = find_user(who)))
-					goto out_unlock;	/* No processes for this user */
+				who = cred->uid;
+			else if ((who != cred->uid) &&
+				 !(user = find_user(who)))
+				goto out_unlock;	/* No processes for this user */
 
 			do_each_thread(g, p)
-				if (p->cred->uid == who)
+				if (__task_cred(p)->uid == who)
 					error = set_one_prio(p, niceval, error);
 			while_each_thread(g, p);
-			if (who != current_uid())
+			if (who != cred->uid)
 				free_uid(user);		/* For find_user() */
 			break;
 	}
@@ -207,6 +208,7 @@ asmlinkage long sys_getpriority(int which, int who)
 {
 	struct task_struct *g, *p;
 	struct user_struct *user;
+	const struct cred *cred = current_cred();
 	long niceval, retval = -ESRCH;
 	struct pid *pgrp;
 
@@ -238,21 +240,21 @@ asmlinkage long sys_getpriority(int which, int who)
 			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 			break;
 		case PRIO_USER:
-			user = current->cred->user;
+			user = (struct user_struct *) cred->user;
 			if (!who)
-				who = current_uid();
-			else
-				if (who != current_uid() && !(user = find_user(who)))
-					goto out_unlock;	/* No processes for this user */
+				who = cred->uid;
+			else if ((who != cred->uid) &&
+				 !(user = find_user(who)))
+				goto out_unlock;	/* No processes for this user */
 
 			do_each_thread(g, p)
-				if (p->cred->uid == who) {
+				if (__task_cred(p)->uid == who) {
 					niceval = 20 - task_nice(p);
 					if (niceval > retval)
 						retval = niceval;
 				}
 			while_each_thread(g, p);
-			if (who != current_uid())
+			if (who != cred->uid)
 				free_uid(user);		/* for find_user() */
 			break;
 	}
@@ -743,11 +745,11 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
 
 asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid)
 {
-	struct cred *cred = current->cred;
+	const struct cred *cred = current_cred();
 	int retval;
 
-	if (!(retval = put_user(cred->uid, ruid)) &&
-	    !(retval = put_user(cred->euid, euid)))
+	if (!(retval   = put_user(cred->uid,  ruid)) &&
+	    !(retval   = put_user(cred->euid, euid)))
 		retval = put_user(cred->suid, suid);
 
 	return retval;
@@ -796,11 +798,11 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
 
 asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid)
 {
-	struct cred *cred = current->cred;
+	const struct cred *cred = current_cred();
 	int retval;
 
-	if (!(retval = put_user(cred->gid, rgid)) &&
-	    !(retval = put_user(cred->egid, egid)))
+	if (!(retval   = put_user(cred->gid,  rgid)) &&
+	    !(retval   = put_user(cred->egid, egid)))
 		retval = put_user(cred->sgid, sgid);
 
 	return retval;
@@ -1199,7 +1201,7 @@ static void groups_sort(struct group_info *group_info)
 }
 
 /* a simple bsearch */
-int groups_search(struct group_info *group_info, gid_t grp)
+int groups_search(const struct group_info *group_info, gid_t grp)
 {
 	unsigned int left, right;
 
@@ -1268,13 +1270,8 @@ EXPORT_SYMBOL(set_current_groups);
 
 asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist)
 {
-	struct cred *cred = current->cred;
-	int i = 0;
-
-	/*
-	 *	SMP: Nobody else can change our grouplist. Thus we are
-	 *	safe.
-	 */
+	const struct cred *cred = current_cred();
+	int i;
 
 	if (gidsetsize < 0)
 		return -EINVAL;
@@ -1330,8 +1327,9 @@ asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist)
  */
 int in_group_p(gid_t grp)
 {
-	struct cred *cred = current->cred;
+	const struct cred *cred = current_cred();
 	int retval = 1;
+
 	if (grp != cred->fsgid)
 		retval = groups_search(cred->group_info, grp);
 	return retval;
@@ -1341,8 +1339,9 @@ EXPORT_SYMBOL(in_group_p);
 
 int in_egroup_p(gid_t grp)
 {
-	struct cred *cred = current->cred;
+	const struct cred *cred = current_cred();
 	int retval = 1;
+
 	if (grp != cred->egid)
 		retval = groups_search(cred->group_info, grp);
 	return retval;
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 71f07fc39fea..2460c3199b5a 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -84,11 +84,12 @@ asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid)
 
 asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid)
 {
+	const struct cred *cred = current_cred();
 	int retval;
 
-	if (!(retval = put_user(high2lowuid(current->cred->uid), ruid)) &&
-	    !(retval = put_user(high2lowuid(current->cred->euid), euid)))
-		retval = put_user(high2lowuid(current->cred->suid), suid);
+	if (!(retval   = put_user(high2lowuid(cred->uid),  ruid)) &&
+	    !(retval   = put_user(high2lowuid(cred->euid), euid)))
+		retval = put_user(high2lowuid(cred->suid), suid);
 
 	return retval;
 }
@@ -104,11 +105,12 @@ asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid)
 
 asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid)
 {
+	const struct cred *cred = current_cred();
 	int retval;
 
-	if (!(retval = put_user(high2lowgid(current->cred->gid), rgid)) &&
-	    !(retval = put_user(high2lowgid(current->cred->egid), egid)))
-		retval = put_user(high2lowgid(current->cred->sgid), sgid);
+	if (!(retval   = put_user(high2lowgid(cred->gid),  rgid)) &&
+	    !(retval   = put_user(high2lowgid(cred->egid), egid)))
+		retval = put_user(high2lowgid(cred->sgid), sgid);
 
 	return retval;
 }
@@ -161,25 +163,24 @@ static int groups16_from_user(struct group_info *group_info,
 
 asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t __user *grouplist)
 {
-	int i = 0;
+	const struct cred *cred = current_cred();
+	int i;
 
 	if (gidsetsize < 0)
 		return -EINVAL;
 
-	get_group_info(current->cred->group_info);
-	i = current->cred->group_info->ngroups;
+	i = cred->group_info->ngroups;
 	if (gidsetsize) {
 		if (i > gidsetsize) {
 			i = -EINVAL;
 			goto out;
 		}
-		if (groups16_to_user(grouplist, current->cred->group_info)) {
+		if (groups16_to_user(grouplist, cred->group_info)) {
 			i = -EFAULT;
 			goto out;
 		}
 	}
 out:
-	put_group_info(current->cred->group_info);
 	return i;
 }
 
@@ -210,20 +211,20 @@ asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist)
 
 asmlinkage long sys_getuid16(void)
 {
-	return high2lowuid(current->cred->uid);
+	return high2lowuid(current_uid());
 }
 
 asmlinkage long sys_geteuid16(void)
 {
-	return high2lowuid(current->cred->euid);
+	return high2lowuid(current_euid());
 }
 
 asmlinkage long sys_getgid16(void)
 {
-	return high2lowgid(current->cred->gid);
+	return high2lowgid(current_gid());
 }
 
 asmlinkage long sys_getegid16(void)
 {
-	return high2lowgid(current->cred->egid);
+	return high2lowgid(current_egid());
 }
diff --git a/net/core/scm.c b/net/core/scm.c
index c28ca32a7d93..f73c44b17dda 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -44,7 +44,7 @@
 
 static __inline__ int scm_check_creds(struct ucred *creds)
 {
-	struct cred *cred = current->cred;
+	const struct cred *cred = current_cred();
 
 	if ((creds->pid == task_tgid_vnr(current) || capable(CAP_SYS_ADMIN)) &&
 	    ((creds->uid == cred->uid   || creds->uid == cred->euid ||
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index c79543212602..0443f8349458 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -350,16 +350,18 @@ EXPORT_SYMBOL_GPL(rpcauth_lookup_credcache);
 struct rpc_cred *
 rpcauth_lookupcred(struct rpc_auth *auth, int flags)
 {
-	struct auth_cred acred = {
-		.uid = current_fsuid(),
-		.gid = current_fsgid(),
-		.group_info = current->cred->group_info,
-	};
+	struct auth_cred acred;
 	struct rpc_cred *ret;
+	const struct cred *cred = current_cred();
 
 	dprintk("RPC:       looking up %s cred\n",
 		auth->au_ops->au_name);
-	get_group_info(acred.group_info);
+
+	memset(&acred, 0, sizeof(acred));
+	acred.uid = cred->fsuid;
+	acred.gid = cred->fsgid;
+	acred.group_info = get_group_info(((struct cred *)cred)->group_info);
+
 	ret = auth->au_ops->lookup_cred(auth, &acred, flags);
 	put_group_info(acred.group_info);
 	return ret;
diff --git a/security/commoncap.c b/security/commoncap.c
index fa61679f8c73..61307f590003 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -641,7 +641,7 @@ int cap_task_setnice (struct task_struct *p, int nice)
 int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 		   unsigned long arg4, unsigned long arg5, long *rc_p)
 {
-	struct cred *cred = current->cred;
+	struct cred *cred = current_cred();
 	long error = 0;
 
 	switch (option) {
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index b0904cdda2e7..ce8ac6073d57 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -582,7 +582,7 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial,
 {
 	struct request_key_auth *rka;
 	struct task_struct *t = current;
-	struct cred *cred = t->cred;
+	struct cred *cred = current_cred();
 	struct key *key;
 	key_ref_t key_ref, skey_ref;
 	int ret;
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index 3e9b9eb1dd28..0488b0af5bd6 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -67,6 +67,7 @@ static int call_sbin_request_key(struct key_construction *cons,
 				 void *aux)
 {
 	struct task_struct *tsk = current;
+	const struct cred *cred = current_cred();
 	key_serial_t prkey, sskey;
 	struct key *key = cons->key, *authkey = cons->authkey, *keyring;
 	char *argv[9], *envp[3], uid_str[12], gid_str[12];
@@ -96,16 +97,16 @@ static int call_sbin_request_key(struct key_construction *cons,
 		goto error_link;
 
 	/* record the UID and GID */
-	sprintf(uid_str, "%d", current_fsuid());
-	sprintf(gid_str, "%d", current_fsgid());
+	sprintf(uid_str, "%d", cred->fsuid);
+	sprintf(gid_str, "%d", cred->fsgid);
 
 	/* we say which key is under construction */
 	sprintf(key_str, "%d", key->serial);
 
 	/* we specify the process's default keyrings */
 	sprintf(keyring_str[0], "%d",
-		tsk->cred->thread_keyring ?
-		tsk->cred->thread_keyring->serial : 0);
+		cred->thread_keyring ?
+		cred->thread_keyring->serial : 0);
 
 	prkey = 0;
 	if (tsk->signal->process_keyring)
@@ -118,7 +119,7 @@ static int call_sbin_request_key(struct key_construction *cons,
 		sskey = rcu_dereference(tsk->signal->session_keyring)->serial;
 		rcu_read_unlock();
 	} else {
-		sskey = tsk->cred->user->session_keyring->serial;
+		sskey = cred->user->session_keyring->serial;
 	}
 
 	sprintf(keyring_str[2], "%d", sskey);
diff --git a/security/selinux/exports.c b/security/selinux/exports.c
index cf02490cd1eb..c73aeaa008e8 100644
--- a/security/selinux/exports.c
+++ b/security/selinux/exports.c
@@ -39,9 +39,13 @@ EXPORT_SYMBOL_GPL(selinux_string_to_sid);
 int selinux_secmark_relabel_packet_permission(u32 sid)
 {
 	if (selinux_enabled) {
-		struct task_security_struct *tsec = current->cred->security;
+		const struct task_security_struct *__tsec;
+		u32 tsid;
 
-		return avc_has_perm(tsec->sid, sid, SECCLASS_PACKET,
+		__tsec = current_security();
+		tsid = __tsec->sid;
+
+		return avc_has_perm(tsid, sid, SECCLASS_PACKET,
 				    PACKET__RELABELTO, NULL);
 	}
 	return 0;
diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c
index d7db76617b0e..c0eb72013d67 100644
--- a/security/selinux/xfrm.c
+++ b/security/selinux/xfrm.c
@@ -197,7 +197,7 @@ static int selinux_xfrm_sec_ctx_alloc(struct xfrm_sec_ctx **ctxp,
 	struct xfrm_user_sec_ctx *uctx, u32 sid)
 {
 	int rc = 0;
-	struct task_security_struct *tsec = current->cred->security;
+	const struct task_security_struct *tsec = current_security();
 	struct xfrm_sec_ctx *ctx = NULL;
 	char *ctx_str = NULL;
 	u32 str_len;
@@ -333,7 +333,7 @@ void selinux_xfrm_policy_free(struct xfrm_sec_ctx *ctx)
  */
 int selinux_xfrm_policy_delete(struct xfrm_sec_ctx *ctx)
 {
-	struct task_security_struct *tsec = current->cred->security;
+	const struct task_security_struct *tsec = current_security();
 	int rc = 0;
 
 	if (ctx) {
@@ -378,7 +378,7 @@ void selinux_xfrm_state_free(struct xfrm_state *x)
   */
 int selinux_xfrm_state_delete(struct xfrm_state *x)
 {
-	struct task_security_struct *tsec = current->cred->security;
+	const struct task_security_struct *tsec = current_security();
 	struct xfrm_sec_ctx *ctx = x->security;
 	int rc = 0;
 
diff --git a/security/smack/smack_access.c b/security/smack/smack_access.c
index b6dd4fc0fb0b..247cec3b5a43 100644
--- a/security/smack/smack_access.c
+++ b/security/smack/smack_access.c
@@ -164,7 +164,7 @@ int smk_curacc(char *obj_label, u32 mode)
 {
 	int rc;
 
-	rc = smk_access(current->cred->security, obj_label, mode);
+	rc = smk_access(current_security(), obj_label, mode);
 	if (rc == 0)
 		return 0;
 
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index cc837314fb0e..e8a4fcb1ad04 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -143,7 +143,7 @@ static int smack_ptrace_traceme(struct task_struct *ptp)
 static int smack_syslog(int type)
 {
 	int rc;
-	char *sp = current->cred->security;
+	char *sp = current_security();
 
 	rc = cap_syslog(type);
 	if (rc != 0)
@@ -375,7 +375,7 @@ static int smack_sb_umount(struct vfsmount *mnt, int flags)
  */
 static int smack_inode_alloc_security(struct inode *inode)
 {
-	inode->i_security = new_inode_smack(current->cred->security);
+	inode->i_security = new_inode_smack(current_security());
 	if (inode->i_security == NULL)
 		return -ENOMEM;
 	return 0;
@@ -820,7 +820,7 @@ static int smack_file_permission(struct file *file, int mask)
  */
 static int smack_file_alloc_security(struct file *file)
 {
-	file->f_security = current->cred->security;
+	file->f_security = current_security();
 	return 0;
 }
 
@@ -918,7 +918,7 @@ static int smack_file_fcntl(struct file *file, unsigned int cmd,
  */
 static int smack_file_set_fowner(struct file *file)
 {
-	file->f_security = current->cred->security;
+	file->f_security = current_security();
 	return 0;
 }
 
@@ -986,8 +986,7 @@ static int smack_file_receive(struct file *file)
  */
 static int smack_cred_alloc_security(struct cred *cred)
 {
-	cred->security = current->cred->security;
-
+	cred->security = current_security();
 	return 0;
 }
 
@@ -1225,7 +1224,7 @@ static void smack_task_to_inode(struct task_struct *p, struct inode *inode)
  */
 static int smack_sk_alloc_security(struct sock *sk, int family, gfp_t gfp_flags)
 {
-	char *csp = current->cred->security;
+	char *csp = current_security();
 	struct socket_smack *ssp;
 
 	ssp = kzalloc(sizeof(struct socket_smack), gfp_flags);
@@ -1450,7 +1449,7 @@ static int smack_flags_to_may(int flags)
  */
 static int smack_msg_msg_alloc_security(struct msg_msg *msg)
 {
-	msg->security = current->cred->security;
+	msg->security = current_security();
 	return 0;
 }
 
@@ -1486,7 +1485,7 @@ static int smack_shm_alloc_security(struct shmid_kernel *shp)
 {
 	struct kern_ipc_perm *isp = &shp->shm_perm;
 
-	isp->security = current->cred->security;
+	isp->security = current_security();
 	return 0;
 }
 
@@ -1595,7 +1594,7 @@ static int smack_sem_alloc_security(struct sem_array *sma)
 {
 	struct kern_ipc_perm *isp = &sma->sem_perm;
 
-	isp->security = current->cred->security;
+	isp->security = current_security();
 	return 0;
 }
 
@@ -1699,7 +1698,7 @@ static int smack_msg_queue_alloc_security(struct msg_queue *msq)
 {
 	struct kern_ipc_perm *kisp = &msq->q_perm;
 
-	kisp->security = current->cred->security;
+	kisp->security = current_security();
 	return 0;
 }
 
@@ -1854,7 +1853,7 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode)
 	struct super_block *sbp;
 	struct superblock_smack *sbsp;
 	struct inode_smack *isp;
-	char *csp = current->cred->security;
+	char *csp = current_security();
 	char *fetched;
 	char *final;
 	struct dentry *dp;
@@ -2290,8 +2289,7 @@ static void smack_sock_graft(struct sock *sk, struct socket *parent)
 		return;
 
 	ssp = sk->sk_security;
-	ssp->smk_in = current->cred->security;
-	ssp->smk_out = current->cred->security;
+	ssp->smk_in = ssp->smk_out = current_security();
 	ssp->smk_packet[0] = '\0';
 
 	rc = smack_netlabel(sk);
diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c
index c5ca279e0506..ca257dfdc75d 100644
--- a/security/smack/smackfs.c
+++ b/security/smack/smackfs.c
@@ -336,7 +336,7 @@ static void smk_cipso_doi(void)
 
 	audit_info.loginuid = audit_get_loginuid(current);
 	audit_info.sessionid = audit_get_sessionid(current);
-	audit_info.secid = smack_to_secid(current->cred->security);
+	audit_info.secid = smack_to_secid(current_security());
 
 	rc = netlbl_cfg_map_del(NULL, &audit_info);
 	if (rc != 0)
@@ -371,7 +371,7 @@ static void smk_unlbl_ambient(char *oldambient)
 
 	audit_info.loginuid = audit_get_loginuid(current);
 	audit_info.sessionid = audit_get_sessionid(current);
-	audit_info.secid = smack_to_secid(current->cred->security);
+	audit_info.secid = smack_to_secid(current_security());
 
 	if (oldambient != NULL) {
 		rc = netlbl_cfg_map_del(oldambient, &audit_info);
-- 
cgit v1.2.3


From c69e8d9c01db2adc503464993c358901c9af9de4 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:19 +1100
Subject: CRED: Use RCU to access another task's creds and to release a task's
 own creds

Use RCU to access another task's creds and to release a task's own creds.
This means that it will be possible for the credentials of a task to be
replaced without another task (a) requiring a full lock to read them, and (b)
seeing deallocated memory.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 arch/ia64/kernel/perfmon.c   | 32 +++++++++++++---------
 drivers/connector/cn_proc.c  | 16 +++++++----
 fs/binfmt_elf.c              |  8 ++++--
 fs/binfmt_elf_fdpic.c        |  8 ++++--
 fs/fcntl.c                   | 15 ++++++++---
 fs/fuse/dir.c                | 23 ++++++++++------
 fs/ioprio.c                  | 14 +++++++---
 fs/proc/array.c              | 32 ++++++++++++++--------
 fs/proc/base.c               | 32 ++++++++++++++++------
 include/linux/cred.h         |  3 ++-
 kernel/auditsc.c             | 33 +++++++++++++----------
 kernel/cgroup.c              | 16 +++++------
 kernel/exit.c                | 14 ++++++----
 kernel/futex.c               | 22 +++++++++------
 kernel/futex_compat.c        |  7 ++---
 kernel/ptrace.c              | 22 ++++++++-------
 kernel/sched.c               | 31 ++++++++++++++-------
 kernel/signal.c              | 49 ++++++++++++++++++++-------------
 kernel/sys.c                 | 11 +++++---
 kernel/tsacct.c              |  6 +++--
 mm/mempolicy.c               |  8 +++---
 mm/migrate.c                 |  8 +++---
 mm/oom_kill.c                |  6 ++---
 security/commoncap.c         | 64 +++++++++++++++++++++++++++-----------------
 security/keys/permission.c   | 10 ++++---
 security/keys/process_keys.c | 24 ++++++++++-------
 security/selinux/selinuxfs.c | 13 ++++++---
 security/smack/smack_lsm.c   | 32 +++++++++++-----------
 28 files changed, 355 insertions(+), 204 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index dd38db46a77a..0e499757309b 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2399,25 +2399,33 @@ error_kmem:
 static int
 pfm_bad_permissions(struct task_struct *task)
 {
+	const struct cred *tcred;
 	uid_t uid = current_uid();
 	gid_t gid = current_gid();
+	int ret;
+
+	rcu_read_lock();
+	tcred = __task_cred(task);
 
 	/* inspired by ptrace_attach() */
 	DPRINT(("cur: uid=%d gid=%d task: euid=%d suid=%d uid=%d egid=%d sgid=%d\n",
 		uid,
 		gid,
-		task->euid,
-		task->suid,
-		task->uid,
-		task->egid,
-		task->sgid));
-
-	return (uid != task->euid)
-	    || (uid != task->suid)
-	    || (uid != task->uid)
-	    || (gid != task->egid)
-	    || (gid != task->sgid)
-	    || (gid != task->gid)) && !capable(CAP_SYS_PTRACE);
+		tcred->euid,
+		tcred->suid,
+		tcred->uid,
+		tcred->egid,
+		tcred->sgid));
+
+	ret = ((uid != tcred->euid)
+	       || (uid != tcred->suid)
+	       || (uid != tcred->uid)
+	       || (gid != tcred->egid)
+	       || (gid != tcred->sgid)
+	       || (gid != tcred->gid)) && !capable(CAP_SYS_PTRACE);
+
+	rcu_read_unlock();
+	return ret;
 }
 
 static int
diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c
index 354c1ff17159..c5afc98e2675 100644
--- a/drivers/connector/cn_proc.c
+++ b/drivers/connector/cn_proc.c
@@ -106,6 +106,7 @@ void proc_id_connector(struct task_struct *task, int which_id)
 	struct proc_event *ev;
 	__u8 buffer[CN_PROC_MSG_SIZE];
 	struct timespec ts;
+	const struct cred *cred;
 
 	if (atomic_read(&proc_event_num_listeners) < 1)
 		return;
@@ -115,14 +116,19 @@ void proc_id_connector(struct task_struct *task, int which_id)
 	ev->what = which_id;
 	ev->event_data.id.process_pid = task->pid;
 	ev->event_data.id.process_tgid = task->tgid;
+	rcu_read_lock();
+	cred = __task_cred(task);
 	if (which_id == PROC_EVENT_UID) {
-		ev->event_data.id.r.ruid = task->cred->uid;
-		ev->event_data.id.e.euid = task->cred->euid;
+		ev->event_data.id.r.ruid = cred->uid;
+		ev->event_data.id.e.euid = cred->euid;
 	} else if (which_id == PROC_EVENT_GID) {
-		ev->event_data.id.r.rgid = task->cred->gid;
-		ev->event_data.id.e.egid = task->cred->egid;
-	} else
+		ev->event_data.id.r.rgid = cred->gid;
+		ev->event_data.id.e.egid = cred->egid;
+	} else {
+		rcu_read_unlock();
 	     	return;
+	}
+	rcu_read_unlock();
 	get_seq(&msg->seq, &ev->cpu);
 	ktime_get_ts(&ts); /* get high res monotonic timestamp */
 	put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 0e6655613169..9142ff5dc8e6 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1361,6 +1361,7 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
 		       struct mm_struct *mm)
 {
+	const struct cred *cred;
 	unsigned int i, len;
 	
 	/* first copy the parameters from user space */
@@ -1388,8 +1389,11 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
 	psinfo->pr_zomb = psinfo->pr_sname == 'Z';
 	psinfo->pr_nice = task_nice(p);
 	psinfo->pr_flag = p->flags;
-	SET_UID(psinfo->pr_uid, p->cred->uid);
-	SET_GID(psinfo->pr_gid, p->cred->gid);
+	rcu_read_lock();
+	cred = __task_cred(p);
+	SET_UID(psinfo->pr_uid, cred->uid);
+	SET_GID(psinfo->pr_gid, cred->gid);
+	rcu_read_unlock();
 	strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
 	
 	return 0;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 1f6e8c023b4c..45dabd59936f 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1414,6 +1414,7 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
 		       struct mm_struct *mm)
 {
+	const struct cred *cred;
 	unsigned int i, len;
 
 	/* first copy the parameters from user space */
@@ -1441,8 +1442,11 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
 	psinfo->pr_zomb = psinfo->pr_sname == 'Z';
 	psinfo->pr_nice = task_nice(p);
 	psinfo->pr_flag = p->flags;
-	SET_UID(psinfo->pr_uid, p->cred->uid);
-	SET_GID(psinfo->pr_gid, p->cred->gid);
+	rcu_read_lock();
+	cred = __task_cred(p);
+	SET_UID(psinfo->pr_uid, cred->uid);
+	SET_GID(psinfo->pr_gid, cred->gid);
+	rcu_read_unlock();
 	strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
 
 	return 0;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index c594cc0e40fb..87c39f1f0817 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -401,10 +401,17 @@ static const long band_table[NSIGPOLL] = {
 static inline int sigio_perm(struct task_struct *p,
                              struct fown_struct *fown, int sig)
 {
-	return (((fown->euid == 0) ||
-		 (fown->euid == p->cred->suid) || (fown->euid == p->cred->uid) ||
-		 (fown->uid == p->cred->suid) || (fown->uid == p->cred->uid)) &&
-		!security_file_send_sigiotask(p, fown, sig));
+	const struct cred *cred;
+	int ret;
+
+	rcu_read_lock();
+	cred = __task_cred(p);
+	ret = ((fown->euid == 0 ||
+		fown->euid == cred->suid || fown->euid == cred->uid ||
+		fown->uid  == cred->suid || fown->uid  == cred->uid) &&
+	       !security_file_send_sigiotask(p, fown, sig));
+	rcu_read_unlock();
+	return ret;
 }
 
 static void send_sigio_to_task(struct task_struct *p,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index e97a98981862..95bc22bdd060 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -869,18 +869,25 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat,
  */
 int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task)
 {
+	const struct cred *cred;
+	int ret;
+
 	if (fc->flags & FUSE_ALLOW_OTHER)
 		return 1;
 
-	if (task->cred->euid == fc->user_id &&
-	    task->cred->suid == fc->user_id &&
-	    task->cred->uid == fc->user_id &&
-	    task->cred->egid == fc->group_id &&
-	    task->cred->sgid == fc->group_id &&
-	    task->cred->gid == fc->group_id)
-		return 1;
+	rcu_read_lock();
+	ret = 0;
+	cred = __task_cred(task);
+	if (cred->euid == fc->user_id &&
+	    cred->suid == fc->user_id &&
+	    cred->uid  == fc->user_id &&
+	    cred->egid == fc->group_id &&
+	    cred->sgid == fc->group_id &&
+	    cred->gid  == fc->group_id)
+		ret = 1;
+	rcu_read_unlock();
 
-	return 0;
+	return ret;
 }
 
 static int fuse_access(struct inode *inode, int mask)
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 5112554fd210..3569e0ad86a2 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -31,10 +31,16 @@ static int set_task_ioprio(struct task_struct *task, int ioprio)
 {
 	int err;
 	struct io_context *ioc;
+	const struct cred *cred = current_cred(), *tcred;
 
-	if (task->cred->uid != current_euid() &&
-	    task->cred->uid != current_uid() && !capable(CAP_SYS_NICE))
+	rcu_read_lock();
+	tcred = __task_cred(task);
+	if (tcred->uid != cred->euid &&
+	    tcred->uid != cred->uid && !capable(CAP_SYS_NICE)) {
+		rcu_read_unlock();
 		return -EPERM;
+	}
+	rcu_read_unlock();
 
 	err = security_task_setioprio(task, ioprio);
 	if (err)
@@ -131,7 +137,7 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
 				break;
 
 			do_each_thread(g, p) {
-				if (p->cred->uid != who)
+				if (__task_cred(p)->uid != who)
 					continue;
 				ret = set_task_ioprio(p, ioprio);
 				if (ret)
@@ -224,7 +230,7 @@ asmlinkage long sys_ioprio_get(int which, int who)
 				break;
 
 			do_each_thread(g, p) {
-				if (p->cred->uid != user->uid)
+				if (__task_cred(p)->uid != user->uid)
 					continue;
 				tmpio = get_task_ioprio(p);
 				if (tmpio < 0)
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 62fe9b2009b6..7e4877d9dcb5 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -159,6 +159,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 	struct group_info *group_info;
 	int g;
 	struct fdtable *fdt = NULL;
+	const struct cred *cred;
 	pid_t ppid, tpid;
 
 	rcu_read_lock();
@@ -170,6 +171,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 		if (tracer)
 			tpid = task_pid_nr_ns(tracer, ns);
 	}
+	cred = get_cred((struct cred *) __task_cred(p));
 	seq_printf(m,
 		"State:\t%s\n"
 		"Tgid:\t%d\n"
@@ -182,8 +184,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 		task_tgid_nr_ns(p, ns),
 		pid_nr_ns(pid, ns),
 		ppid, tpid,
-		p->cred->uid, p->cred->euid, p->cred->suid, p->cred->fsuid,
-		p->cred->gid, p->cred->egid, p->cred->sgid, p->cred->fsgid);
+		cred->uid, cred->euid, cred->suid, cred->fsuid,
+		cred->gid, cred->egid, cred->sgid, cred->fsgid);
 
 	task_lock(p);
 	if (p->files)
@@ -194,13 +196,12 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 		fdt ? fdt->max_fds : 0);
 	rcu_read_unlock();
 
-	group_info = p->cred->group_info;
-	get_group_info(group_info);
+	group_info = cred->group_info;
 	task_unlock(p);
 
 	for (g = 0; g < min(group_info->ngroups, NGROUPS_SMALL); g++)
 		seq_printf(m, "%d ", GROUP_AT(group_info, g));
-	put_group_info(group_info);
+	put_cred(cred);
 
 	seq_printf(m, "\n");
 }
@@ -262,7 +263,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
 		blocked = p->blocked;
 		collect_sigign_sigcatch(p, &ignored, &caught);
 		num_threads = atomic_read(&p->signal->count);
-		qsize = atomic_read(&p->cred->user->sigpending);
+		qsize = atomic_read(&__task_cred(p)->user->sigpending);
 		qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur;
 		unlock_task_sighand(p, &flags);
 	}
@@ -293,12 +294,21 @@ static void render_cap_t(struct seq_file *m, const char *header,
 
 static inline void task_cap(struct seq_file *m, struct task_struct *p)
 {
-	struct cred *cred = p->cred;
+	const struct cred *cred;
+	kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset;
 
-	render_cap_t(m, "CapInh:\t", &cred->cap_inheritable);
-	render_cap_t(m, "CapPrm:\t", &cred->cap_permitted);
-	render_cap_t(m, "CapEff:\t", &cred->cap_effective);
-	render_cap_t(m, "CapBnd:\t", &cred->cap_bset);
+	rcu_read_lock();
+	cred = __task_cred(p);
+	cap_inheritable	= cred->cap_inheritable;
+	cap_permitted	= cred->cap_permitted;
+	cap_effective	= cred->cap_effective;
+	cap_bset	= cred->cap_bset;
+	rcu_read_unlock();
+
+	render_cap_t(m, "CapInh:\t", &cap_inheritable);
+	render_cap_t(m, "CapPrm:\t", &cap_permitted);
+	render_cap_t(m, "CapEff:\t", &cap_effective);
+	render_cap_t(m, "CapBnd:\t", &cap_bset);
 }
 
 static inline void task_context_switch_counts(struct seq_file *m,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6862b360c36c..cf42c42cbfbb 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1406,6 +1406,7 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
 {
 	struct inode * inode;
 	struct proc_inode *ei;
+	const struct cred *cred;
 
 	/* We need a new inode */
 
@@ -1428,8 +1429,11 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
 	inode->i_uid = 0;
 	inode->i_gid = 0;
 	if (task_dumpable(task)) {
-		inode->i_uid = task->cred->euid;
-		inode->i_gid = task->cred->egid;
+		rcu_read_lock();
+		cred = __task_cred(task);
+		inode->i_uid = cred->euid;
+		inode->i_gid = cred->egid;
+		rcu_read_unlock();
 	}
 	security_task_to_inode(task, inode);
 
@@ -1445,6 +1449,8 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat
 {
 	struct inode *inode = dentry->d_inode;
 	struct task_struct *task;
+	const struct cred *cred;
+
 	generic_fillattr(inode, stat);
 
 	rcu_read_lock();
@@ -1454,8 +1460,9 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat
 	if (task) {
 		if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
 		    task_dumpable(task)) {
-			stat->uid = task->cred->euid;
-			stat->gid = task->cred->egid;
+			cred = __task_cred(task);
+			stat->uid = cred->euid;
+			stat->gid = cred->egid;
 		}
 	}
 	rcu_read_unlock();
@@ -1483,11 +1490,16 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
 	struct task_struct *task = get_proc_task(inode);
+	const struct cred *cred;
+
 	if (task) {
 		if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
 		    task_dumpable(task)) {
-			inode->i_uid = task->cred->euid;
-			inode->i_gid = task->cred->egid;
+			rcu_read_lock();
+			cred = __task_cred(task);
+			inode->i_uid = cred->euid;
+			inode->i_gid = cred->egid;
+			rcu_read_unlock();
 		} else {
 			inode->i_uid = 0;
 			inode->i_gid = 0;
@@ -1649,6 +1661,7 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 	struct task_struct *task = get_proc_task(inode);
 	int fd = proc_fd(inode);
 	struct files_struct *files;
+	const struct cred *cred;
 
 	if (task) {
 		files = get_files_struct(task);
@@ -1658,8 +1671,11 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 				rcu_read_unlock();
 				put_files_struct(files);
 				if (task_dumpable(task)) {
-					inode->i_uid = task->cred->euid;
-					inode->i_gid = task->cred->egid;
+					rcu_read_lock();
+					cred = __task_cred(task);
+					inode->i_uid = cred->euid;
+					inode->i_gid = cred->egid;
+					rcu_read_unlock();
 				} else {
 					inode->i_uid = 0;
 					inode->i_gid = 0;
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 4221ec6000c1..166ce4ddba64 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -147,8 +147,9 @@ static inline struct cred *get_cred(struct cred *cred)
  * Release a reference to a set of credentials, deleting them when the last ref
  * is released.
  */
-static inline void put_cred(struct cred *cred)
+static inline void put_cred(const struct cred *_cred)
 {
+	struct cred *cred = (struct cred *) _cred;
 	if (atomic_dec_and_test(&(cred)->usage))
 		__put_cred(cred);
 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 2febf5165fad..ae8ef88ade3f 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -447,7 +447,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 			      struct audit_names *name,
 			      enum audit_state *state)
 {
-	struct cred *cred = tsk->cred;
+	const struct cred *cred = get_task_cred(tsk);
 	int i, j, need_sid = 1;
 	u32 sid;
 
@@ -642,8 +642,10 @@ static int audit_filter_rules(struct task_struct *tsk,
 			break;
 		}
 
-		if (!result)
+		if (!result) {
+			put_cred(cred);
 			return 0;
+		}
 	}
 	if (rule->filterkey && ctx)
 		ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
@@ -651,6 +653,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 	case AUDIT_NEVER:    *state = AUDIT_DISABLED;	    break;
 	case AUDIT_ALWAYS:   *state = AUDIT_RECORD_CONTEXT; break;
 	}
+	put_cred(cred);
 	return 1;
 }
 
@@ -1229,7 +1232,7 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
 
 static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
 {
-	struct cred *cred = tsk->cred;
+	const struct cred *cred;
 	int i, call_panic = 0;
 	struct audit_buffer *ab;
 	struct audit_aux_data *aux;
@@ -1239,13 +1242,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 	context->pid = tsk->pid;
 	if (!context->ppid)
 		context->ppid = sys_getppid();
-	context->uid = cred->uid;
-	context->gid = cred->gid;
-	context->euid = cred->euid;
-	context->suid = cred->suid;
+	cred = current_cred();
+	context->uid   = cred->uid;
+	context->gid   = cred->gid;
+	context->euid  = cred->euid;
+	context->suid  = cred->suid;
 	context->fsuid = cred->fsuid;
-	context->egid = cred->egid;
-	context->sgid = cred->sgid;
+	context->egid  = cred->egid;
+	context->sgid  = cred->sgid;
 	context->fsgid = cred->fsgid;
 	context->personality = tsk->personality;
 
@@ -2088,7 +2092,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
 			audit_log_format(ab, "login pid=%d uid=%u "
 				"old auid=%u new auid=%u"
 				" old ses=%u new ses=%u",
-				task->pid, task->cred->uid,
+				task->pid, task_uid(task),
 				task->loginuid, loginuid,
 				task->sessionid, sessionid);
 			audit_log_end(ab);
@@ -2471,7 +2475,7 @@ void __audit_ptrace(struct task_struct *t)
 
 	context->target_pid = t->pid;
 	context->target_auid = audit_get_loginuid(t);
-	context->target_uid = t->cred->uid;
+	context->target_uid = task_uid(t);
 	context->target_sessionid = audit_get_sessionid(t);
 	security_task_getsecid(t, &context->target_sid);
 	memcpy(context->target_comm, t->comm, TASK_COMM_LEN);
@@ -2490,6 +2494,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
 	struct audit_aux_data_pids *axp;
 	struct task_struct *tsk = current;
 	struct audit_context *ctx = tsk->audit_context;
+	uid_t uid = current_uid(), t_uid = task_uid(t);
 
 	if (audit_pid && t->tgid == audit_pid) {
 		if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
@@ -2497,7 +2502,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
 			if (tsk->loginuid != -1)
 				audit_sig_uid = tsk->loginuid;
 			else
-				audit_sig_uid = tsk->cred->uid;
+				audit_sig_uid = uid;
 			security_task_getsecid(tsk, &audit_sig_sid);
 		}
 		if (!audit_signals || audit_dummy_context())
@@ -2509,7 +2514,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
 	if (!ctx->target_pid) {
 		ctx->target_pid = t->tgid;
 		ctx->target_auid = audit_get_loginuid(t);
-		ctx->target_uid = t->cred->uid;
+		ctx->target_uid = t_uid;
 		ctx->target_sessionid = audit_get_sessionid(t);
 		security_task_getsecid(t, &ctx->target_sid);
 		memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN);
@@ -2530,7 +2535,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
 
 	axp->target_pid[axp->pid_count] = t->tgid;
 	axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
-	axp->target_uid[axp->pid_count] = t->cred->uid;
+	axp->target_uid[axp->pid_count] = t_uid;
 	axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
 	security_task_getsecid(t, &axp->target_sid[axp->pid_count]);
 	memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e210526e6401..a512a75a5560 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1279,7 +1279,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
 {
 	struct task_struct *tsk;
-	uid_t euid;
+	const struct cred *cred = current_cred(), *tcred;
 	int ret;
 
 	if (pid) {
@@ -1289,16 +1289,16 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
 			rcu_read_unlock();
 			return -ESRCH;
 		}
-		get_task_struct(tsk);
-		rcu_read_unlock();
 
-		euid = current_euid();
-		if (euid &&
-		    euid != tsk->cred->uid &&
-		    euid != tsk->cred->suid) {
-			put_task_struct(tsk);
+		tcred = __task_cred(tsk);
+		if (cred->euid &&
+		    cred->euid != tcred->uid &&
+		    cred->euid != tcred->suid) {
+			rcu_read_unlock();
 			return -EACCES;
 		}
+		get_task_struct(tsk);
+		rcu_read_unlock();
 	} else {
 		tsk = current;
 		get_task_struct(tsk);
diff --git a/kernel/exit.c b/kernel/exit.c
index e0f6e1892fb9..bbc22530f2c1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -160,7 +160,10 @@ void release_task(struct task_struct * p)
 	int zap_leader;
 repeat:
 	tracehook_prepare_release_task(p);
-	atomic_dec(&p->cred->user->processes);
+	/* don't need to get the RCU readlock here - the process is dead and
+	 * can't be modifying its own credentials */
+	atomic_dec(&__task_cred(p)->user->processes);
+
 	proc_flush_task(p);
 	write_lock_irq(&tasklist_lock);
 	tracehook_finish_release_task(p);
@@ -1267,12 +1270,12 @@ static int wait_task_zombie(struct task_struct *p, int options,
 	unsigned long state;
 	int retval, status, traced;
 	pid_t pid = task_pid_vnr(p);
+	uid_t uid = __task_cred(p)->uid;
 
 	if (!likely(options & WEXITED))
 		return 0;
 
 	if (unlikely(options & WNOWAIT)) {
-		uid_t uid = p->cred->uid;
 		int exit_code = p->exit_code;
 		int why, status;
 
@@ -1393,7 +1396,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
 	if (!retval && infop)
 		retval = put_user(pid, &infop->si_pid);
 	if (!retval && infop)
-		retval = put_user(p->cred->uid, &infop->si_uid);
+		retval = put_user(uid, &infop->si_uid);
 	if (!retval)
 		retval = pid;
 
@@ -1458,7 +1461,8 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
 	if (!unlikely(options & WNOWAIT))
 		p->exit_code = 0;
 
-	uid = p->cred->uid;
+	/* don't need the RCU readlock here as we're holding a spinlock */
+	uid = __task_cred(p)->uid;
 unlock_sig:
 	spin_unlock_irq(&p->sighand->siglock);
 	if (!exit_code)
@@ -1532,10 +1536,10 @@ static int wait_task_continued(struct task_struct *p, int options,
 	}
 	if (!unlikely(options & WNOWAIT))
 		p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
+	uid = __task_cred(p)->uid;
 	spin_unlock_irq(&p->sighand->siglock);
 
 	pid = task_pid_vnr(p);
-	uid = p->cred->uid;
 	get_task_struct(p);
 	read_unlock(&tasklist_lock);
 
diff --git a/kernel/futex.c b/kernel/futex.c
index 28421d8210b8..4fe790e89d0f 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -439,15 +439,20 @@ static void free_pi_state(struct futex_pi_state *pi_state)
 static struct task_struct * futex_find_get_task(pid_t pid)
 {
 	struct task_struct *p;
-	uid_t euid = current_euid();
+	const struct cred *cred = current_cred(), *pcred;
 
 	rcu_read_lock();
 	p = find_task_by_vpid(pid);
-	if (!p || (euid != p->cred->euid &&
-		   euid != p->cred->uid))
+	if (!p) {
 		p = ERR_PTR(-ESRCH);
-	else
-		get_task_struct(p);
+	} else {
+		pcred = __task_cred(p);
+		if (cred->euid != pcred->euid &&
+		    cred->euid != pcred->uid)
+			p = ERR_PTR(-ESRCH);
+		else
+			get_task_struct(p);
+	}
 
 	rcu_read_unlock();
 
@@ -1831,7 +1836,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
 {
 	struct robust_list_head __user *head;
 	unsigned long ret;
-	uid_t euid = current_euid();
+	const struct cred *cred = current_cred(), *pcred;
 
 	if (!futex_cmpxchg_enabled)
 		return -ENOSYS;
@@ -1847,8 +1852,9 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
 		if (!p)
 			goto err_unlock;
 		ret = -EPERM;
-		if (euid != p->cred->euid &&
-		    euid != p->cred->uid &&
+		pcred = __task_cred(p);
+		if (cred->euid != pcred->euid &&
+		    cred->euid != pcred->uid &&
 		    !capable(CAP_SYS_PTRACE))
 			goto err_unlock;
 		head = p->robust_list;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 2c3fd5ed34f5..d607a5b9ee29 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -135,7 +135,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
 {
 	struct compat_robust_list_head __user *head;
 	unsigned long ret;
-	uid_t euid = current_euid();
+	const struct cred *cred = current_cred(), *pcred;
 
 	if (!futex_cmpxchg_enabled)
 		return -ENOSYS;
@@ -151,8 +151,9 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
 		if (!p)
 			goto err_unlock;
 		ret = -EPERM;
-		if (euid != p->cred->euid &&
-		    euid != p->cred->uid &&
+		pcred = __task_cred(p);
+		if (cred->euid != pcred->euid &&
+		    cred->euid != pcred->uid &&
 		    !capable(CAP_SYS_PTRACE))
 			goto err_unlock;
 		head = p->compat_robust_list;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 49849d12dd12..b9d5f4e4f6a4 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -115,7 +115,7 @@ int ptrace_check_attach(struct task_struct *child, int kill)
 
 int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 {
-	struct cred *cred = current->cred, *tcred = task->cred;
+	const struct cred *cred = current_cred(), *tcred;
 
 	/* May we inspect the given task?
 	 * This check is used both for attaching with ptrace
@@ -125,19 +125,23 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 	 * because setting up the necessary parent/child relationship
 	 * or halting the specified task is impossible.
 	 */
-	uid_t uid = cred->uid;
-	gid_t gid = cred->gid;
 	int dumpable = 0;
 	/* Don't let security modules deny introspection */
 	if (task == current)
 		return 0;
-	if ((uid != tcred->euid ||
-	     uid != tcred->suid ||
-	     uid != tcred->uid  ||
-	     gid != tcred->egid ||
-	     gid != tcred->sgid ||
-	     gid != tcred->gid) && !capable(CAP_SYS_PTRACE))
+	rcu_read_lock();
+	tcred = __task_cred(task);
+	if ((cred->uid != tcred->euid ||
+	     cred->uid != tcred->suid ||
+	     cred->uid != tcred->uid  ||
+	     cred->gid != tcred->egid ||
+	     cred->gid != tcred->sgid ||
+	     cred->gid != tcred->gid) &&
+	    !capable(CAP_SYS_PTRACE)) {
+		rcu_read_unlock();
 		return -EPERM;
+	}
+	rcu_read_unlock();
 	smp_rmb();
 	if (task->mm)
 		dumpable = get_dumpable(task->mm);
diff --git a/kernel/sched.c b/kernel/sched.c
index 733c59e645aa..92992e287b10 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -345,7 +345,9 @@ static inline struct task_group *task_group(struct task_struct *p)
 	struct task_group *tg;
 
 #ifdef CONFIG_USER_SCHED
-	tg = p->cred->user->tg;
+	rcu_read_lock();
+	tg = __task_cred(p)->user->tg;
+	rcu_read_unlock();
 #elif defined(CONFIG_CGROUP_SCHED)
 	tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
 				struct task_group, css);
@@ -5121,6 +5123,22 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
 	set_load_weight(p);
 }
 
+/*
+ * check the target process has a UID that matches the current process's
+ */
+static bool check_same_owner(struct task_struct *p)
+{
+	const struct cred *cred = current_cred(), *pcred;
+	bool match;
+
+	rcu_read_lock();
+	pcred = __task_cred(p);
+	match = (cred->euid == pcred->euid ||
+		 cred->euid == pcred->uid);
+	rcu_read_unlock();
+	return match;
+}
+
 static int __sched_setscheduler(struct task_struct *p, int policy,
 				struct sched_param *param, bool user)
 {
@@ -5128,7 +5146,6 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
 	unsigned long flags;
 	const struct sched_class *prev_class = p->sched_class;
 	struct rq *rq;
-	uid_t euid;
 
 	/* may grab non-irq protected spin_locks */
 	BUG_ON(in_interrupt());
@@ -5181,9 +5198,7 @@ recheck:
 			return -EPERM;
 
 		/* can't change other user's priorities */
-		euid = current_euid();
-		if (euid != p->cred->euid &&
-		    euid != p->cred->uid)
+		if (!check_same_owner(p))
 			return -EPERM;
 	}
 
@@ -5394,7 +5409,6 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
 	cpumask_t cpus_allowed;
 	cpumask_t new_mask = *in_mask;
 	struct task_struct *p;
-	uid_t euid;
 	int retval;
 
 	get_online_cpus();
@@ -5415,11 +5429,8 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
 	get_task_struct(p);
 	read_unlock(&tasklist_lock);
 
-	euid = current_euid();
 	retval = -EPERM;
-	if (euid != p->cred->euid &&
-	    euid != p->cred->uid &&
-	    !capable(CAP_SYS_NICE))
+	if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
 		goto out_unlock;
 
 	retval = security_task_setscheduler(p, 0, NULL);
diff --git a/kernel/signal.c b/kernel/signal.c
index 80e8a6489f97..84989124bafb 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -177,6 +177,11 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
 	return sig;
 }
 
+/*
+ * allocate a new signal queue record
+ * - this may be called without locks if and only if t == current, otherwise an
+ *   appopriate lock must be held to protect t's user_struct
+ */
 static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
 					 int override_rlimit)
 {
@@ -184,11 +189,12 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
 	struct user_struct *user;
 
 	/*
-	 * In order to avoid problems with "switch_user()", we want to make
-	 * sure that the compiler doesn't re-load "t->user"
+	 * We won't get problems with the target's UID changing under us
+	 * because changing it requires RCU be used, and if t != current, the
+	 * caller must be holding the RCU readlock (by way of a spinlock) and
+	 * we use RCU protection here
 	 */
-	user = t->cred->user;
-	barrier();
+	user = __task_cred(t)->user;
 	atomic_inc(&user->sigpending);
 	if (override_rlimit ||
 	    atomic_read(&user->sigpending) <=
@@ -562,12 +568,13 @@ static int rm_from_queue(unsigned long mask, struct sigpending *s)
 
 /*
  * Bad permissions for sending the signal
+ * - the caller must hold at least the RCU read lock
  */
 static int check_kill_permission(int sig, struct siginfo *info,
 				 struct task_struct *t)
 {
+	const struct cred *cred = current_cred(), *tcred;
 	struct pid *sid;
-	uid_t uid, euid;
 	int error;
 
 	if (!valid_signal(sig))
@@ -580,10 +587,11 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	if (error)
 		return error;
 
-	uid = current_uid();
-	euid = current_euid();
-	if ((euid ^ t->cred->suid) && (euid ^ t->cred->uid) &&
-	    (uid  ^ t->cred->suid) && (uid  ^ t->cred->uid) &&
+	tcred = __task_cred(t);
+	if ((cred->euid ^ tcred->suid) &&
+	    (cred->euid ^ tcred->uid) &&
+	    (cred->uid  ^ tcred->suid) &&
+	    (cred->uid  ^ tcred->uid) &&
 	    !capable(CAP_KILL)) {
 		switch (sig) {
 		case SIGCONT:
@@ -1011,6 +1019,10 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
 	return sighand;
 }
 
+/*
+ * send signal info to all the members of a group
+ * - the caller must hold the RCU read lock at least
+ */
 int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 {
 	unsigned long flags;
@@ -1032,8 +1044,8 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 /*
  * __kill_pgrp_info() sends a signal to a process group: this is what the tty
  * control characters do (^C, ^Z etc)
+ * - the caller must hold at least a readlock on tasklist_lock
  */
-
 int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp)
 {
 	struct task_struct *p = NULL;
@@ -1089,6 +1101,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
 {
 	int ret = -EINVAL;
 	struct task_struct *p;
+	const struct cred *pcred;
 
 	if (!valid_signal(sig))
 		return ret;
@@ -1099,9 +1112,11 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
 		ret = -ESRCH;
 		goto out_unlock;
 	}
-	if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)))
-	    && (euid != p->cred->suid) && (euid != p->cred->uid)
-	    && (uid != p->cred->suid) && (uid != p->cred->uid)) {
+	pcred = __task_cred(p);
+	if ((info == SEND_SIG_NOINFO ||
+	     (!is_si_special(info) && SI_FROMUSER(info))) &&
+	    euid != pcred->suid && euid != pcred->uid &&
+	    uid  != pcred->suid && uid  != pcred->uid) {
 		ret = -EPERM;
 		goto out_unlock;
 	}
@@ -1372,10 +1387,9 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 	 */
 	rcu_read_lock();
 	info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
+	info.si_uid = __task_cred(tsk)->uid;
 	rcu_read_unlock();
 
-	info.si_uid = tsk->cred->uid;
-
 	thread_group_cputime(tsk, &cputime);
 	info.si_utime = cputime_to_jiffies(cputime.utime);
 	info.si_stime = cputime_to_jiffies(cputime.stime);
@@ -1443,10 +1457,9 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
 	 */
 	rcu_read_lock();
 	info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
+	info.si_uid = __task_cred(tsk)->uid;
 	rcu_read_unlock();
 
-	info.si_uid = tsk->cred->uid;
-
 	info.si_utime = cputime_to_clock_t(tsk->utime);
 	info.si_stime = cputime_to_clock_t(tsk->stime);
 
@@ -1713,7 +1726,7 @@ static int ptrace_signal(int signr, siginfo_t *info,
 		info->si_errno = 0;
 		info->si_code = SI_USER;
 		info->si_pid = task_pid_vnr(current->parent);
-		info->si_uid = current->parent->cred->uid;
+		info->si_uid = task_uid(current->parent);
 	}
 
 	/* If the (new) signal is now blocked, requeue it.  */
diff --git a/kernel/sys.c b/kernel/sys.c
index c4d6b59553e9..ccc9eb736d35 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -112,14 +112,17 @@ EXPORT_SYMBOL(cad_pid);
 
 void (*pm_power_off_prepare)(void);
 
+/*
+ * set the priority of a task
+ * - the caller must hold the RCU read lock
+ */
 static int set_one_prio(struct task_struct *p, int niceval, int error)
 {
-	uid_t euid = current_euid();
+	const struct cred *cred = current_cred(), *pcred = __task_cred(p);
 	int no_nice;
 
-	if (p->cred->uid  != euid &&
-	    p->cred->euid != euid &&
-	    !capable(CAP_SYS_NICE)) {
+	if (pcred->uid  != cred->euid &&
+	    pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) {
 		error = -EPERM;
 		goto out;
 	}
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 6d1ed07bf312..2dc06ab35716 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -27,6 +27,7 @@
  */
 void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
 {
+	const struct cred *tcred;
 	struct timespec uptime, ts;
 	u64 ac_etime;
 
@@ -53,10 +54,11 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
 		stats->ac_flag |= AXSIG;
 	stats->ac_nice	 = task_nice(tsk);
 	stats->ac_sched	 = tsk->policy;
-	stats->ac_uid	 = tsk->cred->uid;
-	stats->ac_gid	 = tsk->cred->gid;
 	stats->ac_pid	 = tsk->pid;
 	rcu_read_lock();
+	tcred = __task_cred(tsk);
+	stats->ac_uid	 = tcred->uid;
+	stats->ac_gid	 = tcred->gid;
 	stats->ac_ppid	 = pid_alive(tsk) ?
 				rcu_dereference(tsk->real_parent)->tgid : 0;
 	rcu_read_unlock();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b23492ee3e50..7555219c535b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1110,7 +1110,7 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 		const unsigned long __user *old_nodes,
 		const unsigned long __user *new_nodes)
 {
-	struct cred *cred, *tcred;
+	const struct cred *cred = current_cred(), *tcred;
 	struct mm_struct *mm;
 	struct task_struct *task;
 	nodemask_t old;
@@ -1145,14 +1145,16 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 	 * capabilities, superuser privileges or the same
 	 * userid as the target process.
 	 */
-	cred = current->cred;
-	tcred = task->cred;
+	rcu_read_lock();
+	tcred = __task_cred(task);
 	if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
 	    cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
 	    !capable(CAP_SYS_NICE)) {
+		rcu_read_unlock();
 		err = -EPERM;
 		goto out;
 	}
+	rcu_read_unlock();
 
 	task_nodes = cpuset_mems_allowed(task);
 	/* Is the user allowed to access the target nodes? */
diff --git a/mm/migrate.c b/mm/migrate.c
index 794443da1b4f..142284229ce2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1045,7 +1045,7 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
 			const int __user *nodes,
 			int __user *status, int flags)
 {
-	struct cred *cred, *tcred;
+	const struct cred *cred = current_cred(), *tcred;
 	struct task_struct *task;
 	struct mm_struct *mm;
 	int err;
@@ -1076,14 +1076,16 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
 	 * capabilities, superuser privileges or the same
 	 * userid as the target process.
 	 */
-	cred = current->cred;
-	tcred = task->cred;
+	rcu_read_lock();
+	tcred = __task_cred(task);
 	if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
 	    cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
 	    !capable(CAP_SYS_NICE)) {
+		rcu_read_unlock();
 		err = -EPERM;
 		goto out;
 	}
+	rcu_read_unlock();
 
  	err = security_task_movememory(task);
  	if (err)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 3af787ba2077..0e0b282a2073 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -298,9 +298,9 @@ static void dump_tasks(const struct mem_cgroup *mem)
 
 		task_lock(p);
 		printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d     %3d %s\n",
-		       p->pid, p->cred->uid, p->tgid, p->mm->total_vm,
-		       get_mm_rss(p->mm), (int)task_cpu(p), p->oomkilladj,
-		       p->comm);
+		       p->pid, __task_cred(p)->uid, p->tgid,
+		       p->mm->total_vm, get_mm_rss(p->mm), (int)task_cpu(p),
+		       p->oomkilladj, p->comm);
 		task_unlock(p);
 	} while_each_thread(g, p);
 }
diff --git a/security/commoncap.c b/security/commoncap.c
index 61307f590003..0384bf95db68 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -51,10 +51,13 @@ EXPORT_SYMBOL(cap_netlink_recv);
  */
 int cap_capable(struct task_struct *tsk, int cap, int audit)
 {
+	__u32 cap_raised;
+
 	/* Derived from include/linux/sched.h:capable. */
-	if (cap_raised(tsk->cred->cap_effective, cap))
-		return 0;
-	return -EPERM;
+	rcu_read_lock();
+	cap_raised = cap_raised(__task_cred(tsk)->cap_effective, cap);
+	rcu_read_unlock();
+	return cap_raised ? 0 : -EPERM;
 }
 
 int cap_settime(struct timespec *ts, struct timezone *tz)
@@ -66,34 +69,42 @@ int cap_settime(struct timespec *ts, struct timezone *tz)
 
 int cap_ptrace_may_access(struct task_struct *child, unsigned int mode)
 {
-	/* Derived from arch/i386/kernel/ptrace.c:sys_ptrace. */
-	if (cap_issubset(child->cred->cap_permitted,
-			 current->cred->cap_permitted))
-		return 0;
-	if (capable(CAP_SYS_PTRACE))
-		return 0;
-	return -EPERM;
+	int ret = 0;
+
+	rcu_read_lock();
+	if (!cap_issubset(child->cred->cap_permitted,
+			  current->cred->cap_permitted) &&
+	    !capable(CAP_SYS_PTRACE))
+		ret = -EPERM;
+	rcu_read_unlock();
+	return ret;
 }
 
 int cap_ptrace_traceme(struct task_struct *parent)
 {
-	if (cap_issubset(current->cred->cap_permitted,
-			 parent->cred->cap_permitted))
-		return 0;
-	if (has_capability(parent, CAP_SYS_PTRACE))
-		return 0;
-	return -EPERM;
+	int ret = 0;
+
+	rcu_read_lock();
+	if (!cap_issubset(current->cred->cap_permitted,
+			 parent->cred->cap_permitted) &&
+	    !has_capability(parent, CAP_SYS_PTRACE))
+		ret = -EPERM;
+	rcu_read_unlock();
+	return ret;
 }
 
 int cap_capget (struct task_struct *target, kernel_cap_t *effective,
 		kernel_cap_t *inheritable, kernel_cap_t *permitted)
 {
-	struct cred *cred = target->cred;
+	const struct cred *cred;
 
 	/* Derived from kernel/capability.c:sys_capget. */
+	rcu_read_lock();
+	cred = __task_cred(target);
 	*effective   = cred->cap_effective;
 	*inheritable = cred->cap_inheritable;
 	*permitted   = cred->cap_permitted;
+	rcu_read_unlock();
 	return 0;
 }
 
@@ -433,7 +444,7 @@ void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
 
 int cap_bprm_secureexec (struct linux_binprm *bprm)
 {
-	const struct cred *cred = current->cred;
+	const struct cred *cred = current_cred();
 
 	if (cred->uid != 0) {
 		if (bprm->cap_effective)
@@ -511,11 +522,11 @@ static inline void cap_emulate_setxuid (int old_ruid, int old_euid,
 	if ((old_ruid == 0 || old_euid == 0 || old_suid == 0) &&
 	    (cred->uid != 0 && cred->euid != 0 && cred->suid != 0) &&
 	    !issecure(SECURE_KEEP_CAPS)) {
-		cap_clear (cred->cap_permitted);
-		cap_clear (cred->cap_effective);
+		cap_clear(cred->cap_permitted);
+		cap_clear(cred->cap_effective);
 	}
 	if (old_euid == 0 && cred->euid != 0) {
-		cap_clear (cred->cap_effective);
+		cap_clear(cred->cap_effective);
 	}
 	if (old_euid != 0 && cred->euid == 0) {
 		cred->cap_effective = cred->cap_permitted;
@@ -582,9 +593,14 @@ int cap_task_post_setuid (uid_t old_ruid, uid_t old_euid, uid_t old_suid,
  */
 static int cap_safe_nice(struct task_struct *p)
 {
-	if (!cap_issubset(p->cred->cap_permitted,
-			  current->cred->cap_permitted) &&
-	    !capable(CAP_SYS_NICE))
+	int is_subset;
+
+	rcu_read_lock();
+	is_subset = cap_issubset(__task_cred(p)->cap_permitted,
+				 current_cred()->cap_permitted);
+	rcu_read_unlock();
+
+	if (!is_subset && !capable(CAP_SYS_NICE))
 		return -EPERM;
 	return 0;
 }
diff --git a/security/keys/permission.c b/security/keys/permission.c
index baf3d5f31e71..13c36164f284 100644
--- a/security/keys/permission.c
+++ b/security/keys/permission.c
@@ -22,13 +22,16 @@ int key_task_permission(const key_ref_t key_ref,
 			struct task_struct *context,
 			key_perm_t perm)
 {
-	struct cred *cred = context->cred;
+	const struct cred *cred;
 	struct key *key;
 	key_perm_t kperm;
 	int ret;
 
 	key = key_ref_to_ptr(key_ref);
 
+	rcu_read_lock();
+	cred = __task_cred(context);
+
 	/* use the second 8-bits of permissions for keys the caller owns */
 	if (key->uid == cred->fsuid) {
 		kperm = key->perm >> 16;
@@ -43,10 +46,7 @@ int key_task_permission(const key_ref_t key_ref,
 			goto use_these_perms;
 		}
 
-		spin_lock(&cred->lock);
 		ret = groups_search(cred->group_info, key->gid);
-		spin_unlock(&cred->lock);
-
 		if (ret) {
 			kperm = key->perm >> 8;
 			goto use_these_perms;
@@ -57,6 +57,8 @@ int key_task_permission(const key_ref_t key_ref,
 	kperm = key->perm;
 
 use_these_perms:
+	rcu_read_lock();
+
 	/* use the top 8-bits of permissions for keys the caller possesses
 	 * - possessor permissions are additive with other permissions
 	 */
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index ce8ac6073d57..212601ebaa46 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -412,10 +412,13 @@ key_ref_t search_process_keyrings(struct key_type *type,
 				  struct task_struct *context)
 {
 	struct request_key_auth *rka;
+	struct cred *cred;
 	key_ref_t key_ref, ret, err;
 
 	might_sleep();
 
+	cred = get_task_cred(context);
+
 	/* we want to return -EAGAIN or -ENOKEY if any of the keyrings were
 	 * searchable, but we failed to find a key or we found a negative key;
 	 * otherwise we want to return a sample error (probably -EACCES) if
@@ -428,9 +431,9 @@ key_ref_t search_process_keyrings(struct key_type *type,
 	err = ERR_PTR(-EAGAIN);
 
 	/* search the thread keyring first */
-	if (context->cred->thread_keyring) {
+	if (cred->thread_keyring) {
 		key_ref = keyring_search_aux(
-			make_key_ref(context->cred->thread_keyring, 1),
+			make_key_ref(cred->thread_keyring, 1),
 			context, type, description, match);
 		if (!IS_ERR(key_ref))
 			goto found;
@@ -495,9 +498,9 @@ key_ref_t search_process_keyrings(struct key_type *type,
 		}
 	}
 	/* or search the user-session keyring */
-	else if (context->cred->user->session_keyring) {
+	else if (cred->user->session_keyring) {
 		key_ref = keyring_search_aux(
-			make_key_ref(context->cred->user->session_keyring, 1),
+			make_key_ref(cred->user->session_keyring, 1),
 			context, type, description, match);
 		if (!IS_ERR(key_ref))
 			goto found;
@@ -519,20 +522,20 @@ key_ref_t search_process_keyrings(struct key_type *type,
 	 * search the keyrings of the process mentioned there
 	 * - we don't permit access to request_key auth keys via this method
 	 */
-	if (context->cred->request_key_auth &&
+	if (cred->request_key_auth &&
 	    context == current &&
 	    type != &key_type_request_key_auth
 	    ) {
 		/* defend against the auth key being revoked */
-		down_read(&context->cred->request_key_auth->sem);
+		down_read(&cred->request_key_auth->sem);
 
-		if (key_validate(context->cred->request_key_auth) == 0) {
-			rka = context->cred->request_key_auth->payload.data;
+		if (key_validate(cred->request_key_auth) == 0) {
+			rka = cred->request_key_auth->payload.data;
 
 			key_ref = search_process_keyrings(type, description,
 							  match, rka->context);
 
-			up_read(&context->cred->request_key_auth->sem);
+			up_read(&cred->request_key_auth->sem);
 
 			if (!IS_ERR(key_ref))
 				goto found;
@@ -549,7 +552,7 @@ key_ref_t search_process_keyrings(struct key_type *type,
 				break;
 			}
 		} else {
-			up_read(&context->cred->request_key_auth->sem);
+			up_read(&cred->request_key_auth->sem);
 		}
 	}
 
@@ -557,6 +560,7 @@ key_ref_t search_process_keyrings(struct key_type *type,
 	key_ref = ret ? ret : err;
 
 found:
+	put_cred(cred);
 	return key_ref;
 
 } /* end search_process_keyrings() */
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 10715d1330b9..c86303638235 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -95,13 +95,18 @@ extern void selnl_notify_setenforce(int val);
 static int task_has_security(struct task_struct *tsk,
 			     u32 perms)
 {
-	struct task_security_struct *tsec;
-
-	tsec = tsk->cred->security;
+	const struct task_security_struct *tsec;
+	u32 sid = 0;
+
+	rcu_read_lock();
+	tsec = __task_cred(tsk)->security;
+	if (tsec)
+		sid = tsec->sid;
+	rcu_read_unlock();
 	if (!tsec)
 		return -EACCES;
 
-	return avc_has_perm(tsec->sid, SECINITSID_SECURITY,
+	return avc_has_perm(sid, SECINITSID_SECURITY,
 			    SECCLASS_SECURITY, perms, NULL);
 }
 
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index e8a4fcb1ad04..11167fd567b9 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -30,6 +30,8 @@
 
 #include "smack.h"
 
+#define task_security(task)	(task_cred_xxx((task), security))
+
 /*
  * I hope these are the hokeyist lines of code in the module. Casey.
  */
@@ -1012,7 +1014,7 @@ static void smack_cred_free(struct cred *cred)
  */
 static int smack_task_setpgid(struct task_struct *p, pid_t pgid)
 {
-	return smk_curacc(p->cred->security, MAY_WRITE);
+	return smk_curacc(task_security(p), MAY_WRITE);
 }
 
 /**
@@ -1023,7 +1025,7 @@ static int smack_task_setpgid(struct task_struct *p, pid_t pgid)
  */
 static int smack_task_getpgid(struct task_struct *p)
 {
-	return smk_curacc(p->cred->security, MAY_READ);
+	return smk_curacc(task_security(p), MAY_READ);
 }
 
 /**
@@ -1034,7 +1036,7 @@ static int smack_task_getpgid(struct task_struct *p)
  */
 static int smack_task_getsid(struct task_struct *p)
 {
-	return smk_curacc(p->cred->security, MAY_READ);
+	return smk_curacc(task_security(p), MAY_READ);
 }
 
 /**
@@ -1046,7 +1048,7 @@ static int smack_task_getsid(struct task_struct *p)
  */
 static void smack_task_getsecid(struct task_struct *p, u32 *secid)
 {
-	*secid = smack_to_secid(p->cred->security);
+	*secid = smack_to_secid(task_security(p));
 }
 
 /**
@@ -1062,7 +1064,7 @@ static int smack_task_setnice(struct task_struct *p, int nice)
 
 	rc = cap_task_setnice(p, nice);
 	if (rc == 0)
-		rc = smk_curacc(p->cred->security, MAY_WRITE);
+		rc = smk_curacc(task_security(p), MAY_WRITE);
 	return rc;
 }
 
@@ -1079,7 +1081,7 @@ static int smack_task_setioprio(struct task_struct *p, int ioprio)
 
 	rc = cap_task_setioprio(p, ioprio);
 	if (rc == 0)
-		rc = smk_curacc(p->cred->security, MAY_WRITE);
+		rc = smk_curacc(task_security(p), MAY_WRITE);
 	return rc;
 }
 
@@ -1091,7 +1093,7 @@ static int smack_task_setioprio(struct task_struct *p, int ioprio)
  */
 static int smack_task_getioprio(struct task_struct *p)
 {
-	return smk_curacc(p->cred->security, MAY_READ);
+	return smk_curacc(task_security(p), MAY_READ);
 }
 
 /**
@@ -1109,7 +1111,7 @@ static int smack_task_setscheduler(struct task_struct *p, int policy,
 
 	rc = cap_task_setscheduler(p, policy, lp);
 	if (rc == 0)
-		rc = smk_curacc(p->cred->security, MAY_WRITE);
+		rc = smk_curacc(task_security(p), MAY_WRITE);
 	return rc;
 }
 
@@ -1121,7 +1123,7 @@ static int smack_task_setscheduler(struct task_struct *p, int policy,
  */
 static int smack_task_getscheduler(struct task_struct *p)
 {
-	return smk_curacc(p->cred->security, MAY_READ);
+	return smk_curacc(task_security(p), MAY_READ);
 }
 
 /**
@@ -1132,7 +1134,7 @@ static int smack_task_getscheduler(struct task_struct *p)
  */
 static int smack_task_movememory(struct task_struct *p)
 {
-	return smk_curacc(p->cred->security, MAY_WRITE);
+	return smk_curacc(task_security(p), MAY_WRITE);
 }
 
 /**
@@ -1155,13 +1157,13 @@ static int smack_task_kill(struct task_struct *p, struct siginfo *info,
 	 * can write the receiver.
 	 */
 	if (secid == 0)
-		return smk_curacc(p->cred->security, MAY_WRITE);
+		return smk_curacc(task_security(p), MAY_WRITE);
 	/*
 	 * If the secid isn't 0 we're dealing with some USB IO
 	 * specific behavior. This is not clean. For one thing
 	 * we can't take privilege into account.
 	 */
-	return smk_access(smack_from_secid(secid), p->cred->security, MAY_WRITE);
+	return smk_access(smack_from_secid(secid), task_security(p), MAY_WRITE);
 }
 
 /**
@@ -1174,7 +1176,7 @@ static int smack_task_wait(struct task_struct *p)
 {
 	int rc;
 
-	rc = smk_access(current->cred->security, p->cred->security, MAY_WRITE);
+	rc = smk_access(current_security(), task_security(p), MAY_WRITE);
 	if (rc == 0)
 		return 0;
 
@@ -1205,7 +1207,7 @@ static int smack_task_wait(struct task_struct *p)
 static void smack_task_to_inode(struct task_struct *p, struct inode *inode)
 {
 	struct inode_smack *isp = inode->i_security;
-	isp->smk_inode = p->cred->security;
+	isp->smk_inode = task_security(p);
 }
 
 /*
@@ -2010,7 +2012,7 @@ static int smack_getprocattr(struct task_struct *p, char *name, char **value)
 	if (strcmp(name, "current") != 0)
 		return -EINVAL;
 
-	cp = kstrdup(p->cred->security, GFP_KERNEL);
+	cp = kstrdup(task_security(p), GFP_KERNEL);
 	if (cp == NULL)
 		return -ENOMEM;
 
-- 
cgit v1.2.3


From bb952bb98a7e479262c7eb25d5592545a3af147d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:20 +1100
Subject: CRED: Separate per-task-group keyrings from signal_struct

Separate per-task-group keyrings from signal_struct and dangle their anchor
from the cred struct rather than the signal_struct.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: James Morris <jmorris@namei.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/cred.h         |  16 +++++++
 include/linux/key.h          |   8 +---
 include/linux/sched.h        |   6 ---
 kernel/cred.c                |  63 +++++++++++++++++++++++++++
 kernel/fork.c                |   7 ---
 security/keys/process_keys.c | 100 +++++++++++++++++--------------------------
 security/keys/request_key.c  |  34 ++++++---------
 7 files changed, 135 insertions(+), 99 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index 166ce4ddba64..62b9e532422d 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -71,6 +71,21 @@ extern int groups_search(const struct group_info *, gid_t);
 extern int in_group_p(gid_t);
 extern int in_egroup_p(gid_t);
 
+/*
+ * The common credentials for a thread group
+ * - shared by CLONE_THREAD
+ */
+#ifdef CONFIG_KEYS
+struct thread_group_cred {
+	atomic_t	usage;
+	pid_t		tgid;			/* thread group process ID */
+	spinlock_t	lock;
+	struct key	*session_keyring;	/* keyring inherited over fork */
+	struct key	*process_keyring;	/* keyring private to this process */
+	struct rcu_head	rcu;			/* RCU deletion hook */
+};
+#endif
+
 /*
  * The security context of a task
  *
@@ -114,6 +129,7 @@ struct cred {
 					 * keys to */
 	struct key	*thread_keyring; /* keyring private to this thread */
 	struct key	*request_key_auth; /* assumed request_key authority */
+	struct thread_group_cred *tgcred; /* thread-group shared credentials */
 #endif
 #ifdef CONFIG_SECURITY
 	void		*security;	/* subjective LSM security */
diff --git a/include/linux/key.h b/include/linux/key.h
index df709e1af3cd..0836cc838b0c 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -278,9 +278,7 @@ extern ctl_table key_sysctls[];
  */
 extern void switch_uid_keyring(struct user_struct *new_user);
 extern int copy_keys(unsigned long clone_flags, struct task_struct *tsk);
-extern int copy_thread_group_keys(struct task_struct *tsk);
 extern void exit_keys(struct task_struct *tsk);
-extern void exit_thread_group_keys(struct signal_struct *tg);
 extern int suid_keys(struct task_struct *tsk);
 extern int exec_keys(struct task_struct *tsk);
 extern void key_fsuid_changed(struct task_struct *tsk);
@@ -289,8 +287,8 @@ extern void key_init(void);
 
 #define __install_session_keyring(keyring)				\
 ({									\
-	struct key *old_session = current->signal->session_keyring;	\
-	current->signal->session_keyring = keyring;			\
+	struct key *old_session = current->cred->tgcred->session_keyring; \
+	current->cred->tgcred->session_keyring = keyring;		\
 	old_session;							\
 })
 
@@ -308,9 +306,7 @@ extern void key_init(void);
 #define switch_uid_keyring(u)		do { } while(0)
 #define __install_session_keyring(k)	({ NULL; })
 #define copy_keys(f,t)			0
-#define copy_thread_group_keys(t)	0
 #define exit_keys(t)			do { } while(0)
-#define exit_thread_group_keys(tg)	do { } while(0)
 #define suid_keys(t)			do { } while(0)
 #define exec_keys(t)			do { } while(0)
 #define key_fsuid_changed(t)		do { } while(0)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 740cf946c8cc..2913252989b3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -571,12 +571,6 @@ struct signal_struct {
 	 */
 	struct rlimit rlim[RLIM_NLIMITS];
 
-	/* keep the process-shared keyrings here so that they do the right
-	 * thing in threads created with CLONE_THREAD */
-#ifdef CONFIG_KEYS
-	struct key *session_keyring;	/* keyring inherited over fork */
-	struct key *process_keyring;	/* keyring private to this process */
-#endif
 #ifdef CONFIG_BSD_PROCESS_ACCT
 	struct pacct_struct pacct;	/* per-process accounting information */
 #endif
diff --git a/kernel/cred.c b/kernel/cred.c
index 833244a7cb05..ac73e3617684 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -16,6 +16,17 @@
 #include <linux/init_task.h>
 #include <linux/security.h>
 
+/*
+ * The common credentials for the initial task's thread group
+ */
+#ifdef CONFIG_KEYS
+static struct thread_group_cred init_tgcred = {
+	.usage	= ATOMIC_INIT(2),
+	.tgid	= 0,
+	.lock	= SPIN_LOCK_UNLOCKED,
+};
+#endif
+
 /*
  * The initial credentials for the initial task
  */
@@ -28,8 +39,41 @@ struct cred init_cred = {
 	.cap_bset		= CAP_INIT_BSET,
 	.user			= INIT_USER,
 	.group_info		= &init_groups,
+#ifdef CONFIG_KEYS
+	.tgcred			= &init_tgcred,
+#endif
 };
 
+/*
+ * Dispose of the shared task group credentials
+ */
+#ifdef CONFIG_KEYS
+static void release_tgcred_rcu(struct rcu_head *rcu)
+{
+	struct thread_group_cred *tgcred =
+		container_of(rcu, struct thread_group_cred, rcu);
+
+	BUG_ON(atomic_read(&tgcred->usage) != 0);
+
+	key_put(tgcred->session_keyring);
+	key_put(tgcred->process_keyring);
+	kfree(tgcred);
+}
+#endif
+
+/*
+ * Release a set of thread group credentials.
+ */
+static void release_tgcred(struct cred *cred)
+{
+#ifdef CONFIG_KEYS
+	struct thread_group_cred *tgcred = cred->tgcred;
+
+	if (atomic_dec_and_test(&tgcred->usage))
+		call_rcu(&tgcred->rcu, release_tgcred_rcu);
+#endif
+}
+
 /*
  * The RCU callback to actually dispose of a set of credentials
  */
@@ -41,6 +85,7 @@ static void put_cred_rcu(struct rcu_head *rcu)
 
 	key_put(cred->thread_keyring);
 	key_put(cred->request_key_auth);
+	release_tgcred(cred);
 	put_group_info(cred->group_info);
 	free_uid(cred->user);
 	security_cred_free(cred);
@@ -71,12 +116,30 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 	if (!pcred)
 		return -ENOMEM;
 
+#ifdef CONFIG_KEYS
+	if (clone_flags & CLONE_THREAD) {
+		atomic_inc(&pcred->tgcred->usage);
+	} else {
+		pcred->tgcred = kmalloc(sizeof(struct cred), GFP_KERNEL);
+		if (!pcred->tgcred) {
+			kfree(pcred);
+			return -ENOMEM;
+		}
+		atomic_set(&pcred->tgcred->usage, 1);
+		spin_lock_init(&pcred->tgcred->lock);
+		pcred->tgcred->process_keyring = NULL;
+		pcred->tgcred->session_keyring =
+			key_get(p->cred->tgcred->session_keyring);
+	}
+#endif
+
 #ifdef CONFIG_SECURITY
 	pcred->security = NULL;
 #endif
 
 	ret = security_cred_alloc(pcred);
 	if (ret < 0) {
+		release_tgcred(pcred);
 		kfree(pcred);
 		return ret;
 	}
diff --git a/kernel/fork.c b/kernel/fork.c
index c932e283ddfc..ded1972672a3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -802,12 +802,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	if (!sig)
 		return -ENOMEM;
 
-	ret = copy_thread_group_keys(tsk);
-	if (ret < 0) {
-		kmem_cache_free(signal_cachep, sig);
-		return ret;
-	}
-
 	atomic_set(&sig->count, 1);
 	atomic_set(&sig->live, 1);
 	init_waitqueue_head(&sig->wait_chldexit);
@@ -852,7 +846,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 void __cleanup_signal(struct signal_struct *sig)
 {
 	thread_group_cputime_free(sig);
-	exit_thread_group_keys(sig);
 	tty_kref_put(sig->tty);
 	kmem_cache_free(signal_cachep, sig);
 }
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 212601ebaa46..70ee93406f30 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -189,7 +189,7 @@ int install_process_keyring(void)
 
 	might_sleep();
 
-	if (!tsk->signal->process_keyring) {
+	if (!tsk->cred->tgcred->process_keyring) {
 		sprintf(buf, "_pid.%u", tsk->tgid);
 
 		keyring = keyring_alloc(buf, tsk->cred->uid, tsk->cred->gid, tsk,
@@ -200,12 +200,12 @@ int install_process_keyring(void)
 		}
 
 		/* attach keyring */
-		spin_lock_irq(&tsk->sighand->siglock);
-		if (!tsk->signal->process_keyring) {
-			tsk->signal->process_keyring = keyring;
+		spin_lock_irq(&tsk->cred->tgcred->lock);
+		if (!tsk->cred->tgcred->process_keyring) {
+			tsk->cred->tgcred->process_keyring = keyring;
 			keyring = NULL;
 		}
-		spin_unlock_irq(&tsk->sighand->siglock);
+		spin_unlock_irq(&tsk->cred->tgcred->lock);
 
 		key_put(keyring);
 	}
@@ -235,11 +235,11 @@ static int install_session_keyring(struct key *keyring)
 		sprintf(buf, "_ses.%u", tsk->tgid);
 
 		flags = KEY_ALLOC_QUOTA_OVERRUN;
-		if (tsk->signal->session_keyring)
+		if (tsk->cred->tgcred->session_keyring)
 			flags = KEY_ALLOC_IN_QUOTA;
 
-		keyring = keyring_alloc(buf, tsk->cred->uid, tsk->cred->gid, tsk,
-					flags, NULL);
+		keyring = keyring_alloc(buf, tsk->cred->uid, tsk->cred->gid,
+					tsk, flags, NULL);
 		if (IS_ERR(keyring))
 			return PTR_ERR(keyring);
 	}
@@ -248,10 +248,10 @@ static int install_session_keyring(struct key *keyring)
 	}
 
 	/* install the keyring */
-	spin_lock_irq(&tsk->sighand->siglock);
-	old = tsk->signal->session_keyring;
-	rcu_assign_pointer(tsk->signal->session_keyring, keyring);
-	spin_unlock_irq(&tsk->sighand->siglock);
+	spin_lock_irq(&tsk->cred->tgcred->lock);
+	old = tsk->cred->tgcred->session_keyring;
+	rcu_assign_pointer(tsk->cred->tgcred->session_keyring, keyring);
+	spin_unlock_irq(&tsk->cred->tgcred->lock);
 
 	/* we're using RCU on the pointer, but there's no point synchronising
 	 * on it if it didn't previously point to anything */
@@ -264,28 +264,6 @@ static int install_session_keyring(struct key *keyring)
 
 } /* end install_session_keyring() */
 
-/*****************************************************************************/
-/*
- * copy the keys in a thread group for fork without CLONE_THREAD
- */
-int copy_thread_group_keys(struct task_struct *tsk)
-{
-	key_check(current->thread_group->session_keyring);
-	key_check(current->thread_group->process_keyring);
-
-	/* no process keyring yet */
-	tsk->signal->process_keyring = NULL;
-
-	/* same session keyring */
-	rcu_read_lock();
-	tsk->signal->session_keyring =
-		key_get(rcu_dereference(current->signal->session_keyring));
-	rcu_read_unlock();
-
-	return 0;
-
-} /* end copy_thread_group_keys() */
-
 /*****************************************************************************/
 /*
  * copy the keys for fork
@@ -305,17 +283,6 @@ int copy_keys(unsigned long clone_flags, struct task_struct *tsk)
 
 } /* end copy_keys() */
 
-/*****************************************************************************/
-/*
- * dispose of thread group keys upon thread group destruction
- */
-void exit_thread_group_keys(struct signal_struct *tg)
-{
-	key_put(tg->session_keyring);
-	key_put(tg->process_keyring);
-
-} /* end exit_thread_group_keys() */
-
 /*****************************************************************************/
 /*
  * dispose of per-thread keys upon thread exit
@@ -344,10 +311,10 @@ int exec_keys(struct task_struct *tsk)
 	key_put(old);
 
 	/* discard the process keyring from a newly exec'd task */
-	spin_lock_irq(&tsk->sighand->siglock);
-	old = tsk->signal->process_keyring;
-	tsk->signal->process_keyring = NULL;
-	spin_unlock_irq(&tsk->sighand->siglock);
+	spin_lock_irq(&tsk->cred->tgcred->lock);
+	old = tsk->cred->tgcred->process_keyring;
+	tsk->cred->tgcred->process_keyring = NULL;
+	spin_unlock_irq(&tsk->cred->tgcred->lock);
 
 	key_put(old);
 
@@ -452,9 +419,9 @@ key_ref_t search_process_keyrings(struct key_type *type,
 	}
 
 	/* search the process keyring second */
-	if (context->signal->process_keyring) {
+	if (cred->tgcred->process_keyring) {
 		key_ref = keyring_search_aux(
-			make_key_ref(context->signal->process_keyring, 1),
+			make_key_ref(cred->tgcred->process_keyring, 1),
 			context, type, description, match);
 		if (!IS_ERR(key_ref))
 			goto found;
@@ -473,11 +440,11 @@ key_ref_t search_process_keyrings(struct key_type *type,
 	}
 
 	/* search the session keyring */
-	if (context->signal->session_keyring) {
+	if (cred->tgcred->session_keyring) {
 		rcu_read_lock();
 		key_ref = keyring_search_aux(
 			make_key_ref(rcu_dereference(
-					     context->signal->session_keyring),
+					     cred->tgcred->session_keyring),
 				     1),
 			context, type, description, match);
 		rcu_read_unlock();
@@ -586,11 +553,13 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial,
 {
 	struct request_key_auth *rka;
 	struct task_struct *t = current;
-	struct cred *cred = current_cred();
+	struct cred *cred;
 	struct key *key;
 	key_ref_t key_ref, skey_ref;
 	int ret;
 
+try_again:
+	cred = get_current_cred();
 	key_ref = ERR_PTR(-ENOKEY);
 
 	switch (id) {
@@ -604,6 +573,7 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial,
 				key = ERR_PTR(ret);
 				goto error;
 			}
+			goto reget_creds;
 		}
 
 		key = cred->thread_keyring;
@@ -612,7 +582,7 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial,
 		break;
 
 	case KEY_SPEC_PROCESS_KEYRING:
-		if (!t->signal->process_keyring) {
+		if (!cred->tgcred->process_keyring) {
 			if (!create)
 				goto error;
 
@@ -621,15 +591,16 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial,
 				key = ERR_PTR(ret);
 				goto error;
 			}
+			goto reget_creds;
 		}
 
-		key = t->signal->process_keyring;
+		key = cred->tgcred->process_keyring;
 		atomic_inc(&key->usage);
 		key_ref = make_key_ref(key, 1);
 		break;
 
 	case KEY_SPEC_SESSION_KEYRING:
-		if (!t->signal->session_keyring) {
+		if (!cred->tgcred->session_keyring) {
 			/* always install a session keyring upon access if one
 			 * doesn't exist yet */
 			ret = install_user_keyrings();
@@ -639,10 +610,11 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial,
 				cred->user->session_keyring);
 			if (ret < 0)
 				goto error;
+			goto reget_creds;
 		}
 
 		rcu_read_lock();
-		key = rcu_dereference(t->signal->session_keyring);
+		key = rcu_dereference(cred->tgcred->session_keyring);
 		atomic_inc(&key->usage);
 		rcu_read_unlock();
 		key_ref = make_key_ref(key, 1);
@@ -758,6 +730,7 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial,
 		goto invalid_key;
 
 error:
+	put_cred(cred);
 	return key_ref;
 
 invalid_key:
@@ -765,6 +738,12 @@ invalid_key:
 	key_ref = ERR_PTR(ret);
 	goto error;
 
+	/* if we attempted to install a keyring, then it may have caused new
+	 * creds to be installed */
+reget_creds:
+	put_cred(cred);
+	goto try_again;
+
 } /* end lookup_user_key() */
 
 /*****************************************************************************/
@@ -777,6 +756,7 @@ invalid_key:
 long join_session_keyring(const char *name)
 {
 	struct task_struct *tsk = current;
+	struct cred *cred = current->cred;
 	struct key *keyring;
 	long ret;
 
@@ -787,7 +767,7 @@ long join_session_keyring(const char *name)
 			goto error;
 
 		rcu_read_lock();
-		ret = rcu_dereference(tsk->signal->session_keyring)->serial;
+		ret = rcu_dereference(cred->tgcred->session_keyring)->serial;
 		rcu_read_unlock();
 		goto error;
 	}
@@ -799,7 +779,7 @@ long join_session_keyring(const char *name)
 	keyring = find_keyring_by_name(name, false);
 	if (PTR_ERR(keyring) == -ENOKEY) {
 		/* not found - try and create a new one */
-		keyring = keyring_alloc(name, tsk->cred->uid, tsk->cred->gid, tsk,
+		keyring = keyring_alloc(name, cred->uid, cred->gid, tsk,
 					KEY_ALLOC_IN_QUOTA, NULL);
 		if (IS_ERR(keyring)) {
 			ret = PTR_ERR(keyring);
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index 0488b0af5bd6..3d12558362df 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -66,7 +66,6 @@ static int call_sbin_request_key(struct key_construction *cons,
 				 const char *op,
 				 void *aux)
 {
-	struct task_struct *tsk = current;
 	const struct cred *cred = current_cred();
 	key_serial_t prkey, sskey;
 	struct key *key = cons->key, *authkey = cons->authkey, *keyring;
@@ -109,18 +108,13 @@ static int call_sbin_request_key(struct key_construction *cons,
 		cred->thread_keyring->serial : 0);
 
 	prkey = 0;
-	if (tsk->signal->process_keyring)
-		prkey = tsk->signal->process_keyring->serial;
+	if (cred->tgcred->process_keyring)
+		prkey = cred->tgcred->process_keyring->serial;
 
-	sprintf(keyring_str[1], "%d", prkey);
-
-	if (tsk->signal->session_keyring) {
-		rcu_read_lock();
-		sskey = rcu_dereference(tsk->signal->session_keyring)->serial;
-		rcu_read_unlock();
-	} else {
+	if (cred->tgcred->session_keyring)
+		sskey = rcu_dereference(cred->tgcred->session_keyring)->serial;
+	else
 		sskey = cred->user->session_keyring->serial;
-	}
 
 	sprintf(keyring_str[2], "%d", sskey);
 
@@ -222,7 +216,7 @@ static int construct_key(struct key *key, const void *callout_info,
 static void construct_get_dest_keyring(struct key **_dest_keyring)
 {
 	struct request_key_auth *rka;
-	struct task_struct *tsk = current;
+	const struct cred *cred = current_cred();
 	struct key *dest_keyring = *_dest_keyring, *authkey;
 
 	kenter("%p", dest_keyring);
@@ -234,11 +228,11 @@ static void construct_get_dest_keyring(struct key **_dest_keyring)
 	} else {
 		/* use a default keyring; falling through the cases until we
 		 * find one that we actually have */
-		switch (tsk->cred->jit_keyring) {
+		switch (cred->jit_keyring) {
 		case KEY_REQKEY_DEFL_DEFAULT:
 		case KEY_REQKEY_DEFL_REQUESTOR_KEYRING:
-			if (tsk->cred->request_key_auth) {
-				authkey = tsk->cred->request_key_auth;
+			if (cred->request_key_auth) {
+				authkey = cred->request_key_auth;
 				down_read(&authkey->sem);
 				rka = authkey->payload.data;
 				if (!test_bit(KEY_FLAG_REVOKED,
@@ -251,19 +245,19 @@ static void construct_get_dest_keyring(struct key **_dest_keyring)
 			}
 
 		case KEY_REQKEY_DEFL_THREAD_KEYRING:
-			dest_keyring = key_get(tsk->cred->thread_keyring);
+			dest_keyring = key_get(cred->thread_keyring);
 			if (dest_keyring)
 				break;
 
 		case KEY_REQKEY_DEFL_PROCESS_KEYRING:
-			dest_keyring = key_get(tsk->signal->process_keyring);
+			dest_keyring = key_get(cred->tgcred->process_keyring);
 			if (dest_keyring)
 				break;
 
 		case KEY_REQKEY_DEFL_SESSION_KEYRING:
 			rcu_read_lock();
 			dest_keyring = key_get(
-				rcu_dereference(tsk->signal->session_keyring));
+				rcu_dereference(cred->tgcred->session_keyring));
 			rcu_read_unlock();
 
 			if (dest_keyring)
@@ -271,11 +265,11 @@ static void construct_get_dest_keyring(struct key **_dest_keyring)
 
 		case KEY_REQKEY_DEFL_USER_SESSION_KEYRING:
 			dest_keyring =
-				key_get(tsk->cred->user->session_keyring);
+				key_get(cred->user->session_keyring);
 			break;
 
 		case KEY_REQKEY_DEFL_USER_KEYRING:
-			dest_keyring = key_get(tsk->cred->user->uid_keyring);
+			dest_keyring = key_get(cred->user->uid_keyring);
 			break;
 
 		case KEY_REQKEY_DEFL_GROUP_KEYRING:
-- 
cgit v1.2.3


From 6cc88bc45ce8043171089c9592da223dfab91823 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:21 +1100
Subject: CRED: Rename is_single_threaded() to is_wq_single_threaded()

Rename is_single_threaded() to is_wq_single_threaded() so that a new
is_single_threaded() can be created that refers to tasks rather than
waitqueues.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: James Morris <jmorris@namei.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/workqueue.c       |  8 ++++----
 lib/is_single_threaded.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+), 4 deletions(-)
 create mode 100644 lib/is_single_threaded.c

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f928f2a87b9b..f12ab5c4dec4 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -84,21 +84,21 @@ static cpumask_t cpu_singlethread_map __read_mostly;
 static cpumask_t cpu_populated_map __read_mostly;
 
 /* If it's single threaded, it isn't in the list of workqueues. */
-static inline int is_single_threaded(struct workqueue_struct *wq)
+static inline int is_wq_single_threaded(struct workqueue_struct *wq)
 {
 	return wq->singlethread;
 }
 
 static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq)
 {
-	return is_single_threaded(wq)
+	return is_wq_single_threaded(wq)
 		? &cpu_singlethread_map : &cpu_populated_map;
 }
 
 static
 struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu)
 {
-	if (unlikely(is_single_threaded(wq)))
+	if (unlikely(is_wq_single_threaded(wq)))
 		cpu = singlethread_cpu;
 	return per_cpu_ptr(wq->cpu_wq, cpu);
 }
@@ -769,7 +769,7 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 {
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
 	struct workqueue_struct *wq = cwq->wq;
-	const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d";
+	const char *fmt = is_wq_single_threaded(wq) ? "%s" : "%s/%d";
 	struct task_struct *p;
 
 	p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu);
diff --git a/lib/is_single_threaded.c b/lib/is_single_threaded.c
new file mode 100644
index 000000000000..f1ed2fe76c65
--- /dev/null
+++ b/lib/is_single_threaded.c
@@ -0,0 +1,45 @@
+/* Function to determine if a thread group is single threaded or not
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ * - Derived from security/selinux/hooks.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/sched.h>
+
+/**
+ * is_single_threaded - Determine if a thread group is single-threaded or not
+ * @p: A task in the thread group in question
+ *
+ * This returns true if the thread group to which a task belongs is single
+ * threaded, false if it is not.
+ */
+bool is_single_threaded(struct task_struct *p)
+{
+	struct task_struct *g, *t;
+	struct mm_struct *mm = p->mm;
+
+	if (atomic_read(&p->signal->count) != 1)
+		goto no;
+
+	if (atomic_read(&p->mm->mm_users) != 1) {
+		read_lock(&tasklist_lock);
+		do_each_thread(g, t) {
+			if (t->mm == mm && t != p)
+				goto no_unlock;
+		} while_each_thread(g, t);
+		read_unlock(&tasklist_lock);
+	}
+
+	return true;
+
+no_unlock:
+	read_unlock(&tasklist_lock);
+no:
+	return false;
+}
-- 
cgit v1.2.3


From d84f4f992cbd76e8f39c488cf0c5d123843923b1 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:23 +1100
Subject: CRED: Inaugurate COW credentials

Inaugurate copy-on-write credentials management.  This uses RCU to manage the
credentials pointer in the task_struct with respect to accesses by other tasks.
A process may only modify its own credentials, and so does not need locking to
access or modify its own credentials.

A mutex (cred_replace_mutex) is added to the task_struct to control the effect
of PTRACE_ATTACHED on credential calculations, particularly with respect to
execve().

With this patch, the contents of an active credentials struct may not be
changed directly; rather a new set of credentials must be prepared, modified
and committed using something like the following sequence of events:

	struct cred *new = prepare_creds();
	int ret = blah(new);
	if (ret < 0) {
		abort_creds(new);
		return ret;
	}
	return commit_creds(new);

There are some exceptions to this rule: the keyrings pointed to by the active
credentials may be instantiated - keyrings violate the COW rule as managing
COW keyrings is tricky, given that it is possible for a task to directly alter
the keys in a keyring in use by another task.

To help enforce this, various pointers to sets of credentials, such as those in
the task_struct, are declared const.  The purpose of this is compile-time
discouragement of altering credentials through those pointers.  Once a set of
credentials has been made public through one of these pointers, it may not be
modified, except under special circumstances:

  (1) Its reference count may incremented and decremented.

  (2) The keyrings to which it points may be modified, but not replaced.

The only safe way to modify anything else is to create a replacement and commit
using the functions described in Documentation/credentials.txt (which will be
added by a later patch).

This patch and the preceding patches have been tested with the LTP SELinux
testsuite.

This patch makes several logical sets of alteration:

 (1) execve().

     This now prepares and commits credentials in various places in the
     security code rather than altering the current creds directly.

 (2) Temporary credential overrides.

     do_coredump() and sys_faccessat() now prepare their own credentials and
     temporarily override the ones currently on the acting thread, whilst
     preventing interference from other threads by holding cred_replace_mutex
     on the thread being dumped.

     This will be replaced in a future patch by something that hands down the
     credentials directly to the functions being called, rather than altering
     the task's objective credentials.

 (3) LSM interface.

     A number of functions have been changed, added or removed:

     (*) security_capset_check(), ->capset_check()
     (*) security_capset_set(), ->capset_set()

     	 Removed in favour of security_capset().

     (*) security_capset(), ->capset()

     	 New.  This is passed a pointer to the new creds, a pointer to the old
     	 creds and the proposed capability sets.  It should fill in the new
     	 creds or return an error.  All pointers, barring the pointer to the
     	 new creds, are now const.

     (*) security_bprm_apply_creds(), ->bprm_apply_creds()

     	 Changed; now returns a value, which will cause the process to be
     	 killed if it's an error.

     (*) security_task_alloc(), ->task_alloc_security()

     	 Removed in favour of security_prepare_creds().

     (*) security_cred_free(), ->cred_free()

     	 New.  Free security data attached to cred->security.

     (*) security_prepare_creds(), ->cred_prepare()

     	 New. Duplicate any security data attached to cred->security.

     (*) security_commit_creds(), ->cred_commit()

     	 New. Apply any security effects for the upcoming installation of new
     	 security by commit_creds().

     (*) security_task_post_setuid(), ->task_post_setuid()

     	 Removed in favour of security_task_fix_setuid().

     (*) security_task_fix_setuid(), ->task_fix_setuid()

     	 Fix up the proposed new credentials for setuid().  This is used by
     	 cap_set_fix_setuid() to implicitly adjust capabilities in line with
     	 setuid() changes.  Changes are made to the new credentials, rather
     	 than the task itself as in security_task_post_setuid().

     (*) security_task_reparent_to_init(), ->task_reparent_to_init()

     	 Removed.  Instead the task being reparented to init is referred
     	 directly to init's credentials.

	 NOTE!  This results in the loss of some state: SELinux's osid no
	 longer records the sid of the thread that forked it.

     (*) security_key_alloc(), ->key_alloc()
     (*) security_key_permission(), ->key_permission()

     	 Changed.  These now take cred pointers rather than task pointers to
     	 refer to the security context.

 (4) sys_capset().

     This has been simplified and uses less locking.  The LSM functions it
     calls have been merged.

 (5) reparent_to_kthreadd().

     This gives the current thread the same credentials as init by simply using
     commit_thread() to point that way.

 (6) __sigqueue_alloc() and switch_uid()

     __sigqueue_alloc() can't stop the target task from changing its creds
     beneath it, so this function gets a reference to the currently applicable
     user_struct which it then passes into the sigqueue struct it returns if
     successful.

     switch_uid() is now called from commit_creds(), and possibly should be
     folded into that.  commit_creds() should take care of protecting
     __sigqueue_alloc().

 (7) [sg]et[ug]id() and co and [sg]et_current_groups.

     The set functions now all use prepare_creds(), commit_creds() and
     abort_creds() to build and check a new set of credentials before applying
     it.

     security_task_set[ug]id() is called inside the prepared section.  This
     guarantees that nothing else will affect the creds until we've finished.

     The calling of set_dumpable() has been moved into commit_creds().

     Much of the functionality of set_user() has been moved into
     commit_creds().

     The get functions all simply access the data directly.

 (8) security_task_prctl() and cap_task_prctl().

     security_task_prctl() has been modified to return -ENOSYS if it doesn't
     want to handle a function, or otherwise return the return value directly
     rather than through an argument.

     Additionally, cap_task_prctl() now prepares a new set of credentials, even
     if it doesn't end up using it.

 (9) Keyrings.

     A number of changes have been made to the keyrings code:

     (a) switch_uid_keyring(), copy_keys(), exit_keys() and suid_keys() have
     	 all been dropped and built in to the credentials functions directly.
     	 They may want separating out again later.

     (b) key_alloc() and search_process_keyrings() now take a cred pointer
     	 rather than a task pointer to specify the security context.

     (c) copy_creds() gives a new thread within the same thread group a new
     	 thread keyring if its parent had one, otherwise it discards the thread
     	 keyring.

     (d) The authorisation key now points directly to the credentials to extend
     	 the search into rather pointing to the task that carries them.

     (e) Installing thread, process or session keyrings causes a new set of
     	 credentials to be created, even though it's not strictly necessary for
     	 process or session keyrings (they're shared).

(10) Usermode helper.

     The usermode helper code now carries a cred struct pointer in its
     subprocess_info struct instead of a new session keyring pointer.  This set
     of credentials is derived from init_cred and installed on the new process
     after it has been cloned.

     call_usermodehelper_setup() allocates the new credentials and
     call_usermodehelper_freeinfo() discards them if they haven't been used.  A
     special cred function (prepare_usermodeinfo_creds()) is provided
     specifically for call_usermodehelper_setup() to call.

     call_usermodehelper_setkeys() adjusts the credentials to sport the
     supplied keyring as the new session keyring.

(11) SELinux.

     SELinux has a number of changes, in addition to those to support the LSM
     interface changes mentioned above:

     (a) selinux_setprocattr() no longer does its check for whether the
     	 current ptracer can access processes with the new SID inside the lock
     	 that covers getting the ptracer's SID.  Whilst this lock ensures that
     	 the check is done with the ptracer pinned, the result is only valid
     	 until the lock is released, so there's no point doing it inside the
     	 lock.

(12) is_single_threaded().

     This function has been extracted from selinux_setprocattr() and put into
     a file of its own in the lib/ directory as join_session_keyring() now
     wants to use it too.

     The code in SELinux just checked to see whether a task shared mm_structs
     with other tasks (CLONE_VM), but that isn't good enough.  We really want
     to know if they're part of the same thread group (CLONE_THREAD).

(13) nfsd.

     The NFS server daemon now has to use the COW credentials to set the
     credentials it is going to use.  It really needs to pass the credentials
     down to the functions it calls, but it can't do that until other patches
     in this series have been applied.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/exec.c                        |  31 ++-
 fs/nfsd/auth.c                   |  92 ++++----
 fs/nfsd/nfs4recover.c            |  68 +++---
 fs/nfsd/nfsfh.c                  |  11 +-
 fs/open.c                        |  31 ++-
 include/linux/audit.h            |  22 +-
 include/linux/capability.h       |   2 -
 include/linux/cred.h             |  44 +++-
 include/linux/init_task.h        |   2 +
 include/linux/key.h              |  22 +-
 include/linux/sched.h            |   6 +-
 include/linux/security.h         | 178 +++++++---------
 init/main.c                      |   1 +
 kernel/auditsc.c                 |  42 ++--
 kernel/capability.c              |  78 +++----
 kernel/cred-internals.h          |  21 ++
 kernel/cred.c                    | 321 ++++++++++++++++++++++++----
 kernel/exit.c                    |   9 +-
 kernel/fork.c                    |   7 +-
 kernel/kmod.c                    |  30 ++-
 kernel/ptrace.c                  |   9 +
 kernel/signal.c                  |  10 +-
 kernel/sys.c                     | 450 +++++++++++++++++++++------------------
 kernel/user.c                    |  37 +---
 kernel/user_namespace.c          |  12 +-
 lib/Makefile                     |   2 +-
 net/rxrpc/ar-key.c               |   6 +-
 security/capability.c            |  21 +-
 security/commoncap.c             | 265 +++++++++++------------
 security/keys/internal.h         |  17 +-
 security/keys/key.c              |  25 +--
 security/keys/keyctl.c           |  95 ++++++---
 security/keys/keyring.c          |  14 +-
 security/keys/permission.c       |  24 ++-
 security/keys/proc.c             |   8 +-
 security/keys/process_keys.c     | 333 ++++++++++++++---------------
 security/keys/request_key.c      |  29 ++-
 security/keys/request_key_auth.c |  41 ++--
 security/security.c              |  58 +++--
 security/selinux/hooks.c         | 286 ++++++++++++-------------
 security/smack/smack_lsm.c       |  82 ++++---
 41 files changed, 1603 insertions(+), 1239 deletions(-)
 create mode 100644 kernel/cred-internals.h

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index a5330e1a2216..9bd3559ddece 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1007,13 +1007,12 @@ int flush_old_exec(struct linux_binprm * bprm)
 	 */
 	current->mm->task_size = TASK_SIZE;
 
-	if (bprm->e_uid != current_euid() || bprm->e_gid != current_egid()) {
-		suid_keys(current);
+	if (bprm->e_uid != current_euid() ||
+	    bprm->e_gid != current_egid()) {
 		set_dumpable(current->mm, suid_dumpable);
 		current->pdeath_signal = 0;
 	} else if (file_permission(bprm->file, MAY_READ) ||
 			(bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) {
-		suid_keys(current);
 		set_dumpable(current->mm, suid_dumpable);
 	}
 
@@ -1096,10 +1095,8 @@ void compute_creds(struct linux_binprm *bprm)
 {
 	int unsafe;
 
-	if (bprm->e_uid != current_uid()) {
-		suid_keys(current);
+	if (bprm->e_uid != current_uid())
 		current->pdeath_signal = 0;
-	}
 	exec_keys(current);
 
 	task_lock(current);
@@ -1709,8 +1706,9 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	struct linux_binfmt * binfmt;
 	struct inode * inode;
 	struct file * file;
+	const struct cred *old_cred;
+	struct cred *cred;
 	int retval = 0;
-	int fsuid = current_fsuid();
 	int flag = 0;
 	int ispipe = 0;
 	unsigned long core_limit = current->signal->rlim[RLIMIT_CORE].rlim_cur;
@@ -1723,12 +1721,20 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	binfmt = current->binfmt;
 	if (!binfmt || !binfmt->core_dump)
 		goto fail;
+
+	cred = prepare_creds();
+	if (!cred) {
+		retval = -ENOMEM;
+		goto fail;
+	}
+
 	down_write(&mm->mmap_sem);
 	/*
 	 * If another thread got here first, or we are not dumpable, bail out.
 	 */
 	if (mm->core_state || !get_dumpable(mm)) {
 		up_write(&mm->mmap_sem);
+		put_cred(cred);
 		goto fail;
 	}
 
@@ -1739,12 +1745,16 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	 */
 	if (get_dumpable(mm) == 2) {	/* Setuid core dump mode */
 		flag = O_EXCL;		/* Stop rewrite attacks */
-		current->cred->fsuid = 0;	/* Dump root private */
+		cred->fsuid = 0;	/* Dump root private */
 	}
 
 	retval = coredump_wait(exit_code, &core_state);
-	if (retval < 0)
+	if (retval < 0) {
+		put_cred(cred);
 		goto fail;
+	}
+
+	old_cred = override_creds(cred);
 
 	/*
 	 * Clear any false indication of pending signals that might
@@ -1835,7 +1845,8 @@ fail_unlock:
 	if (helper_argv)
 		argv_free(helper_argv);
 
-	current->cred->fsuid = fsuid;
+	revert_creds(old_cred);
+	put_cred(cred);
 	coredump_finish(mm);
 fail:
 	return retval;
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 808fc03a6fbd..836ffa1047d9 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -27,55 +27,67 @@ int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp)
 
 int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 {
-	struct cred *act_as = current->cred ;
-	struct svc_cred	cred = rqstp->rq_cred;
+	struct group_info *rqgi;
+	struct group_info *gi;
+	struct cred *new;
 	int i;
 	int flags = nfsexp_flags(rqstp, exp);
 	int ret;
 
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	new->fsuid = rqstp->rq_cred.cr_uid;
+	new->fsgid = rqstp->rq_cred.cr_gid;
+
+	rqgi = rqstp->rq_cred.cr_group_info;
+
 	if (flags & NFSEXP_ALLSQUASH) {
-		cred.cr_uid = exp->ex_anon_uid;
-		cred.cr_gid = exp->ex_anon_gid;
-		cred.cr_group_info = groups_alloc(0);
+		new->fsuid = exp->ex_anon_uid;
+		new->fsgid = exp->ex_anon_gid;
+		gi = groups_alloc(0);
 	} else if (flags & NFSEXP_ROOTSQUASH) {
-		struct group_info *gi;
-		if (!cred.cr_uid)
-			cred.cr_uid = exp->ex_anon_uid;
-		if (!cred.cr_gid)
-			cred.cr_gid = exp->ex_anon_gid;
-		gi = groups_alloc(cred.cr_group_info->ngroups);
-		if (gi)
-			for (i = 0; i < cred.cr_group_info->ngroups; i++) {
-				if (!GROUP_AT(cred.cr_group_info, i))
-					GROUP_AT(gi, i) = exp->ex_anon_gid;
-				else
-					GROUP_AT(gi, i) = GROUP_AT(cred.cr_group_info, i);
-			}
-		cred.cr_group_info = gi;
-	} else
-		get_group_info(cred.cr_group_info);
-
-	if (cred.cr_uid != (uid_t) -1)
-		act_as->fsuid = cred.cr_uid;
-	else
-		act_as->fsuid = exp->ex_anon_uid;
-	if (cred.cr_gid != (gid_t) -1)
-		act_as->fsgid = cred.cr_gid;
-	else
-		act_as->fsgid = exp->ex_anon_gid;
+		if (!new->fsuid)
+			new->fsuid = exp->ex_anon_uid;
+		if (!new->fsgid)
+			new->fsgid = exp->ex_anon_gid;
 
-	if (!cred.cr_group_info)
-		return -ENOMEM;
-	ret = set_groups(act_as, cred.cr_group_info);
-	put_group_info(cred.cr_group_info);
-	if ((cred.cr_uid)) {
-		act_as->cap_effective =
-			cap_drop_nfsd_set(act_as->cap_effective);
+		gi = groups_alloc(rqgi->ngroups);
+		if (!gi)
+			goto oom;
+
+		for (i = 0; i < rqgi->ngroups; i++) {
+			if (!GROUP_AT(rqgi, i))
+				GROUP_AT(gi, i) = exp->ex_anon_gid;
+			else
+				GROUP_AT(gi, i) = GROUP_AT(rqgi, i);
+		}
 	} else {
-		act_as->cap_effective =
-			cap_raise_nfsd_set(act_as->cap_effective,
-					   act_as->cap_permitted);
+		gi = get_group_info(rqgi);
 	}
+
+	if (new->fsuid == (uid_t) -1)
+		new->fsuid = exp->ex_anon_uid;
+	if (new->fsgid == (gid_t) -1)
+		new->fsgid = exp->ex_anon_gid;
+
+	ret = set_groups(new, gi);
+	put_group_info(gi);
+	if (!ret)
+		goto error;
+
+	if (new->uid)
+		new->cap_effective = cap_drop_nfsd_set(new->cap_effective);
+	else
+		new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
+							new->cap_permitted);
+	return commit_creds(new);
+
+oom:
+	ret = -ENOMEM;
+error:
+	abort_creds(new);
 	return ret;
 }
 
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 632a50b4b371..9371ea12d7fa 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -54,20 +54,26 @@
 static struct path rec_dir;
 static int rec_dir_init = 0;
 
-static void
-nfs4_save_user(uid_t *saveuid, gid_t *savegid)
+static int
+nfs4_save_creds(const struct cred **original_creds)
 {
-	*saveuid = current->cred->fsuid;
-	*savegid = current->cred->fsgid;
-	current->cred->fsuid = 0;
-	current->cred->fsgid = 0;
+	struct cred *new;
+
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	new->fsuid = 0;
+	new->fsgid = 0;
+	*original_creds = override_creds(new);
+	put_cred(new);
+	return 0;
 }
 
 static void
-nfs4_reset_user(uid_t saveuid, gid_t savegid)
+nfs4_reset_creds(const struct cred *original)
 {
-	current->cred->fsuid = saveuid;
-	current->cred->fsgid = savegid;
+	revert_creds(original);
 }
 
 static void
@@ -129,10 +135,9 @@ nfsd4_sync_rec_dir(void)
 int
 nfsd4_create_clid_dir(struct nfs4_client *clp)
 {
+	const struct cred *original_cred;
 	char *dname = clp->cl_recdir;
 	struct dentry *dentry;
-	uid_t uid;
-	gid_t gid;
 	int status;
 
 	dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname);
@@ -140,7 +145,9 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 	if (!rec_dir_init || clp->cl_firststate)
 		return 0;
 
-	nfs4_save_user(&uid, &gid);
+	status = nfs4_save_creds(&original_cred);
+	if (status < 0)
+		return status;
 
 	/* lock the parent */
 	mutex_lock(&rec_dir.dentry->d_inode->i_mutex);
@@ -168,7 +175,7 @@ out_unlock:
 		clp->cl_firststate = 1;
 		nfsd4_sync_rec_dir();
 	}
-	nfs4_reset_user(uid, gid);
+	nfs4_reset_creds(original_cred);
 	dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status);
 	return status;
 }
@@ -211,20 +218,21 @@ nfsd4_build_dentrylist(void *arg, const char *name, int namlen,
 static int
 nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
 {
+	const struct cred *original_cred;
 	struct file *filp;
 	struct dentry_list_arg dla = {
 		.parent = dir,
 	};
 	struct list_head *dentries = &dla.dentries;
 	struct dentry_list *child;
-	uid_t uid;
-	gid_t gid;
 	int status;
 
 	if (!rec_dir_init)
 		return 0;
 
-	nfs4_save_user(&uid, &gid);
+	status = nfs4_save_creds(&original_cred);
+	if (status < 0)
+		return status;
 
 	filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY,
 			   current_cred());
@@ -250,7 +258,7 @@ out:
 		dput(child->dentry);
 		kfree(child);
 	}
-	nfs4_reset_user(uid, gid);
+	nfs4_reset_creds(original_cred);
 	return status;
 }
 
@@ -312,8 +320,7 @@ out:
 void
 nfsd4_remove_clid_dir(struct nfs4_client *clp)
 {
-	uid_t uid;
-	gid_t gid;
+	const struct cred *original_cred;
 	int status;
 
 	if (!rec_dir_init || !clp->cl_firststate)
@@ -323,9 +330,13 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
 	if (status)
 		goto out;
 	clp->cl_firststate = 0;
-	nfs4_save_user(&uid, &gid);
+
+	status = nfs4_save_creds(&original_cred);
+	if (status < 0)
+		goto out;
+
 	status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1);
-	nfs4_reset_user(uid, gid);
+	nfs4_reset_creds(original_cred);
 	if (status == 0)
 		nfsd4_sync_rec_dir();
 	mnt_drop_write(rec_dir.mnt);
@@ -402,16 +413,21 @@ nfsd4_recdir_load(void) {
 void
 nfsd4_init_recdir(char *rec_dirname)
 {
-	uid_t			uid = 0;
-	gid_t			gid = 0;
-	int 			status;
+	const struct cred *original_cred;
+	int status;
 
 	printk("NFSD: Using %s as the NFSv4 state recovery directory\n",
 			rec_dirname);
 
 	BUG_ON(rec_dir_init);
 
-	nfs4_save_user(&uid, &gid);
+	status = nfs4_save_creds(&original_cred);
+	if (status < 0) {
+		printk("NFSD: Unable to change credentials to find recovery"
+		       " directory: error %d\n",
+		       status);
+		return;
+	}
 
 	status = kern_path(rec_dirname, LOOKUP_FOLLOW | LOOKUP_DIRECTORY,
 			&rec_dir);
@@ -421,7 +437,7 @@ nfsd4_init_recdir(char *rec_dirname)
 
 	if (!status)
 		rec_dir_init = 1;
-	nfs4_reset_user(uid, gid);
+	nfs4_reset_creds(original_cred);
 }
 
 void
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index e67cfaea0865..f0da7d9c3a92 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -186,9 +186,14 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
 		 * access control settings being in effect, we cannot
 		 * fix that case easily.
 		 */
-		current->cred->cap_effective =
-			cap_raise_nfsd_set(current->cred->cap_effective,
-					   current->cred->cap_permitted);
+		struct cred *new = prepare_creds();
+		if (!new)
+			return nfserrno(-ENOMEM);
+		new->cap_effective =
+			cap_raise_nfsd_set(new->cap_effective,
+					   new->cap_permitted);
+		put_cred(override_creds(new));
+		put_cred(new);
 	} else {
 		error = nfsd_setuser_and_check_port(rqstp, exp);
 		if (error)
diff --git a/fs/open.c b/fs/open.c
index f96eaab280a3..c0a426d5766c 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -425,30 +425,33 @@ out:
  */
 asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 {
-	struct cred *cred = current->cred;
+	const struct cred *old_cred;
+	struct cred *override_cred;
 	struct path path;
 	struct inode *inode;
-	int old_fsuid, old_fsgid;
-	kernel_cap_t uninitialized_var(old_cap);  /* !SECURE_NO_SETUID_FIXUP */
 	int res;
 
 	if (mode & ~S_IRWXO)	/* where's F_OK, X_OK, W_OK, R_OK? */
 		return -EINVAL;
 
-	old_fsuid = cred->fsuid;
-	old_fsgid = cred->fsgid;
+	override_cred = prepare_creds();
+	if (!override_cred)
+		return -ENOMEM;
 
-	cred->fsuid = cred->uid;
-	cred->fsgid = cred->gid;
+	override_cred->fsuid = override_cred->uid;
+	override_cred->fsgid = override_cred->gid;
 
 	if (!issecure(SECURE_NO_SETUID_FIXUP)) {
 		/* Clear the capabilities if we switch to a non-root user */
-		if (current->cred->uid)
-			old_cap = cap_set_effective(__cap_empty_set);
+		if (override_cred->uid)
+			cap_clear(override_cred->cap_effective);
 		else
-			old_cap = cap_set_effective(cred->cap_permitted);
+			override_cred->cap_effective =
+				override_cred->cap_permitted;
 	}
 
+	old_cred = override_creds(override_cred);
+
 	res = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
 	if (res)
 		goto out;
@@ -485,12 +488,8 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 out_path_release:
 	path_put(&path);
 out:
-	cred->fsuid = old_fsuid;
-	cred->fsgid = old_fsgid;
-
-	if (!issecure(SECURE_NO_SETUID_FIXUP))
-		cap_set_effective(old_cap);
-
+	revert_creds(old_cred);
+	put_cred(override_cred);
 	return res;
 }
 
diff --git a/include/linux/audit.h b/include/linux/audit.h
index 6fbebac7b1bf..0b2fcb698a63 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -454,8 +454,10 @@ extern int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_pr
 extern int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, unsigned int __user *u_msg_prio, const struct timespec __user *u_abs_timeout);
 extern int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification);
 extern int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat);
-extern void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_cap_t *pE);
-extern int __audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_cap_t *perm);
+extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
+				  const struct cred *new,
+				  const struct cred *old);
+extern int __audit_log_capset(pid_t pid, const struct cred *new, const struct cred *old);
 
 static inline int audit_ipc_obj(struct kern_ipc_perm *ipcp)
 {
@@ -522,16 +524,20 @@ static inline int audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
  *
  * -Eric
  */
-static inline void audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_cap_t *pE)
+static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm,
+				       const struct cred *new,
+				       const struct cred *old)
 {
 	if (unlikely(!audit_dummy_context()))
-		__audit_log_bprm_fcaps(bprm, pP, pE);
+		return __audit_log_bprm_fcaps(bprm, new, old);
+	return 0;
 }
 
-static inline int audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_cap_t *perm)
+static inline int audit_log_capset(pid_t pid, const struct cred *new,
+				   const struct cred *old)
 {
 	if (unlikely(!audit_dummy_context()))
-		return __audit_log_capset(pid, eff, inh, perm);
+		return __audit_log_capset(pid, new, old);
 	return 0;
 }
 
@@ -566,8 +572,8 @@ extern int audit_signals;
 #define audit_mq_timedreceive(d,l,p,t) ({ 0; })
 #define audit_mq_notify(d,n) ({ 0; })
 #define audit_mq_getsetattr(d,s) ({ 0; })
-#define audit_log_bprm_fcaps(b, p, e) do { ; } while (0)
-#define audit_log_capset(pid, e, i, p) ({ 0; })
+#define audit_log_bprm_fcaps(b, ncr, ocr) ({ 0; })
+#define audit_log_capset(pid, ncr, ocr) ({ 0; })
 #define audit_ptrace(t) ((void)0)
 #define audit_n_rules 0
 #define audit_signals 0
diff --git a/include/linux/capability.h b/include/linux/capability.h
index 7f26580a5a4d..e22f48c2a46f 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -519,8 +519,6 @@ extern const kernel_cap_t __cap_empty_set;
 extern const kernel_cap_t __cap_full_set;
 extern const kernel_cap_t __cap_init_eff_set;
 
-kernel_cap_t cap_set_effective(const kernel_cap_t pE_new);
-
 /**
  * has_capability - Determine if a task has a superior capability available
  * @t: The task in question
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 62b9e532422d..eaf6fa695a04 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -84,6 +84,8 @@ struct thread_group_cred {
 	struct key	*process_keyring;	/* keyring private to this process */
 	struct rcu_head	rcu;			/* RCU deletion hook */
 };
+
+extern void release_tgcred(struct cred *cred);
 #endif
 
 /*
@@ -137,11 +139,30 @@ struct cred {
 	struct user_struct *user;	/* real user ID subscription */
 	struct group_info *group_info;	/* supplementary groups for euid/fsgid */
 	struct rcu_head	rcu;		/* RCU deletion hook */
-	spinlock_t	lock;		/* lock for pointer changes */
 };
 
 extern void __put_cred(struct cred *);
 extern int copy_creds(struct task_struct *, unsigned long);
+extern struct cred *prepare_creds(void);
+extern struct cred *prepare_usermodehelper_creds(void);
+extern int commit_creds(struct cred *);
+extern void abort_creds(struct cred *);
+extern const struct cred *override_creds(const struct cred *) __deprecated;
+extern void revert_creds(const struct cred *) __deprecated;
+extern void __init cred_init(void);
+
+/**
+ * get_new_cred - Get a reference on a new set of credentials
+ * @cred: The new credentials to reference
+ *
+ * Get a reference on the specified set of new credentials.  The caller must
+ * release the reference.
+ */
+static inline struct cred *get_new_cred(struct cred *cred)
+{
+	atomic_inc(&cred->usage);
+	return cred;
+}
 
 /**
  * get_cred - Get a reference on a set of credentials
@@ -150,10 +171,9 @@ extern int copy_creds(struct task_struct *, unsigned long);
  * Get a reference on the specified set of credentials.  The caller must
  * release the reference.
  */
-static inline struct cred *get_cred(struct cred *cred)
+static inline const struct cred *get_cred(const struct cred *cred)
 {
-	atomic_inc(&cred->usage);
-	return cred;
+	return get_new_cred((struct cred *) cred);
 }
 
 /**
@@ -166,6 +186,8 @@ static inline struct cred *get_cred(struct cred *cred)
 static inline void put_cred(const struct cred *_cred)
 {
 	struct cred *cred = (struct cred *) _cred;
+
+	BUG_ON(atomic_read(&(cred)->usage) <= 0);
 	if (atomic_dec_and_test(&(cred)->usage))
 		__put_cred(cred);
 }
@@ -250,13 +272,13 @@ static inline void put_cred(const struct cred *_cred)
 	__groups;					\
 })
 
-#define task_cred_xxx(task, xxx)		\
-({						\
-	__typeof__(task->cred->xxx) ___val;	\
-	rcu_read_lock();			\
-	___val = __task_cred((task))->xxx;	\
-	rcu_read_unlock();			\
-	___val;					\
+#define task_cred_xxx(task, xxx)			\
+({							\
+	__typeof__(((struct cred *)NULL)->xxx) ___val;	\
+	rcu_read_lock();				\
+	___val = __task_cred((task))->xxx;		\
+	rcu_read_unlock();				\
+	___val;						\
 })
 
 #define task_uid(task)		(task_cred_xxx((task), uid))
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 5e24c54b6dfd..08c3b24ad9a8 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -150,6 +150,8 @@ extern struct cred init_cred;
 	.sibling	= LIST_HEAD_INIT(tsk.sibling),			\
 	.group_leader	= &tsk,						\
 	.cred		= &init_cred,					\
+	.cred_exec_mutex =						\
+		 __MUTEX_INITIALIZER(tsk.cred_exec_mutex),		\
 	.comm		= "swapper",					\
 	.thread		= INIT_THREAD,					\
 	.fs		= &init_fs,					\
diff --git a/include/linux/key.h b/include/linux/key.h
index 0836cc838b0c..69ecf0934b02 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -73,6 +73,7 @@ struct key;
 struct seq_file;
 struct user_struct;
 struct signal_struct;
+struct cred;
 
 struct key_type;
 struct key_owner;
@@ -181,7 +182,7 @@ struct key {
 extern struct key *key_alloc(struct key_type *type,
 			     const char *desc,
 			     uid_t uid, gid_t gid,
-			     struct task_struct *ctx,
+			     const struct cred *cred,
 			     key_perm_t perm,
 			     unsigned long flags);
 
@@ -249,7 +250,7 @@ extern int key_unlink(struct key *keyring,
 		      struct key *key);
 
 extern struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid,
-				 struct task_struct *ctx,
+				 const struct cred *cred,
 				 unsigned long flags,
 				 struct key *dest);
 
@@ -276,22 +277,12 @@ extern ctl_table key_sysctls[];
 /*
  * the userspace interface
  */
-extern void switch_uid_keyring(struct user_struct *new_user);
-extern int copy_keys(unsigned long clone_flags, struct task_struct *tsk);
-extern void exit_keys(struct task_struct *tsk);
-extern int suid_keys(struct task_struct *tsk);
+extern int install_thread_keyring_to_cred(struct cred *cred);
 extern int exec_keys(struct task_struct *tsk);
 extern void key_fsuid_changed(struct task_struct *tsk);
 extern void key_fsgid_changed(struct task_struct *tsk);
 extern void key_init(void);
 
-#define __install_session_keyring(keyring)				\
-({									\
-	struct key *old_session = current->cred->tgcred->session_keyring; \
-	current->cred->tgcred->session_keyring = keyring;		\
-	old_session;							\
-})
-
 #else /* CONFIG_KEYS */
 
 #define key_validate(k)			0
@@ -303,11 +294,6 @@ extern void key_init(void);
 #define make_key_ref(k, p)		NULL
 #define key_ref_to_ptr(k)		NULL
 #define is_key_possessed(k)		0
-#define switch_uid_keyring(u)		do { } while(0)
-#define __install_session_keyring(k)	({ NULL; })
-#define copy_keys(f,t)			0
-#define exit_keys(t)			do { } while(0)
-#define suid_keys(t)			do { } while(0)
 #define exec_keys(t)			do { } while(0)
 #define key_fsuid_changed(t)		do { } while(0)
 #define key_fsgid_changed(t)		do { } while(0)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2913252989b3..121d655e460d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1145,7 +1145,8 @@ struct task_struct {
 	struct list_head cpu_timers[3];
 
 /* process credentials */
-	struct cred *cred;	/* actual/objective task credentials */
+	const struct cred *cred;	/* actual/objective task credentials (COW) */
+	struct mutex cred_exec_mutex;	/* execve vs ptrace cred calculation mutex */
 
 	char comm[TASK_COMM_LEN]; /* executable name excluding path
 				     - access with [gs]et_task_comm (which lock
@@ -1720,7 +1721,6 @@ static inline struct user_struct *get_uid(struct user_struct *u)
 	return u;
 }
 extern void free_uid(struct user_struct *);
-extern void switch_uid(struct user_struct *);
 extern void release_uids(struct user_namespace *ns);
 
 #include <asm/current.h>
@@ -1870,6 +1870,8 @@ static inline unsigned long wait_task_inactive(struct task_struct *p,
 #define for_each_process(p) \
 	for (p = &init_task ; (p = next_task(p)) != &init_task ; )
 
+extern bool is_single_threaded(struct task_struct *);
+
 /*
  * Careful: do_each_thread/while_each_thread is a double loop so
  *          'break' will not work as expected - use goto instead.
diff --git a/include/linux/security.h b/include/linux/security.h
index 7e9fe046a0d1..68be11251447 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -53,24 +53,21 @@ extern int cap_settime(struct timespec *ts, struct timezone *tz);
 extern int cap_ptrace_may_access(struct task_struct *child, unsigned int mode);
 extern int cap_ptrace_traceme(struct task_struct *parent);
 extern int cap_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted);
-extern int cap_capset_check(const kernel_cap_t *effective,
-			    const kernel_cap_t *inheritable,
-			    const kernel_cap_t *permitted);
-extern void cap_capset_set(const kernel_cap_t *effective,
-			   const kernel_cap_t *inheritable,
-			   const kernel_cap_t *permitted);
+extern int cap_capset(struct cred *new, const struct cred *old,
+		      const kernel_cap_t *effective,
+		      const kernel_cap_t *inheritable,
+		      const kernel_cap_t *permitted);
 extern int cap_bprm_set_security(struct linux_binprm *bprm);
-extern void cap_bprm_apply_creds(struct linux_binprm *bprm, int unsafe);
+extern int cap_bprm_apply_creds(struct linux_binprm *bprm, int unsafe);
 extern int cap_bprm_secureexec(struct linux_binprm *bprm);
 extern int cap_inode_setxattr(struct dentry *dentry, const char *name,
 			      const void *value, size_t size, int flags);
 extern int cap_inode_removexattr(struct dentry *dentry, const char *name);
 extern int cap_inode_need_killpriv(struct dentry *dentry);
 extern int cap_inode_killpriv(struct dentry *dentry);
-extern int cap_task_post_setuid(uid_t old_ruid, uid_t old_euid, uid_t old_suid, int flags);
-extern void cap_task_reparent_to_init(struct task_struct *p);
+extern int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags);
 extern int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
-			  unsigned long arg4, unsigned long arg5, long *rc_p);
+			  unsigned long arg4, unsigned long arg5);
 extern int cap_task_setscheduler(struct task_struct *p, int policy, struct sched_param *lp);
 extern int cap_task_setioprio(struct task_struct *p, int ioprio);
 extern int cap_task_setnice(struct task_struct *p, int nice);
@@ -170,8 +167,8 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	Compute and set the security attributes of a process being transformed
  *	by an execve operation based on the old attributes (current->security)
  *	and the information saved in @bprm->security by the set_security hook.
- *	Since this hook function (and its caller) are void, this hook can not
- *	return an error.  However, it can leave the security attributes of the
+ *	Since this function may return an error, in which case the process will
+ *      be killed.  However, it can leave the security attributes of the
  *	process unchanged if an access failure occurs at this point.
  *	bprm_apply_creds is called under task_lock.  @unsafe indicates various
  *	reasons why it may be unsafe to change security state.
@@ -593,15 +590,18 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	manual page for definitions of the @clone_flags.
  *	@clone_flags contains the flags indicating what should be shared.
  *	Return 0 if permission is granted.
- * @cred_alloc_security:
- *	@cred contains the cred struct for child process.
- *	Allocate and attach a security structure to the cred->security field.
- *	The security field is initialized to NULL when the task structure is
- *	allocated.
- *	Return 0 if operation was successful.
  * @cred_free:
  *	@cred points to the credentials.
  *	Deallocate and clear the cred->security field in a set of credentials.
+ * @cred_prepare:
+ *	@new points to the new credentials.
+ *	@old points to the original credentials.
+ *	@gfp indicates the atomicity of any memory allocations.
+ *	Prepare a new set of credentials by copying the data from the old set.
+ * @cred_commit:
+ *	@new points to the new credentials.
+ *	@old points to the original credentials.
+ *	Install a new set of credentials.
  * @task_setuid:
  *	Check permission before setting one or more of the user identity
  *	attributes of the current process.  The @flags parameter indicates
@@ -614,15 +614,13 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@id2 contains a uid.
  *	@flags contains one of the LSM_SETID_* values.
  *	Return 0 if permission is granted.
- * @task_post_setuid:
+ * @task_fix_setuid:
  *	Update the module's state after setting one or more of the user
  *	identity attributes of the current process.  The @flags parameter
  *	indicates which of the set*uid system calls invoked this hook.  If
- *	@flags is LSM_SETID_FS, then @old_ruid is the old fs uid and the other
- *	parameters are not used.
- *	@old_ruid contains the old real uid (or fs uid if LSM_SETID_FS).
- *	@old_euid contains the old effective uid (or -1 if LSM_SETID_FS).
- *	@old_suid contains the old saved uid (or -1 if LSM_SETID_FS).
+ *	@new is the set of credentials that will be installed.  Modifications
+ *	should be made to this rather than to @current->cred.
+ *	@old is the set of credentials that are being replaces
  *	@flags contains one of the LSM_SETID_* values.
  *	Return 0 on success.
  * @task_setgid:
@@ -725,13 +723,8 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@arg3 contains a argument.
  *	@arg4 contains a argument.
  *	@arg5 contains a argument.
- *      @rc_p contains a pointer to communicate back the forced return code
- *	Return 0 if permission is granted, and non-zero if the security module
- *      has taken responsibility (setting *rc_p) for the prctl call.
- * @task_reparent_to_init:
- *	Set the security attributes in @p->security for a kernel thread that
- *	is being reparented to the init task.
- *	@p contains the task_struct for the kernel thread.
+ *	Return -ENOSYS if no-one wanted to handle this op, any other value to
+ *	cause prctl() to return immediately with that value.
  * @task_to_inode:
  *	Set the security attributes for an inode based on an associated task's
  *	security attributes, e.g. for /proc/pid inodes.
@@ -1008,7 +1001,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	See whether a specific operational right is granted to a process on a
  *	key.
  *	@key_ref refers to the key (key pointer + possession attribute bit).
- *	@context points to the process to provide the context against which to
+ *	@cred points to the credentials to provide the context against which to
  *	evaluate the security data on the key.
  *	@perm describes the combination of permissions required of this key.
  *	Return 1 if permission granted, 0 if permission denied and -ve it the
@@ -1170,6 +1163,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@child process.
  *	Security modules may also want to perform a process tracing check
  *	during an execve in the set_security or apply_creds hooks of
+ *	tracing check during an execve in the bprm_set_creds hook of
  *	binprm_security_ops if the process is being traced and its security
  *	attributes would be changed by the execve.
  *	@child contains the task_struct structure for the target process.
@@ -1193,19 +1187,15 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@inheritable contains the inheritable capability set.
  *	@permitted contains the permitted capability set.
  *	Return 0 if the capability sets were successfully obtained.
- * @capset_check:
- *	Check permission before setting the @effective, @inheritable, and
- *	@permitted capability sets for the current process.
- *	@effective contains the effective capability set.
- *	@inheritable contains the inheritable capability set.
- *	@permitted contains the permitted capability set.
- *	Return 0 if permission is granted.
- * @capset_set:
+ * @capset:
  *	Set the @effective, @inheritable, and @permitted capability sets for
  *	the current process.
+ *	@new contains the new credentials structure for target process.
+ *	@old contains the current credentials structure for target process.
  *	@effective contains the effective capability set.
  *	@inheritable contains the inheritable capability set.
  *	@permitted contains the permitted capability set.
+ *	Return 0 and update @new if permission is granted.
  * @capable:
  *	Check whether the @tsk process has the @cap capability.
  *	@tsk contains the task_struct for the process.
@@ -1297,12 +1287,11 @@ struct security_operations {
 	int (*capget) (struct task_struct *target,
 		       kernel_cap_t *effective,
 		       kernel_cap_t *inheritable, kernel_cap_t *permitted);
-	int (*capset_check) (const kernel_cap_t *effective,
-			     const kernel_cap_t *inheritable,
-			     const kernel_cap_t *permitted);
-	void (*capset_set) (const kernel_cap_t *effective,
-			    const kernel_cap_t *inheritable,
-			    const kernel_cap_t *permitted);
+	int (*capset) (struct cred *new,
+		       const struct cred *old,
+		       const kernel_cap_t *effective,
+		       const kernel_cap_t *inheritable,
+		       const kernel_cap_t *permitted);
 	int (*capable) (struct task_struct *tsk, int cap, int audit);
 	int (*acct) (struct file *file);
 	int (*sysctl) (struct ctl_table *table, int op);
@@ -1314,7 +1303,7 @@ struct security_operations {
 
 	int (*bprm_alloc_security) (struct linux_binprm *bprm);
 	void (*bprm_free_security) (struct linux_binprm *bprm);
-	void (*bprm_apply_creds) (struct linux_binprm *bprm, int unsafe);
+	int (*bprm_apply_creds) (struct linux_binprm *bprm, int unsafe);
 	void (*bprm_post_apply_creds) (struct linux_binprm *bprm);
 	int (*bprm_set_security) (struct linux_binprm *bprm);
 	int (*bprm_check_security) (struct linux_binprm *bprm);
@@ -1405,11 +1394,13 @@ struct security_operations {
 	int (*dentry_open) (struct file *file, const struct cred *cred);
 
 	int (*task_create) (unsigned long clone_flags);
-	int (*cred_alloc_security) (struct cred *cred);
 	void (*cred_free) (struct cred *cred);
+	int (*cred_prepare)(struct cred *new, const struct cred *old,
+			    gfp_t gfp);
+	void (*cred_commit)(struct cred *new, const struct cred *old);
 	int (*task_setuid) (uid_t id0, uid_t id1, uid_t id2, int flags);
-	int (*task_post_setuid) (uid_t old_ruid /* or fsuid */ ,
-				 uid_t old_euid, uid_t old_suid, int flags);
+	int (*task_fix_setuid) (struct cred *new, const struct cred *old,
+				int flags);
 	int (*task_setgid) (gid_t id0, gid_t id1, gid_t id2, int flags);
 	int (*task_setpgid) (struct task_struct *p, pid_t pgid);
 	int (*task_getpgid) (struct task_struct *p);
@@ -1429,8 +1420,7 @@ struct security_operations {
 	int (*task_wait) (struct task_struct *p);
 	int (*task_prctl) (int option, unsigned long arg2,
 			   unsigned long arg3, unsigned long arg4,
-			   unsigned long arg5, long *rc_p);
-	void (*task_reparent_to_init) (struct task_struct *p);
+			   unsigned long arg5);
 	void (*task_to_inode) (struct task_struct *p, struct inode *inode);
 
 	int (*ipc_permission) (struct kern_ipc_perm *ipcp, short flag);
@@ -1535,10 +1525,10 @@ struct security_operations {
 
 	/* key management security hooks */
 #ifdef CONFIG_KEYS
-	int (*key_alloc) (struct key *key, struct task_struct *tsk, unsigned long flags);
+	int (*key_alloc) (struct key *key, const struct cred *cred, unsigned long flags);
 	void (*key_free) (struct key *key);
 	int (*key_permission) (key_ref_t key_ref,
-			       struct task_struct *context,
+			       const struct cred *cred,
 			       key_perm_t perm);
 	int (*key_getsecurity)(struct key *key, char **_buffer);
 #endif	/* CONFIG_KEYS */
@@ -1564,12 +1554,10 @@ int security_capget(struct task_struct *target,
 		    kernel_cap_t *effective,
 		    kernel_cap_t *inheritable,
 		    kernel_cap_t *permitted);
-int security_capset_check(const kernel_cap_t *effective,
-			  const kernel_cap_t *inheritable,
-			  const kernel_cap_t *permitted);
-void security_capset_set(const kernel_cap_t *effective,
-			 const kernel_cap_t *inheritable,
-			 const kernel_cap_t *permitted);
+int security_capset(struct cred *new, const struct cred *old,
+		    const kernel_cap_t *effective,
+		    const kernel_cap_t *inheritable,
+		    const kernel_cap_t *permitted);
 int security_capable(struct task_struct *tsk, int cap);
 int security_capable_noaudit(struct task_struct *tsk, int cap);
 int security_acct(struct file *file);
@@ -1583,7 +1571,7 @@ int security_vm_enough_memory_mm(struct mm_struct *mm, long pages);
 int security_vm_enough_memory_kern(long pages);
 int security_bprm_alloc(struct linux_binprm *bprm);
 void security_bprm_free(struct linux_binprm *bprm);
-void security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe);
+int security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe);
 void security_bprm_post_apply_creds(struct linux_binprm *bprm);
 int security_bprm_set(struct linux_binprm *bprm);
 int security_bprm_check(struct linux_binprm *bprm);
@@ -1660,11 +1648,12 @@ int security_file_send_sigiotask(struct task_struct *tsk,
 int security_file_receive(struct file *file);
 int security_dentry_open(struct file *file, const struct cred *cred);
 int security_task_create(unsigned long clone_flags);
-int security_cred_alloc(struct cred *cred);
 void security_cred_free(struct cred *cred);
+int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp);
+void security_commit_creds(struct cred *new, const struct cred *old);
 int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags);
-int security_task_post_setuid(uid_t old_ruid, uid_t old_euid,
-			      uid_t old_suid, int flags);
+int security_task_fix_setuid(struct cred *new, const struct cred *old,
+			     int flags);
 int security_task_setgid(gid_t id0, gid_t id1, gid_t id2, int flags);
 int security_task_setpgid(struct task_struct *p, pid_t pgid);
 int security_task_getpgid(struct task_struct *p);
@@ -1683,8 +1672,7 @@ int security_task_kill(struct task_struct *p, struct siginfo *info,
 			int sig, u32 secid);
 int security_task_wait(struct task_struct *p);
 int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
-			 unsigned long arg4, unsigned long arg5, long *rc_p);
-void security_task_reparent_to_init(struct task_struct *p);
+			unsigned long arg4, unsigned long arg5);
 void security_task_to_inode(struct task_struct *p, struct inode *inode);
 int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag);
 void security_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid);
@@ -1759,18 +1747,13 @@ static inline int security_capget(struct task_struct *target,
 	return cap_capget(target, effective, inheritable, permitted);
 }
 
-static inline int security_capset_check(const kernel_cap_t *effective,
-					const kernel_cap_t *inheritable,
-					const kernel_cap_t *permitted)
+static inline int security_capset(struct cred *new,
+				   const struct cred *old,
+				   const kernel_cap_t *effective,
+				   const kernel_cap_t *inheritable,
+				   const kernel_cap_t *permitted)
 {
-	return cap_capset_check(effective, inheritable, permitted);
-}
-
-static inline void security_capset_set(const kernel_cap_t *effective,
-				       const kernel_cap_t *inheritable,
-				       const kernel_cap_t *permitted)
-{
-	cap_capset_set(effective, inheritable, permitted);
+	return cap_capset(new, old, effective, inheritable, permitted);
 }
 
 static inline int security_capable(struct task_struct *tsk, int cap)
@@ -1837,9 +1820,9 @@ static inline int security_bprm_alloc(struct linux_binprm *bprm)
 static inline void security_bprm_free(struct linux_binprm *bprm)
 { }
 
-static inline void security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
+static inline int security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
 {
-	cap_bprm_apply_creds(bprm, unsafe);
+	return cap_bprm_apply_creds(bprm, unsafe);
 }
 
 static inline void security_bprm_post_apply_creds(struct linux_binprm *bprm)
@@ -2182,13 +2165,20 @@ static inline int security_task_create(unsigned long clone_flags)
 	return 0;
 }
 
-static inline int security_cred_alloc(struct cred *cred)
+static inline void security_cred_free(struct cred *cred)
+{ }
+
+static inline int security_prepare_creds(struct cred *new,
+					 const struct cred *old,
+					 gfp_t gfp)
 {
 	return 0;
 }
 
-static inline void security_cred_free(struct cred *cred)
-{ }
+static inline void security_commit_creds(struct cred *new,
+					 const struct cred *old)
+{
+}
 
 static inline int security_task_setuid(uid_t id0, uid_t id1, uid_t id2,
 				       int flags)
@@ -2196,10 +2186,11 @@ static inline int security_task_setuid(uid_t id0, uid_t id1, uid_t id2,
 	return 0;
 }
 
-static inline int security_task_post_setuid(uid_t old_ruid, uid_t old_euid,
-					    uid_t old_suid, int flags)
+static inline int security_task_fix_setuid(struct cred *new,
+					   const struct cred *old,
+					   int flags)
 {
-	return cap_task_post_setuid(old_ruid, old_euid, old_suid, flags);
+	return cap_task_fix_setuid(new, old, flags);
 }
 
 static inline int security_task_setgid(gid_t id0, gid_t id1, gid_t id2,
@@ -2286,14 +2277,9 @@ static inline int security_task_wait(struct task_struct *p)
 static inline int security_task_prctl(int option, unsigned long arg2,
 				      unsigned long arg3,
 				      unsigned long arg4,
-				      unsigned long arg5, long *rc_p)
-{
-	return cap_task_prctl(option, arg2, arg3, arg3, arg5, rc_p);
-}
-
-static inline void security_task_reparent_to_init(struct task_struct *p)
+				      unsigned long arg5)
 {
-	cap_task_reparent_to_init(p);
+	return cap_task_prctl(option, arg2, arg3, arg3, arg5);
 }
 
 static inline void security_task_to_inode(struct task_struct *p, struct inode *inode)
@@ -2719,16 +2705,16 @@ static inline void security_skb_classify_flow(struct sk_buff *skb, struct flowi
 #ifdef CONFIG_KEYS
 #ifdef CONFIG_SECURITY
 
-int security_key_alloc(struct key *key, struct task_struct *tsk, unsigned long flags);
+int security_key_alloc(struct key *key, const struct cred *cred, unsigned long flags);
 void security_key_free(struct key *key);
 int security_key_permission(key_ref_t key_ref,
-			    struct task_struct *context, key_perm_t perm);
+			    const struct cred *cred, key_perm_t perm);
 int security_key_getsecurity(struct key *key, char **_buffer);
 
 #else
 
 static inline int security_key_alloc(struct key *key,
-				     struct task_struct *tsk,
+				     const struct cred *cred,
 				     unsigned long flags)
 {
 	return 0;
@@ -2739,7 +2725,7 @@ static inline void security_key_free(struct key *key)
 }
 
 static inline int security_key_permission(key_ref_t key_ref,
-					  struct task_struct *context,
+					  const struct cred *cred,
 					  key_perm_t perm)
 {
 	return 0;
diff --git a/init/main.c b/init/main.c
index 7e117a231af1..db843bff5732 100644
--- a/init/main.c
+++ b/init/main.c
@@ -669,6 +669,7 @@ asmlinkage void __init start_kernel(void)
 		efi_enter_virtual_mode();
 #endif
 	thread_info_cache_init();
+	cred_init();
 	fork_init(num_physpages);
 	proc_caches_init();
 	buffer_init();
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ae8ef88ade3f..bc1e2d854bf6 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2546,18 +2546,17 @@ int __audit_signal_info(int sig, struct task_struct *t)
 
 /**
  * __audit_log_bprm_fcaps - store information about a loading bprm and relevant fcaps
- * @bprm pointer to the bprm being processed
- * @caps the caps read from the disk
+ * @bprm: pointer to the bprm being processed
+ * @new: the proposed new credentials
+ * @old: the old credentials
  *
  * Simply check if the proc already has the caps given by the file and if not
  * store the priv escalation info for later auditing at the end of the syscall
  *
- * this can fail and we don't care.  See the note in audit.h for
- * audit_log_bprm_fcaps() for my explaination....
- *
  * -Eric
  */
-void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_cap_t *pE)
+int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
+			   const struct cred *new, const struct cred *old)
 {
 	struct audit_aux_data_bprm_fcaps *ax;
 	struct audit_context *context = current->audit_context;
@@ -2566,7 +2565,7 @@ void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_
 
 	ax = kmalloc(sizeof(*ax), GFP_KERNEL);
 	if (!ax)
-		return;
+		return -ENOMEM;
 
 	ax->d.type = AUDIT_BPRM_FCAPS;
 	ax->d.next = context->aux;
@@ -2581,26 +2580,27 @@ void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_
 	ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
 	ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
 
-	ax->old_pcap.permitted = *pP;
-	ax->old_pcap.inheritable = current->cred->cap_inheritable;
-	ax->old_pcap.effective = *pE;
+	ax->old_pcap.permitted   = old->cap_permitted;
+	ax->old_pcap.inheritable = old->cap_inheritable;
+	ax->old_pcap.effective   = old->cap_effective;
 
-	ax->new_pcap.permitted = current->cred->cap_permitted;
-	ax->new_pcap.inheritable = current->cred->cap_inheritable;
-	ax->new_pcap.effective = current->cred->cap_effective;
+	ax->new_pcap.permitted   = new->cap_permitted;
+	ax->new_pcap.inheritable = new->cap_inheritable;
+	ax->new_pcap.effective   = new->cap_effective;
+	return 0;
 }
 
 /**
  * __audit_log_capset - store information about the arguments to the capset syscall
- * @pid target pid of the capset call
- * @eff effective cap set
- * @inh inheritible cap set
- * @perm permited cap set
+ * @pid: target pid of the capset call
+ * @new: the new credentials
+ * @old: the old (current) credentials
  *
  * Record the aguments userspace sent to sys_capset for later printing by the
  * audit system if applicable
  */
-int __audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_cap_t *perm)
+int __audit_log_capset(pid_t pid,
+		       const struct cred *new, const struct cred *old)
 {
 	struct audit_aux_data_capset *ax;
 	struct audit_context *context = current->audit_context;
@@ -2617,9 +2617,9 @@ int __audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_c
 	context->aux = (void *)ax;
 
 	ax->pid = pid;
-	ax->cap.effective = *eff;
-	ax->cap.inheritable = *eff;
-	ax->cap.permitted = *perm;
+	ax->cap.effective   = new->cap_effective;
+	ax->cap.inheritable = new->cap_effective;
+	ax->cap.permitted   = new->cap_permitted;
 
 	return 0;
 }
diff --git a/kernel/capability.c b/kernel/capability.c
index a404b980b1bd..36b4b4daebec 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -15,12 +15,7 @@
 #include <linux/syscalls.h>
 #include <linux/pid_namespace.h>
 #include <asm/uaccess.h>
-
-/*
- * This lock protects task->cap_* for all tasks including current.
- * Locking rule: acquire this prior to tasklist_lock.
- */
-static DEFINE_SPINLOCK(task_capability_lock);
+#include "cred-internals.h"
 
 /*
  * Leveraged for setting/resetting capabilities
@@ -128,12 +123,11 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
 }
 
 /*
- * If we have configured with filesystem capability support, then the
- * only thing that can change the capabilities of the current process
- * is the current process. As such, we can't be in this code at the
- * same time as we are in the process of setting capabilities in this
- * process. The net result is that we can limit our use of locks to
- * when we are reading the caps of another process.
+ * The only thing that can change the capabilities of the current
+ * process is the current process. As such, we can't be in this code
+ * at the same time as we are in the process of setting capabilities
+ * in this process. The net result is that we can limit our use of
+ * locks to when we are reading the caps of another process.
  */
 static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
 				     kernel_cap_t *pIp, kernel_cap_t *pPp)
@@ -143,7 +137,6 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
 	if (pid && (pid != task_pid_vnr(current))) {
 		struct task_struct *target;
 
-		spin_lock(&task_capability_lock);
 		read_lock(&tasklist_lock);
 
 		target = find_task_by_vpid(pid);
@@ -153,34 +146,12 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
 			ret = security_capget(target, pEp, pIp, pPp);
 
 		read_unlock(&tasklist_lock);
-		spin_unlock(&task_capability_lock);
 	} else
 		ret = security_capget(current, pEp, pIp, pPp);
 
 	return ret;
 }
 
-/*
- * Atomically modify the effective capabilities returning the original
- * value. No permission check is performed here - it is assumed that the
- * caller is permitted to set the desired effective capabilities.
- */
-kernel_cap_t cap_set_effective(const kernel_cap_t pE_new)
-{
-	kernel_cap_t pE_old;
-
-	spin_lock(&task_capability_lock);
-
-	pE_old = current->cred->cap_effective;
-	current->cred->cap_effective = pE_new;
-
-	spin_unlock(&task_capability_lock);
-
-	return pE_old;
-}
-
-EXPORT_SYMBOL(cap_set_effective);
-
 /**
  * sys_capget - get the capabilities of a given process.
  * @header: pointer to struct that contains capability version and
@@ -208,7 +179,6 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
 		return -EINVAL;
 
 	ret = cap_get_target_pid(pid, &pE, &pI, &pP);
-
 	if (!ret) {
 		struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
 		unsigned i;
@@ -270,6 +240,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 	struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
 	unsigned i, tocopy;
 	kernel_cap_t inheritable, permitted, effective;
+	struct cred *new;
 	int ret;
 	pid_t pid;
 
@@ -284,8 +255,8 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 	if (pid != 0 && pid != task_pid_vnr(current))
 		return -EPERM;
 
-	if (copy_from_user(&kdata, data, tocopy
-			   * sizeof(struct __user_cap_data_struct)))
+	if (copy_from_user(&kdata, data,
+			   tocopy * sizeof(struct __user_cap_data_struct)))
 		return -EFAULT;
 
 	for (i = 0; i < tocopy; i++) {
@@ -300,24 +271,23 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 		i++;
 	}
 
-	ret = audit_log_capset(pid, &effective, &inheritable, &permitted);
-	if (ret)
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	ret = security_capset(new, current_cred(),
+			      &effective, &inheritable, &permitted);
+	if (ret < 0)
+		goto error;
+
+	ret = audit_log_capset(pid, new, current_cred());
+	if (ret < 0)
 		return ret;
 
-	/* This lock is required even when filesystem capability support is
-	 * configured - it protects the sys_capget() call from returning
-	 * incorrect data in the case that the targeted process is not the
-	 * current one.
-	 */
-	spin_lock(&task_capability_lock);
-
-	ret = security_capset_check(&effective, &inheritable, &permitted);
-	/* Having verified that the proposed changes are legal, we now put them
-	 * into effect.
-	 */
-	if (!ret)
-		security_capset_set(&effective, &inheritable, &permitted);
-	spin_unlock(&task_capability_lock);
+	return commit_creds(new);
+
+error:
+	abort_creds(new);
 	return ret;
 }
 
diff --git a/kernel/cred-internals.h b/kernel/cred-internals.h
new file mode 100644
index 000000000000..2dc4fc2d0bf1
--- /dev/null
+++ b/kernel/cred-internals.h
@@ -0,0 +1,21 @@
+/* Internal credentials stuff
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+/*
+ * user.c
+ */
+static inline void sched_switch_user(struct task_struct *p)
+{
+#ifdef CONFIG_USER_SCHED
+	sched_move_task(p);
+#endif	/* CONFIG_USER_SCHED */
+}
+
diff --git a/kernel/cred.c b/kernel/cred.c
index ac73e3617684..cb6b5eda978d 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -15,6 +15,10 @@
 #include <linux/keyctl.h>
 #include <linux/init_task.h>
 #include <linux/security.h>
+#include <linux/cn_proc.h>
+#include "cred-internals.h"
+
+static struct kmem_cache *cred_jar;
 
 /*
  * The common credentials for the initial task's thread group
@@ -64,7 +68,7 @@ static void release_tgcred_rcu(struct rcu_head *rcu)
 /*
  * Release a set of thread group credentials.
  */
-static void release_tgcred(struct cred *cred)
+void release_tgcred(struct cred *cred)
 {
 #ifdef CONFIG_KEYS
 	struct thread_group_cred *tgcred = cred->tgcred;
@@ -81,79 +85,322 @@ static void put_cred_rcu(struct rcu_head *rcu)
 {
 	struct cred *cred = container_of(rcu, struct cred, rcu);
 
-	BUG_ON(atomic_read(&cred->usage) != 0);
+	if (atomic_read(&cred->usage) != 0)
+		panic("CRED: put_cred_rcu() sees %p with usage %d\n",
+		      cred, atomic_read(&cred->usage));
 
+	security_cred_free(cred);
 	key_put(cred->thread_keyring);
 	key_put(cred->request_key_auth);
 	release_tgcred(cred);
 	put_group_info(cred->group_info);
 	free_uid(cred->user);
-	security_cred_free(cred);
-	kfree(cred);
+	kmem_cache_free(cred_jar, cred);
 }
 
 /**
  * __put_cred - Destroy a set of credentials
- * @sec: The record to release
+ * @cred: The record to release
  *
  * Destroy a set of credentials on which no references remain.
  */
 void __put_cred(struct cred *cred)
 {
+	BUG_ON(atomic_read(&cred->usage) != 0);
+
 	call_rcu(&cred->rcu, put_cred_rcu);
 }
 EXPORT_SYMBOL(__put_cred);
 
+/**
+ * prepare_creds - Prepare a new set of credentials for modification
+ *
+ * Prepare a new set of task credentials for modification.  A task's creds
+ * shouldn't generally be modified directly, therefore this function is used to
+ * prepare a new copy, which the caller then modifies and then commits by
+ * calling commit_creds().
+ *
+ * Returns a pointer to the new creds-to-be if successful, NULL otherwise.
+ *
+ * Call commit_creds() or abort_creds() to clean up.
+ */
+struct cred *prepare_creds(void)
+{
+	struct task_struct *task = current;
+	const struct cred *old;
+	struct cred *new;
+
+	BUG_ON(atomic_read(&task->cred->usage) < 1);
+
+	new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
+	if (!new)
+		return NULL;
+
+	old = task->cred;
+	memcpy(new, old, sizeof(struct cred));
+
+	atomic_set(&new->usage, 1);
+	get_group_info(new->group_info);
+	get_uid(new->user);
+
+#ifdef CONFIG_KEYS
+	key_get(new->thread_keyring);
+	key_get(new->request_key_auth);
+	atomic_inc(&new->tgcred->usage);
+#endif
+
+#ifdef CONFIG_SECURITY
+	new->security = NULL;
+#endif
+
+	if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
+		goto error;
+	return new;
+
+error:
+	abort_creds(new);
+	return NULL;
+}
+EXPORT_SYMBOL(prepare_creds);
+
+/*
+ * prepare new credentials for the usermode helper dispatcher
+ */
+struct cred *prepare_usermodehelper_creds(void)
+{
+#ifdef CONFIG_KEYS
+	struct thread_group_cred *tgcred = NULL;
+#endif
+	struct cred *new;
+
+#ifdef CONFIG_KEYS
+	tgcred = kzalloc(sizeof(*new->tgcred), GFP_ATOMIC);
+	if (!tgcred)
+		return NULL;
+#endif
+
+	new = kmem_cache_alloc(cred_jar, GFP_ATOMIC);
+	if (!new)
+		return NULL;
+
+	memcpy(new, &init_cred, sizeof(struct cred));
+
+	atomic_set(&new->usage, 1);
+	get_group_info(new->group_info);
+	get_uid(new->user);
+
+#ifdef CONFIG_KEYS
+	new->thread_keyring = NULL;
+	new->request_key_auth = NULL;
+	new->jit_keyring = KEY_REQKEY_DEFL_DEFAULT;
+
+	atomic_set(&tgcred->usage, 1);
+	spin_lock_init(&tgcred->lock);
+	new->tgcred = tgcred;
+#endif
+
+#ifdef CONFIG_SECURITY
+	new->security = NULL;
+#endif
+	if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0)
+		goto error;
+
+	BUG_ON(atomic_read(&new->usage) != 1);
+	return new;
+
+error:
+	put_cred(new);
+	return NULL;
+}
+
 /*
  * Copy credentials for the new process created by fork()
+ *
+ * We share if we can, but under some circumstances we have to generate a new
+ * set.
  */
 int copy_creds(struct task_struct *p, unsigned long clone_flags)
 {
-	struct cred *pcred;
-	int ret;
+#ifdef CONFIG_KEYS
+	struct thread_group_cred *tgcred;
+#endif
+	struct cred *new;
+
+	mutex_init(&p->cred_exec_mutex);
 
-	pcred = kmemdup(p->cred, sizeof(*p->cred), GFP_KERNEL);
-	if (!pcred)
+	if (
+#ifdef CONFIG_KEYS
+		!p->cred->thread_keyring &&
+#endif
+		clone_flags & CLONE_THREAD
+	    ) {
+		get_cred(p->cred);
+		atomic_inc(&p->cred->user->processes);
+		return 0;
+	}
+
+	new = prepare_creds();
+	if (!new)
 		return -ENOMEM;
 
 #ifdef CONFIG_KEYS
-	if (clone_flags & CLONE_THREAD) {
-		atomic_inc(&pcred->tgcred->usage);
-	} else {
-		pcred->tgcred = kmalloc(sizeof(struct cred), GFP_KERNEL);
-		if (!pcred->tgcred) {
-			kfree(pcred);
+	/* new threads get their own thread keyrings if their parent already
+	 * had one */
+	if (new->thread_keyring) {
+		key_put(new->thread_keyring);
+		new->thread_keyring = NULL;
+		if (clone_flags & CLONE_THREAD)
+			install_thread_keyring_to_cred(new);
+	}
+
+	/* we share the process and session keyrings between all the threads in
+	 * a process - this is slightly icky as we violate COW credentials a
+	 * bit */
+	if (!(clone_flags & CLONE_THREAD)) {
+		tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
+		if (!tgcred) {
+			put_cred(new);
 			return -ENOMEM;
 		}
-		atomic_set(&pcred->tgcred->usage, 1);
-		spin_lock_init(&pcred->tgcred->lock);
-		pcred->tgcred->process_keyring = NULL;
-		pcred->tgcred->session_keyring =
-			key_get(p->cred->tgcred->session_keyring);
+		atomic_set(&tgcred->usage, 1);
+		spin_lock_init(&tgcred->lock);
+		tgcred->process_keyring = NULL;
+		tgcred->session_keyring = key_get(new->tgcred->session_keyring);
+
+		release_tgcred(new);
+		new->tgcred = tgcred;
 	}
 #endif
 
-#ifdef CONFIG_SECURITY
-	pcred->security = NULL;
-#endif
+	atomic_inc(&new->user->processes);
+	p->cred = new;
+	return 0;
+}
 
-	ret = security_cred_alloc(pcred);
-	if (ret < 0) {
-		release_tgcred(pcred);
-		kfree(pcred);
-		return ret;
+/**
+ * commit_creds - Install new credentials upon the current task
+ * @new: The credentials to be assigned
+ *
+ * Install a new set of credentials to the current task, using RCU to replace
+ * the old set.
+ *
+ * This function eats the caller's reference to the new credentials.
+ *
+ * Always returns 0 thus allowing this function to be tail-called at the end
+ * of, say, sys_setgid().
+ */
+int commit_creds(struct cred *new)
+{
+	struct task_struct *task = current;
+	const struct cred *old;
+
+	BUG_ON(atomic_read(&new->usage) < 1);
+	BUG_ON(atomic_read(&task->cred->usage) < 1);
+
+	old = task->cred;
+	security_commit_creds(new, old);
+
+	/* dumpability changes */
+	if (old->euid != new->euid ||
+	    old->egid != new->egid ||
+	    old->fsuid != new->fsuid ||
+	    old->fsgid != new->fsgid ||
+	    !cap_issubset(new->cap_permitted, old->cap_permitted)) {
+		set_dumpable(task->mm, suid_dumpable);
+		task->pdeath_signal = 0;
+		smp_wmb();
 	}
 
-	atomic_set(&pcred->usage, 1);
-	get_group_info(pcred->group_info);
-	get_uid(pcred->user);
-	key_get(pcred->thread_keyring);
-	key_get(pcred->request_key_auth);
+	/* alter the thread keyring */
+	if (new->fsuid != old->fsuid)
+		key_fsuid_changed(task);
+	if (new->fsgid != old->fsgid)
+		key_fsgid_changed(task);
+
+	/* do it
+	 * - What if a process setreuid()'s and this brings the
+	 *   new uid over his NPROC rlimit?  We can check this now
+	 *   cheaply with the new uid cache, so if it matters
+	 *   we should be checking for it.  -DaveM
+	 */
+	if (new->user != old->user)
+		atomic_inc(&new->user->processes);
+	rcu_assign_pointer(task->cred, new);
+	if (new->user != old->user)
+		atomic_dec(&old->user->processes);
+
+	sched_switch_user(task);
+
+	/* send notifications */
+	if (new->uid   != old->uid  ||
+	    new->euid  != old->euid ||
+	    new->suid  != old->suid ||
+	    new->fsuid != old->fsuid)
+		proc_id_connector(task, PROC_EVENT_UID);
 
-	atomic_inc(&pcred->user->processes);
+	if (new->gid   != old->gid  ||
+	    new->egid  != old->egid ||
+	    new->sgid  != old->sgid ||
+	    new->fsgid != old->fsgid)
+		proc_id_connector(task, PROC_EVENT_GID);
 
-	/* RCU assignment is unneeded here as no-one can have accessed this
-	 * pointer yet, barring us */
-	p->cred = pcred;
+	put_cred(old);
 	return 0;
 }
+EXPORT_SYMBOL(commit_creds);
+
+/**
+ * abort_creds - Discard a set of credentials and unlock the current task
+ * @new: The credentials that were going to be applied
+ *
+ * Discard a set of credentials that were under construction and unlock the
+ * current task.
+ */
+void abort_creds(struct cred *new)
+{
+	BUG_ON(atomic_read(&new->usage) < 1);
+	put_cred(new);
+}
+EXPORT_SYMBOL(abort_creds);
+
+/**
+ * override_creds - Temporarily override the current process's credentials
+ * @new: The credentials to be assigned
+ *
+ * Install a set of temporary override credentials on the current process,
+ * returning the old set for later reversion.
+ */
+const struct cred *override_creds(const struct cred *new)
+{
+	const struct cred *old = current->cred;
+
+	rcu_assign_pointer(current->cred, get_cred(new));
+	return old;
+}
+EXPORT_SYMBOL(override_creds);
+
+/**
+ * revert_creds - Revert a temporary credentials override
+ * @old: The credentials to be restored
+ *
+ * Revert a temporary set of override credentials to an old set, discarding the
+ * override set.
+ */
+void revert_creds(const struct cred *old)
+{
+	const struct cred *override = current->cred;
+
+	rcu_assign_pointer(current->cred, old);
+	put_cred(override);
+}
+EXPORT_SYMBOL(revert_creds);
+
+/*
+ * initialise the credentials stuff
+ */
+void __init cred_init(void)
+{
+	/* allocate a slab in which we can store credentials */
+	cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred),
+				     0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+}
diff --git a/kernel/exit.c b/kernel/exit.c
index bbc22530f2c1..c0711da15486 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -47,12 +47,14 @@
 #include <linux/blkdev.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/tracehook.h>
+#include <linux/init_task.h>
 #include <trace/sched.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
+#include "cred-internals.h"
 
 static void exit_mm(struct task_struct * tsk);
 
@@ -338,12 +340,12 @@ static void reparent_to_kthreadd(void)
 	/* cpus_allowed? */
 	/* rt_priority? */
 	/* signals? */
-	security_task_reparent_to_init(current);
 	memcpy(current->signal->rlim, init_task.signal->rlim,
 	       sizeof(current->signal->rlim));
-	atomic_inc(&(INIT_USER->__count));
+
+	atomic_inc(&init_cred.usage);
+	commit_creds(&init_cred);
 	write_unlock_irq(&tasklist_lock);
-	switch_uid(INIT_USER);
 }
 
 void __set_special_pids(struct pid *pid)
@@ -1085,7 +1087,6 @@ NORET_TYPE void do_exit(long code)
 	check_stack_usage();
 	exit_thread();
 	cgroup_exit(tsk, 1);
-	exit_keys(tsk);
 
 	if (group_dead && tsk->signal->leader)
 		disassociate_ctty(1);
diff --git a/kernel/fork.c b/kernel/fork.c
index ded1972672a3..82a7948a664e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1084,10 +1084,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto bad_fork_cleanup_sighand;
 	if ((retval = copy_mm(clone_flags, p)))
 		goto bad_fork_cleanup_signal;
-	if ((retval = copy_keys(clone_flags, p)))
-		goto bad_fork_cleanup_mm;
 	if ((retval = copy_namespaces(clone_flags, p)))
-		goto bad_fork_cleanup_keys;
+		goto bad_fork_cleanup_mm;
 	if ((retval = copy_io(clone_flags, p)))
 		goto bad_fork_cleanup_namespaces;
 	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
@@ -1252,8 +1250,6 @@ bad_fork_cleanup_io:
 	put_io_context(p->io_context);
 bad_fork_cleanup_namespaces:
 	exit_task_namespaces(p);
-bad_fork_cleanup_keys:
-	exit_keys(p);
 bad_fork_cleanup_mm:
 	if (p->mm)
 		mmput(p->mm);
@@ -1281,6 +1277,7 @@ bad_fork_cleanup_cgroup:
 bad_fork_cleanup_put_domain:
 	module_put(task_thread_info(p)->exec_domain->module);
 bad_fork_cleanup_count:
+	atomic_dec(&p->cred->user->processes);
 	put_cred(p->cred);
 bad_fork_free:
 	free_task(p);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index f044f8f57703..b46dbb908669 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -118,10 +118,10 @@ EXPORT_SYMBOL(request_module);
 struct subprocess_info {
 	struct work_struct work;
 	struct completion *complete;
+	struct cred *cred;
 	char *path;
 	char **argv;
 	char **envp;
-	struct key *ring;
 	enum umh_wait wait;
 	int retval;
 	struct file *stdin;
@@ -134,19 +134,20 @@ struct subprocess_info {
 static int ____call_usermodehelper(void *data)
 {
 	struct subprocess_info *sub_info = data;
-	struct key *new_session, *old_session;
 	int retval;
 
-	/* Unblock all signals and set the session keyring. */
-	new_session = key_get(sub_info->ring);
+	BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
+
+	/* Unblock all signals */
 	spin_lock_irq(&current->sighand->siglock);
-	old_session = __install_session_keyring(new_session);
 	flush_signal_handlers(current, 1);
 	sigemptyset(&current->blocked);
 	recalc_sigpending();
 	spin_unlock_irq(&current->sighand->siglock);
 
-	key_put(old_session);
+	/* Install the credentials */
+	commit_creds(sub_info->cred);
+	sub_info->cred = NULL;
 
 	/* Install input pipe when needed */
 	if (sub_info->stdin) {
@@ -185,6 +186,8 @@ void call_usermodehelper_freeinfo(struct subprocess_info *info)
 {
 	if (info->cleanup)
 		(*info->cleanup)(info->argv, info->envp);
+	if (info->cred)
+		put_cred(info->cred);
 	kfree(info);
 }
 EXPORT_SYMBOL(call_usermodehelper_freeinfo);
@@ -240,6 +243,8 @@ static void __call_usermodehelper(struct work_struct *work)
 	pid_t pid;
 	enum umh_wait wait = sub_info->wait;
 
+	BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
+
 	/* CLONE_VFORK: wait until the usermode helper has execve'd
 	 * successfully We need the data structures to stay around
 	 * until that is done.  */
@@ -362,6 +367,9 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
 	sub_info->path = path;
 	sub_info->argv = argv;
 	sub_info->envp = envp;
+	sub_info->cred = prepare_usermodehelper_creds();
+	if (!sub_info->cred)
+		return NULL;
 
   out:
 	return sub_info;
@@ -376,7 +384,13 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
 void call_usermodehelper_setkeys(struct subprocess_info *info,
 				 struct key *session_keyring)
 {
-	info->ring = session_keyring;
+#ifdef CONFIG_KEYS
+	struct thread_group_cred *tgcred = info->cred->tgcred;
+	key_put(tgcred->session_keyring);
+	tgcred->session_keyring = key_get(session_keyring);
+#else
+	BUG();
+#endif
 }
 EXPORT_SYMBOL(call_usermodehelper_setkeys);
 
@@ -444,6 +458,8 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
 	DECLARE_COMPLETION_ONSTACK(done);
 	int retval = 0;
 
+	BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
+
 	helper_lock();
 	if (sub_info->path[0] == '\0')
 		goto out;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index b9d5f4e4f6a4..f764b8806955 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -171,6 +171,14 @@ int ptrace_attach(struct task_struct *task)
 	if (same_thread_group(task, current))
 		goto out;
 
+	/* Protect exec's credential calculations against our interference;
+	 * SUID, SGID and LSM creds get determined differently under ptrace.
+	 */
+	retval = mutex_lock_interruptible(&current->cred_exec_mutex);
+	if (retval  < 0)
+		goto out;
+
+	retval = -EPERM;
 repeat:
 	/*
 	 * Nasty, nasty.
@@ -210,6 +218,7 @@ repeat:
 bad:
 	write_unlock_irqrestore(&tasklist_lock, flags);
 	task_unlock(task);
+	mutex_unlock(&current->cred_exec_mutex);
 out:
 	return retval;
 }
diff --git a/kernel/signal.c b/kernel/signal.c
index 84989124bafb..2a64304ed54b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -180,7 +180,7 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
 /*
  * allocate a new signal queue record
  * - this may be called without locks if and only if t == current, otherwise an
- *   appopriate lock must be held to protect t's user_struct
+ *   appopriate lock must be held to stop the target task from exiting
  */
 static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
 					 int override_rlimit)
@@ -194,7 +194,7 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
 	 * caller must be holding the RCU readlock (by way of a spinlock) and
 	 * we use RCU protection here
 	 */
-	user = __task_cred(t)->user;
+	user = get_uid(__task_cred(t)->user);
 	atomic_inc(&user->sigpending);
 	if (override_rlimit ||
 	    atomic_read(&user->sigpending) <=
@@ -202,12 +202,14 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
 		q = kmem_cache_alloc(sigqueue_cachep, flags);
 	if (unlikely(q == NULL)) {
 		atomic_dec(&user->sigpending);
+		free_uid(user);
 	} else {
 		INIT_LIST_HEAD(&q->list);
 		q->flags = 0;
-		q->user = get_uid(user);
+		q->user = user;
 	}
-	return(q);
+
+	return q;
 }
 
 static void __sigqueue_free(struct sigqueue *q)
diff --git a/kernel/sys.c b/kernel/sys.c
index ccc9eb736d35..ab735040468a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -180,7 +180,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
 			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 			break;
 		case PRIO_USER:
-			user = cred->user;
+			user = (struct user_struct *) cred->user;
 			if (!who)
 				who = cred->uid;
 			else if ((who != cred->uid) &&
@@ -479,47 +479,48 @@ void ctrl_alt_del(void)
  */
 asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
 {
-	struct cred *cred = current->cred;
-	int old_rgid = cred->gid;
-	int old_egid = cred->egid;
-	int new_rgid = old_rgid;
-	int new_egid = old_egid;
+	const struct cred *old;
+	struct cred *new;
 	int retval;
 
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+	old = current_cred();
+
 	retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE);
 	if (retval)
-		return retval;
+		goto error;
 
+	retval = -EPERM;
 	if (rgid != (gid_t) -1) {
-		if ((old_rgid == rgid) ||
-		    (cred->egid == rgid) ||
+		if (old->gid == rgid ||
+		    old->egid == rgid ||
 		    capable(CAP_SETGID))
-			new_rgid = rgid;
+			new->gid = rgid;
 		else
-			return -EPERM;
+			goto error;
 	}
 	if (egid != (gid_t) -1) {
-		if ((old_rgid == egid) ||
-		    (cred->egid == egid) ||
-		    (cred->sgid == egid) ||
+		if (old->gid == egid ||
+		    old->egid == egid ||
+		    old->sgid == egid ||
 		    capable(CAP_SETGID))
-			new_egid = egid;
+			new->egid = egid;
 		else
-			return -EPERM;
-	}
-	if (new_egid != old_egid) {
-		set_dumpable(current->mm, suid_dumpable);
-		smp_wmb();
+			goto error;
 	}
+
 	if (rgid != (gid_t) -1 ||
-	    (egid != (gid_t) -1 && egid != old_rgid))
-		cred->sgid = new_egid;
-	cred->fsgid = new_egid;
-	cred->egid = new_egid;
-	cred->gid = new_rgid;
-	key_fsgid_changed(current);
-	proc_id_connector(current, PROC_EVENT_GID);
-	return 0;
+	    (egid != (gid_t) -1 && egid != old->gid))
+		new->sgid = new->egid;
+	new->fsgid = new->egid;
+
+	return commit_creds(new);
+
+error:
+	abort_creds(new);
+	return retval;
 }
 
 /*
@@ -529,40 +530,42 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
  */
 asmlinkage long sys_setgid(gid_t gid)
 {
-	struct cred *cred = current->cred;
-	int old_egid = cred->egid;
+	const struct cred *old;
+	struct cred *new;
 	int retval;
 
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+	old = current_cred();
+
 	retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID);
 	if (retval)
-		return retval;
+		goto error;
 
-	if (capable(CAP_SETGID)) {
-		if (old_egid != gid) {
-			set_dumpable(current->mm, suid_dumpable);
-			smp_wmb();
-		}
-		cred->gid = cred->egid = cred->sgid = cred->fsgid = gid;
-	} else if ((gid == cred->gid) || (gid == cred->sgid)) {
-		if (old_egid != gid) {
-			set_dumpable(current->mm, suid_dumpable);
-			smp_wmb();
-		}
-		cred->egid = cred->fsgid = gid;
-	}
+	retval = -EPERM;
+	if (capable(CAP_SETGID))
+		new->gid = new->egid = new->sgid = new->fsgid = gid;
+	else if (gid == old->gid || gid == old->sgid)
+		new->egid = new->fsgid = gid;
 	else
-		return -EPERM;
+		goto error;
 
-	key_fsgid_changed(current);
-	proc_id_connector(current, PROC_EVENT_GID);
-	return 0;
+	return commit_creds(new);
+
+error:
+	abort_creds(new);
+	return retval;
 }
   
-static int set_user(uid_t new_ruid, int dumpclear)
+/*
+ * change the user struct in a credentials set to match the new UID
+ */
+static int set_user(struct cred *new)
 {
 	struct user_struct *new_user;
 
-	new_user = alloc_uid(current->nsproxy->user_ns, new_ruid);
+	new_user = alloc_uid(current->nsproxy->user_ns, new->uid);
 	if (!new_user)
 		return -EAGAIN;
 
@@ -573,13 +576,8 @@ static int set_user(uid_t new_ruid, int dumpclear)
 		return -EAGAIN;
 	}
 
-	switch_uid(new_user);
-
-	if (dumpclear) {
-		set_dumpable(current->mm, suid_dumpable);
-		smp_wmb();
-	}
-	current->cred->uid = new_ruid;
+	free_uid(new->user);
+	new->user = new_user;
 	return 0;
 }
 
@@ -600,55 +598,56 @@ static int set_user(uid_t new_ruid, int dumpclear)
  */
 asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
 {
-	struct cred *cred = current->cred;
-	int old_ruid, old_euid, old_suid, new_ruid, new_euid;
+	const struct cred *old;
+	struct cred *new;
 	int retval;
 
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+	old = current_cred();
+
 	retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE);
 	if (retval)
-		return retval;
-
-	new_ruid = old_ruid = cred->uid;
-	new_euid = old_euid = cred->euid;
-	old_suid = cred->suid;
+		goto error;
 
+	retval = -EPERM;
 	if (ruid != (uid_t) -1) {
-		new_ruid = ruid;
-		if ((old_ruid != ruid) &&
-		    (cred->euid != ruid) &&
+		new->uid = ruid;
+		if (old->uid != ruid &&
+		    old->euid != ruid &&
 		    !capable(CAP_SETUID))
-			return -EPERM;
+			goto error;
 	}
 
 	if (euid != (uid_t) -1) {
-		new_euid = euid;
-		if ((old_ruid != euid) &&
-		    (cred->euid != euid) &&
-		    (cred->suid != euid) &&
+		new->euid = euid;
+		if (old->uid != euid &&
+		    old->euid != euid &&
+		    old->suid != euid &&
 		    !capable(CAP_SETUID))
-			return -EPERM;
+			goto error;
 	}
 
-	if (new_ruid != old_ruid && set_user(new_ruid, new_euid != old_euid) < 0)
-		return -EAGAIN;
+	retval = -EAGAIN;
+	if (new->uid != old->uid && set_user(new) < 0)
+		goto error;
 
-	if (new_euid != old_euid) {
-		set_dumpable(current->mm, suid_dumpable);
-		smp_wmb();
-	}
-	cred->fsuid = cred->euid = new_euid;
 	if (ruid != (uid_t) -1 ||
-	    (euid != (uid_t) -1 && euid != old_ruid))
-		cred->suid = cred->euid;
-	cred->fsuid = cred->euid;
-
-	key_fsuid_changed(current);
-	proc_id_connector(current, PROC_EVENT_UID);
+	    (euid != (uid_t) -1 && euid != old->uid))
+		new->suid = new->euid;
+	new->fsuid = new->euid;
 
-	return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RE);
-}
+	retval = security_task_fix_setuid(new, old, LSM_SETID_RE);
+	if (retval < 0)
+		goto error;
 
+	return commit_creds(new);
 
+error:
+	abort_creds(new);
+	return retval;
+}
 		
 /*
  * setuid() is implemented like SysV with SAVED_IDS 
@@ -663,37 +662,41 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
  */
 asmlinkage long sys_setuid(uid_t uid)
 {
-	struct cred *cred = current->cred;
-	int old_euid = cred->euid;
-	int old_ruid, old_suid, new_suid;
+	const struct cred *old;
+	struct cred *new;
 	int retval;
 
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+	old = current_cred();
+
 	retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
 	if (retval)
-		return retval;
+		goto error;
 
-	old_ruid = cred->uid;
-	old_suid = cred->suid;
-	new_suid = old_suid;
-	
+	retval = -EPERM;
 	if (capable(CAP_SETUID)) {
-		if (uid != old_ruid && set_user(uid, old_euid != uid) < 0)
-			return -EAGAIN;
-		new_suid = uid;
-	} else if ((uid != cred->uid) && (uid != new_suid))
-		return -EPERM;
-
-	if (old_euid != uid) {
-		set_dumpable(current->mm, suid_dumpable);
-		smp_wmb();
+		new->suid = new->uid = uid;
+		if (uid != old->uid && set_user(new) < 0) {
+			retval = -EAGAIN;
+			goto error;
+		}
+	} else if (uid != old->uid && uid != new->suid) {
+		goto error;
 	}
-	cred->fsuid = cred->euid = uid;
-	cred->suid = new_suid;
 
-	key_fsuid_changed(current);
-	proc_id_connector(current, PROC_EVENT_UID);
+	new->fsuid = new->euid = uid;
+
+	retval = security_task_fix_setuid(new, old, LSM_SETID_ID);
+	if (retval < 0)
+		goto error;
+
+	return commit_creds(new);
 
-	return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_ID);
+error:
+	abort_creds(new);
+	return retval;
 }
 
 
@@ -703,47 +706,53 @@ asmlinkage long sys_setuid(uid_t uid)
  */
 asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
 {
-	struct cred *cred = current->cred;
-	int old_ruid = cred->uid;
-	int old_euid = cred->euid;
-	int old_suid = cred->suid;
+	const struct cred *old;
+	struct cred *new;
 	int retval;
 
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
 	retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES);
 	if (retval)
-		return retval;
+		goto error;
+	old = current_cred();
 
+	retval = -EPERM;
 	if (!capable(CAP_SETUID)) {
-		if ((ruid != (uid_t) -1) && (ruid != cred->uid) &&
-		    (ruid != cred->euid) && (ruid != cred->suid))
-			return -EPERM;
-		if ((euid != (uid_t) -1) && (euid != cred->uid) &&
-		    (euid != cred->euid) && (euid != cred->suid))
-			return -EPERM;
-		if ((suid != (uid_t) -1) && (suid != cred->uid) &&
-		    (suid != cred->euid) && (suid != cred->suid))
-			return -EPERM;
+		if (ruid != (uid_t) -1 && ruid != old->uid &&
+		    ruid != old->euid  && ruid != old->suid)
+			goto error;
+		if (euid != (uid_t) -1 && euid != old->uid &&
+		    euid != old->euid  && euid != old->suid)
+			goto error;
+		if (suid != (uid_t) -1 && suid != old->uid &&
+		    suid != old->euid  && suid != old->suid)
+			goto error;
 	}
+
+	retval = -EAGAIN;
 	if (ruid != (uid_t) -1) {
-		if (ruid != cred->uid &&
-		    set_user(ruid, euid != cred->euid) < 0)
-			return -EAGAIN;
+		new->uid = ruid;
+		if (ruid != old->uid && set_user(new) < 0)
+			goto error;
 	}
-	if (euid != (uid_t) -1) {
-		if (euid != cred->euid) {
-			set_dumpable(current->mm, suid_dumpable);
-			smp_wmb();
-		}
-		cred->euid = euid;
-	}
-	cred->fsuid = cred->euid;
+	if (euid != (uid_t) -1)
+		new->euid = euid;
 	if (suid != (uid_t) -1)
-		cred->suid = suid;
+		new->suid = suid;
+	new->fsuid = new->euid;
 
-	key_fsuid_changed(current);
-	proc_id_connector(current, PROC_EVENT_UID);
+	retval = security_task_fix_setuid(new, old, LSM_SETID_RES);
+	if (retval < 0)
+		goto error;
 
-	return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RES);
+	return commit_creds(new);
+
+error:
+	abort_creds(new);
+	return retval;
 }
 
 asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid)
@@ -763,40 +772,45 @@ asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __us
  */
 asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
 {
-	struct cred *cred = current->cred;
+	const struct cred *old;
+	struct cred *new;
 	int retval;
 
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+	old = current_cred();
+
 	retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES);
 	if (retval)
-		return retval;
+		goto error;
 
+	retval = -EPERM;
 	if (!capable(CAP_SETGID)) {
-		if ((rgid != (gid_t) -1) && (rgid != cred->gid) &&
-		    (rgid != cred->egid) && (rgid != cred->sgid))
-			return -EPERM;
-		if ((egid != (gid_t) -1) && (egid != cred->gid) &&
-		    (egid != cred->egid) && (egid != cred->sgid))
-			return -EPERM;
-		if ((sgid != (gid_t) -1) && (sgid != cred->gid) &&
-		    (sgid != cred->egid) && (sgid != cred->sgid))
-			return -EPERM;
+		if (rgid != (gid_t) -1 && rgid != old->gid &&
+		    rgid != old->egid  && rgid != old->sgid)
+			goto error;
+		if (egid != (gid_t) -1 && egid != old->gid &&
+		    egid != old->egid  && egid != old->sgid)
+			goto error;
+		if (sgid != (gid_t) -1 && sgid != old->gid &&
+		    sgid != old->egid  && sgid != old->sgid)
+			goto error;
 	}
-	if (egid != (gid_t) -1) {
-		if (egid != cred->egid) {
-			set_dumpable(current->mm, suid_dumpable);
-			smp_wmb();
-		}
-		cred->egid = egid;
-	}
-	cred->fsgid = cred->egid;
+
 	if (rgid != (gid_t) -1)
-		cred->gid = rgid;
+		new->gid = rgid;
+	if (egid != (gid_t) -1)
+		new->egid = egid;
 	if (sgid != (gid_t) -1)
-		cred->sgid = sgid;
+		new->sgid = sgid;
+	new->fsgid = new->egid;
 
-	key_fsgid_changed(current);
-	proc_id_connector(current, PROC_EVENT_GID);
-	return 0;
+	return commit_creds(new);
+
+error:
+	abort_creds(new);
+	return retval;
 }
 
 asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid)
@@ -820,28 +834,35 @@ asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __us
  */
 asmlinkage long sys_setfsuid(uid_t uid)
 {
-	struct cred *cred = current->cred;
-	int old_fsuid;
+	const struct cred *old;
+	struct cred *new;
+	uid_t old_fsuid;
+
+	new = prepare_creds();
+	if (!new)
+		return current_fsuid();
+	old = current_cred();
+	old_fsuid = old->fsuid;
 
-	old_fsuid = cred->fsuid;
-	if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS))
-		return old_fsuid;
+	if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0)
+		goto error;
 
-	if (uid == cred->uid || uid == cred->euid ||
-	    uid == cred->suid || uid == cred->fsuid ||
+	if (uid == old->uid  || uid == old->euid  ||
+	    uid == old->suid || uid == old->fsuid ||
 	    capable(CAP_SETUID)) {
 		if (uid != old_fsuid) {
-			set_dumpable(current->mm, suid_dumpable);
-			smp_wmb();
+			new->fsuid = uid;
+			if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
+				goto change_okay;
 		}
-		cred->fsuid = uid;
 	}
 
-	key_fsuid_changed(current);
-	proc_id_connector(current, PROC_EVENT_UID);
-
-	security_task_post_setuid(old_fsuid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS);
+error:
+	abort_creds(new);
+	return old_fsuid;
 
+change_okay:
+	commit_creds(new);
 	return old_fsuid;
 }
 
@@ -850,24 +871,34 @@ asmlinkage long sys_setfsuid(uid_t uid)
  */
 asmlinkage long sys_setfsgid(gid_t gid)
 {
-	struct cred *cred = current->cred;
-	int old_fsgid;
+	const struct cred *old;
+	struct cred *new;
+	gid_t old_fsgid;
+
+	new = prepare_creds();
+	if (!new)
+		return current_fsgid();
+	old = current_cred();
+	old_fsgid = old->fsgid;
 
-	old_fsgid = cred->fsgid;
 	if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS))
-		return old_fsgid;
+		goto error;
 
-	if (gid == cred->gid || gid == cred->egid ||
-	    gid == cred->sgid || gid == cred->fsgid ||
+	if (gid == old->gid  || gid == old->egid  ||
+	    gid == old->sgid || gid == old->fsgid ||
 	    capable(CAP_SETGID)) {
 		if (gid != old_fsgid) {
-			set_dumpable(current->mm, suid_dumpable);
-			smp_wmb();
+			new->fsgid = gid;
+			goto change_okay;
 		}
-		cred->fsgid = gid;
-		key_fsgid_changed(current);
-		proc_id_connector(current, PROC_EVENT_GID);
 	}
+
+error:
+	abort_creds(new);
+	return old_fsgid;
+
+change_okay:
+	commit_creds(new);
 	return old_fsgid;
 }
 
@@ -1136,7 +1167,7 @@ EXPORT_SYMBOL(groups_free);
 
 /* export the group_info to a user-space array */
 static int groups_to_user(gid_t __user *grouplist,
-    struct group_info *group_info)
+			  const struct group_info *group_info)
 {
 	int i;
 	unsigned int count = group_info->ngroups;
@@ -1227,31 +1258,25 @@ int groups_search(const struct group_info *group_info, gid_t grp)
 }
 
 /**
- * set_groups - Change a group subscription in a security record
- * @sec: The security record to alter
- * @group_info: The group list to impose
+ * set_groups - Change a group subscription in a set of credentials
+ * @new: The newly prepared set of credentials to alter
+ * @group_info: The group list to install
  *
- * Validate a group subscription and, if valid, impose it upon a task security
- * record.
+ * Validate a group subscription and, if valid, insert it into a set
+ * of credentials.
  */
-int set_groups(struct cred *cred, struct group_info *group_info)
+int set_groups(struct cred *new, struct group_info *group_info)
 {
 	int retval;
-	struct group_info *old_info;
 
 	retval = security_task_setgroups(group_info);
 	if (retval)
 		return retval;
 
+	put_group_info(new->group_info);
 	groups_sort(group_info);
 	get_group_info(group_info);
-
-	spin_lock(&cred->lock);
-	old_info = cred->group_info;
-	cred->group_info = group_info;
-	spin_unlock(&cred->lock);
-
-	put_group_info(old_info);
+	new->group_info = group_info;
 	return 0;
 }
 
@@ -1266,7 +1291,20 @@ EXPORT_SYMBOL(set_groups);
  */
 int set_current_groups(struct group_info *group_info)
 {
-	return set_groups(current->cred, group_info);
+	struct cred *new;
+	int ret;
+
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	ret = set_groups(new, group_info);
+	if (ret < 0) {
+		abort_creds(new);
+		return ret;
+	}
+
+	return commit_creds(new);
 }
 
 EXPORT_SYMBOL(set_current_groups);
@@ -1666,9 +1704,11 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 	unsigned char comm[sizeof(me->comm)];
 	long error;
 
-	if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error))
+	error = security_task_prctl(option, arg2, arg3, arg4, arg5);
+	if (error != -ENOSYS)
 		return error;
 
+	error = 0;
 	switch (option) {
 		case PR_SET_PDEATHSIG:
 			if (!valid_signal(arg2)) {
diff --git a/kernel/user.c b/kernel/user.c
index 104d22ac84d5..d476307dd4b0 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,6 +16,7 @@
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/user_namespace.h>
+#include "cred-internals.h"
 
 struct user_namespace init_user_ns = {
 	.kref = {
@@ -104,16 +105,10 @@ static int sched_create_user(struct user_struct *up)
 	return rc;
 }
 
-static void sched_switch_user(struct task_struct *p)
-{
-	sched_move_task(p);
-}
-
 #else	/* CONFIG_USER_SCHED */
 
 static void sched_destroy_user(struct user_struct *up) { }
 static int sched_create_user(struct user_struct *up) { return 0; }
-static void sched_switch_user(struct task_struct *p) { }
 
 #endif	/* CONFIG_USER_SCHED */
 
@@ -448,36 +443,6 @@ out_unlock:
 	return NULL;
 }
 
-void switch_uid(struct user_struct *new_user)
-{
-	struct user_struct *old_user;
-
-	/* What if a process setreuid()'s and this brings the
-	 * new uid over his NPROC rlimit?  We can check this now
-	 * cheaply with the new uid cache, so if it matters
-	 * we should be checking for it.  -DaveM
-	 */
-	old_user = current->cred->user;
-	atomic_inc(&new_user->processes);
-	atomic_dec(&old_user->processes);
-	switch_uid_keyring(new_user);
-	current->cred->user = new_user;
-	sched_switch_user(current);
-
-	/*
-	 * We need to synchronize with __sigqueue_alloc()
-	 * doing a get_uid(p->user).. If that saw the old
-	 * user value, we need to wait until it has exited
-	 * its critical region before we can free the old
-	 * structure.
-	 */
-	smp_mb();
-	spin_unlock_wait(&current->sighand->siglock);
-
-	free_uid(old_user);
-	suid_keys(current);
-}
-
 #ifdef CONFIG_USER_NS
 void release_uids(struct user_namespace *ns)
 {
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index f82730adea00..0d9c51d67333 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -19,6 +19,7 @@ static struct user_namespace *clone_user_ns(struct user_namespace *old_ns)
 {
 	struct user_namespace *ns;
 	struct user_struct *new_user;
+	struct cred *new;
 	int n;
 
 	ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL);
@@ -45,7 +46,16 @@ static struct user_namespace *clone_user_ns(struct user_namespace *old_ns)
 		return ERR_PTR(-ENOMEM);
 	}
 
-	switch_uid(new_user);
+	/* Install the new user */
+	new = prepare_creds();
+	if (!new) {
+		free_uid(new_user);
+		free_uid(ns->root_user);
+		kfree(ns);
+	}
+	free_uid(new->user);
+	new->user = new_user;
+	commit_creds(new);
 	return ns;
 }
 
diff --git a/lib/Makefile b/lib/Makefile
index 7cb65d85aeb0..80fe8a3ec12a 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -11,7 +11,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o dump_stack.o \
 	 idr.o int_sqrt.o extable.o prio_tree.o \
 	 sha1.o irq_regs.o reciprocal_div.o argv_split.o \
-	 proportions.o prio_heap.o ratelimit.o show_mem.o
+	 proportions.o prio_heap.o ratelimit.o show_mem.o is_single_threaded.o
 
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c
index 9a8ff684da79..ad8c7a782da1 100644
--- a/net/rxrpc/ar-key.c
+++ b/net/rxrpc/ar-key.c
@@ -287,6 +287,7 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *conn,
 			      time_t expiry,
 			      u32 kvno)
 {
+	const struct cred *cred = current_cred();
 	struct key *key;
 	int ret;
 
@@ -297,7 +298,7 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *conn,
 
 	_enter("");
 
-	key = key_alloc(&key_type_rxrpc, "x", 0, 0, current, 0,
+	key = key_alloc(&key_type_rxrpc, "x", 0, 0, cred, 0,
 			KEY_ALLOC_NOT_IN_QUOTA);
 	if (IS_ERR(key)) {
 		_leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key));
@@ -340,10 +341,11 @@ EXPORT_SYMBOL(rxrpc_get_server_data_key);
  */
 struct key *rxrpc_get_null_key(const char *keyname)
 {
+	const struct cred *cred = current_cred();
 	struct key *key;
 	int ret;
 
-	key = key_alloc(&key_type_rxrpc, keyname, 0, 0, current,
+	key = key_alloc(&key_type_rxrpc, keyname, 0, 0, cred,
 			KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA);
 	if (IS_ERR(key))
 		return key;
diff --git a/security/capability.c b/security/capability.c
index fac2f61b69a9..efeb6d9e0e6a 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -340,12 +340,16 @@ static int cap_task_create(unsigned long clone_flags)
 	return 0;
 }
 
-static int cap_cred_alloc_security(struct cred *cred)
+static void cap_cred_free(struct cred *cred)
+{
+}
+
+static int cap_cred_prepare(struct cred *new, const struct cred *old, gfp_t gfp)
 {
 	return 0;
 }
 
-static void cap_cred_free(struct cred *cred)
+static void cap_cred_commit(struct cred *new, const struct cred *old)
 {
 }
 
@@ -750,7 +754,7 @@ static void cap_release_secctx(char *secdata, u32 seclen)
 }
 
 #ifdef CONFIG_KEYS
-static int cap_key_alloc(struct key *key, struct task_struct *ctx,
+static int cap_key_alloc(struct key *key, const struct cred *cred,
 			 unsigned long flags)
 {
 	return 0;
@@ -760,7 +764,7 @@ static void cap_key_free(struct key *key)
 {
 }
 
-static int cap_key_permission(key_ref_t key_ref, struct task_struct *context,
+static int cap_key_permission(key_ref_t key_ref, const struct cred *cred,
 			      key_perm_t perm)
 {
 	return 0;
@@ -814,8 +818,7 @@ void security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, ptrace_may_access);
 	set_to_cap_if_null(ops, ptrace_traceme);
 	set_to_cap_if_null(ops, capget);
-	set_to_cap_if_null(ops, capset_check);
-	set_to_cap_if_null(ops, capset_set);
+	set_to_cap_if_null(ops, capset);
 	set_to_cap_if_null(ops, acct);
 	set_to_cap_if_null(ops, capable);
 	set_to_cap_if_null(ops, quotactl);
@@ -890,10 +893,11 @@ void security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, file_receive);
 	set_to_cap_if_null(ops, dentry_open);
 	set_to_cap_if_null(ops, task_create);
-	set_to_cap_if_null(ops, cred_alloc_security);
 	set_to_cap_if_null(ops, cred_free);
+	set_to_cap_if_null(ops, cred_prepare);
+	set_to_cap_if_null(ops, cred_commit);
 	set_to_cap_if_null(ops, task_setuid);
-	set_to_cap_if_null(ops, task_post_setuid);
+	set_to_cap_if_null(ops, task_fix_setuid);
 	set_to_cap_if_null(ops, task_setgid);
 	set_to_cap_if_null(ops, task_setpgid);
 	set_to_cap_if_null(ops, task_getpgid);
@@ -910,7 +914,6 @@ void security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, task_wait);
 	set_to_cap_if_null(ops, task_kill);
 	set_to_cap_if_null(ops, task_prctl);
-	set_to_cap_if_null(ops, task_reparent_to_init);
 	set_to_cap_if_null(ops, task_to_inode);
 	set_to_cap_if_null(ops, ipc_permission);
 	set_to_cap_if_null(ops, ipc_getsecid);
diff --git a/security/commoncap.c b/security/commoncap.c
index 0384bf95db68..b5419273f92d 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -72,8 +72,8 @@ int cap_ptrace_may_access(struct task_struct *child, unsigned int mode)
 	int ret = 0;
 
 	rcu_read_lock();
-	if (!cap_issubset(child->cred->cap_permitted,
-			  current->cred->cap_permitted) &&
+	if (!cap_issubset(__task_cred(child)->cap_permitted,
+			  current_cred()->cap_permitted) &&
 	    !capable(CAP_SYS_PTRACE))
 		ret = -EPERM;
 	rcu_read_unlock();
@@ -85,8 +85,8 @@ int cap_ptrace_traceme(struct task_struct *parent)
 	int ret = 0;
 
 	rcu_read_lock();
-	if (!cap_issubset(current->cred->cap_permitted,
-			 parent->cred->cap_permitted) &&
+	if (!cap_issubset(current_cred()->cap_permitted,
+			  __task_cred(parent)->cap_permitted) &&
 	    !has_capability(parent, CAP_SYS_PTRACE))
 		ret = -EPERM;
 	rcu_read_unlock();
@@ -117,7 +117,7 @@ static inline int cap_inh_is_capped(void)
 	 * to the old permitted set. That is, if the current task
 	 * does *not* possess the CAP_SETPCAP capability.
 	 */
-	return (cap_capable(current, CAP_SETPCAP, SECURITY_CAP_AUDIT) != 0);
+	return cap_capable(current, CAP_SETPCAP, SECURITY_CAP_AUDIT) != 0;
 }
 
 static inline int cap_limit_ptraced_target(void) { return 1; }
@@ -132,52 +132,39 @@ static inline int cap_limit_ptraced_target(void)
 
 #endif /* def CONFIG_SECURITY_FILE_CAPABILITIES */
 
-int cap_capset_check(const kernel_cap_t *effective,
-		     const kernel_cap_t *inheritable,
-		     const kernel_cap_t *permitted)
+int cap_capset(struct cred *new,
+	       const struct cred *old,
+	       const kernel_cap_t *effective,
+	       const kernel_cap_t *inheritable,
+	       const kernel_cap_t *permitted)
 {
-	const struct cred *cred = current->cred;
-
-	if (cap_inh_is_capped()
-	    && !cap_issubset(*inheritable,
-			     cap_combine(cred->cap_inheritable,
-					 cred->cap_permitted))) {
+	if (cap_inh_is_capped() &&
+	    !cap_issubset(*inheritable,
+			  cap_combine(old->cap_inheritable,
+				      old->cap_permitted)))
 		/* incapable of using this inheritable set */
 		return -EPERM;
-	}
+
 	if (!cap_issubset(*inheritable,
-			   cap_combine(cred->cap_inheritable,
-				       cred->cap_bset))) {
+			  cap_combine(old->cap_inheritable,
+				      old->cap_bset)))
 		/* no new pI capabilities outside bounding set */
 		return -EPERM;
-	}
 
 	/* verify restrictions on target's new Permitted set */
-	if (!cap_issubset (*permitted,
-			   cap_combine (cred->cap_permitted,
-					cred->cap_permitted))) {
+	if (!cap_issubset(*permitted, old->cap_permitted))
 		return -EPERM;
-	}
 
 	/* verify the _new_Effective_ is a subset of the _new_Permitted_ */
-	if (!cap_issubset (*effective, *permitted)) {
+	if (!cap_issubset(*effective, *permitted))
 		return -EPERM;
-	}
 
+	new->cap_effective   = *effective;
+	new->cap_inheritable = *inheritable;
+	new->cap_permitted   = *permitted;
 	return 0;
 }
 
-void cap_capset_set(const kernel_cap_t *effective,
-		    const kernel_cap_t *inheritable,
-		    const kernel_cap_t *permitted)
-{
-	struct cred *cred = current->cred;
-
-	cred->cap_effective   = *effective;
-	cred->cap_inheritable = *inheritable;
-	cred->cap_permitted   = *permitted;
-}
-
 static inline void bprm_clear_caps(struct linux_binprm *bprm)
 {
 	cap_clear(bprm->cap_post_exec_permitted);
@@ -382,41 +369,46 @@ int cap_bprm_set_security (struct linux_binprm *bprm)
 	return ret;
 }
 
-void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
+int cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
 {
-	struct cred *cred = current->cred;
+	const struct cred *old = current_cred();
+	struct cred *new;
+
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
 
-	if (bprm->e_uid != cred->uid || bprm->e_gid != cred->gid ||
+	if (bprm->e_uid != old->uid || bprm->e_gid != old->gid ||
 	    !cap_issubset(bprm->cap_post_exec_permitted,
-			  cred->cap_permitted)) {
+			  old->cap_permitted)) {
 		set_dumpable(current->mm, suid_dumpable);
 		current->pdeath_signal = 0;
 
 		if (unsafe & ~LSM_UNSAFE_PTRACE_CAP) {
 			if (!capable(CAP_SETUID)) {
-				bprm->e_uid = cred->uid;
-				bprm->e_gid = cred->gid;
+				bprm->e_uid = old->uid;
+				bprm->e_gid = old->gid;
 			}
 			if (cap_limit_ptraced_target()) {
 				bprm->cap_post_exec_permitted = cap_intersect(
 					bprm->cap_post_exec_permitted,
-					cred->cap_permitted);
+					new->cap_permitted);
 			}
 		}
 	}
 
-	cred->suid = cred->euid = cred->fsuid = bprm->e_uid;
-	cred->sgid = cred->egid = cred->fsgid = bprm->e_gid;
+	new->suid = new->euid = new->fsuid = bprm->e_uid;
+	new->sgid = new->egid = new->fsgid = bprm->e_gid;
 
 	/* For init, we want to retain the capabilities set
 	 * in the init_task struct. Thus we skip the usual
 	 * capability rules */
 	if (!is_global_init(current)) {
-		cred->cap_permitted = bprm->cap_post_exec_permitted;
+		new->cap_permitted = bprm->cap_post_exec_permitted;
 		if (bprm->cap_effective)
-			cred->cap_effective = bprm->cap_post_exec_permitted;
+			new->cap_effective = bprm->cap_post_exec_permitted;
 		else
-			cap_clear(cred->cap_effective);
+			cap_clear(new->cap_effective);
 	}
 
 	/*
@@ -431,15 +423,15 @@ void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
 	 * Number 1 above might fail if you don't have a full bset, but I think
 	 * that is interesting information to audit.
 	 */
-	if (!cap_isclear(cred->cap_effective)) {
-		if (!cap_issubset(CAP_FULL_SET, cred->cap_effective) ||
-		    (bprm->e_uid != 0) || (cred->uid != 0) ||
+	if (!cap_isclear(new->cap_effective)) {
+		if (!cap_issubset(CAP_FULL_SET, new->cap_effective) ||
+		    bprm->e_uid != 0 || new->uid != 0 ||
 		    issecure(SECURE_NOROOT))
-			audit_log_bprm_fcaps(bprm, &cred->cap_permitted,
-					     &cred->cap_effective);
+			audit_log_bprm_fcaps(bprm, new, old);
 	}
 
-	cred->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
+	new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
+	return commit_creds(new);
 }
 
 int cap_bprm_secureexec (struct linux_binprm *bprm)
@@ -514,65 +506,49 @@ int cap_inode_removexattr(struct dentry *dentry, const char *name)
  * files..
  * Thanks to Olaf Kirch and Peter Benie for spotting this.
  */
-static inline void cap_emulate_setxuid (int old_ruid, int old_euid,
-					int old_suid)
+static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old)
 {
-	struct cred *cred = current->cred;
-
-	if ((old_ruid == 0 || old_euid == 0 || old_suid == 0) &&
-	    (cred->uid != 0 && cred->euid != 0 && cred->suid != 0) &&
+	if ((old->uid == 0 || old->euid == 0 || old->suid == 0) &&
+	    (new->uid != 0 && new->euid != 0 && new->suid != 0) &&
 	    !issecure(SECURE_KEEP_CAPS)) {
-		cap_clear(cred->cap_permitted);
-		cap_clear(cred->cap_effective);
-	}
-	if (old_euid == 0 && cred->euid != 0) {
-		cap_clear(cred->cap_effective);
-	}
-	if (old_euid != 0 && cred->euid == 0) {
-		cred->cap_effective = cred->cap_permitted;
+		cap_clear(new->cap_permitted);
+		cap_clear(new->cap_effective);
 	}
+	if (old->euid == 0 && new->euid != 0)
+		cap_clear(new->cap_effective);
+	if (old->euid != 0 && new->euid == 0)
+		new->cap_effective = new->cap_permitted;
 }
 
-int cap_task_post_setuid (uid_t old_ruid, uid_t old_euid, uid_t old_suid,
-			  int flags)
+int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags)
 {
-	struct cred *cred = current->cred;
-
 	switch (flags) {
 	case LSM_SETID_RE:
 	case LSM_SETID_ID:
 	case LSM_SETID_RES:
 		/* Copied from kernel/sys.c:setreuid/setuid/setresuid. */
-		if (!issecure (SECURE_NO_SETUID_FIXUP)) {
-			cap_emulate_setxuid (old_ruid, old_euid, old_suid);
-		}
+		if (!issecure(SECURE_NO_SETUID_FIXUP))
+			cap_emulate_setxuid(new, old);
 		break;
 	case LSM_SETID_FS:
-		{
-			uid_t old_fsuid = old_ruid;
-
-			/* Copied from kernel/sys.c:setfsuid. */
+		/* Copied from kernel/sys.c:setfsuid. */
 
-			/*
-			 * FIXME - is fsuser used for all CAP_FS_MASK capabilities?
-			 *          if not, we might be a bit too harsh here.
-			 */
-
-			if (!issecure (SECURE_NO_SETUID_FIXUP)) {
-				if (old_fsuid == 0 && cred->fsuid != 0) {
-					cred->cap_effective =
-						cap_drop_fs_set(
-							cred->cap_effective);
-				}
-				if (old_fsuid != 0 && cred->fsuid == 0) {
-					cred->cap_effective =
-						cap_raise_fs_set(
-						    cred->cap_effective,
-						    cred->cap_permitted);
-				}
+		/*
+		 * FIXME - is fsuser used for all CAP_FS_MASK capabilities?
+		 *          if not, we might be a bit too harsh here.
+		 */
+		if (!issecure(SECURE_NO_SETUID_FIXUP)) {
+			if (old->fsuid == 0 && new->fsuid != 0) {
+				new->cap_effective =
+					cap_drop_fs_set(new->cap_effective);
+			}
+			if (old->fsuid != 0 && new->fsuid == 0) {
+				new->cap_effective =
+					cap_raise_fs_set(new->cap_effective,
+							 new->cap_permitted);
 			}
-			break;
 		}
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -628,13 +604,14 @@ int cap_task_setnice (struct task_struct *p, int nice)
  * this task could get inconsistent info.  There can be no
  * racing writer bc a task can only change its own caps.
  */
-static long cap_prctl_drop(unsigned long cap)
+static long cap_prctl_drop(struct cred *new, unsigned long cap)
 {
 	if (!capable(CAP_SETPCAP))
 		return -EPERM;
 	if (!cap_valid(cap))
 		return -EINVAL;
-	cap_lower(current->cred->cap_bset, cap);
+
+	cap_lower(new->cap_bset, cap);
 	return 0;
 }
 
@@ -655,22 +632,29 @@ int cap_task_setnice (struct task_struct *p, int nice)
 #endif
 
 int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
-		   unsigned long arg4, unsigned long arg5, long *rc_p)
+		   unsigned long arg4, unsigned long arg5)
 {
-	struct cred *cred = current_cred();
+	struct cred *new;
 	long error = 0;
 
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
 	switch (option) {
 	case PR_CAPBSET_READ:
+		error = -EINVAL;
 		if (!cap_valid(arg2))
-			error = -EINVAL;
-		else
-			error = !!cap_raised(cred->cap_bset, arg2);
-		break;
+			goto error;
+		error = !!cap_raised(new->cap_bset, arg2);
+		goto no_change;
+
 #ifdef CONFIG_SECURITY_FILE_CAPABILITIES
 	case PR_CAPBSET_DROP:
-		error = cap_prctl_drop(arg2);
-		break;
+		error = cap_prctl_drop(new, arg2);
+		if (error < 0)
+			goto error;
+		goto changed;
 
 	/*
 	 * The next four prctl's remain to assist with transitioning a
@@ -692,12 +676,12 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 	 * capability-based-privilege environment.
 	 */
 	case PR_SET_SECUREBITS:
-		if ((((cred->securebits & SECURE_ALL_LOCKS) >> 1)
-		     & (cred->securebits ^ arg2))                  /*[1]*/
-		    || ((cred->securebits & SECURE_ALL_LOCKS
-			 & ~arg2))                                    /*[2]*/
-		    || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS)) /*[3]*/
-		    || (cap_capable(current, CAP_SETPCAP, SECURITY_CAP_AUDIT) != 0)) { /*[4]*/
+		error = -EPERM;
+		if ((((new->securebits & SECURE_ALL_LOCKS) >> 1)
+		     & (new->securebits ^ arg2))			/*[1]*/
+		    || ((new->securebits & SECURE_ALL_LOCKS & ~arg2))	/*[2]*/
+		    || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS))	/*[3]*/
+		    || (cap_capable(current, CAP_SETPCAP, SECURITY_CAP_AUDIT) != 0) /*[4]*/
 			/*
 			 * [1] no changing of bits that are locked
 			 * [2] no unlocking of locks
@@ -705,50 +689,51 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 			 * [4] doing anything requires privilege (go read about
 			 *     the "sendmail capabilities bug")
 			 */
-			error = -EPERM;  /* cannot change a locked bit */
-		} else {
-			cred->securebits = arg2;
-		}
-		break;
+		    )
+			/* cannot change a locked bit */
+			goto error;
+		new->securebits = arg2;
+		goto changed;
+
 	case PR_GET_SECUREBITS:
-		error = cred->securebits;
-		break;
+		error = new->securebits;
+		goto no_change;
 
 #endif /* def CONFIG_SECURITY_FILE_CAPABILITIES */
 
 	case PR_GET_KEEPCAPS:
 		if (issecure(SECURE_KEEP_CAPS))
 			error = 1;
-		break;
+		goto no_change;
+
 	case PR_SET_KEEPCAPS:
+		error = -EINVAL;
 		if (arg2 > 1) /* Note, we rely on arg2 being unsigned here */
-			error = -EINVAL;
-		else if (issecure(SECURE_KEEP_CAPS_LOCKED))
-			error = -EPERM;
-		else if (arg2)
-			cred->securebits |= issecure_mask(SECURE_KEEP_CAPS);
+			goto error;
+		error = -EPERM;
+		if (issecure(SECURE_KEEP_CAPS_LOCKED))
+			goto error;
+		if (arg2)
+			new->securebits |= issecure_mask(SECURE_KEEP_CAPS);
 		else
-			cred->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
-		break;
+			new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
+		goto changed;
 
 	default:
 		/* No functionality available - continue with default */
-		return 0;
+		error = -ENOSYS;
+		goto error;
 	}
 
 	/* Functionality provided */
-	*rc_p = error;
-	return 1;
-}
-
-void cap_task_reparent_to_init (struct task_struct *p)
-{
-	struct cred *cred = p->cred;
-
-	cap_set_init_eff(cred->cap_effective);
-	cap_clear(cred->cap_inheritable);
-	cap_set_full(cred->cap_permitted);
-	p->cred->securebits = SECUREBITS_DEFAULT;
+changed:
+	return commit_creds(new);
+
+no_change:
+	error = 0;
+error:
+	abort_creds(new);
+	return error;
 }
 
 int cap_syslog (int type)
diff --git a/security/keys/internal.h b/security/keys/internal.h
index d1586c629788..81932abefe7b 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -12,6 +12,7 @@
 #ifndef _INTERNAL_H
 #define _INTERNAL_H
 
+#include <linux/sched.h>
 #include <linux/key-type.h>
 
 static inline __attribute__((format(printf, 1, 2)))
@@ -25,7 +26,7 @@ void no_printk(const char *fmt, ...)
 #define kleave(FMT, ...) \
 	printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
 #define kdebug(FMT, ...) \
-	printk(KERN_DEBUG "xxx" FMT"yyy\n", ##__VA_ARGS__)
+	printk(KERN_DEBUG "   "FMT"\n", ##__VA_ARGS__)
 #else
 #define kenter(FMT, ...) \
 	no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
@@ -97,7 +98,7 @@ extern struct key *keyring_search_instkey(struct key *keyring,
 typedef int (*key_match_func_t)(const struct key *, const void *);
 
 extern key_ref_t keyring_search_aux(key_ref_t keyring_ref,
-				    struct task_struct *tsk,
+				    const struct cred *cred,
 				    struct key_type *type,
 				    const void *description,
 				    key_match_func_t match);
@@ -105,13 +106,13 @@ extern key_ref_t keyring_search_aux(key_ref_t keyring_ref,
 extern key_ref_t search_process_keyrings(struct key_type *type,
 					 const void *description,
 					 key_match_func_t match,
-					 struct task_struct *tsk);
+					 const struct cred *cred);
 
 extern struct key *find_keyring_by_name(const char *name, bool skip_perm_check);
 
 extern int install_user_keyrings(void);
-extern int install_thread_keyring(void);
-extern int install_process_keyring(void);
+extern int install_thread_keyring_to_cred(struct cred *);
+extern int install_process_keyring_to_cred(struct cred *);
 
 extern struct key *request_key_and_link(struct key_type *type,
 					const char *description,
@@ -130,12 +131,12 @@ extern long join_session_keyring(const char *name);
  * check to see whether permission is granted to use a key in the desired way
  */
 extern int key_task_permission(const key_ref_t key_ref,
-			       struct task_struct *context,
+			       const struct cred *cred,
 			       key_perm_t perm);
 
 static inline int key_permission(const key_ref_t key_ref, key_perm_t perm)
 {
-	return key_task_permission(key_ref, current, perm);
+	return key_task_permission(key_ref, current_cred(), perm);
 }
 
 /* required permissions */
@@ -153,7 +154,7 @@ static inline int key_permission(const key_ref_t key_ref, key_perm_t perm)
 struct request_key_auth {
 	struct key		*target_key;
 	struct key		*dest_keyring;
-	struct task_struct	*context;
+	const struct cred	*cred;
 	void			*callout_info;
 	size_t			callout_len;
 	pid_t			pid;
diff --git a/security/keys/key.c b/security/keys/key.c
index a6ca39ed3b0e..f76c8a546fd3 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -218,7 +218,7 @@ serial_exists:
  *   instantiate the key or discard it before returning
  */
 struct key *key_alloc(struct key_type *type, const char *desc,
-		      uid_t uid, gid_t gid, struct task_struct *ctx,
+		      uid_t uid, gid_t gid, const struct cred *cred,
 		      key_perm_t perm, unsigned long flags)
 {
 	struct key_user *user = NULL;
@@ -294,7 +294,7 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 #endif
 
 	/* let the security module know about the key */
-	ret = security_key_alloc(key, ctx, flags);
+	ret = security_key_alloc(key, cred, flags);
 	if (ret < 0)
 		goto security_error;
 
@@ -391,7 +391,7 @@ static int __key_instantiate_and_link(struct key *key,
 				      const void *data,
 				      size_t datalen,
 				      struct key *keyring,
-				      struct key *instkey)
+				      struct key *authkey)
 {
 	int ret, awaken;
 
@@ -421,8 +421,8 @@ static int __key_instantiate_and_link(struct key *key,
 				ret = __key_link(keyring, key);
 
 			/* disable the authorisation key */
-			if (instkey)
-				key_revoke(instkey);
+			if (authkey)
+				key_revoke(authkey);
 		}
 	}
 
@@ -444,14 +444,14 @@ int key_instantiate_and_link(struct key *key,
 			     const void *data,
 			     size_t datalen,
 			     struct key *keyring,
-			     struct key *instkey)
+			     struct key *authkey)
 {
 	int ret;
 
 	if (keyring)
 		down_write(&keyring->sem);
 
-	ret = __key_instantiate_and_link(key, data, datalen, keyring, instkey);
+	ret = __key_instantiate_and_link(key, data, datalen, keyring, authkey);
 
 	if (keyring)
 		up_write(&keyring->sem);
@@ -469,7 +469,7 @@ EXPORT_SYMBOL(key_instantiate_and_link);
 int key_negate_and_link(struct key *key,
 			unsigned timeout,
 			struct key *keyring,
-			struct key *instkey)
+			struct key *authkey)
 {
 	struct timespec now;
 	int ret, awaken;
@@ -504,8 +504,8 @@ int key_negate_and_link(struct key *key,
 			ret = __key_link(keyring, key);
 
 		/* disable the authorisation key */
-		if (instkey)
-			key_revoke(instkey);
+		if (authkey)
+			key_revoke(authkey);
 	}
 
 	mutex_unlock(&key_construction_mutex);
@@ -743,6 +743,7 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 			       key_perm_t perm,
 			       unsigned long flags)
 {
+	const struct cred *cred = current_cred();
 	struct key_type *ktype;
 	struct key *keyring, *key = NULL;
 	key_ref_t key_ref;
@@ -802,8 +803,8 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 	}
 
 	/* allocate a new key */
-	key = key_alloc(ktype, description, current_fsuid(), current_fsgid(),
-			current, perm, flags);
+	key = key_alloc(ktype, description, cred->fsuid, cred->fsgid, cred,
+			perm, flags);
 	if (IS_ERR(key)) {
 		key_ref = ERR_CAST(key);
 		goto error_3;
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 8833b447adef..7c72baa02f2e 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -866,6 +866,23 @@ static long get_instantiation_keyring(key_serial_t ringid,
 	return -ENOKEY;
 }
 
+/*
+ * change the request_key authorisation key on the current process
+ */
+static int keyctl_change_reqkey_auth(struct key *key)
+{
+	struct cred *new;
+
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	key_put(new->request_key_auth);
+	new->request_key_auth = key_get(key);
+
+	return commit_creds(new);
+}
+
 /*****************************************************************************/
 /*
  * instantiate the key with the specified payload, and, if one is given, link
@@ -876,12 +893,15 @@ long keyctl_instantiate_key(key_serial_t id,
 			    size_t plen,
 			    key_serial_t ringid)
 {
+	const struct cred *cred = current_cred();
 	struct request_key_auth *rka;
 	struct key *instkey, *dest_keyring;
 	void *payload;
 	long ret;
 	bool vm = false;
 
+	kenter("%d,,%zu,%d", id, plen, ringid);
+
 	ret = -EINVAL;
 	if (plen > 1024 * 1024 - 1)
 		goto error;
@@ -889,7 +909,7 @@ long keyctl_instantiate_key(key_serial_t id,
 	/* the appropriate instantiation authorisation key must have been
 	 * assumed before calling this */
 	ret = -EPERM;
-	instkey = current->cred->request_key_auth;
+	instkey = cred->request_key_auth;
 	if (!instkey)
 		goto error;
 
@@ -931,10 +951,8 @@ long keyctl_instantiate_key(key_serial_t id,
 
 	/* discard the assumed authority if it's just been disabled by
 	 * instantiation of the key */
-	if (ret == 0) {
-		key_put(current->cred->request_key_auth);
-		current->cred->request_key_auth = NULL;
-	}
+	if (ret == 0)
+		keyctl_change_reqkey_auth(NULL);
 
 error2:
 	if (!vm)
@@ -953,14 +971,17 @@ error:
  */
 long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid)
 {
+	const struct cred *cred = current_cred();
 	struct request_key_auth *rka;
 	struct key *instkey, *dest_keyring;
 	long ret;
 
+	kenter("%d,%u,%d", id, timeout, ringid);
+
 	/* the appropriate instantiation authorisation key must have been
 	 * assumed before calling this */
 	ret = -EPERM;
-	instkey = current->cred->request_key_auth;
+	instkey = cred->request_key_auth;
 	if (!instkey)
 		goto error;
 
@@ -982,10 +1003,8 @@ long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid)
 
 	/* discard the assumed authority if it's just been disabled by
 	 * instantiation of the key */
-	if (ret == 0) {
-		key_put(current->cred->request_key_auth);
-		current->cred->request_key_auth = NULL;
-	}
+	if (ret == 0)
+		keyctl_change_reqkey_auth(NULL);
 
 error:
 	return ret;
@@ -999,36 +1018,56 @@ error:
  */
 long keyctl_set_reqkey_keyring(int reqkey_defl)
 {
-	struct cred *cred = current->cred;
-	int ret;
+	struct cred *new;
+	int ret, old_setting;
+
+	old_setting = current_cred_xxx(jit_keyring);
+
+	if (reqkey_defl == KEY_REQKEY_DEFL_NO_CHANGE)
+		return old_setting;
+
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
 
 	switch (reqkey_defl) {
 	case KEY_REQKEY_DEFL_THREAD_KEYRING:
-		ret = install_thread_keyring();
+		ret = install_thread_keyring_to_cred(new);
 		if (ret < 0)
-			return ret;
+			goto error;
 		goto set;
 
 	case KEY_REQKEY_DEFL_PROCESS_KEYRING:
-		ret = install_process_keyring();
-		if (ret < 0)
-			return ret;
+		ret = install_process_keyring_to_cred(new);
+		if (ret < 0) {
+			if (ret != -EEXIST)
+				goto error;
+			ret = 0;
+		}
+		goto set;
 
 	case KEY_REQKEY_DEFL_DEFAULT:
 	case KEY_REQKEY_DEFL_SESSION_KEYRING:
 	case KEY_REQKEY_DEFL_USER_KEYRING:
 	case KEY_REQKEY_DEFL_USER_SESSION_KEYRING:
-	set:
-		cred->jit_keyring = reqkey_defl;
+	case KEY_REQKEY_DEFL_REQUESTOR_KEYRING:
+		goto set;
 
 	case KEY_REQKEY_DEFL_NO_CHANGE:
-		return cred->jit_keyring;
-
 	case KEY_REQKEY_DEFL_GROUP_KEYRING:
 	default:
-		return -EINVAL;
+		ret = -EINVAL;
+		goto error;
 	}
 
+set:
+	new->jit_keyring = reqkey_defl;
+	commit_creds(new);
+	return old_setting;
+error:
+	abort_creds(new);
+	return -EINVAL;
+
 } /* end keyctl_set_reqkey_keyring() */
 
 /*****************************************************************************/
@@ -1087,9 +1126,7 @@ long keyctl_assume_authority(key_serial_t id)
 
 	/* we divest ourselves of authority if given an ID of 0 */
 	if (id == 0) {
-		key_put(current->cred->request_key_auth);
-		current->cred->request_key_auth = NULL;
-		ret = 0;
+		ret = keyctl_change_reqkey_auth(NULL);
 		goto error;
 	}
 
@@ -1104,10 +1141,12 @@ long keyctl_assume_authority(key_serial_t id)
 		goto error;
 	}
 
-	key_put(current->cred->request_key_auth);
-	current->cred->request_key_auth = authkey;
-	ret = authkey->serial;
+	ret = keyctl_change_reqkey_auth(authkey);
+	if (ret < 0)
+		goto error;
+	key_put(authkey);
 
+	ret = authkey->serial;
 error:
 	return ret;
 
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index fdf75f901991..ed851574d073 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -245,14 +245,14 @@ static long keyring_read(const struct key *keyring,
  * allocate a keyring and link into the destination keyring
  */
 struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid,
-			  struct task_struct *ctx, unsigned long flags,
+			  const struct cred *cred, unsigned long flags,
 			  struct key *dest)
 {
 	struct key *keyring;
 	int ret;
 
 	keyring = key_alloc(&key_type_keyring, description,
-			    uid, gid, ctx,
+			    uid, gid, cred,
 			    (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL,
 			    flags);
 
@@ -281,7 +281,7 @@ struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid,
  * - we propagate the possession attribute from the keyring ref to the key ref
  */
 key_ref_t keyring_search_aux(key_ref_t keyring_ref,
-			     struct task_struct *context,
+			     const struct cred *cred,
 			     struct key_type *type,
 			     const void *description,
 			     key_match_func_t match)
@@ -304,7 +304,7 @@ key_ref_t keyring_search_aux(key_ref_t keyring_ref,
 	key_check(keyring);
 
 	/* top keyring must have search permission to begin the search */
-        err = key_task_permission(keyring_ref, context, KEY_SEARCH);
+        err = key_task_permission(keyring_ref, cred, KEY_SEARCH);
 	if (err < 0) {
 		key_ref = ERR_PTR(err);
 		goto error;
@@ -377,7 +377,7 @@ descend:
 
 		/* key must have search permissions */
 		if (key_task_permission(make_key_ref(key, possessed),
-					context, KEY_SEARCH) < 0)
+					cred, KEY_SEARCH) < 0)
 			continue;
 
 		/* we set a different error code if we pass a negative key */
@@ -404,7 +404,7 @@ ascend:
 			continue;
 
 		if (key_task_permission(make_key_ref(key, possessed),
-					context, KEY_SEARCH) < 0)
+					cred, KEY_SEARCH) < 0)
 			continue;
 
 		/* stack the current position */
@@ -459,7 +459,7 @@ key_ref_t keyring_search(key_ref_t keyring,
 	if (!type->match)
 		return ERR_PTR(-ENOKEY);
 
-	return keyring_search_aux(keyring, current,
+	return keyring_search_aux(keyring, current->cred,
 				  type, description, type->match);
 
 } /* end keyring_search() */
diff --git a/security/keys/permission.c b/security/keys/permission.c
index 13c36164f284..5d9fc7b93f2e 100644
--- a/security/keys/permission.c
+++ b/security/keys/permission.c
@@ -14,24 +14,27 @@
 #include "internal.h"
 
 /*****************************************************************************/
-/*
- * check to see whether permission is granted to use a key in the desired way,
- * but permit the security modules to override
+/**
+ * key_task_permission - Check a key can be used
+ * @key_ref: The key to check
+ * @cred: The credentials to use
+ * @perm: The permissions to check for
+ *
+ * Check to see whether permission is granted to use a key in the desired way,
+ * but permit the security modules to override.
+ *
+ * The caller must hold either a ref on cred or must hold the RCU readlock or a
+ * spinlock.
  */
-int key_task_permission(const key_ref_t key_ref,
-			struct task_struct *context,
+int key_task_permission(const key_ref_t key_ref, const struct cred *cred,
 			key_perm_t perm)
 {
-	const struct cred *cred;
 	struct key *key;
 	key_perm_t kperm;
 	int ret;
 
 	key = key_ref_to_ptr(key_ref);
 
-	rcu_read_lock();
-	cred = __task_cred(context);
-
 	/* use the second 8-bits of permissions for keys the caller owns */
 	if (key->uid == cred->fsuid) {
 		kperm = key->perm >> 16;
@@ -57,7 +60,6 @@ int key_task_permission(const key_ref_t key_ref,
 	kperm = key->perm;
 
 use_these_perms:
-	rcu_read_lock();
 
 	/* use the top 8-bits of permissions for keys the caller possesses
 	 * - possessor permissions are additive with other permissions
@@ -71,7 +73,7 @@ use_these_perms:
 		return -EACCES;
 
 	/* let LSM be the final arbiter */
-	return security_key_permission(key_ref, context, perm);
+	return security_key_permission(key_ref, cred, perm);
 
 } /* end key_task_permission() */
 
diff --git a/security/keys/proc.c b/security/keys/proc.c
index f619170da760..7f508def50e3 100644
--- a/security/keys/proc.c
+++ b/security/keys/proc.c
@@ -136,8 +136,12 @@ static int proc_keys_show(struct seq_file *m, void *v)
 	int rc;
 
 	/* check whether the current task is allowed to view the key (assuming
-	 * non-possession) */
-	rc = key_task_permission(make_key_ref(key, 0), current, KEY_VIEW);
+	 * non-possession)
+	 * - the caller holds a spinlock, and thus the RCU read lock, making our
+	 *   access to __current_cred() safe
+	 */
+	rc = key_task_permission(make_key_ref(key, 0), current_cred(),
+				 KEY_VIEW);
 	if (rc < 0)
 		return 0;
 
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 70ee93406f30..df329f684a65 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -42,11 +42,15 @@ struct key_user root_key_user = {
  */
 int install_user_keyrings(void)
 {
-	struct user_struct *user = current->cred->user;
+	struct user_struct *user;
+	const struct cred *cred;
 	struct key *uid_keyring, *session_keyring;
 	char buf[20];
 	int ret;
 
+	cred = current_cred();
+	user = cred->user;
+
 	kenter("%p{%u}", user, user->uid);
 
 	if (user->uid_keyring) {
@@ -67,7 +71,7 @@ int install_user_keyrings(void)
 		uid_keyring = find_keyring_by_name(buf, true);
 		if (IS_ERR(uid_keyring)) {
 			uid_keyring = keyring_alloc(buf, user->uid, (gid_t) -1,
-						    current, KEY_ALLOC_IN_QUOTA,
+						    cred, KEY_ALLOC_IN_QUOTA,
 						    NULL);
 			if (IS_ERR(uid_keyring)) {
 				ret = PTR_ERR(uid_keyring);
@@ -83,8 +87,7 @@ int install_user_keyrings(void)
 		if (IS_ERR(session_keyring)) {
 			session_keyring =
 				keyring_alloc(buf, user->uid, (gid_t) -1,
-					      current, KEY_ALLOC_IN_QUOTA,
-					      NULL);
+					      cred, KEY_ALLOC_IN_QUOTA, NULL);
 			if (IS_ERR(session_keyring)) {
 				ret = PTR_ERR(session_keyring);
 				goto error_release;
@@ -116,142 +119,128 @@ error:
 	return ret;
 }
 
-/*****************************************************************************/
 /*
- * deal with the UID changing
+ * install a fresh thread keyring directly to new credentials
  */
-void switch_uid_keyring(struct user_struct *new_user)
+int install_thread_keyring_to_cred(struct cred *new)
 {
-#if 0 /* do nothing for now */
-	struct key *old;
-
-	/* switch to the new user's session keyring if we were running under
-	 * root's default session keyring */
-	if (new_user->uid != 0 &&
-	    current->session_keyring == &root_session_keyring
-	    ) {
-		atomic_inc(&new_user->session_keyring->usage);
-
-		task_lock(current);
-		old = current->session_keyring;
-		current->session_keyring = new_user->session_keyring;
-		task_unlock(current);
+	struct key *keyring;
 
-		key_put(old);
-	}
-#endif
+	keyring = keyring_alloc("_tid", new->uid, new->gid, new,
+				KEY_ALLOC_QUOTA_OVERRUN, NULL);
+	if (IS_ERR(keyring))
+		return PTR_ERR(keyring);
 
-} /* end switch_uid_keyring() */
+	new->thread_keyring = keyring;
+	return 0;
+}
 
-/*****************************************************************************/
 /*
  * install a fresh thread keyring, discarding the old one
  */
-int install_thread_keyring(void)
+static int install_thread_keyring(void)
 {
-	struct task_struct *tsk = current;
-	struct key *keyring, *old;
-	char buf[20];
+	struct cred *new;
 	int ret;
 
-	sprintf(buf, "_tid.%u", tsk->pid);
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
 
-	keyring = keyring_alloc(buf, tsk->cred->uid, tsk->cred->gid, tsk,
-				KEY_ALLOC_QUOTA_OVERRUN, NULL);
-	if (IS_ERR(keyring)) {
-		ret = PTR_ERR(keyring);
-		goto error;
+	BUG_ON(new->thread_keyring);
+
+	ret = install_thread_keyring_to_cred(new);
+	if (ret < 0) {
+		abort_creds(new);
+		return ret;
 	}
 
-	task_lock(tsk);
-	old = tsk->cred->thread_keyring;
-	tsk->cred->thread_keyring = keyring;
-	task_unlock(tsk);
+	return commit_creds(new);
+}
 
-	ret = 0;
+/*
+ * install a process keyring directly to a credentials struct
+ * - returns -EEXIST if there was already a process keyring, 0 if one installed,
+ *   and other -ve on any other error
+ */
+int install_process_keyring_to_cred(struct cred *new)
+{
+	struct key *keyring;
+	int ret;
 
-	key_put(old);
-error:
+	if (new->tgcred->process_keyring)
+		return -EEXIST;
+
+	keyring = keyring_alloc("_pid", new->uid, new->gid,
+				new, KEY_ALLOC_QUOTA_OVERRUN, NULL);
+	if (IS_ERR(keyring))
+		return PTR_ERR(keyring);
+
+	spin_lock_irq(&new->tgcred->lock);
+	if (!new->tgcred->process_keyring) {
+		new->tgcred->process_keyring = keyring;
+		keyring = NULL;
+		ret = 0;
+	} else {
+		ret = -EEXIST;
+	}
+	spin_unlock_irq(&new->tgcred->lock);
+	key_put(keyring);
 	return ret;
+}
 
-} /* end install_thread_keyring() */
-
-/*****************************************************************************/
 /*
  * make sure a process keyring is installed
+ * - we
  */
-int install_process_keyring(void)
+static int install_process_keyring(void)
 {
-	struct task_struct *tsk = current;
-	struct key *keyring;
-	char buf[20];
+	struct cred *new;
 	int ret;
 
-	might_sleep();
-
-	if (!tsk->cred->tgcred->process_keyring) {
-		sprintf(buf, "_pid.%u", tsk->tgid);
-
-		keyring = keyring_alloc(buf, tsk->cred->uid, tsk->cred->gid, tsk,
-					KEY_ALLOC_QUOTA_OVERRUN, NULL);
-		if (IS_ERR(keyring)) {
-			ret = PTR_ERR(keyring);
-			goto error;
-		}
-
-		/* attach keyring */
-		spin_lock_irq(&tsk->cred->tgcred->lock);
-		if (!tsk->cred->tgcred->process_keyring) {
-			tsk->cred->tgcred->process_keyring = keyring;
-			keyring = NULL;
-		}
-		spin_unlock_irq(&tsk->cred->tgcred->lock);
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
 
-		key_put(keyring);
+	ret = install_process_keyring_to_cred(new);
+	if (ret < 0) {
+		abort_creds(new);
+		return ret != -EEXIST ?: 0;
 	}
 
-	ret = 0;
-error:
-	return ret;
-
-} /* end install_process_keyring() */
+	return commit_creds(new);
+}
 
-/*****************************************************************************/
 /*
- * install a session keyring, discarding the old one
- * - if a keyring is not supplied, an empty one is invented
+ * install a session keyring directly to a credentials struct
  */
-static int install_session_keyring(struct key *keyring)
+static int install_session_keyring_to_cred(struct cred *cred,
+					   struct key *keyring)
 {
-	struct task_struct *tsk = current;
 	unsigned long flags;
 	struct key *old;
-	char buf[20];
 
 	might_sleep();
 
 	/* create an empty session keyring */
 	if (!keyring) {
-		sprintf(buf, "_ses.%u", tsk->tgid);
-
 		flags = KEY_ALLOC_QUOTA_OVERRUN;
-		if (tsk->cred->tgcred->session_keyring)
+		if (cred->tgcred->session_keyring)
 			flags = KEY_ALLOC_IN_QUOTA;
 
-		keyring = keyring_alloc(buf, tsk->cred->uid, tsk->cred->gid,
-					tsk, flags, NULL);
+		keyring = keyring_alloc("_ses", cred->uid, cred->gid,
+					cred, flags, NULL);
 		if (IS_ERR(keyring))
 			return PTR_ERR(keyring);
-	}
-	else {
+	} else {
 		atomic_inc(&keyring->usage);
 	}
 
 	/* install the keyring */
-	spin_lock_irq(&tsk->cred->tgcred->lock);
-	old = tsk->cred->tgcred->session_keyring;
-	rcu_assign_pointer(tsk->cred->tgcred->session_keyring, keyring);
-	spin_unlock_irq(&tsk->cred->tgcred->lock);
+	spin_lock_irq(&cred->tgcred->lock);
+	old = cred->tgcred->session_keyring;
+	rcu_assign_pointer(cred->tgcred->session_keyring, keyring);
+	spin_unlock_irq(&cred->tgcred->lock);
 
 	/* we're using RCU on the pointer, but there's no point synchronising
 	 * on it if it didn't previously point to anything */
@@ -261,38 +250,29 @@ static int install_session_keyring(struct key *keyring)
 	}
 
 	return 0;
+}
 
-} /* end install_session_keyring() */
-
-/*****************************************************************************/
 /*
- * copy the keys for fork
+ * install a session keyring, discarding the old one
+ * - if a keyring is not supplied, an empty one is invented
  */
-int copy_keys(unsigned long clone_flags, struct task_struct *tsk)
+static int install_session_keyring(struct key *keyring)
 {
-	key_check(tsk->cred->thread_keyring);
-	key_check(tsk->cred->request_key_auth);
-
-	/* no thread keyring yet */
-	tsk->cred->thread_keyring = NULL;
-
-	/* copy the request_key() authorisation for this thread */
-	key_get(tsk->cred->request_key_auth);
-
-	return 0;
+	struct cred *new;
+	int ret;
 
-} /* end copy_keys() */
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
 
-/*****************************************************************************/
-/*
- * dispose of per-thread keys upon thread exit
- */
-void exit_keys(struct task_struct *tsk)
-{
-	key_put(tsk->cred->thread_keyring);
-	key_put(tsk->cred->request_key_auth);
+	ret = install_session_keyring_to_cred(new, NULL);
+	if (ret < 0) {
+		abort_creds(new);
+		return ret;
+	}
 
-} /* end exit_keys() */
+	return commit_creds(new);
+}
 
 /*****************************************************************************/
 /*
@@ -300,38 +280,41 @@ void exit_keys(struct task_struct *tsk)
  */
 int exec_keys(struct task_struct *tsk)
 {
-	struct key *old;
+	struct thread_group_cred *tgcred = NULL;
+	struct cred *new;
 
-	/* newly exec'd tasks don't get a thread keyring */
-	task_lock(tsk);
-	old = tsk->cred->thread_keyring;
-	tsk->cred->thread_keyring = NULL;
-	task_unlock(tsk);
+#ifdef CONFIG_KEYS
+	tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
+	if (!tgcred)
+		return -ENOMEM;
+#endif
 
-	key_put(old);
+	new = prepare_creds();
+	if (new < 0)
+		return -ENOMEM;
 
-	/* discard the process keyring from a newly exec'd task */
-	spin_lock_irq(&tsk->cred->tgcred->lock);
-	old = tsk->cred->tgcred->process_keyring;
-	tsk->cred->tgcred->process_keyring = NULL;
-	spin_unlock_irq(&tsk->cred->tgcred->lock);
+	/* newly exec'd tasks don't get a thread keyring */
+	key_put(new->thread_keyring);
+	new->thread_keyring = NULL;
 
-	key_put(old);
+	/* create a new per-thread-group creds for all this set of threads to
+	 * share */
+	memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred));
 
-	return 0;
+	atomic_set(&tgcred->usage, 1);
+	spin_lock_init(&tgcred->lock);
 
-} /* end exec_keys() */
+	/* inherit the session keyring; new process keyring */
+	key_get(tgcred->session_keyring);
+	tgcred->process_keyring = NULL;
 
-/*****************************************************************************/
-/*
- * deal with SUID programs
- * - we might want to make this invent a new session keyring
- */
-int suid_keys(struct task_struct *tsk)
-{
+	release_tgcred(new);
+	new->tgcred = tgcred;
+
+	commit_creds(new);
 	return 0;
 
-} /* end suid_keys() */
+} /* end exec_keys() */
 
 /*****************************************************************************/
 /*
@@ -376,16 +359,13 @@ void key_fsgid_changed(struct task_struct *tsk)
 key_ref_t search_process_keyrings(struct key_type *type,
 				  const void *description,
 				  key_match_func_t match,
-				  struct task_struct *context)
+				  const struct cred *cred)
 {
 	struct request_key_auth *rka;
-	struct cred *cred;
 	key_ref_t key_ref, ret, err;
 
 	might_sleep();
 
-	cred = get_task_cred(context);
-
 	/* we want to return -EAGAIN or -ENOKEY if any of the keyrings were
 	 * searchable, but we failed to find a key or we found a negative key;
 	 * otherwise we want to return a sample error (probably -EACCES) if
@@ -401,7 +381,7 @@ key_ref_t search_process_keyrings(struct key_type *type,
 	if (cred->thread_keyring) {
 		key_ref = keyring_search_aux(
 			make_key_ref(cred->thread_keyring, 1),
-			context, type, description, match);
+			cred, type, description, match);
 		if (!IS_ERR(key_ref))
 			goto found;
 
@@ -422,7 +402,7 @@ key_ref_t search_process_keyrings(struct key_type *type,
 	if (cred->tgcred->process_keyring) {
 		key_ref = keyring_search_aux(
 			make_key_ref(cred->tgcred->process_keyring, 1),
-			context, type, description, match);
+			cred, type, description, match);
 		if (!IS_ERR(key_ref))
 			goto found;
 
@@ -446,7 +426,7 @@ key_ref_t search_process_keyrings(struct key_type *type,
 			make_key_ref(rcu_dereference(
 					     cred->tgcred->session_keyring),
 				     1),
-			context, type, description, match);
+			cred, type, description, match);
 		rcu_read_unlock();
 
 		if (!IS_ERR(key_ref))
@@ -468,7 +448,7 @@ key_ref_t search_process_keyrings(struct key_type *type,
 	else if (cred->user->session_keyring) {
 		key_ref = keyring_search_aux(
 			make_key_ref(cred->user->session_keyring, 1),
-			context, type, description, match);
+			cred, type, description, match);
 		if (!IS_ERR(key_ref))
 			goto found;
 
@@ -490,7 +470,7 @@ key_ref_t search_process_keyrings(struct key_type *type,
 	 * - we don't permit access to request_key auth keys via this method
 	 */
 	if (cred->request_key_auth &&
-	    context == current &&
+	    cred == current_cred() &&
 	    type != &key_type_request_key_auth
 	    ) {
 		/* defend against the auth key being revoked */
@@ -500,7 +480,7 @@ key_ref_t search_process_keyrings(struct key_type *type,
 			rka = cred->request_key_auth->payload.data;
 
 			key_ref = search_process_keyrings(type, description,
-							  match, rka->context);
+							  match, rka->cred);
 
 			up_read(&cred->request_key_auth->sem);
 
@@ -527,7 +507,6 @@ key_ref_t search_process_keyrings(struct key_type *type,
 	key_ref = ret ? ret : err;
 
 found:
-	put_cred(cred);
 	return key_ref;
 
 } /* end search_process_keyrings() */
@@ -552,8 +531,7 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial,
 			  key_perm_t perm)
 {
 	struct request_key_auth *rka;
-	struct task_struct *t = current;
-	struct cred *cred;
+	const struct cred *cred;
 	struct key *key;
 	key_ref_t key_ref, skey_ref;
 	int ret;
@@ -608,6 +586,7 @@ try_again:
 				goto error;
 			ret = install_session_keyring(
 				cred->user->session_keyring);
+
 			if (ret < 0)
 				goto error;
 			goto reget_creds;
@@ -693,7 +672,7 @@ try_again:
 		/* check to see if we possess the key */
 		skey_ref = search_process_keyrings(key->type, key,
 						   lookup_user_key_possessed,
-						   current);
+						   cred);
 
 		if (!IS_ERR(skey_ref)) {
 			key_put(key);
@@ -725,7 +704,7 @@ try_again:
 		goto invalid_key;
 
 	/* check the permissions */
-	ret = key_task_permission(key_ref, t, perm);
+	ret = key_task_permission(key_ref, cred, perm);
 	if (ret < 0)
 		goto invalid_key;
 
@@ -755,21 +734,33 @@ reget_creds:
  */
 long join_session_keyring(const char *name)
 {
-	struct task_struct *tsk = current;
-	struct cred *cred = current->cred;
+	const struct cred *old;
+	struct cred *new;
 	struct key *keyring;
-	long ret;
+	long ret, serial;
+
+	/* only permit this if there's a single thread in the thread group -
+	 * this avoids us having to adjust the creds on all threads and risking
+	 * ENOMEM */
+	if (!is_single_threaded(current))
+		return -EMLINK;
+
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+	old = current_cred();
 
 	/* if no name is provided, install an anonymous keyring */
 	if (!name) {
-		ret = install_session_keyring(NULL);
+		ret = install_session_keyring_to_cred(new, NULL);
 		if (ret < 0)
 			goto error;
 
-		rcu_read_lock();
-		ret = rcu_dereference(cred->tgcred->session_keyring)->serial;
-		rcu_read_unlock();
-		goto error;
+		serial = new->tgcred->session_keyring->serial;
+		ret = commit_creds(new);
+		if (ret == 0)
+			ret = serial;
+		goto okay;
 	}
 
 	/* allow the user to join or create a named keyring */
@@ -779,29 +770,33 @@ long join_session_keyring(const char *name)
 	keyring = find_keyring_by_name(name, false);
 	if (PTR_ERR(keyring) == -ENOKEY) {
 		/* not found - try and create a new one */
-		keyring = keyring_alloc(name, cred->uid, cred->gid, tsk,
+		keyring = keyring_alloc(name, old->uid, old->gid, old,
 					KEY_ALLOC_IN_QUOTA, NULL);
 		if (IS_ERR(keyring)) {
 			ret = PTR_ERR(keyring);
 			goto error2;
 		}
-	}
-	else if (IS_ERR(keyring)) {
+	} else if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto error2;
 	}
 
 	/* we've got a keyring - now to install it */
-	ret = install_session_keyring(keyring);
+	ret = install_session_keyring_to_cred(new, keyring);
 	if (ret < 0)
 		goto error2;
 
+	commit_creds(new);
+	mutex_unlock(&key_session_mutex);
+
 	ret = keyring->serial;
 	key_put(keyring);
+okay:
+	return ret;
 
 error2:
 	mutex_unlock(&key_session_mutex);
 error:
+	abort_creds(new);
 	return ret;
-
-} /* end join_session_keyring() */
+}
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index 3d12558362df..0e04f72ef2d4 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -83,8 +83,10 @@ static int call_sbin_request_key(struct key_construction *cons,
 	/* allocate a new session keyring */
 	sprintf(desc, "_req.%u", key->serial);
 
-	keyring = keyring_alloc(desc, current_fsuid(), current_fsgid(), current,
+	cred = get_current_cred();
+	keyring = keyring_alloc(desc, cred->fsuid, cred->fsgid, cred,
 				KEY_ALLOC_QUOTA_OVERRUN, NULL);
+	put_cred(cred);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto error_alloc;
@@ -104,8 +106,7 @@ static int call_sbin_request_key(struct key_construction *cons,
 
 	/* we specify the process's default keyrings */
 	sprintf(keyring_str[0], "%d",
-		cred->thread_keyring ?
-		cred->thread_keyring->serial : 0);
+		cred->thread_keyring ? cred->thread_keyring->serial : 0);
 
 	prkey = 0;
 	if (cred->tgcred->process_keyring)
@@ -155,8 +156,8 @@ error_link:
 	key_put(keyring);
 
 error_alloc:
-	kleave(" = %d", ret);
 	complete_request_key(cons, ret);
+	kleave(" = %d", ret);
 	return ret;
 }
 
@@ -295,6 +296,7 @@ static int construct_alloc_key(struct key_type *type,
 			       struct key_user *user,
 			       struct key **_key)
 {
+	const struct cred *cred = current_cred();
 	struct key *key;
 	key_ref_t key_ref;
 
@@ -302,9 +304,8 @@ static int construct_alloc_key(struct key_type *type,
 
 	mutex_lock(&user->cons_lock);
 
-	key = key_alloc(type, description,
-			current_fsuid(), current_fsgid(), current, KEY_POS_ALL,
-			flags);
+	key = key_alloc(type, description, cred->fsuid, cred->fsgid, cred,
+			KEY_POS_ALL, flags);
 	if (IS_ERR(key))
 		goto alloc_failed;
 
@@ -317,8 +318,7 @@ static int construct_alloc_key(struct key_type *type,
 	 * waited for locks */
 	mutex_lock(&key_construction_mutex);
 
-	key_ref = search_process_keyrings(type, description, type->match,
-					  current);
+	key_ref = search_process_keyrings(type, description, type->match, cred);
 	if (!IS_ERR(key_ref))
 		goto key_already_present;
 
@@ -363,6 +363,8 @@ static struct key *construct_key_and_link(struct key_type *type,
 	struct key *key;
 	int ret;
 
+	kenter("");
+
 	user = key_user_lookup(current_fsuid());
 	if (!user)
 		return ERR_PTR(-ENOMEM);
@@ -376,17 +378,21 @@ static struct key *construct_key_and_link(struct key_type *type,
 	if (ret == 0) {
 		ret = construct_key(key, callout_info, callout_len, aux,
 				    dest_keyring);
-		if (ret < 0)
+		if (ret < 0) {
+			kdebug("cons failed");
 			goto construction_failed;
+		}
 	}
 
 	key_put(dest_keyring);
+	kleave(" = key %d", key_serial(key));
 	return key;
 
 construction_failed:
 	key_negate_and_link(key, key_negative_timeout, NULL, NULL);
 	key_put(key);
 	key_put(dest_keyring);
+	kleave(" = %d", ret);
 	return ERR_PTR(ret);
 }
 
@@ -405,6 +411,7 @@ struct key *request_key_and_link(struct key_type *type,
 				 struct key *dest_keyring,
 				 unsigned long flags)
 {
+	const struct cred *cred = current_cred();
 	struct key *key;
 	key_ref_t key_ref;
 
@@ -414,7 +421,7 @@ struct key *request_key_and_link(struct key_type *type,
 
 	/* search all the process keyrings for a key */
 	key_ref = search_process_keyrings(type, description, type->match,
-					  current);
+					  cred);
 
 	if (!IS_ERR(key_ref)) {
 		key = key_ref_to_ptr(key_ref);
diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c
index 2125579d5d73..86747151ee5b 100644
--- a/security/keys/request_key_auth.c
+++ b/security/keys/request_key_auth.c
@@ -105,9 +105,9 @@ static void request_key_auth_revoke(struct key *key)
 
 	kenter("{%d}", key->serial);
 
-	if (rka->context) {
-		put_task_struct(rka->context);
-		rka->context = NULL;
+	if (rka->cred) {
+		put_cred(rka->cred);
+		rka->cred = NULL;
 	}
 
 } /* end request_key_auth_revoke() */
@@ -122,9 +122,9 @@ static void request_key_auth_destroy(struct key *key)
 
 	kenter("{%d}", key->serial);
 
-	if (rka->context) {
-		put_task_struct(rka->context);
-		rka->context = NULL;
+	if (rka->cred) {
+		put_cred(rka->cred);
+		rka->cred = NULL;
 	}
 
 	key_put(rka->target_key);
@@ -143,6 +143,7 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info,
 				 size_t callout_len, struct key *dest_keyring)
 {
 	struct request_key_auth *rka, *irka;
+	const struct cred *cred = current->cred;
 	struct key *authkey = NULL;
 	char desc[20];
 	int ret;
@@ -164,28 +165,25 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info,
 
 	/* see if the calling process is already servicing the key request of
 	 * another process */
-	if (current->cred->request_key_auth) {
+	if (cred->request_key_auth) {
 		/* it is - use that instantiation context here too */
-		down_read(&current->cred->request_key_auth->sem);
+		down_read(&cred->request_key_auth->sem);
 
 		/* if the auth key has been revoked, then the key we're
 		 * servicing is already instantiated */
-		if (test_bit(KEY_FLAG_REVOKED,
-			     &current->cred->request_key_auth->flags))
+		if (test_bit(KEY_FLAG_REVOKED, &cred->request_key_auth->flags))
 			goto auth_key_revoked;
 
-		irka = current->cred->request_key_auth->payload.data;
-		rka->context = irka->context;
+		irka = cred->request_key_auth->payload.data;
+		rka->cred = get_cred(irka->cred);
 		rka->pid = irka->pid;
-		get_task_struct(rka->context);
 
-		up_read(&current->cred->request_key_auth->sem);
+		up_read(&cred->request_key_auth->sem);
 	}
 	else {
 		/* it isn't - use this process as the context */
-		rka->context = current;
+		rka->cred = get_cred(cred);
 		rka->pid = current->pid;
-		get_task_struct(rka->context);
 	}
 
 	rka->target_key = key_get(target);
@@ -197,7 +195,7 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info,
 	sprintf(desc, "%x", target->serial);
 
 	authkey = key_alloc(&key_type_request_key_auth, desc,
-			    current_fsuid(), current_fsgid(), current,
+			    cred->fsuid, cred->fsgid, cred,
 			    KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH |
 			    KEY_USR_VIEW, KEY_ALLOC_NOT_IN_QUOTA);
 	if (IS_ERR(authkey)) {
@@ -205,16 +203,16 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info,
 		goto error_alloc;
 	}
 
-	/* construct and attach to the keyring */
+	/* construct the auth key */
 	ret = key_instantiate_and_link(authkey, rka, 0, NULL, NULL);
 	if (ret < 0)
 		goto error_inst;
 
-	kleave(" = {%d}", authkey->serial);
+	kleave(" = {%d,%d}", authkey->serial, atomic_read(&authkey->usage));
 	return authkey;
 
 auth_key_revoked:
-	up_read(&current->cred->request_key_auth->sem);
+	up_read(&cred->request_key_auth->sem);
 	kfree(rka->callout_info);
 	kfree(rka);
 	kleave("= -EKEYREVOKED");
@@ -257,6 +255,7 @@ static int key_get_instantiation_authkey_match(const struct key *key,
  */
 struct key *key_get_instantiation_authkey(key_serial_t target_id)
 {
+	const struct cred *cred = current_cred();
 	struct key *authkey;
 	key_ref_t authkey_ref;
 
@@ -264,7 +263,7 @@ struct key *key_get_instantiation_authkey(key_serial_t target_id)
 		&key_type_request_key_auth,
 		(void *) (unsigned long) target_id,
 		key_get_instantiation_authkey_match,
-		current);
+		cred);
 
 	if (IS_ERR(authkey_ref)) {
 		authkey = ERR_CAST(authkey_ref);
diff --git a/security/security.c b/security/security.c
index f40a0a04c3c2..a55d739c6864 100644
--- a/security/security.c
+++ b/security/security.c
@@ -145,18 +145,13 @@ int security_capget(struct task_struct *target,
 	return security_ops->capget(target, effective, inheritable, permitted);
 }
 
-int security_capset_check(const kernel_cap_t *effective,
-			  const kernel_cap_t *inheritable,
-			  const kernel_cap_t *permitted)
+int security_capset(struct cred *new, const struct cred *old,
+		    const kernel_cap_t *effective,
+		    const kernel_cap_t *inheritable,
+		    const kernel_cap_t *permitted)
 {
-	return security_ops->capset_check(effective, inheritable, permitted);
-}
-
-void security_capset_set(const kernel_cap_t *effective,
-			 const kernel_cap_t *inheritable,
-			 const kernel_cap_t *permitted)
-{
-	security_ops->capset_set(effective, inheritable, permitted);
+	return security_ops->capset(new, old,
+				    effective, inheritable, permitted);
 }
 
 int security_capable(struct task_struct *tsk, int cap)
@@ -228,9 +223,9 @@ void security_bprm_free(struct linux_binprm *bprm)
 	security_ops->bprm_free_security(bprm);
 }
 
-void security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
+int security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
 {
-	security_ops->bprm_apply_creds(bprm, unsafe);
+	return security_ops->bprm_apply_creds(bprm, unsafe);
 }
 
 void security_bprm_post_apply_creds(struct linux_binprm *bprm)
@@ -616,14 +611,19 @@ int security_task_create(unsigned long clone_flags)
 	return security_ops->task_create(clone_flags);
 }
 
-int security_cred_alloc(struct cred *cred)
+void security_cred_free(struct cred *cred)
 {
-	return security_ops->cred_alloc_security(cred);
+	security_ops->cred_free(cred);
 }
 
-void security_cred_free(struct cred *cred)
+int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp)
 {
-	security_ops->cred_free(cred);
+	return security_ops->cred_prepare(new, old, gfp);
+}
+
+void security_commit_creds(struct cred *new, const struct cred *old)
+{
+	return security_ops->cred_commit(new, old);
 }
 
 int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags)
@@ -631,10 +631,10 @@ int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags)
 	return security_ops->task_setuid(id0, id1, id2, flags);
 }
 
-int security_task_post_setuid(uid_t old_ruid, uid_t old_euid,
-			       uid_t old_suid, int flags)
+int security_task_fix_setuid(struct cred *new, const struct cred *old,
+			     int flags)
 {
-	return security_ops->task_post_setuid(old_ruid, old_euid, old_suid, flags);
+	return security_ops->task_fix_setuid(new, old, flags);
 }
 
 int security_task_setgid(gid_t id0, gid_t id1, gid_t id2, int flags)
@@ -716,14 +716,9 @@ int security_task_wait(struct task_struct *p)
 }
 
 int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
-			 unsigned long arg4, unsigned long arg5, long *rc_p)
-{
-	return security_ops->task_prctl(option, arg2, arg3, arg4, arg5, rc_p);
-}
-
-void security_task_reparent_to_init(struct task_struct *p)
+			 unsigned long arg4, unsigned long arg5)
 {
-	security_ops->task_reparent_to_init(p);
+	return security_ops->task_prctl(option, arg2, arg3, arg4, arg5);
 }
 
 void security_task_to_inode(struct task_struct *p, struct inode *inode)
@@ -1123,9 +1118,10 @@ EXPORT_SYMBOL(security_skb_classify_flow);
 
 #ifdef CONFIG_KEYS
 
-int security_key_alloc(struct key *key, struct task_struct *tsk, unsigned long flags)
+int security_key_alloc(struct key *key, const struct cred *cred,
+		       unsigned long flags)
 {
-	return security_ops->key_alloc(key, tsk, flags);
+	return security_ops->key_alloc(key, cred, flags);
 }
 
 void security_key_free(struct key *key)
@@ -1134,9 +1130,9 @@ void security_key_free(struct key *key)
 }
 
 int security_key_permission(key_ref_t key_ref,
-			    struct task_struct *context, key_perm_t perm)
+			    const struct cred *cred, key_perm_t perm)
 {
-	return security_ops->key_permission(key_ref, context, perm);
+	return security_ops->key_permission(key_ref, cred, perm);
 }
 
 int security_key_getsecurity(struct key *key, char **_buffer)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index f20cbd681ba6..c71bba78872f 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -156,20 +156,20 @@ static int selinux_secmark_enabled(void)
 	return (atomic_read(&selinux_secmark_refcount) > 0);
 }
 
-/* Allocate and free functions for each kind of security blob. */
-
-static int cred_alloc_security(struct cred *cred)
+/*
+ * initialise the security for the init task
+ */
+static void cred_init_security(void)
 {
+	struct cred *cred = (struct cred *) current->cred;
 	struct task_security_struct *tsec;
 
 	tsec = kzalloc(sizeof(struct task_security_struct), GFP_KERNEL);
 	if (!tsec)
-		return -ENOMEM;
+		panic("SELinux:  Failed to initialize initial task.\n");
 
-	tsec->osid = tsec->sid = SECINITSID_UNLABELED;
+	tsec->osid = tsec->sid = SECINITSID_KERNEL;
 	cred->security = tsec;
-
-	return 0;
 }
 
 /*
@@ -1378,6 +1378,19 @@ static inline u32 signal_to_av(int sig)
 	return perm;
 }
 
+/*
+ * Check permission between a pair of credentials
+ * fork check, ptrace check, etc.
+ */
+static int cred_has_perm(const struct cred *actor,
+			 const struct cred *target,
+			 u32 perms)
+{
+	u32 asid = cred_sid(actor), tsid = cred_sid(target);
+
+	return avc_has_perm(asid, tsid, SECCLASS_PROCESS, perms, NULL);
+}
+
 /*
  * Check permission between a pair of tasks, e.g. signal checks,
  * fork check, ptrace check, etc.
@@ -1820,24 +1833,19 @@ static int selinux_capget(struct task_struct *target, kernel_cap_t *effective,
 	return secondary_ops->capget(target, effective, inheritable, permitted);
 }
 
-static int selinux_capset_check(const kernel_cap_t *effective,
-				const kernel_cap_t *inheritable,
-				const kernel_cap_t *permitted)
+static int selinux_capset(struct cred *new, const struct cred *old,
+			  const kernel_cap_t *effective,
+			  const kernel_cap_t *inheritable,
+			  const kernel_cap_t *permitted)
 {
 	int error;
 
-	error = secondary_ops->capset_check(effective, inheritable, permitted);
+	error = secondary_ops->capset(new, old,
+				      effective, inheritable, permitted);
 	if (error)
 		return error;
 
-	return task_has_perm(current, current, PROCESS__SETCAP);
-}
-
-static void selinux_capset_set(const kernel_cap_t *effective,
-			       const kernel_cap_t *inheritable,
-			       const kernel_cap_t *permitted)
-{
-	secondary_ops->capset_set(effective, inheritable, permitted);
+	return cred_has_perm(old, new, PROCESS__SETCAP);
 }
 
 static int selinux_capable(struct task_struct *tsk, int cap, int audit)
@@ -2244,16 +2252,23 @@ static inline void flush_unauthorized_files(const struct cred *cred,
 	spin_unlock(&files->file_lock);
 }
 
-static void selinux_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
+static int selinux_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
 {
 	struct task_security_struct *tsec;
 	struct bprm_security_struct *bsec;
+	struct cred *new;
 	u32 sid;
 	int rc;
 
-	secondary_ops->bprm_apply_creds(bprm, unsafe);
+	rc = secondary_ops->bprm_apply_creds(bprm, unsafe);
+	if (rc < 0)
+		return rc;
 
-	tsec = current_security();
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	tsec = new->security;
 
 	bsec = bprm->security;
 	sid = bsec->sid;
@@ -2268,7 +2283,7 @@ static void selinux_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
 					PROCESS__SHARE, NULL);
 			if (rc) {
 				bsec->unsafe = 1;
-				return;
+				goto out;
 			}
 		}
 
@@ -2292,12 +2307,16 @@ static void selinux_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
 						  PROCESS__PTRACE, NULL);
 				if (rc) {
 					bsec->unsafe = 1;
-					return;
+					goto out;
 				}
 			}
 		}
 		tsec->sid = sid;
 	}
+
+out:
+	commit_creds(new);
+	return 0;
 }
 
 /*
@@ -3021,6 +3040,7 @@ static int selinux_file_ioctl(struct file *file, unsigned int cmd,
 static int file_map_prot_check(struct file *file, unsigned long prot, int shared)
 {
 	const struct cred *cred = current_cred();
+	int rc = 0;
 
 #ifndef CONFIG_PPC32
 	if ((prot & PROT_EXEC) && (!file || (!shared && (prot & PROT_WRITE)))) {
@@ -3029,9 +3049,9 @@ static int file_map_prot_check(struct file *file, unsigned long prot, int shared
 		 * private file mapping that will also be writable.
 		 * This has an additional check.
 		 */
-		int rc = task_has_perm(current, current, PROCESS__EXECMEM);
+		rc = cred_has_perm(cred, cred, PROCESS__EXECMEM);
 		if (rc)
-			return rc;
+			goto error;
 	}
 #endif
 
@@ -3048,7 +3068,9 @@ static int file_map_prot_check(struct file *file, unsigned long prot, int shared
 
 		return file_has_perm(cred, file, av);
 	}
-	return 0;
+
+error:
+	return rc;
 }
 
 static int selinux_file_mmap(struct file *file, unsigned long reqprot,
@@ -3090,8 +3112,7 @@ static int selinux_file_mprotect(struct vm_area_struct *vma,
 		rc = 0;
 		if (vma->vm_start >= vma->vm_mm->start_brk &&
 		    vma->vm_end <= vma->vm_mm->brk) {
-			rc = task_has_perm(current, current,
-					   PROCESS__EXECHEAP);
+			rc = cred_has_perm(cred, cred, PROCESS__EXECHEAP);
 		} else if (!vma->vm_file &&
 			   vma->vm_start <= vma->vm_mm->start_stack &&
 			   vma->vm_end >= vma->vm_mm->start_stack) {
@@ -3104,8 +3125,7 @@ static int selinux_file_mprotect(struct vm_area_struct *vma,
 			 * modified content.  This typically should only
 			 * occur for text relocations.
 			 */
-			rc = file_has_perm(cred, vma->vm_file,
-					   FILE__EXECMOD);
+			rc = file_has_perm(cred, vma->vm_file, FILE__EXECMOD);
 		}
 		if (rc)
 			return rc;
@@ -3211,6 +3231,7 @@ static int selinux_dentry_open(struct file *file, const struct cred *cred)
 	struct file_security_struct *fsec;
 	struct inode *inode;
 	struct inode_security_struct *isec;
+
 	inode = file->f_path.dentry->d_inode;
 	fsec = file->f_security;
 	isec = inode->i_security;
@@ -3247,38 +3268,41 @@ static int selinux_task_create(unsigned long clone_flags)
 	return task_has_perm(current, current, PROCESS__FORK);
 }
 
-static int selinux_cred_alloc_security(struct cred *cred)
+/*
+ * detach and free the LSM part of a set of credentials
+ */
+static void selinux_cred_free(struct cred *cred)
 {
-	struct task_security_struct *tsec1, *tsec2;
-	int rc;
-
-	tsec1 = current_security();
+	struct task_security_struct *tsec = cred->security;
+	cred->security = NULL;
+	kfree(tsec);
+}
 
-	rc = cred_alloc_security(cred);
-	if (rc)
-		return rc;
-	tsec2 = cred->security;
+/*
+ * prepare a new set of credentials for modification
+ */
+static int selinux_cred_prepare(struct cred *new, const struct cred *old,
+				gfp_t gfp)
+{
+	const struct task_security_struct *old_tsec;
+	struct task_security_struct *tsec;
 
-	tsec2->osid = tsec1->osid;
-	tsec2->sid = tsec1->sid;
+	old_tsec = old->security;
 
-	/* Retain the exec, fs, key, and sock SIDs across fork */
-	tsec2->exec_sid = tsec1->exec_sid;
-	tsec2->create_sid = tsec1->create_sid;
-	tsec2->keycreate_sid = tsec1->keycreate_sid;
-	tsec2->sockcreate_sid = tsec1->sockcreate_sid;
+	tsec = kmemdup(old_tsec, sizeof(struct task_security_struct), gfp);
+	if (!tsec)
+		return -ENOMEM;
 
+	new->security = tsec;
 	return 0;
 }
 
 /*
- * detach and free the LSM part of a set of credentials
+ * commit new credentials
  */
-static void selinux_cred_free(struct cred *cred)
+static void selinux_cred_commit(struct cred *new, const struct cred *old)
 {
-	struct task_security_struct *tsec = cred->security;
-	cred->security = NULL;
-	kfree(tsec);
+	secondary_ops->cred_commit(new, old);
 }
 
 static int selinux_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags)
@@ -3292,9 +3316,10 @@ static int selinux_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags)
 	return 0;
 }
 
-static int selinux_task_post_setuid(uid_t id0, uid_t id1, uid_t id2, int flags)
+static int selinux_task_fix_setuid(struct cred *new, const struct cred *old,
+				   int flags)
 {
-	return secondary_ops->task_post_setuid(id0, id1, id2, flags);
+	return secondary_ops->task_fix_setuid(new, old, flags);
 }
 
 static int selinux_task_setgid(gid_t id0, gid_t id1, gid_t id2, int flags)
@@ -3368,7 +3393,7 @@ static int selinux_task_setrlimit(unsigned int resource, struct rlimit *new_rlim
 	/* Control the ability to change the hard limit (whether
 	   lowering or raising it), so that the hard limit can
 	   later be used as a safe reset point for the soft limit
-	   upon context transitions. See selinux_bprm_apply_creds. */
+	   upon context transitions.  See selinux_bprm_committing_creds. */
 	if (old_rlim->rlim_max != new_rlim->rlim_max)
 		return task_has_perm(current, current, PROCESS__SETRLIMIT);
 
@@ -3422,13 +3447,12 @@ static int selinux_task_prctl(int option,
 			      unsigned long arg2,
 			      unsigned long arg3,
 			      unsigned long arg4,
-			      unsigned long arg5,
-			      long *rc_p)
+			      unsigned long arg5)
 {
 	/* The current prctl operations do not appear to require
 	   any SELinux controls since they merely observe or modify
 	   the state of the current process. */
-	return secondary_ops->task_prctl(option, arg2, arg3, arg4, arg5, rc_p);
+	return secondary_ops->task_prctl(option, arg2, arg3, arg4, arg5);
 }
 
 static int selinux_task_wait(struct task_struct *p)
@@ -3436,18 +3460,6 @@ static int selinux_task_wait(struct task_struct *p)
 	return task_has_perm(p, current, PROCESS__SIGCHLD);
 }
 
-static void selinux_task_reparent_to_init(struct task_struct *p)
-{
-	struct task_security_struct *tsec;
-
-	secondary_ops->task_reparent_to_init(p);
-
-	tsec = p->cred->security;
-	tsec->osid = tsec->sid;
-	tsec->sid = SECINITSID_KERNEL;
-	return;
-}
-
 static void selinux_task_to_inode(struct task_struct *p,
 				  struct inode *inode)
 {
@@ -5325,7 +5337,8 @@ static int selinux_setprocattr(struct task_struct *p,
 {
 	struct task_security_struct *tsec;
 	struct task_struct *tracer;
-	u32 sid = 0;
+	struct cred *new;
+	u32 sid = 0, ptsid;
 	int error;
 	char *str = value;
 
@@ -5372,86 +5385,75 @@ static int selinux_setprocattr(struct task_struct *p,
 			return error;
 	}
 
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
 	/* Permission checking based on the specified context is
 	   performed during the actual operation (execve,
 	   open/mkdir/...), when we know the full context of the
-	   operation.  See selinux_bprm_set_security for the execve
+	   operation.  See selinux_bprm_set_creds for the execve
 	   checks and may_create for the file creation checks. The
 	   operation will then fail if the context is not permitted. */
-	tsec = p->cred->security;
-	if (!strcmp(name, "exec"))
+	tsec = new->security;
+	if (!strcmp(name, "exec")) {
 		tsec->exec_sid = sid;
-	else if (!strcmp(name, "fscreate"))
+	} else if (!strcmp(name, "fscreate")) {
 		tsec->create_sid = sid;
-	else if (!strcmp(name, "keycreate")) {
+	} else if (!strcmp(name, "keycreate")) {
 		error = may_create_key(sid, p);
 		if (error)
-			return error;
+			goto abort_change;
 		tsec->keycreate_sid = sid;
-	} else if (!strcmp(name, "sockcreate"))
+	} else if (!strcmp(name, "sockcreate")) {
 		tsec->sockcreate_sid = sid;
-	else if (!strcmp(name, "current")) {
-		struct av_decision avd;
-
+	} else if (!strcmp(name, "current")) {
+		error = -EINVAL;
 		if (sid == 0)
-			return -EINVAL;
-		/*
-		 * SELinux allows to change context in the following case only.
-		 *  - Single threaded processes.
-		 *  - Multi threaded processes intend to change its context into
-		 *    more restricted domain (defined by TYPEBOUNDS statement).
-		 */
-		if (atomic_read(&p->mm->mm_users) != 1) {
-			struct task_struct *g, *t;
-			struct mm_struct *mm = p->mm;
-			read_lock(&tasklist_lock);
-			do_each_thread(g, t) {
-				if (t->mm == mm && t != p) {
-					read_unlock(&tasklist_lock);
-					error = security_bounded_transition(tsec->sid, sid);
-					if (!error)
-						goto boundary_ok;
-
-					return error;
-				}
-			} while_each_thread(g, t);
-			read_unlock(&tasklist_lock);
+			goto abort_change;
+
+		/* Only allow single threaded processes to change context */
+		error = -EPERM;
+		if (!is_single_threaded(p)) {
+			error = security_bounded_transition(tsec->sid, sid);
+			if (error)
+				goto abort_change;
 		}
-boundary_ok:
 
 		/* Check permissions for the transition. */
 		error = avc_has_perm(tsec->sid, sid, SECCLASS_PROCESS,
 				     PROCESS__DYNTRANSITION, NULL);
 		if (error)
-			return error;
+			goto abort_change;
 
 		/* Check for ptracing, and update the task SID if ok.
 		   Otherwise, leave SID unchanged and fail. */
+		ptsid = 0;
 		task_lock(p);
-		rcu_read_lock();
 		tracer = tracehook_tracer_task(p);
-		if (tracer != NULL) {
-			u32 ptsid = task_sid(tracer);
-			rcu_read_unlock();
-			error = avc_has_perm_noaudit(ptsid, sid,
-						     SECCLASS_PROCESS,
-						     PROCESS__PTRACE, 0, &avd);
-			if (!error)
-				tsec->sid = sid;
-			task_unlock(p);
-			avc_audit(ptsid, sid, SECCLASS_PROCESS,
-				  PROCESS__PTRACE, &avd, error, NULL);
+		if (tracer)
+			ptsid = task_sid(tracer);
+		task_unlock(p);
+
+		if (tracer) {
+			error = avc_has_perm(ptsid, sid, SECCLASS_PROCESS,
+					     PROCESS__PTRACE, NULL);
 			if (error)
-				return error;
-		} else {
-			rcu_read_unlock();
-			tsec->sid = sid;
-			task_unlock(p);
+				goto abort_change;
 		}
-	} else
-		return -EINVAL;
 
+		tsec->sid = sid;
+	} else {
+		error = -EINVAL;
+		goto abort_change;
+	}
+
+	commit_creds(new);
 	return size;
+
+abort_change:
+	abort_creds(new);
+	return error;
 }
 
 static int selinux_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
@@ -5471,23 +5473,21 @@ static void selinux_release_secctx(char *secdata, u32 seclen)
 
 #ifdef CONFIG_KEYS
 
-static int selinux_key_alloc(struct key *k, struct task_struct *tsk,
+static int selinux_key_alloc(struct key *k, const struct cred *cred,
 			     unsigned long flags)
 {
-	const struct task_security_struct *__tsec;
+	const struct task_security_struct *tsec;
 	struct key_security_struct *ksec;
 
 	ksec = kzalloc(sizeof(struct key_security_struct), GFP_KERNEL);
 	if (!ksec)
 		return -ENOMEM;
 
-	rcu_read_lock();
-	__tsec = __task_cred(tsk)->security;
-	if (__tsec->keycreate_sid)
-		ksec->sid = __tsec->keycreate_sid;
+	tsec = cred->security;
+	if (tsec->keycreate_sid)
+		ksec->sid = tsec->keycreate_sid;
 	else
-		ksec->sid = __tsec->sid;
-	rcu_read_unlock();
+		ksec->sid = tsec->sid;
 
 	k->security = ksec;
 	return 0;
@@ -5502,8 +5502,8 @@ static void selinux_key_free(struct key *k)
 }
 
 static int selinux_key_permission(key_ref_t key_ref,
-			    struct task_struct *ctx,
-			    key_perm_t perm)
+				  const struct cred *cred,
+				  key_perm_t perm)
 {
 	struct key *key;
 	struct key_security_struct *ksec;
@@ -5515,7 +5515,7 @@ static int selinux_key_permission(key_ref_t key_ref,
 	if (perm == 0)
 		return 0;
 
-	sid = task_sid(ctx);
+	sid = cred_sid(cred);
 
 	key = key_ref_to_ptr(key_ref);
 	ksec = key->security;
@@ -5545,8 +5545,7 @@ static struct security_operations selinux_ops = {
 	.ptrace_may_access =		selinux_ptrace_may_access,
 	.ptrace_traceme =		selinux_ptrace_traceme,
 	.capget =			selinux_capget,
-	.capset_check =			selinux_capset_check,
-	.capset_set =			selinux_capset_set,
+	.capset =			selinux_capset,
 	.sysctl =			selinux_sysctl,
 	.capable =			selinux_capable,
 	.quotactl =			selinux_quotactl,
@@ -5621,10 +5620,11 @@ static struct security_operations selinux_ops = {
 	.dentry_open =			selinux_dentry_open,
 
 	.task_create =			selinux_task_create,
-	.cred_alloc_security =		selinux_cred_alloc_security,
 	.cred_free =			selinux_cred_free,
+	.cred_prepare =			selinux_cred_prepare,
+	.cred_commit =			selinux_cred_commit,
 	.task_setuid =			selinux_task_setuid,
-	.task_post_setuid =		selinux_task_post_setuid,
+	.task_fix_setuid =		selinux_task_fix_setuid,
 	.task_setgid =			selinux_task_setgid,
 	.task_setpgid =			selinux_task_setpgid,
 	.task_getpgid =			selinux_task_getpgid,
@@ -5641,7 +5641,6 @@ static struct security_operations selinux_ops = {
 	.task_kill =			selinux_task_kill,
 	.task_wait =			selinux_task_wait,
 	.task_prctl =			selinux_task_prctl,
-	.task_reparent_to_init =	selinux_task_reparent_to_init,
 	.task_to_inode =		selinux_task_to_inode,
 
 	.ipc_permission =		selinux_ipc_permission,
@@ -5737,8 +5736,6 @@ static struct security_operations selinux_ops = {
 
 static __init int selinux_init(void)
 {
-	struct task_security_struct *tsec;
-
 	if (!security_module_enable(&selinux_ops)) {
 		selinux_enabled = 0;
 		return 0;
@@ -5752,10 +5749,7 @@ static __init int selinux_init(void)
 	printk(KERN_INFO "SELinux:  Initializing.\n");
 
 	/* Set the security state for the initial task. */
-	if (cred_alloc_security(current->cred))
-		panic("SELinux:  Failed to initialize initial task.\n");
-	tsec = current->cred->security;
-	tsec->osid = tsec->sid = SECINITSID_KERNEL;
+	cred_init_security();
 
 	sel_inode_cache = kmem_cache_create("selinux_inode_security",
 					    sizeof(struct inode_security_struct),
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 11167fd567b9..e952b397153d 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -104,8 +104,7 @@ static int smack_ptrace_may_access(struct task_struct *ctp, unsigned int mode)
 	if (rc != 0)
 		return rc;
 
-	rc = smk_access(current->cred->security, ctp->cred->security,
-			MAY_READWRITE);
+	rc = smk_access(current_security(), task_security(ctp), MAY_READWRITE);
 	if (rc != 0 && capable(CAP_MAC_OVERRIDE))
 		return 0;
 	return rc;
@@ -127,8 +126,7 @@ static int smack_ptrace_traceme(struct task_struct *ptp)
 	if (rc != 0)
 		return rc;
 
-	rc = smk_access(ptp->cred->security, current->cred->security,
-			MAY_READWRITE);
+	rc = smk_access(task_security(ptp), current_security(), MAY_READWRITE);
 	if (rc != 0 && has_capability(ptp, CAP_MAC_OVERRIDE))
 		return 0;
 	return rc;
@@ -976,22 +974,6 @@ static int smack_file_receive(struct file *file)
  * Task hooks
  */
 
-/**
- * smack_cred_alloc_security - "allocate" a task cred blob
- * @cred: the task creds in need of a blob
- *
- * Smack isn't using copies of blobs. Everyone
- * points to an immutable list. No alloc required.
- * No data copy required.
- *
- * Always returns 0
- */
-static int smack_cred_alloc_security(struct cred *cred)
-{
-	cred->security = current_security();
-	return 0;
-}
-
 /**
  * smack_cred_free - "free" task-level security credentials
  * @cred: the credentials in question
@@ -1005,6 +987,30 @@ static void smack_cred_free(struct cred *cred)
 	cred->security = NULL;
 }
 
+/**
+ * smack_cred_prepare - prepare new set of credentials for modification
+ * @new: the new credentials
+ * @old: the original credentials
+ * @gfp: the atomicity of any memory allocations
+ *
+ * Prepare a new set of credentials for modification.
+ */
+static int smack_cred_prepare(struct cred *new, const struct cred *old,
+			      gfp_t gfp)
+{
+	new->security = old->security;
+	return 0;
+}
+
+/*
+ * commit new credentials
+ * @new: the new credentials
+ * @old: the original credentials
+ */
+static void smack_cred_commit(struct cred *new, const struct cred *old)
+{
+}
+
 /**
  * smack_task_setpgid - Smack check on setting pgid
  * @p: the task object
@@ -2036,6 +2042,7 @@ static int smack_getprocattr(struct task_struct *p, char *name, char **value)
 static int smack_setprocattr(struct task_struct *p, char *name,
 			     void *value, size_t size)
 {
+	struct cred *new;
 	char *newsmack;
 
 	/*
@@ -2058,7 +2065,11 @@ static int smack_setprocattr(struct task_struct *p, char *name,
 	if (newsmack == NULL)
 		return -EINVAL;
 
-	p->cred->security = newsmack;
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+	new->security = newsmack;
+	commit_creds(new);
 	return size;
 }
 
@@ -2354,17 +2365,17 @@ static int smack_inet_conn_request(struct sock *sk, struct sk_buff *skb,
 /**
  * smack_key_alloc - Set the key security blob
  * @key: object
- * @tsk: the task associated with the key
+ * @cred: the credentials to use
  * @flags: unused
  *
  * No allocation required
  *
  * Returns 0
  */
-static int smack_key_alloc(struct key *key, struct task_struct *tsk,
+static int smack_key_alloc(struct key *key, const struct cred *cred,
 			   unsigned long flags)
 {
-	key->security = tsk->cred->security;
+	key->security = cred->security;
 	return 0;
 }
 
@@ -2382,14 +2393,14 @@ static void smack_key_free(struct key *key)
 /*
  * smack_key_permission - Smack access on a key
  * @key_ref: gets to the object
- * @context: task involved
+ * @cred: the credentials to use
  * @perm: unused
  *
  * Return 0 if the task has read and write to the object,
  * an error code otherwise
  */
 static int smack_key_permission(key_ref_t key_ref,
-				struct task_struct *context, key_perm_t perm)
+				const struct cred *cred, key_perm_t perm)
 {
 	struct key *keyp;
 
@@ -2405,11 +2416,10 @@ static int smack_key_permission(key_ref_t key_ref,
 	/*
 	 * This should not occur
 	 */
-	if (context->cred->security == NULL)
+	if (cred->security == NULL)
 		return -EACCES;
 
-	return smk_access(context->cred->security, keyp->security,
-			  MAY_READWRITE);
+	return smk_access(cred->security, keyp->security, MAY_READWRITE);
 }
 #endif /* CONFIG_KEYS */
 
@@ -2580,8 +2590,7 @@ struct security_operations smack_ops = {
 	.ptrace_may_access =		smack_ptrace_may_access,
 	.ptrace_traceme =		smack_ptrace_traceme,
 	.capget = 			cap_capget,
-	.capset_check = 		cap_capset_check,
-	.capset_set = 			cap_capset_set,
+	.capset = 			cap_capset,
 	.capable = 			cap_capable,
 	.syslog = 			smack_syslog,
 	.settime = 			cap_settime,
@@ -2630,9 +2639,10 @@ struct security_operations smack_ops = {
 	.file_send_sigiotask = 		smack_file_send_sigiotask,
 	.file_receive = 		smack_file_receive,
 
-	.cred_alloc_security = 		smack_cred_alloc_security,
 	.cred_free =			smack_cred_free,
-	.task_post_setuid =		cap_task_post_setuid,
+	.cred_prepare =			smack_cred_prepare,
+	.cred_commit =			smack_cred_commit,
+	.task_fix_setuid =		cap_task_fix_setuid,
 	.task_setpgid = 		smack_task_setpgid,
 	.task_getpgid = 		smack_task_getpgid,
 	.task_getsid = 			smack_task_getsid,
@@ -2645,7 +2655,6 @@ struct security_operations smack_ops = {
 	.task_movememory = 		smack_task_movememory,
 	.task_kill = 			smack_task_kill,
 	.task_wait = 			smack_task_wait,
-	.task_reparent_to_init =	cap_task_reparent_to_init,
 	.task_to_inode = 		smack_task_to_inode,
 	.task_prctl =			cap_task_prctl,
 
@@ -2721,6 +2730,8 @@ struct security_operations smack_ops = {
  */
 static __init int smack_init(void)
 {
+	struct cred *cred;
+
 	if (!security_module_enable(&smack_ops))
 		return 0;
 
@@ -2729,7 +2740,8 @@ static __init int smack_init(void)
 	/*
 	 * Set the security state for the initial task.
 	 */
-	current->cred->security = &smack_known_floor.smk_known;
+	cred = (struct cred *) current->cred;
+	cred->security = &smack_known_floor.smk_known;
 
 	/*
 	 * Initialize locks
-- 
cgit v1.2.3


From a6f76f23d297f70e2a6b3ec607f7aeeea9e37e8d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:24 +1100
Subject: CRED: Make execve() take advantage of copy-on-write credentials

Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.

This patch and the preceding patches have been tested with the LTP SELinux
testsuite.

This patch makes several logical sets of alteration:

 (1) execve().

     The credential bits from struct linux_binprm are, for the most part,
     replaced with a single credentials pointer (bprm->cred).  This means that
     all the creds can be calculated in advance and then applied at the point
     of no return with no possibility of failure.

     I would like to replace bprm->cap_effective with:

	cap_isclear(bprm->cap_effective)

     but this seems impossible due to special behaviour for processes of pid 1
     (they always retain their parent's capability masks where normally they'd
     be changed - see cap_bprm_set_creds()).

     The following sequence of events now happens:

     (a) At the start of do_execve, the current task's cred_exec_mutex is
     	 locked to prevent PTRACE_ATTACH from obsoleting the calculation of
     	 creds that we make.

     (a) prepare_exec_creds() is then called to make a copy of the current
     	 task's credentials and prepare it.  This copy is then assigned to
     	 bprm->cred.

  	 This renders security_bprm_alloc() and security_bprm_free()
     	 unnecessary, and so they've been removed.

     (b) The determination of unsafe execution is now performed immediately
     	 after (a) rather than later on in the code.  The result is stored in
     	 bprm->unsafe for future reference.

     (c) prepare_binprm() is called, possibly multiple times.

     	 (i) This applies the result of set[ug]id binaries to the new creds
     	     attached to bprm->cred.  Personality bit clearance is recorded,
     	     but now deferred on the basis that the exec procedure may yet
     	     fail.

         (ii) This then calls the new security_bprm_set_creds().  This should
	     calculate the new LSM and capability credentials into *bprm->cred.

	     This folds together security_bprm_set() and parts of
	     security_bprm_apply_creds() (these two have been removed).
	     Anything that might fail must be done at this point.

         (iii) bprm->cred_prepared is set to 1.

	     bprm->cred_prepared is 0 on the first pass of the security
	     calculations, and 1 on all subsequent passes.  This allows SELinux
	     in (ii) to base its calculations only on the initial script and
	     not on the interpreter.

     (d) flush_old_exec() is called to commit the task to execution.  This
     	 performs the following steps with regard to credentials:

	 (i) Clear pdeath_signal and set dumpable on certain circumstances that
	     may not be covered by commit_creds().

         (ii) Clear any bits in current->personality that were deferred from
             (c.i).

     (e) install_exec_creds() [compute_creds() as was] is called to install the
     	 new credentials.  This performs the following steps with regard to
     	 credentials:

         (i) Calls security_bprm_committing_creds() to apply any security
             requirements, such as flushing unauthorised files in SELinux, that
             must be done before the credentials are changed.

	     This is made up of bits of security_bprm_apply_creds() and
	     security_bprm_post_apply_creds(), both of which have been removed.
	     This function is not allowed to fail; anything that might fail
	     must have been done in (c.ii).

         (ii) Calls commit_creds() to apply the new credentials in a single
             assignment (more or less).  Possibly pdeath_signal and dumpable
             should be part of struct creds.

	 (iii) Unlocks the task's cred_replace_mutex, thus allowing
	     PTRACE_ATTACH to take place.

         (iv) Clears The bprm->cred pointer as the credentials it was holding
             are now immutable.

         (v) Calls security_bprm_committed_creds() to apply any security
             alterations that must be done after the creds have been changed.
             SELinux uses this to flush signals and signal handlers.

     (f) If an error occurs before (d.i), bprm_free() will call abort_creds()
     	 to destroy the proposed new credentials and will then unlock
     	 cred_replace_mutex.  No changes to the credentials will have been
     	 made.

 (2) LSM interface.

     A number of functions have been changed, added or removed:

     (*) security_bprm_alloc(), ->bprm_alloc_security()
     (*) security_bprm_free(), ->bprm_free_security()

     	 Removed in favour of preparing new credentials and modifying those.

     (*) security_bprm_apply_creds(), ->bprm_apply_creds()
     (*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()

     	 Removed; split between security_bprm_set_creds(),
     	 security_bprm_committing_creds() and security_bprm_committed_creds().

     (*) security_bprm_set(), ->bprm_set_security()

     	 Removed; folded into security_bprm_set_creds().

     (*) security_bprm_set_creds(), ->bprm_set_creds()

     	 New.  The new credentials in bprm->creds should be checked and set up
     	 as appropriate.  bprm->cred_prepared is 0 on the first call, 1 on the
     	 second and subsequent calls.

     (*) security_bprm_committing_creds(), ->bprm_committing_creds()
     (*) security_bprm_committed_creds(), ->bprm_committed_creds()

     	 New.  Apply the security effects of the new credentials.  This
     	 includes closing unauthorised files in SELinux.  This function may not
     	 fail.  When the former is called, the creds haven't yet been applied
     	 to the process; when the latter is called, they have.

 	 The former may access bprm->cred, the latter may not.

 (3) SELinux.

     SELinux has a number of changes, in addition to those to support the LSM
     interface changes mentioned above:

     (a) The bprm_security_struct struct has been removed in favour of using
     	 the credentials-under-construction approach.

     (c) flush_unauthorized_files() now takes a cred pointer and passes it on
     	 to inode_has_perm(), file_has_perm() and dentry_open().

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 arch/x86/ia32/ia32_aout.c         |   2 +-
 fs/binfmt_aout.c                  |   2 +-
 fs/binfmt_elf.c                   |   2 +-
 fs/binfmt_elf_fdpic.c             |   2 +-
 fs/binfmt_flat.c                  |   2 +-
 fs/binfmt_som.c                   |   2 +-
 fs/compat.c                       |  42 +++---
 fs/exec.c                         | 149 +++++++++++---------
 fs/internal.h                     |   6 +
 include/linux/audit.h             |  16 ---
 include/linux/binfmts.h           |  16 ++-
 include/linux/cred.h              |   3 +-
 include/linux/key.h               |   2 -
 include/linux/security.h          | 103 +++++---------
 kernel/cred.c                     |  46 ++++++-
 security/capability.c             |  19 +--
 security/commoncap.c              | 152 ++++++++++----------
 security/keys/process_keys.c      |  42 ------
 security/root_plug.c              |  13 +-
 security/security.c               |  26 ++--
 security/selinux/hooks.c          | 283 ++++++++++++++++----------------------
 security/selinux/include/objsec.h |  11 --
 security/smack/smack_lsm.c        |   3 +-
 23 files changed, 429 insertions(+), 515 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 127ec3f07214..2a4d073d2cf1 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -327,7 +327,7 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	current->mm->cached_hole_size = 0;
 
 	current->mm->mmap = NULL;
-	compute_creds(bprm);
+	install_exec_creds(bprm);
 	current->flags &= ~PF_FORKNOEXEC;
 
 	if (N_MAGIC(ex) == OMAGIC) {
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 204cfd1d7676..f1f3f4192a60 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -320,7 +320,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	current->mm->free_area_cache = current->mm->mmap_base;
 	current->mm->cached_hole_size = 0;
 
-	compute_creds(bprm);
+	install_exec_creds(bprm);
  	current->flags &= ~PF_FORKNOEXEC;
 #ifdef __sparc__
 	if (N_MAGIC(ex) == NMAGIC) {
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 9142ff5dc8e6..f458c1217c5e 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -956,7 +956,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	}
 #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
 
-	compute_creds(bprm);
+	install_exec_creds(bprm);
 	current->flags &= ~PF_FORKNOEXEC;
 	retval = create_elf_tables(bprm, &loc->elf_ex,
 			  load_addr, interp_load_addr);
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 45dabd59936f..aa5b43205e37 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -404,7 +404,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
 	current->mm->start_stack = current->mm->start_brk + stack_size;
 #endif
 
-	compute_creds(bprm);
+	install_exec_creds(bprm);
 	current->flags &= ~PF_FORKNOEXEC;
 	if (create_elf_fdpic_tables(bprm, current->mm,
 				    &exec_params, &interp_params) < 0)
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index ccb781a6a804..7bbd5c6b3725 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -880,7 +880,7 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 					(libinfo.lib_list[j].loaded)?
 						libinfo.lib_list[j].start_data:UNLOADED_LIB;
 
-	compute_creds(bprm);
+	install_exec_creds(bprm);
  	current->flags &= ~PF_FORKNOEXEC;
 
 	set_binfmt(&flat_format);
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 74e587a52796..08644a61616e 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -255,7 +255,7 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	kfree(hpuxhdr);
 
 	set_binfmt(&som_format);
-	compute_creds(bprm);
+	install_exec_creds(bprm);
 	setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
 
 	create_som_tables(bprm);
diff --git a/fs/compat.c b/fs/compat.c
index e5f49f538502..d1ece79b6411 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1393,10 +1393,20 @@ int compat_do_execve(char * filename,
 	if (!bprm)
 		goto out_ret;
 
+	retval = mutex_lock_interruptible(&current->cred_exec_mutex);
+	if (retval < 0)
+		goto out_free;
+
+	retval = -ENOMEM;
+	bprm->cred = prepare_exec_creds();
+	if (!bprm->cred)
+		goto out_unlock;
+	check_unsafe_exec(bprm);
+
 	file = open_exec(filename);
 	retval = PTR_ERR(file);
 	if (IS_ERR(file))
-		goto out_kfree;
+		goto out_unlock;
 
 	sched_exec();
 
@@ -1410,14 +1420,10 @@ int compat_do_execve(char * filename,
 
 	bprm->argc = compat_count(argv, MAX_ARG_STRINGS);
 	if ((retval = bprm->argc) < 0)
-		goto out_mm;
+		goto out;
 
 	bprm->envc = compat_count(envp, MAX_ARG_STRINGS);
 	if ((retval = bprm->envc) < 0)
-		goto out_mm;
-
-	retval = security_bprm_alloc(bprm);
-	if (retval)
 		goto out;
 
 	retval = prepare_binprm(bprm);
@@ -1438,19 +1444,16 @@ int compat_do_execve(char * filename,
 		goto out;
 
 	retval = search_binary_handler(bprm, regs);
-	if (retval >= 0) {
-		/* execve success */
-		security_bprm_free(bprm);
-		acct_update_integrals(current);
-		free_bprm(bprm);
-		return retval;
-	}
+	if (retval < 0)
+		goto out;
 
-out:
-	if (bprm->security)
-		security_bprm_free(bprm);
+	/* execve succeeded */
+	mutex_unlock(&current->cred_exec_mutex);
+	acct_update_integrals(current);
+	free_bprm(bprm);
+	return retval;
 
-out_mm:
+out:
 	if (bprm->mm)
 		mmput(bprm->mm);
 
@@ -1460,7 +1463,10 @@ out_file:
 		fput(bprm->file);
 	}
 
-out_kfree:
+out_unlock:
+	mutex_unlock(&current->cred_exec_mutex);
+
+out_free:
 	free_bprm(bprm);
 
 out_ret:
diff --git a/fs/exec.c b/fs/exec.c
index 9bd3559ddece..32f13e299417 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -55,6 +55,7 @@
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/tlb.h>
+#include "internal.h"
 
 #ifdef __alpha__
 /* for /sbin/loader handling in search_binary_handler() */
@@ -1007,15 +1008,17 @@ int flush_old_exec(struct linux_binprm * bprm)
 	 */
 	current->mm->task_size = TASK_SIZE;
 
-	if (bprm->e_uid != current_euid() ||
-	    bprm->e_gid != current_egid()) {
-		set_dumpable(current->mm, suid_dumpable);
+	/* install the new credentials */
+	if (bprm->cred->uid != current_euid() ||
+	    bprm->cred->gid != current_egid()) {
 		current->pdeath_signal = 0;
 	} else if (file_permission(bprm->file, MAY_READ) ||
-			(bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) {
+		   bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP) {
 		set_dumpable(current->mm, suid_dumpable);
 	}
 
+	current->personality &= ~bprm->per_clear;
+
 	/* An exec changes our domain. We are no longer part of the thread
 	   group */
 
@@ -1032,13 +1035,50 @@ out:
 
 EXPORT_SYMBOL(flush_old_exec);
 
+/*
+ * install the new credentials for this executable
+ */
+void install_exec_creds(struct linux_binprm *bprm)
+{
+	security_bprm_committing_creds(bprm);
+
+	commit_creds(bprm->cred);
+	bprm->cred = NULL;
+
+	/* cred_exec_mutex must be held at least to this point to prevent
+	 * ptrace_attach() from altering our determination of the task's
+	 * credentials; any time after this it may be unlocked */
+
+	security_bprm_committed_creds(bprm);
+}
+EXPORT_SYMBOL(install_exec_creds);
+
+/*
+ * determine how safe it is to execute the proposed program
+ * - the caller must hold current->cred_exec_mutex to protect against
+ *   PTRACE_ATTACH
+ */
+void check_unsafe_exec(struct linux_binprm *bprm)
+{
+	struct task_struct *p = current;
+
+	bprm->unsafe = tracehook_unsafe_exec(p);
+
+	if (atomic_read(&p->fs->count) > 1 ||
+	    atomic_read(&p->files->count) > 1 ||
+	    atomic_read(&p->sighand->count) > 1)
+		bprm->unsafe |= LSM_UNSAFE_SHARE;
+}
+
 /* 
  * Fill the binprm structure from the inode. 
  * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
+ *
+ * This may be called multiple times for binary chains (scripts for example).
  */
 int prepare_binprm(struct linux_binprm *bprm)
 {
-	int mode;
+	umode_t mode;
 	struct inode * inode = bprm->file->f_path.dentry->d_inode;
 	int retval;
 
@@ -1046,14 +1086,15 @@ int prepare_binprm(struct linux_binprm *bprm)
 	if (bprm->file->f_op == NULL)
 		return -EACCES;
 
-	bprm->e_uid = current_euid();
-	bprm->e_gid = current_egid();
+	/* clear any previous set[ug]id data from a previous binary */
+	bprm->cred->euid = current_euid();
+	bprm->cred->egid = current_egid();
 
-	if(!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)) {
+	if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)) {
 		/* Set-uid? */
 		if (mode & S_ISUID) {
-			current->personality &= ~PER_CLEAR_ON_SETID;
-			bprm->e_uid = inode->i_uid;
+			bprm->per_clear |= PER_CLEAR_ON_SETID;
+			bprm->cred->euid = inode->i_uid;
 		}
 
 		/* Set-gid? */
@@ -1063,50 +1104,23 @@ int prepare_binprm(struct linux_binprm *bprm)
 		 * executable.
 		 */
 		if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
-			current->personality &= ~PER_CLEAR_ON_SETID;
-			bprm->e_gid = inode->i_gid;
+			bprm->per_clear |= PER_CLEAR_ON_SETID;
+			bprm->cred->egid = inode->i_gid;
 		}
 	}
 
 	/* fill in binprm security blob */
-	retval = security_bprm_set(bprm);
+	retval = security_bprm_set_creds(bprm);
 	if (retval)
 		return retval;
+	bprm->cred_prepared = 1;
 
-	memset(bprm->buf,0,BINPRM_BUF_SIZE);
-	return kernel_read(bprm->file,0,bprm->buf,BINPRM_BUF_SIZE);
+	memset(bprm->buf, 0, BINPRM_BUF_SIZE);
+	return kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE);
 }
 
 EXPORT_SYMBOL(prepare_binprm);
 
-static int unsafe_exec(struct task_struct *p)
-{
-	int unsafe = tracehook_unsafe_exec(p);
-
-	if (atomic_read(&p->fs->count) > 1 ||
-	    atomic_read(&p->files->count) > 1 ||
-	    atomic_read(&p->sighand->count) > 1)
-		unsafe |= LSM_UNSAFE_SHARE;
-
-	return unsafe;
-}
-
-void compute_creds(struct linux_binprm *bprm)
-{
-	int unsafe;
-
-	if (bprm->e_uid != current_uid())
-		current->pdeath_signal = 0;
-	exec_keys(current);
-
-	task_lock(current);
-	unsafe = unsafe_exec(current);
-	security_bprm_apply_creds(bprm, unsafe);
-	task_unlock(current);
-	security_bprm_post_apply_creds(bprm);
-}
-EXPORT_SYMBOL(compute_creds);
-
 /*
  * Arguments are '\0' separated strings found at the location bprm->p
  * points to; chop off the first by relocating brpm->p to right after
@@ -1259,6 +1273,8 @@ EXPORT_SYMBOL(search_binary_handler);
 void free_bprm(struct linux_binprm *bprm)
 {
 	free_arg_pages(bprm);
+	if (bprm->cred)
+		abort_creds(bprm->cred);
 	kfree(bprm);
 }
 
@@ -1284,10 +1300,20 @@ int do_execve(char * filename,
 	if (!bprm)
 		goto out_files;
 
+	retval = mutex_lock_interruptible(&current->cred_exec_mutex);
+	if (retval < 0)
+		goto out_free;
+
+	retval = -ENOMEM;
+	bprm->cred = prepare_exec_creds();
+	if (!bprm->cred)
+		goto out_unlock;
+	check_unsafe_exec(bprm);
+
 	file = open_exec(filename);
 	retval = PTR_ERR(file);
 	if (IS_ERR(file))
-		goto out_kfree;
+		goto out_unlock;
 
 	sched_exec();
 
@@ -1301,14 +1327,10 @@ int do_execve(char * filename,
 
 	bprm->argc = count(argv, MAX_ARG_STRINGS);
 	if ((retval = bprm->argc) < 0)
-		goto out_mm;
+		goto out;
 
 	bprm->envc = count(envp, MAX_ARG_STRINGS);
 	if ((retval = bprm->envc) < 0)
-		goto out_mm;
-
-	retval = security_bprm_alloc(bprm);
-	if (retval)
 		goto out;
 
 	retval = prepare_binprm(bprm);
@@ -1330,21 +1352,18 @@ int do_execve(char * filename,
 
 	current->flags &= ~PF_KTHREAD;
 	retval = search_binary_handler(bprm,regs);
-	if (retval >= 0) {
-		/* execve success */
-		security_bprm_free(bprm);
-		acct_update_integrals(current);
-		free_bprm(bprm);
-		if (displaced)
-			put_files_struct(displaced);
-		return retval;
-	}
+	if (retval < 0)
+		goto out;
 
-out:
-	if (bprm->security)
-		security_bprm_free(bprm);
+	/* execve succeeded */
+	mutex_unlock(&current->cred_exec_mutex);
+	acct_update_integrals(current);
+	free_bprm(bprm);
+	if (displaced)
+		put_files_struct(displaced);
+	return retval;
 
-out_mm:
+out:
 	if (bprm->mm)
 		mmput (bprm->mm);
 
@@ -1353,7 +1372,11 @@ out_file:
 		allow_write_access(bprm->file);
 		fput(bprm->file);
 	}
-out_kfree:
+
+out_unlock:
+	mutex_unlock(&current->cred_exec_mutex);
+
+out_free:
 	free_bprm(bprm);
 
 out_files:
diff --git a/fs/internal.h b/fs/internal.h
index 80aa9a023372..53af885f1732 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -10,6 +10,7 @@
  */
 
 struct super_block;
+struct linux_binprm;
 
 /*
  * block_dev.c
@@ -39,6 +40,11 @@ static inline int sb_is_blkdev_sb(struct super_block *sb)
  */
 extern void __init chrdev_init(void);
 
+/*
+ * exec.c
+ */
+extern void check_unsafe_exec(struct linux_binprm *);
+
 /*
  * namespace.c
  */
diff --git a/include/linux/audit.h b/include/linux/audit.h
index 0b2fcb698a63..e8ce2c4c7ac7 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -508,22 +508,6 @@ static inline int audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
 	return 0;
 }
 
-/*
- * ieieeeeee, an audit function without a return code!
- *
- * This function might fail!  I decided that it didn't matter.  We are too late
- * to fail the syscall and the information isn't REQUIRED for any purpose.  It's
- * just nice to have.  We should be able to look at past audit logs to figure
- * out this process's current cap set along with the fcaps from the PATH record
- * and use that to come up with the final set.  Yeah, its ugly, but all the info
- * is still in the audit log.  So I'm not going to bother mentioning we failed
- * if we couldn't allocate memory.
- *
- * If someone changes their mind they could create the aux record earlier and
- * then search here and use that earlier allocation.  But I don't wanna.
- *
- * -Eric
- */
 static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm,
 				       const struct cred *new,
 				       const struct cred *old)
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 7394b5b349ff..6cbfbe297180 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -35,16 +35,20 @@ struct linux_binprm{
 	struct mm_struct *mm;
 	unsigned long p; /* current top of mem */
 	unsigned int sh_bang:1,
-		     misc_bang:1;
+		misc_bang:1,
+		cred_prepared:1,/* true if creds already prepared (multiple
+				 * preps happen for interpreters) */
+		cap_effective:1;/* true if has elevated effective capabilities,
+				 * false if not; except for init which inherits
+				 * its parent's caps anyway */
 #ifdef __alpha__
 	unsigned int taso:1;
 #endif
 	unsigned int recursion_depth;
 	struct file * file;
-	int e_uid, e_gid;
-	kernel_cap_t cap_post_exec_permitted;
-	bool cap_effective;
-	void *security;
+	struct cred *cred;	/* new credentials */
+	int unsafe;		/* how unsafe this exec is (mask of LSM_UNSAFE_*) */
+	unsigned int per_clear;	/* bits to clear in current->personality */
 	int argc, envc;
 	char * filename;	/* Name of binary as seen by procps */
 	char * interp;		/* Name of the binary really executed. Most
@@ -101,7 +105,7 @@ extern int setup_arg_pages(struct linux_binprm * bprm,
 			   int executable_stack);
 extern int bprm_mm_init(struct linux_binprm *bprm);
 extern int copy_strings_kernel(int argc,char ** argv,struct linux_binprm *bprm);
-extern void compute_creds(struct linux_binprm *binprm);
+extern void install_exec_creds(struct linux_binprm *bprm);
 extern int do_coredump(long signr, int exit_code, struct pt_regs * regs);
 extern int set_binfmt(struct linux_binfmt *new);
 extern void free_bprm(struct linux_binprm *);
diff --git a/include/linux/cred.h b/include/linux/cred.h
index eaf6fa695a04..8edb4d1d5427 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -84,8 +84,6 @@ struct thread_group_cred {
 	struct key	*process_keyring;	/* keyring private to this process */
 	struct rcu_head	rcu;			/* RCU deletion hook */
 };
-
-extern void release_tgcred(struct cred *cred);
 #endif
 
 /*
@@ -144,6 +142,7 @@ struct cred {
 extern void __put_cred(struct cred *);
 extern int copy_creds(struct task_struct *, unsigned long);
 extern struct cred *prepare_creds(void);
+extern struct cred *prepare_exec_creds(void);
 extern struct cred *prepare_usermodehelper_creds(void);
 extern int commit_creds(struct cred *);
 extern void abort_creds(struct cred *);
diff --git a/include/linux/key.h b/include/linux/key.h
index 69ecf0934b02..21d32a142c00 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -278,7 +278,6 @@ extern ctl_table key_sysctls[];
  * the userspace interface
  */
 extern int install_thread_keyring_to_cred(struct cred *cred);
-extern int exec_keys(struct task_struct *tsk);
 extern void key_fsuid_changed(struct task_struct *tsk);
 extern void key_fsgid_changed(struct task_struct *tsk);
 extern void key_init(void);
@@ -294,7 +293,6 @@ extern void key_init(void);
 #define make_key_ref(k, p)		NULL
 #define key_ref_to_ptr(k)		NULL
 #define is_key_possessed(k)		0
-#define exec_keys(t)			do { } while(0)
 #define key_fsuid_changed(t)		do { } while(0)
 #define key_fsgid_changed(t)		do { } while(0)
 #define key_init()			do { } while(0)
diff --git a/include/linux/security.h b/include/linux/security.h
index 68be11251447..56a0eed65673 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -57,8 +57,7 @@ extern int cap_capset(struct cred *new, const struct cred *old,
 		      const kernel_cap_t *effective,
 		      const kernel_cap_t *inheritable,
 		      const kernel_cap_t *permitted);
-extern int cap_bprm_set_security(struct linux_binprm *bprm);
-extern int cap_bprm_apply_creds(struct linux_binprm *bprm, int unsafe);
+extern int cap_bprm_set_creds(struct linux_binprm *bprm);
 extern int cap_bprm_secureexec(struct linux_binprm *bprm);
 extern int cap_inode_setxattr(struct dentry *dentry, const char *name,
 			      const void *value, size_t size, int flags);
@@ -110,7 +109,7 @@ extern unsigned long mmap_min_addr;
 struct sched_param;
 struct request_sock;
 
-/* bprm_apply_creds unsafe reasons */
+/* bprm->unsafe reasons */
 #define LSM_UNSAFE_SHARE	1
 #define LSM_UNSAFE_PTRACE	2
 #define LSM_UNSAFE_PTRACE_CAP	4
@@ -154,36 +153,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *
  * Security hooks for program execution operations.
  *
- * @bprm_alloc_security:
- *	Allocate and attach a security structure to the @bprm->security field.
- *	The security field is initialized to NULL when the bprm structure is
- *	allocated.
- *	@bprm contains the linux_binprm structure to be modified.
- *	Return 0 if operation was successful.
- * @bprm_free_security:
- *	@bprm contains the linux_binprm structure to be modified.
- *	Deallocate and clear the @bprm->security field.
- * @bprm_apply_creds:
- *	Compute and set the security attributes of a process being transformed
- *	by an execve operation based on the old attributes (current->security)
- *	and the information saved in @bprm->security by the set_security hook.
- *	Since this function may return an error, in which case the process will
- *      be killed.  However, it can leave the security attributes of the
- *	process unchanged if an access failure occurs at this point.
- *	bprm_apply_creds is called under task_lock.  @unsafe indicates various
- *	reasons why it may be unsafe to change security state.
- *	@bprm contains the linux_binprm structure.
- * @bprm_post_apply_creds:
- *	Runs after bprm_apply_creds with the task_lock dropped, so that
- *	functions which cannot be called safely under the task_lock can
- *	be used.  This hook is a good place to perform state changes on
- *	the process such as closing open file descriptors to which access
- *	is no longer granted if the attributes were changed.
- *	Note that a security module might need to save state between
- *	bprm_apply_creds and bprm_post_apply_creds to store the decision
- *	on whether the process may proceed.
- *	@bprm contains the linux_binprm structure.
- * @bprm_set_security:
+ * @bprm_set_creds:
  *	Save security information in the bprm->security field, typically based
  *	on information about the bprm->file, for later use by the apply_creds
  *	hook.  This hook may also optionally check permissions (e.g. for
@@ -196,15 +166,30 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@bprm contains the linux_binprm structure.
  *	Return 0 if the hook is successful and permission is granted.
  * @bprm_check_security:
- *	This hook mediates the point when a search for a binary handler	will
- *	begin.  It allows a check the @bprm->security value which is set in
- *	the preceding set_security call.  The primary difference from
- *	set_security is that the argv list and envp list are reliably
- *	available in @bprm.  This hook may be called multiple times
- *	during a single execve; and in each pass set_security is called
- *	first.
+ *	This hook mediates the point when a search for a binary handler will
+ *	begin.  It allows a check the @bprm->security value which is set in the
+ *	preceding set_creds call.  The primary difference from set_creds is
+ *	that the argv list and envp list are reliably available in @bprm.  This
+ *	hook may be called multiple times during a single execve; and in each
+ *	pass set_creds is called first.
  *	@bprm contains the linux_binprm structure.
  *	Return 0 if the hook is successful and permission is granted.
+ * @bprm_committing_creds:
+ *	Prepare to install the new security attributes of a process being
+ *	transformed by an execve operation, based on the old credentials
+ *	pointed to by @current->cred and the information set in @bprm->cred by
+ *	the bprm_set_creds hook.  @bprm points to the linux_binprm structure.
+ *	This hook is a good place to perform state changes on the process such
+ *	as closing open file descriptors to which access will no longer be
+ *	granted when the attributes are changed.  This is called immediately
+ *	before commit_creds().
+ * @bprm_committed_creds:
+ *	Tidy up after the installation of the new security attributes of a
+ *	process being transformed by an execve operation.  The new credentials
+ *	have, by this point, been set to @current->cred.  @bprm points to the
+ *	linux_binprm structure.  This hook is a good place to perform state
+ *	changes on the process such as clearing out non-inheritable signal
+ *	state.  This is called immediately after commit_creds().
  * @bprm_secureexec:
  *	Return a boolean value (0 or 1) indicating whether a "secure exec"
  *	is required.  The flag is passed in the auxiliary table
@@ -1301,13 +1286,11 @@ struct security_operations {
 	int (*settime) (struct timespec *ts, struct timezone *tz);
 	int (*vm_enough_memory) (struct mm_struct *mm, long pages);
 
-	int (*bprm_alloc_security) (struct linux_binprm *bprm);
-	void (*bprm_free_security) (struct linux_binprm *bprm);
-	int (*bprm_apply_creds) (struct linux_binprm *bprm, int unsafe);
-	void (*bprm_post_apply_creds) (struct linux_binprm *bprm);
-	int (*bprm_set_security) (struct linux_binprm *bprm);
+	int (*bprm_set_creds) (struct linux_binprm *bprm);
 	int (*bprm_check_security) (struct linux_binprm *bprm);
 	int (*bprm_secureexec) (struct linux_binprm *bprm);
+	void (*bprm_committing_creds) (struct linux_binprm *bprm);
+	void (*bprm_committed_creds) (struct linux_binprm *bprm);
 
 	int (*sb_alloc_security) (struct super_block *sb);
 	void (*sb_free_security) (struct super_block *sb);
@@ -1569,12 +1552,10 @@ int security_settime(struct timespec *ts, struct timezone *tz);
 int security_vm_enough_memory(long pages);
 int security_vm_enough_memory_mm(struct mm_struct *mm, long pages);
 int security_vm_enough_memory_kern(long pages);
-int security_bprm_alloc(struct linux_binprm *bprm);
-void security_bprm_free(struct linux_binprm *bprm);
-int security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe);
-void security_bprm_post_apply_creds(struct linux_binprm *bprm);
-int security_bprm_set(struct linux_binprm *bprm);
+int security_bprm_set_creds(struct linux_binprm *bprm);
 int security_bprm_check(struct linux_binprm *bprm);
+void security_bprm_committing_creds(struct linux_binprm *bprm);
+void security_bprm_committed_creds(struct linux_binprm *bprm);
 int security_bprm_secureexec(struct linux_binprm *bprm);
 int security_sb_alloc(struct super_block *sb);
 void security_sb_free(struct super_block *sb);
@@ -1812,32 +1793,22 @@ static inline int security_vm_enough_memory_mm(struct mm_struct *mm, long pages)
 	return cap_vm_enough_memory(mm, pages);
 }
 
-static inline int security_bprm_alloc(struct linux_binprm *bprm)
-{
-	return 0;
-}
-
-static inline void security_bprm_free(struct linux_binprm *bprm)
-{ }
-
-static inline int security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
+static inline int security_bprm_set_creds(struct linux_binprm *bprm)
 {
-	return cap_bprm_apply_creds(bprm, unsafe);
+	return cap_bprm_set_creds(bprm);
 }
 
-static inline void security_bprm_post_apply_creds(struct linux_binprm *bprm)
+static inline int security_bprm_check(struct linux_binprm *bprm)
 {
-	return;
+	return 0;
 }
 
-static inline int security_bprm_set(struct linux_binprm *bprm)
+static inline void security_bprm_committing_creds(struct linux_binprm *bprm)
 {
-	return cap_bprm_set_security(bprm);
 }
 
-static inline int security_bprm_check(struct linux_binprm *bprm)
+static inline void security_bprm_committed_creds(struct linux_binprm *bprm)
 {
-	return 0;
 }
 
 static inline int security_bprm_secureexec(struct linux_binprm *bprm)
diff --git a/kernel/cred.c b/kernel/cred.c
index cb6b5eda978d..e6fcdd67b2ec 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -68,7 +68,7 @@ static void release_tgcred_rcu(struct rcu_head *rcu)
 /*
  * Release a set of thread group credentials.
  */
-void release_tgcred(struct cred *cred)
+static void release_tgcred(struct cred *cred)
 {
 #ifdef CONFIG_KEYS
 	struct thread_group_cred *tgcred = cred->tgcred;
@@ -163,6 +163,50 @@ error:
 }
 EXPORT_SYMBOL(prepare_creds);
 
+/*
+ * Prepare credentials for current to perform an execve()
+ * - The caller must hold current->cred_exec_mutex
+ */
+struct cred *prepare_exec_creds(void)
+{
+	struct thread_group_cred *tgcred = NULL;
+	struct cred *new;
+
+#ifdef CONFIG_KEYS
+	tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
+	if (!tgcred)
+		return NULL;
+#endif
+
+	new = prepare_creds();
+	if (!new) {
+		kfree(tgcred);
+		return new;
+	}
+
+#ifdef CONFIG_KEYS
+	/* newly exec'd tasks don't get a thread keyring */
+	key_put(new->thread_keyring);
+	new->thread_keyring = NULL;
+
+	/* create a new per-thread-group creds for all this set of threads to
+	 * share */
+	memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred));
+
+	atomic_set(&tgcred->usage, 1);
+	spin_lock_init(&tgcred->lock);
+
+	/* inherit the session keyring; new process keyring */
+	key_get(tgcred->session_keyring);
+	tgcred->process_keyring = NULL;
+
+	release_tgcred(new);
+	new->tgcred = tgcred;
+#endif
+
+	return new;
+}
+
 /*
  * prepare new credentials for the usermode helper dispatcher
  */
diff --git a/security/capability.c b/security/capability.c
index efeb6d9e0e6a..185804f99ad1 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -32,24 +32,19 @@ static int cap_quota_on(struct dentry *dentry)
 	return 0;
 }
 
-static int cap_bprm_alloc_security(struct linux_binprm *bprm)
+static int cap_bprm_check_security (struct linux_binprm *bprm)
 {
 	return 0;
 }
 
-static void cap_bprm_free_security(struct linux_binprm *bprm)
+static void cap_bprm_committing_creds(struct linux_binprm *bprm)
 {
 }
 
-static void cap_bprm_post_apply_creds(struct linux_binprm *bprm)
+static void cap_bprm_committed_creds(struct linux_binprm *bprm)
 {
 }
 
-static int cap_bprm_check_security(struct linux_binprm *bprm)
-{
-	return 0;
-}
-
 static int cap_sb_alloc_security(struct super_block *sb)
 {
 	return 0;
@@ -827,11 +822,9 @@ void security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, syslog);
 	set_to_cap_if_null(ops, settime);
 	set_to_cap_if_null(ops, vm_enough_memory);
-	set_to_cap_if_null(ops, bprm_alloc_security);
-	set_to_cap_if_null(ops, bprm_free_security);
-	set_to_cap_if_null(ops, bprm_apply_creds);
-	set_to_cap_if_null(ops, bprm_post_apply_creds);
-	set_to_cap_if_null(ops, bprm_set_security);
+	set_to_cap_if_null(ops, bprm_set_creds);
+	set_to_cap_if_null(ops, bprm_committing_creds);
+	set_to_cap_if_null(ops, bprm_committed_creds);
 	set_to_cap_if_null(ops, bprm_check_security);
 	set_to_cap_if_null(ops, bprm_secureexec);
 	set_to_cap_if_null(ops, sb_alloc_security);
diff --git a/security/commoncap.c b/security/commoncap.c
index b5419273f92d..51dfa11e8e56 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -167,7 +167,7 @@ int cap_capset(struct cred *new,
 
 static inline void bprm_clear_caps(struct linux_binprm *bprm)
 {
-	cap_clear(bprm->cap_post_exec_permitted);
+	cap_clear(bprm->cred->cap_permitted);
 	bprm->cap_effective = false;
 }
 
@@ -198,15 +198,15 @@ int cap_inode_killpriv(struct dentry *dentry)
 }
 
 static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
-					  struct linux_binprm *bprm)
+					  struct linux_binprm *bprm,
+					  bool *effective)
 {
+	struct cred *new = bprm->cred;
 	unsigned i;
 	int ret = 0;
 
 	if (caps->magic_etc & VFS_CAP_FLAGS_EFFECTIVE)
-		bprm->cap_effective = true;
-	else
-		bprm->cap_effective = false;
+		*effective = true;
 
 	CAP_FOR_EACH_U32(i) {
 		__u32 permitted = caps->permitted.cap[i];
@@ -215,16 +215,13 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
 		/*
 		 * pP' = (X & fP) | (pI & fI)
 		 */
-		bprm->cap_post_exec_permitted.cap[i] =
-			(current->cred->cap_bset.cap[i] & permitted) |
-			(current->cred->cap_inheritable.cap[i] & inheritable);
+		new->cap_permitted.cap[i] =
+			(new->cap_bset.cap[i] & permitted) |
+			(new->cap_inheritable.cap[i] & inheritable);
 
-		if (permitted & ~bprm->cap_post_exec_permitted.cap[i]) {
-			/*
-			 * insufficient to execute correctly
-			 */
+		if (permitted & ~new->cap_permitted.cap[i])
+			/* insufficient to execute correctly */
 			ret = -EPERM;
-		}
 	}
 
 	/*
@@ -232,7 +229,7 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
 	 * do not have enough capabilities, we return an error if they are
 	 * missing some "forced" (aka file-permitted) capabilities.
 	 */
-	return bprm->cap_effective ? ret : 0;
+	return *effective ? ret : 0;
 }
 
 int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps)
@@ -250,10 +247,9 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
 
 	size = inode->i_op->getxattr((struct dentry *)dentry, XATTR_NAME_CAPS, &caps,
 				   XATTR_CAPS_SZ);
-	if (size == -ENODATA || size == -EOPNOTSUPP) {
+	if (size == -ENODATA || size == -EOPNOTSUPP)
 		/* no data, that's ok */
 		return -ENODATA;
-	}
 	if (size < 0)
 		return size;
 
@@ -262,7 +258,7 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
 
 	cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps.magic_etc);
 
-	switch ((magic_etc & VFS_CAP_REVISION_MASK)) {
+	switch (magic_etc & VFS_CAP_REVISION_MASK) {
 	case VFS_CAP_REVISION_1:
 		if (size != XATTR_CAPS_SZ_1)
 			return -EINVAL;
@@ -283,11 +279,12 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
 		cpu_caps->permitted.cap[i] = le32_to_cpu(caps.data[i].permitted);
 		cpu_caps->inheritable.cap[i] = le32_to_cpu(caps.data[i].inheritable);
 	}
+
 	return 0;
 }
 
 /* Locate any VFS capabilities: */
-static int get_file_caps(struct linux_binprm *bprm)
+static int get_file_caps(struct linux_binprm *bprm, bool *effective)
 {
 	struct dentry *dentry;
 	int rc = 0;
@@ -313,7 +310,10 @@ static int get_file_caps(struct linux_binprm *bprm)
 		goto out;
 	}
 
-	rc = bprm_caps_from_vfs_caps(&vcaps, bprm);
+	rc = bprm_caps_from_vfs_caps(&vcaps, bprm, effective);
+	if (rc == -EINVAL)
+		printk(KERN_NOTICE "%s: cap_from_disk returned %d for %s\n",
+		       __func__, rc, bprm->filename);
 
 out:
 	dput(dentry);
@@ -334,18 +334,27 @@ int cap_inode_killpriv(struct dentry *dentry)
 	return 0;
 }
 
-static inline int get_file_caps(struct linux_binprm *bprm)
+static inline int get_file_caps(struct linux_binprm *bprm, bool *effective)
 {
 	bprm_clear_caps(bprm);
 	return 0;
 }
 #endif
 
-int cap_bprm_set_security (struct linux_binprm *bprm)
+/*
+ * set up the new credentials for an exec'd task
+ */
+int cap_bprm_set_creds(struct linux_binprm *bprm)
 {
+	const struct cred *old = current_cred();
+	struct cred *new = bprm->cred;
+	bool effective;
 	int ret;
 
-	ret = get_file_caps(bprm);
+	effective = false;
+	ret = get_file_caps(bprm, &effective);
+	if (ret < 0)
+		return ret;
 
 	if (!issecure(SECURE_NOROOT)) {
 		/*
@@ -353,63 +362,47 @@ int cap_bprm_set_security (struct linux_binprm *bprm)
 		 * executables under compatibility mode, we override the
 		 * capability sets for the file.
 		 *
-		 * If only the real uid is 0, we do not set the effective
-		 * bit.
+		 * If only the real uid is 0, we do not set the effective bit.
 		 */
-		if (bprm->e_uid == 0 || current_uid() == 0) {
+		if (new->euid == 0 || new->uid == 0) {
 			/* pP' = (cap_bset & ~0) | (pI & ~0) */
-			bprm->cap_post_exec_permitted = cap_combine(
-				current->cred->cap_bset,
-				current->cred->cap_inheritable);
-			bprm->cap_effective = (bprm->e_uid == 0);
-			ret = 0;
+			new->cap_permitted = cap_combine(old->cap_bset,
+							 old->cap_inheritable);
 		}
+		if (new->euid == 0)
+			effective = true;
 	}
 
-	return ret;
-}
-
-int cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
-{
-	const struct cred *old = current_cred();
-	struct cred *new;
-
-	new = prepare_creds();
-	if (!new)
-		return -ENOMEM;
-
-	if (bprm->e_uid != old->uid || bprm->e_gid != old->gid ||
-	    !cap_issubset(bprm->cap_post_exec_permitted,
-			  old->cap_permitted)) {
-		set_dumpable(current->mm, suid_dumpable);
-		current->pdeath_signal = 0;
-
-		if (unsafe & ~LSM_UNSAFE_PTRACE_CAP) {
-			if (!capable(CAP_SETUID)) {
-				bprm->e_uid = old->uid;
-				bprm->e_gid = old->gid;
-			}
-			if (cap_limit_ptraced_target()) {
-				bprm->cap_post_exec_permitted = cap_intersect(
-					bprm->cap_post_exec_permitted,
-					new->cap_permitted);
-			}
+	/* Don't let someone trace a set[ug]id/setpcap binary with the revised
+	 * credentials unless they have the appropriate permit
+	 */
+	if ((new->euid != old->uid ||
+	     new->egid != old->gid ||
+	     !cap_issubset(new->cap_permitted, old->cap_permitted)) &&
+	    bprm->unsafe & ~LSM_UNSAFE_PTRACE_CAP) {
+		/* downgrade; they get no more than they had, and maybe less */
+		if (!capable(CAP_SETUID)) {
+			new->euid = new->uid;
+			new->egid = new->gid;
 		}
+		if (cap_limit_ptraced_target())
+			new->cap_permitted = cap_intersect(new->cap_permitted,
+							   old->cap_permitted);
 	}
 
-	new->suid = new->euid = new->fsuid = bprm->e_uid;
-	new->sgid = new->egid = new->fsgid = bprm->e_gid;
+	new->suid = new->fsuid = new->euid;
+	new->sgid = new->fsgid = new->egid;
 
-	/* For init, we want to retain the capabilities set
-	 * in the init_task struct. Thus we skip the usual
-	 * capability rules */
+	/* For init, we want to retain the capabilities set in the initial
+	 * task.  Thus we skip the usual capability rules
+	 */
 	if (!is_global_init(current)) {
-		new->cap_permitted = bprm->cap_post_exec_permitted;
-		if (bprm->cap_effective)
-			new->cap_effective = bprm->cap_post_exec_permitted;
+		if (effective)
+			new->cap_effective = new->cap_permitted;
 		else
 			cap_clear(new->cap_effective);
 	}
+	bprm->cap_effective = effective;
 
 	/*
 	 * Audit candidate if current->cap_effective is set
@@ -425,23 +418,31 @@ int cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
 	 */
 	if (!cap_isclear(new->cap_effective)) {
 		if (!cap_issubset(CAP_FULL_SET, new->cap_effective) ||
-		    bprm->e_uid != 0 || new->uid != 0 ||
-		    issecure(SECURE_NOROOT))
-			audit_log_bprm_fcaps(bprm, new, old);
+		    new->euid != 0 || new->uid != 0 ||
+		    issecure(SECURE_NOROOT)) {
+			ret = audit_log_bprm_fcaps(bprm, new, old);
+			if (ret < 0)
+				return ret;
+		}
 	}
 
 	new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
-	return commit_creds(new);
+	return 0;
 }
 
-int cap_bprm_secureexec (struct linux_binprm *bprm)
+/*
+ * determine whether a secure execution is required
+ * - the creds have been committed at this point, and are no longer available
+ *   through bprm
+ */
+int cap_bprm_secureexec(struct linux_binprm *bprm)
 {
 	const struct cred *cred = current_cred();
 
 	if (cred->uid != 0) {
 		if (bprm->cap_effective)
 			return 1;
-		if (!cap_isclear(bprm->cap_post_exec_permitted))
+		if (!cap_isclear(cred->cap_permitted))
 			return 1;
 	}
 
@@ -477,7 +478,7 @@ int cap_inode_removexattr(struct dentry *dentry, const char *name)
 }
 
 /* moved from kernel/sys.c. */
-/* 
+/*
  * cap_emulate_setxuid() fixes the effective / permitted capabilities of
  * a process after a call to setuid, setreuid, or setresuid.
  *
@@ -491,10 +492,10 @@ int cap_inode_removexattr(struct dentry *dentry, const char *name)
  *  3) When set*uiding _from_ euid != 0 _to_ euid == 0, the effective
  *  capabilities are set to the permitted capabilities.
  *
- *  fsuid is handled elsewhere. fsuid == 0 and {r,e,s}uid!= 0 should 
+ *  fsuid is handled elsewhere. fsuid == 0 and {r,e,s}uid!= 0 should
  *  never happen.
  *
- *  -astor 
+ *  -astor
  *
  * cevans - New behaviour, Oct '99
  * A process may, via prctl(), elect to keep its capabilities when it
@@ -751,4 +752,3 @@ int cap_vm_enough_memory(struct mm_struct *mm, long pages)
 		cap_sys_admin = 1;
 	return __vm_enough_memory(mm, pages, cap_sys_admin);
 }
-
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index df329f684a65..2f5d89e92b85 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -274,48 +274,6 @@ static int install_session_keyring(struct key *keyring)
 	return commit_creds(new);
 }
 
-/*****************************************************************************/
-/*
- * deal with execve()
- */
-int exec_keys(struct task_struct *tsk)
-{
-	struct thread_group_cred *tgcred = NULL;
-	struct cred *new;
-
-#ifdef CONFIG_KEYS
-	tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
-	if (!tgcred)
-		return -ENOMEM;
-#endif
-
-	new = prepare_creds();
-	if (new < 0)
-		return -ENOMEM;
-
-	/* newly exec'd tasks don't get a thread keyring */
-	key_put(new->thread_keyring);
-	new->thread_keyring = NULL;
-
-	/* create a new per-thread-group creds for all this set of threads to
-	 * share */
-	memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred));
-
-	atomic_set(&tgcred->usage, 1);
-	spin_lock_init(&tgcred->lock);
-
-	/* inherit the session keyring; new process keyring */
-	key_get(tgcred->session_keyring);
-	tgcred->process_keyring = NULL;
-
-	release_tgcred(new);
-	new->tgcred = tgcred;
-
-	commit_creds(new);
-	return 0;
-
-} /* end exec_keys() */
-
 /*****************************************************************************/
 /*
  * the filesystem user ID changed
diff --git a/security/root_plug.c b/security/root_plug.c
index c3f68b5b372d..40fb4f15e27b 100644
--- a/security/root_plug.c
+++ b/security/root_plug.c
@@ -55,9 +55,9 @@ static int rootplug_bprm_check_security (struct linux_binprm *bprm)
 	struct usb_device *dev;
 
 	root_dbg("file %s, e_uid = %d, e_gid = %d\n",
-		 bprm->filename, bprm->e_uid, bprm->e_gid);
+		 bprm->filename, bprm->cred->euid, bprm->cred->egid);
 
-	if (bprm->e_gid == 0) {
+	if (bprm->cred->egid == 0) {
 		dev = usb_find_device(vendor_id, product_id);
 		if (!dev) {
 			root_dbg("e_gid = 0, and device not found, "
@@ -75,15 +75,12 @@ static struct security_operations rootplug_security_ops = {
 	.ptrace_may_access =		cap_ptrace_may_access,
 	.ptrace_traceme =		cap_ptrace_traceme,
 	.capget =			cap_capget,
-	.capset_check =			cap_capset_check,
-	.capset_set =			cap_capset_set,
+	.capset =			cap_capset,
 	.capable =			cap_capable,
 
-	.bprm_apply_creds =		cap_bprm_apply_creds,
-	.bprm_set_security =		cap_bprm_set_security,
+	.bprm_set_creds =		cap_bprm_set_creds,
 
-	.task_post_setuid =		cap_task_post_setuid,
-	.task_reparent_to_init =	cap_task_reparent_to_init,
+	.task_fix_setuid =		cap_task_fix_setuid,
 	.task_prctl =			cap_task_prctl,
 
 	.bprm_check_security =		rootplug_bprm_check_security,
diff --git a/security/security.c b/security/security.c
index a55d739c6864..dc5babb2d6d8 100644
--- a/security/security.c
+++ b/security/security.c
@@ -213,34 +213,24 @@ int security_vm_enough_memory_kern(long pages)
 	return security_ops->vm_enough_memory(current->mm, pages);
 }
 
-int security_bprm_alloc(struct linux_binprm *bprm)
+int security_bprm_set_creds(struct linux_binprm *bprm)
 {
-	return security_ops->bprm_alloc_security(bprm);
+	return security_ops->bprm_set_creds(bprm);
 }
 
-void security_bprm_free(struct linux_binprm *bprm)
-{
-	security_ops->bprm_free_security(bprm);
-}
-
-int security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
-{
-	return security_ops->bprm_apply_creds(bprm, unsafe);
-}
-
-void security_bprm_post_apply_creds(struct linux_binprm *bprm)
+int security_bprm_check(struct linux_binprm *bprm)
 {
-	security_ops->bprm_post_apply_creds(bprm);
+	return security_ops->bprm_check_security(bprm);
 }
 
-int security_bprm_set(struct linux_binprm *bprm)
+void security_bprm_committing_creds(struct linux_binprm *bprm)
 {
-	return security_ops->bprm_set_security(bprm);
+	return security_ops->bprm_committing_creds(bprm);
 }
 
-int security_bprm_check(struct linux_binprm *bprm)
+void security_bprm_committed_creds(struct linux_binprm *bprm)
 {
-	return security_ops->bprm_check_security(bprm);
+	return security_ops->bprm_committed_creds(bprm);
 }
 
 int security_bprm_secureexec(struct linux_binprm *bprm)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index c71bba78872f..21a592184633 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2029,59 +2029,45 @@ static int selinux_vm_enough_memory(struct mm_struct *mm, long pages)
 
 /* binprm security operations */
 
-static int selinux_bprm_alloc_security(struct linux_binprm *bprm)
+static int selinux_bprm_set_creds(struct linux_binprm *bprm)
 {
-	struct bprm_security_struct *bsec;
-
-	bsec = kzalloc(sizeof(struct bprm_security_struct), GFP_KERNEL);
-	if (!bsec)
-		return -ENOMEM;
-
-	bsec->sid = SECINITSID_UNLABELED;
-	bsec->set = 0;
-
-	bprm->security = bsec;
-	return 0;
-}
-
-static int selinux_bprm_set_security(struct linux_binprm *bprm)
-{
-	struct task_security_struct *tsec;
-	struct inode *inode = bprm->file->f_path.dentry->d_inode;
+	const struct task_security_struct *old_tsec;
+	struct task_security_struct *new_tsec;
 	struct inode_security_struct *isec;
-	struct bprm_security_struct *bsec;
-	u32 newsid;
 	struct avc_audit_data ad;
+	struct inode *inode = bprm->file->f_path.dentry->d_inode;
 	int rc;
 
-	rc = secondary_ops->bprm_set_security(bprm);
+	rc = secondary_ops->bprm_set_creds(bprm);
 	if (rc)
 		return rc;
 
-	bsec = bprm->security;
-
-	if (bsec->set)
+	/* SELinux context only depends on initial program or script and not
+	 * the script interpreter */
+	if (bprm->cred_prepared)
 		return 0;
 
-	tsec = current_security();
+	old_tsec = current_security();
+	new_tsec = bprm->cred->security;
 	isec = inode->i_security;
 
 	/* Default to the current task SID. */
-	bsec->sid = tsec->sid;
+	new_tsec->sid = old_tsec->sid;
+	new_tsec->osid = old_tsec->sid;
 
 	/* Reset fs, key, and sock SIDs on execve. */
-	tsec->create_sid = 0;
-	tsec->keycreate_sid = 0;
-	tsec->sockcreate_sid = 0;
+	new_tsec->create_sid = 0;
+	new_tsec->keycreate_sid = 0;
+	new_tsec->sockcreate_sid = 0;
 
-	if (tsec->exec_sid) {
-		newsid = tsec->exec_sid;
+	if (old_tsec->exec_sid) {
+		new_tsec->sid = old_tsec->exec_sid;
 		/* Reset exec SID on execve. */
-		tsec->exec_sid = 0;
+		new_tsec->exec_sid = 0;
 	} else {
 		/* Check for a default transition on this program. */
-		rc = security_transition_sid(tsec->sid, isec->sid,
-					     SECCLASS_PROCESS, &newsid);
+		rc = security_transition_sid(old_tsec->sid, isec->sid,
+					     SECCLASS_PROCESS, &new_tsec->sid);
 		if (rc)
 			return rc;
 	}
@@ -2090,33 +2076,63 @@ static int selinux_bprm_set_security(struct linux_binprm *bprm)
 	ad.u.fs.path = bprm->file->f_path;
 
 	if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)
-		newsid = tsec->sid;
+		new_tsec->sid = old_tsec->sid;
 
-	if (tsec->sid == newsid) {
-		rc = avc_has_perm(tsec->sid, isec->sid,
+	if (new_tsec->sid == old_tsec->sid) {
+		rc = avc_has_perm(old_tsec->sid, isec->sid,
 				  SECCLASS_FILE, FILE__EXECUTE_NO_TRANS, &ad);
 		if (rc)
 			return rc;
 	} else {
 		/* Check permissions for the transition. */
-		rc = avc_has_perm(tsec->sid, newsid,
+		rc = avc_has_perm(old_tsec->sid, new_tsec->sid,
 				  SECCLASS_PROCESS, PROCESS__TRANSITION, &ad);
 		if (rc)
 			return rc;
 
-		rc = avc_has_perm(newsid, isec->sid,
+		rc = avc_has_perm(new_tsec->sid, isec->sid,
 				  SECCLASS_FILE, FILE__ENTRYPOINT, &ad);
 		if (rc)
 			return rc;
 
-		/* Clear any possibly unsafe personality bits on exec: */
-		current->personality &= ~PER_CLEAR_ON_SETID;
+		/* Check for shared state */
+		if (bprm->unsafe & LSM_UNSAFE_SHARE) {
+			rc = avc_has_perm(old_tsec->sid, new_tsec->sid,
+					  SECCLASS_PROCESS, PROCESS__SHARE,
+					  NULL);
+			if (rc)
+				return -EPERM;
+		}
+
+		/* Make sure that anyone attempting to ptrace over a task that
+		 * changes its SID has the appropriate permit */
+		if (bprm->unsafe &
+		    (LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) {
+			struct task_struct *tracer;
+			struct task_security_struct *sec;
+			u32 ptsid = 0;
+
+			rcu_read_lock();
+			tracer = tracehook_tracer_task(current);
+			if (likely(tracer != NULL)) {
+				sec = __task_cred(tracer)->security;
+				ptsid = sec->sid;
+			}
+			rcu_read_unlock();
+
+			if (ptsid != 0) {
+				rc = avc_has_perm(ptsid, new_tsec->sid,
+						  SECCLASS_PROCESS,
+						  PROCESS__PTRACE, NULL);
+				if (rc)
+					return -EPERM;
+			}
+		}
 
-		/* Set the security field to the new SID. */
-		bsec->sid = newsid;
+		/* Clear any possibly unsafe personality bits on exec: */
+		bprm->per_clear |= PER_CLEAR_ON_SETID;
 	}
 
-	bsec->set = 1;
 	return 0;
 }
 
@@ -2125,7 +2141,6 @@ static int selinux_bprm_check_security(struct linux_binprm *bprm)
 	return secondary_ops->bprm_check_security(bprm);
 }
 
-
 static int selinux_bprm_secureexec(struct linux_binprm *bprm)
 {
 	const struct cred *cred = current_cred();
@@ -2141,19 +2156,13 @@ static int selinux_bprm_secureexec(struct linux_binprm *bprm)
 		   the noatsecure permission is granted between
 		   the two SIDs, i.e. ahp returns 0. */
 		atsecure = avc_has_perm(osid, sid,
-					 SECCLASS_PROCESS,
-					 PROCESS__NOATSECURE, NULL);
+					SECCLASS_PROCESS,
+					PROCESS__NOATSECURE, NULL);
 	}
 
 	return (atsecure || secondary_ops->bprm_secureexec(bprm));
 }
 
-static void selinux_bprm_free_security(struct linux_binprm *bprm)
-{
-	kfree(bprm->security);
-	bprm->security = NULL;
-}
-
 extern struct vfsmount *selinuxfs_mount;
 extern struct dentry *selinux_null;
 
@@ -2252,108 +2261,78 @@ static inline void flush_unauthorized_files(const struct cred *cred,
 	spin_unlock(&files->file_lock);
 }
 
-static int selinux_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
+/*
+ * Prepare a process for imminent new credential changes due to exec
+ */
+static void selinux_bprm_committing_creds(struct linux_binprm *bprm)
 {
-	struct task_security_struct *tsec;
-	struct bprm_security_struct *bsec;
-	struct cred *new;
-	u32 sid;
-	int rc;
-
-	rc = secondary_ops->bprm_apply_creds(bprm, unsafe);
-	if (rc < 0)
-		return rc;
-
-	new = prepare_creds();
-	if (!new)
-		return -ENOMEM;
+	struct task_security_struct *new_tsec;
+	struct rlimit *rlim, *initrlim;
+	int rc, i;
 
-	tsec = new->security;
+	secondary_ops->bprm_committing_creds(bprm);
 
-	bsec = bprm->security;
-	sid = bsec->sid;
-
-	tsec->osid = tsec->sid;
-	bsec->unsafe = 0;
-	if (tsec->sid != sid) {
-		/* Check for shared state.  If not ok, leave SID
-		   unchanged and kill. */
-		if (unsafe & LSM_UNSAFE_SHARE) {
-			rc = avc_has_perm(tsec->sid, sid, SECCLASS_PROCESS,
-					PROCESS__SHARE, NULL);
-			if (rc) {
-				bsec->unsafe = 1;
-				goto out;
-			}
-		}
+	new_tsec = bprm->cred->security;
+	if (new_tsec->sid == new_tsec->osid)
+		return;
 
-		/* Check for ptracing, and update the task SID if ok.
-		   Otherwise, leave SID unchanged and kill. */
-		if (unsafe & (LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) {
-			struct task_struct *tracer;
-			struct task_security_struct *sec;
-			u32 ptsid = 0;
+	/* Close files for which the new task SID is not authorized. */
+	flush_unauthorized_files(bprm->cred, current->files);
 
-			rcu_read_lock();
-			tracer = tracehook_tracer_task(current);
-			if (likely(tracer != NULL)) {
-				sec = __task_cred(tracer)->security;
-				ptsid = sec->sid;
-			}
-			rcu_read_unlock();
+	/* Always clear parent death signal on SID transitions. */
+	current->pdeath_signal = 0;
 
-			if (ptsid != 0) {
-				rc = avc_has_perm(ptsid, sid, SECCLASS_PROCESS,
-						  PROCESS__PTRACE, NULL);
-				if (rc) {
-					bsec->unsafe = 1;
-					goto out;
-				}
-			}
+	/* Check whether the new SID can inherit resource limits from the old
+	 * SID.  If not, reset all soft limits to the lower of the current
+	 * task's hard limit and the init task's soft limit.
+	 *
+	 * Note that the setting of hard limits (even to lower them) can be
+	 * controlled by the setrlimit check.  The inclusion of the init task's
+	 * soft limit into the computation is to avoid resetting soft limits
+	 * higher than the default soft limit for cases where the default is
+	 * lower than the hard limit, e.g. RLIMIT_CORE or RLIMIT_STACK.
+	 */
+	rc = avc_has_perm(new_tsec->osid, new_tsec->sid, SECCLASS_PROCESS,
+			  PROCESS__RLIMITINH, NULL);
+	if (rc) {
+		for (i = 0; i < RLIM_NLIMITS; i++) {
+			rlim = current->signal->rlim + i;
+			initrlim = init_task.signal->rlim + i;
+			rlim->rlim_cur = min(rlim->rlim_max, initrlim->rlim_cur);
 		}
-		tsec->sid = sid;
+		update_rlimit_cpu(rlim->rlim_cur);
 	}
-
-out:
-	commit_creds(new);
-	return 0;
 }
 
 /*
- * called after apply_creds without the task lock held
+ * Clean up the process immediately after the installation of new credentials
+ * due to exec
  */
-static void selinux_bprm_post_apply_creds(struct linux_binprm *bprm)
+static void selinux_bprm_committed_creds(struct linux_binprm *bprm)
 {
-	const struct cred *cred = current_cred();
-	struct task_security_struct *tsec;
-	struct rlimit *rlim, *initrlim;
+	const struct task_security_struct *tsec = current_security();
 	struct itimerval itimer;
-	struct bprm_security_struct *bsec;
 	struct sighand_struct *psig;
+	u32 osid, sid;
 	int rc, i;
 	unsigned long flags;
 
-	tsec = current_security();
-	bsec = bprm->security;
+	secondary_ops->bprm_committed_creds(bprm);
 
-	if (bsec->unsafe) {
-		force_sig_specific(SIGKILL, current);
-		return;
-	}
-	if (tsec->osid == tsec->sid)
+	osid = tsec->osid;
+	sid = tsec->sid;
+
+	if (sid == osid)
 		return;
 
-	/* Close files for which the new task SID is not authorized. */
-	flush_unauthorized_files(cred, current->files);
-
-	/* Check whether the new SID can inherit signal state
-	   from the old SID.  If not, clear itimers to avoid
-	   subsequent signal generation and flush and unblock
-	   signals. This must occur _after_ the task SID has
-	  been updated so that any kill done after the flush
-	  will be checked against the new SID. */
-	rc = avc_has_perm(tsec->osid, tsec->sid, SECCLASS_PROCESS,
-			  PROCESS__SIGINH, NULL);
+	/* Check whether the new SID can inherit signal state from the old SID.
+	 * If not, clear itimers to avoid subsequent signal generation and
+	 * flush and unblock signals.
+	 *
+	 * This must occur _after_ the task SID has been updated so that any
+	 * kill done after the flush will be checked against the new SID.
+	 */
+	rc = avc_has_perm(osid, sid, SECCLASS_PROCESS, PROCESS__SIGINH, NULL);
 	if (rc) {
 		memset(&itimer, 0, sizeof itimer);
 		for (i = 0; i < 3; i++)
@@ -2366,32 +2345,8 @@ static void selinux_bprm_post_apply_creds(struct linux_binprm *bprm)
 		spin_unlock_irq(&current->sighand->siglock);
 	}
 
-	/* Always clear parent death signal on SID transitions. */
-	current->pdeath_signal = 0;
-
-	/* Check whether the new SID can inherit resource limits
-	   from the old SID.  If not, reset all soft limits to
-	   the lower of the current task's hard limit and the init
-	   task's soft limit.  Note that the setting of hard limits
-	   (even to lower them) can be controlled by the setrlimit
-	   check. The inclusion of the init task's soft limit into
-	   the computation is to avoid resetting soft limits higher
-	   than the default soft limit for cases where the default
-	   is lower than the hard limit, e.g. RLIMIT_CORE or
-	   RLIMIT_STACK.*/
-	rc = avc_has_perm(tsec->osid, tsec->sid, SECCLASS_PROCESS,
-			  PROCESS__RLIMITINH, NULL);
-	if (rc) {
-		for (i = 0; i < RLIM_NLIMITS; i++) {
-			rlim = current->signal->rlim + i;
-			initrlim = init_task.signal->rlim+i;
-			rlim->rlim_cur = min(rlim->rlim_max, initrlim->rlim_cur);
-		}
-		update_rlimit_cpu(rlim->rlim_cur);
-	}
-
-	/* Wake up the parent if it is waiting so that it can
-	   recheck wait permission to the new task SID. */
+	/* Wake up the parent if it is waiting so that it can recheck
+	 * wait permission to the new task SID. */
 	read_lock_irq(&tasklist_lock);
 	psig = current->parent->sighand;
 	spin_lock_irqsave(&psig->siglock, flags);
@@ -5556,12 +5511,10 @@ static struct security_operations selinux_ops = {
 	.netlink_send =			selinux_netlink_send,
 	.netlink_recv =			selinux_netlink_recv,
 
-	.bprm_alloc_security =		selinux_bprm_alloc_security,
-	.bprm_free_security =		selinux_bprm_free_security,
-	.bprm_apply_creds =		selinux_bprm_apply_creds,
-	.bprm_post_apply_creds =	selinux_bprm_post_apply_creds,
-	.bprm_set_security =		selinux_bprm_set_security,
+	.bprm_set_creds =		selinux_bprm_set_creds,
 	.bprm_check_security =		selinux_bprm_check_security,
+	.bprm_committing_creds =	selinux_bprm_committing_creds,
+	.bprm_committed_creds =		selinux_bprm_committed_creds,
 	.bprm_secureexec =		selinux_bprm_secureexec,
 
 	.sb_alloc_security =		selinux_sb_alloc_security,
diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h
index f8be8d7fa26d..3cc45168f674 100644
--- a/security/selinux/include/objsec.h
+++ b/security/selinux/include/objsec.h
@@ -77,17 +77,6 @@ struct ipc_security_struct {
 	u32 sid;	/* SID of IPC resource */
 };
 
-struct bprm_security_struct {
-	u32 sid;		/* SID for transformed process */
-	unsigned char set;
-
-	/*
-	 * unsafe is used to share failure information from bprm_apply_creds()
-	 * to bprm_post_apply_creds().
-	 */
-	char unsafe;
-};
-
 struct netif_security_struct {
 	int ifindex;			/* device index */
 	u32 sid;			/* SID for this interface */
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index e952b397153d..de396742abf4 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -2596,8 +2596,7 @@ struct security_operations smack_ops = {
 	.settime = 			cap_settime,
 	.vm_enough_memory = 		cap_vm_enough_memory,
 
-	.bprm_apply_creds = 		cap_bprm_apply_creds,
-	.bprm_set_security = 		cap_bprm_set_security,
+	.bprm_set_creds = 		cap_bprm_set_creds,
 	.bprm_secureexec = 		cap_bprm_secureexec,
 
 	.sb_alloc_security = 		smack_sb_alloc_security,
-- 
cgit v1.2.3


From 98870ab0a5a3f1822aee681d2997017e1c87d026 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:26 +1100
Subject: CRED: Documentation

Document credentials and the new credentials API.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 Documentation/credentials.txt | 582 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/cred.h          |  12 +-
 kernel/cred.c                 |   2 +-
 3 files changed, 594 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/credentials.txt

(limited to 'kernel')

diff --git a/Documentation/credentials.txt b/Documentation/credentials.txt
new file mode 100644
index 000000000000..df03169782ea
--- /dev/null
+++ b/Documentation/credentials.txt
@@ -0,0 +1,582 @@
+			     ====================
+			     CREDENTIALS IN LINUX
+			     ====================
+
+By: David Howells <dhowells@redhat.com>
+
+Contents:
+
+ (*) Overview.
+
+ (*) Types of credentials.
+
+ (*) File markings.
+
+ (*) Task credentials.
+
+     - Immutable credentials.
+     - Accessing task credentials.
+     - Accessing another task's credentials.
+     - Altering credentials.
+     - Managing credentials.
+
+ (*) Open file credentials.
+
+ (*) Overriding the VFS's use of credentials.
+
+
+========
+OVERVIEW
+========
+
+There are several parts to the security check performed by Linux when one
+object acts upon another:
+
+ (1) Objects.
+
+     Objects are things in the system that may be acted upon directly by
+     userspace programs.  Linux has a variety of actionable objects, including:
+
+	- Tasks
+	- Files/inodes
+	- Sockets
+	- Message queues
+	- Shared memory segments
+	- Semaphores
+	- Keys
+
+     As a part of the description of all these objects there is a set of
+     credentials.  What's in the set depends on the type of object.
+
+ (2) Object ownership.
+
+     Amongst the credentials of most objects, there will be a subset that
+     indicates the ownership of that object.  This is used for resource
+     accounting and limitation (disk quotas and task rlimits for example).
+
+     In a standard UNIX filesystem, for instance, this will be defined by the
+     UID marked on the inode.
+
+ (3) The objective context.
+
+     Also amongst the credentials of those objects, there will be a subset that
+     indicates the 'objective context' of that object.  This may or may not be
+     the same set as in (2) - in standard UNIX files, for instance, this is the
+     defined by the UID and the GID marked on the inode.
+
+     The objective context is used as part of the security calculation that is
+     carried out when an object is acted upon.
+
+ (4) Subjects.
+
+     A subject is an object that is acting upon another object.
+
+     Most of the objects in the system are inactive: they don't act on other
+     objects within the system.  Processes/tasks are the obvious exception:
+     they do stuff; they access and manipulate things.
+
+     Objects other than tasks may under some circumstances also be subjects.
+     For instance an open file may send SIGIO to a task using the UID and EUID
+     given to it by a task that called fcntl(F_SETOWN) upon it.  In this case,
+     the file struct will have a subjective context too.
+
+ (5) The subjective context.
+
+     A subject has an additional interpretation of its credentials.  A subset
+     of its credentials forms the 'subjective context'.  The subjective context
+     is used as part of the security calculation that is carried out when a
+     subject acts.
+
+     A Linux task, for example, has the FSUID, FSGID and the supplementary
+     group list for when it is acting upon a file - which are quite separate
+     from the real UID and GID that normally form the objective context of the
+     task.
+
+ (6) Actions.
+
+     Linux has a number of actions available that a subject may perform upon an
+     object.  The set of actions available depends on the nature of the subject
+     and the object.
+
+     Actions include reading, writing, creating and deleting files; forking or
+     signalling and tracing tasks.
+
+ (7) Rules, access control lists and security calculations.
+
+     When a subject acts upon an object, a security calculation is made.  This
+     involves taking the subjective context, the objective context and the
+     action, and searching one or more sets of rules to see whether the subject
+     is granted or denied permission to act in the desired manner on the
+     object, given those contexts.
+
+     There are two main sources of rules:
+
+     (a) Discretionary access control (DAC):
+
+	 Sometimes the object will include sets of rules as part of its
+	 description.  This is an 'Access Control List' or 'ACL'.  A Linux
+	 file may supply more than one ACL.
+
+	 A traditional UNIX file, for example, includes a permissions mask that
+	 is an abbreviated ACL with three fixed classes of subject ('user',
+	 'group' and 'other'), each of which may be granted certain privileges
+	 ('read', 'write' and 'execute' - whatever those map to for the object
+	 in question).  UNIX file permissions do not allow the arbitrary
+	 specification of subjects, however, and so are of limited use.
+
+	 A Linux file might also sport a POSIX ACL.  This is a list of rules
+	 that grants various permissions to arbitrary subjects.
+
+     (b) Mandatory access control (MAC):
+
+	 The system as a whole may have one or more sets of rules that get
+	 applied to all subjects and objects, regardless of their source.
+	 SELinux and Smack are examples of this.
+
+	 In the case of SELinux and Smack, each object is given a label as part
+	 of its credentials.  When an action is requested, they take the
+	 subject label, the object label and the action and look for a rule
+	 that says that this action is either granted or denied.
+
+
+====================
+TYPES OF CREDENTIALS
+====================
+
+The Linux kernel supports the following types of credentials:
+
+ (1) Traditional UNIX credentials.
+
+	Real User ID
+	Real Group ID
+
+     The UID and GID are carried by most, if not all, Linux objects, even if in
+     some cases it has to be invented (FAT or CIFS files for example, which are
+     derived from Windows).  These (mostly) define the objective context of
+     that object, with tasks being slightly different in some cases.
+
+	Effective, Saved and FS User ID
+	Effective, Saved and FS Group ID
+	Supplementary groups
+
+     These are additional credentials used by tasks only.  Usually, an
+     EUID/EGID/GROUPS will be used as the subjective context, and real UID/GID
+     will be used as the objective.  For tasks, it should be noted that this is
+     not always true.
+
+ (2) Capabilities.
+
+	Set of permitted capabilities
+	Set of inheritable capabilities
+	Set of effective capabilities
+	Capability bounding set
+
+     These are only carried by tasks.  They indicate superior capabilities
+     granted piecemeal to a task that an ordinary task wouldn't otherwise have.
+     These are manipulated implicitly by changes to the traditional UNIX
+     credentials, but can also be manipulated directly by the capset() system
+     call.
+
+     The permitted capabilities are those caps that the process might grant
+     itself to its effective or permitted sets through capset().  This
+     inheritable set might also be so constrained.
+
+     The effective capabilities are the ones that a task is actually allowed to
+     make use of itself.
+
+     The inheritable capabilities are the ones that may get passed across
+     execve().
+
+     The bounding set limits the capabilities that may be inherited across
+     execve(), especially when a binary is executed that will execute as UID 0.
+
+ (3) Secure management flags (securebits).
+
+     These are only carried by tasks.  These govern the way the above
+     credentials are manipulated and inherited over certain operations such as
+     execve().  They aren't used directly as objective or subjective
+     credentials.
+
+ (4) Keys and keyrings.
+
+     These are only carried by tasks.  They carry and cache security tokens
+     that don't fit into the other standard UNIX credentials.  They are for
+     making such things as network filesystem keys available to the file
+     accesses performed by processes, without the necessity of ordinary
+     programs having to know about security details involved.
+
+     Keyrings are a special type of key.  They carry sets of other keys and can
+     be searched for the desired key.  Each process may subscribe to a number
+     of keyrings:
+
+	Per-thread keying
+	Per-process keyring
+	Per-session keyring
+
+     When a process accesses a key, if not already present, it will normally be
+     cached on one of these keyrings for future accesses to find.
+
+     For more information on using keys, see Documentation/keys.txt.
+
+ (5) LSM
+
+     The Linux Security Module allows extra controls to be placed over the
+     operations that a task may do.  Currently Linux supports two main
+     alternate LSM options: SELinux and Smack.
+
+     Both work by labelling the objects in a system and then applying sets of
+     rules (policies) that say what operations a task with one label may do to
+     an object with another label.
+
+ (6) AF_KEY
+
+     This is a socket-based approach to credential management for networking
+     stacks [RFC 2367].  It isn't discussed by this document as it doesn't
+     interact directly with task and file credentials; rather it keeps system
+     level credentials.
+
+
+When a file is opened, part of the opening task's subjective context is
+recorded in the file struct created.  This allows operations using that file
+struct to use those credentials instead of the subjective context of the task
+that issued the operation.  An example of this would be a file opened on a
+network filesystem where the credentials of the opened file should be presented
+to the server, regardless of who is actually doing a read or a write upon it.
+
+
+=============
+FILE MARKINGS
+=============
+
+Files on disk or obtained over the network may have annotations that form the
+objective security context of that file.  Depending on the type of filesystem,
+this may include one or more of the following:
+
+ (*) UNIX UID, GID, mode;
+
+ (*) Windows user ID;
+
+ (*) Access control list;
+
+ (*) LSM security label;
+
+ (*) UNIX exec privilege escalation bits (SUID/SGID);
+
+ (*) File capabilities exec privilege escalation bits.
+
+These are compared to the task's subjective security context, and certain
+operations allowed or disallowed as a result.  In the case of execve(), the
+privilege escalation bits come into play, and may allow the resulting process
+extra privileges, based on the annotations on the executable file.
+
+
+================
+TASK CREDENTIALS
+================
+
+In Linux, all of a task's credentials are held in (uid, gid) or through
+(groups, keys, LSM security) a refcounted structure of type 'struct cred'.
+Each task points to its credentials by a pointer called 'cred' in its
+task_struct.
+
+Once a set of credentials has been prepared and committed, it may not be
+changed, barring the following exceptions:
+
+ (1) its reference count may be changed;
+
+ (2) the reference count on the group_info struct it points to may be changed;
+
+ (3) the reference count on the security data it points to may be changed;
+
+ (4) the reference count on any keyrings it points to may be changed;
+
+ (5) any keyrings it points to may be revoked, expired or have their security
+     attributes changed; and
+
+ (6) the contents of any keyrings to which it points may be changed (the whole
+     point of keyrings being a shared set of credentials, modifiable by anyone
+     with appropriate access).
+
+To alter anything in the cred struct, the copy-and-replace principle must be
+adhered to.  First take a copy, then alter the copy and then use RCU to change
+the task pointer to make it point to the new copy.  There are wrappers to aid
+with this (see below).
+
+A task may only alter its _own_ credentials; it is no longer permitted for a
+task to alter another's credentials.  This means the capset() system call is no
+longer permitted to take any PID other than the one of the current process.
+Also keyctl_instantiate() and keyctl_negate() functions no longer permit
+attachment to process-specific keyrings in the requesting process as the
+instantiating process may need to create them.
+
+
+IMMUTABLE CREDENTIALS
+---------------------
+
+Once a set of credentials has been made public (by calling commit_creds() for
+example), it must be considered immutable, barring two exceptions:
+
+ (1) The reference count may be altered.
+
+ (2) Whilst the keyring subscriptions of a set of credentials may not be
+     changed, the keyrings subscribed to may have their contents altered.
+
+To catch accidental credential alteration at compile time, struct task_struct
+has _const_ pointers to its credential sets, as does struct file.  Furthermore,
+certain functions such as get_cred() and put_cred() operate on const pointers,
+thus rendering casts unnecessary, but require to temporarily ditch the const
+qualification to be able to alter the reference count.
+
+
+ACCESSING TASK CREDENTIALS
+--------------------------
+
+A task being able to alter only its own credentials permits the current process
+to read or replace its own credentials without the need for any form of locking
+- which simplifies things greatly.  It can just call:
+
+	const struct cred *current_cred()
+
+to get a pointer to its credentials structure, and it doesn't have to release
+it afterwards.
+
+There are convenience wrappers for retrieving specific aspects of a task's
+credentials (the value is simply returned in each case):
+
+	uid_t current_uid(void)		Current's real UID
+	gid_t current_gid(void)		Current's real GID
+	uid_t current_euid(void)	Current's effective UID
+	gid_t current_egid(void)	Current's effective GID
+	uid_t current_fsuid(void)	Current's file access UID
+	gid_t current_fsgid(void)	Current's file access GID
+	kernel_cap_t current_cap(void)	Current's effective capabilities
+	void *current_security(void)	Current's LSM security pointer
+	struct user_struct *current_user(void)  Current's user account
+
+There are also convenience wrappers for retrieving specific associated pairs of
+a task's credentials:
+
+	void current_uid_gid(uid_t *, gid_t *);
+	void current_euid_egid(uid_t *, gid_t *);
+	void current_fsuid_fsgid(uid_t *, gid_t *);
+
+which return these pairs of values through their arguments after retrieving
+them from the current task's credentials.
+
+
+In addition, there is a function for obtaining a reference on the current
+process's current set of credentials:
+
+	const struct cred *get_current_cred(void);
+
+and functions for getting references to one of the credentials that don't
+actually live in struct cred:
+
+	struct user_struct *get_current_user(void);
+	struct group_info *get_current_groups(void);
+
+which get references to the current process's user accounting structure and
+supplementary groups list respectively.
+
+Once a reference has been obtained, it must be released with put_cred(),
+free_uid() or put_group_info() as appropriate.
+
+
+ACCESSING ANOTHER TASK'S CREDENTIALS
+------------------------------------
+
+Whilst a task may access its own credentials without the need for locking, the
+same is not true of a task wanting to access another task's credentials.  It
+must use the RCU read lock and rcu_dereference().
+
+The rcu_dereference() is wrapped by:
+
+	const struct cred *__task_cred(struct task_struct *task);
+
+This should be used inside the RCU read lock, as in the following example:
+
+	void foo(struct task_struct *t, struct foo_data *f)
+	{
+		const struct cred *tcred;
+		...
+		rcu_read_lock();
+		tcred = __task_cred(t);
+		f->uid = tcred->uid;
+		f->gid = tcred->gid;
+		f->groups = get_group_info(tcred->groups);
+		rcu_read_unlock();
+		...
+	}
+
+A function need not get RCU read lock to use __task_cred() if it is holding a
+spinlock at the time as this implicitly holds the RCU read lock.
+
+Should it be necessary to hold another task's credentials for a long period of
+time, and possibly to sleep whilst doing so, then the caller should get a
+reference on them using:
+
+	const struct cred *get_task_cred(struct task_struct *task);
+
+This does all the RCU magic inside of it.  The caller must call put_cred() on
+the credentials so obtained when they're finished with.
+
+There are a couple of convenience functions to access bits of another task's
+credentials, hiding the RCU magic from the caller:
+
+	uid_t task_uid(task)		Task's real UID
+	uid_t task_euid(task)		Task's effective UID
+
+If the caller is holding a spinlock or the RCU read lock at the time anyway,
+then:
+
+	__task_cred(task)->uid
+	__task_cred(task)->euid
+
+should be used instead.  Similarly, if multiple aspects of a task's credentials
+need to be accessed, RCU read lock or a spinlock should be used, __task_cred()
+called, the result stored in a temporary pointer and then the credential
+aspects called from that before dropping the lock.  This prevents the
+potentially expensive RCU magic from being invoked multiple times.
+
+Should some other single aspect of another task's credentials need to be
+accessed, then this can be used:
+
+	task_cred_xxx(task, member)
+
+where 'member' is a non-pointer member of the cred struct.  For instance:
+
+	uid_t task_cred_xxx(task, suid);
+
+will retrieve 'struct cred::suid' from the task, doing the appropriate RCU
+magic.  This may not be used for pointer members as what they point to may
+disappear the moment the RCU read lock is dropped.
+
+
+ALTERING CREDENTIALS
+--------------------
+
+As previously mentioned, a task may only alter its own credentials, and may not
+alter those of another task.  This means that it doesn't need to use any
+locking to alter its own credentials.
+
+To alter the current process's credentials, a function should first prepare a
+new set of credentials by calling:
+
+	struct cred *prepare_creds(void);
+
+this locks current->cred_replace_mutex and then allocates and constructs a
+duplicate of the current process's credentials, returning with the mutex still
+held if successful.  It returns NULL if not successful (out of memory).
+
+The mutex prevents ptrace() from altering the ptrace state of a process whilst
+security checks on credentials construction and changing is taking place as
+the ptrace state may alter the outcome, particularly in the case of execve().
+
+The new credentials set should be altered appropriately, and any security
+checks and hooks done.  Both the current and the proposed sets of credentials
+are available for this purpose as current_cred() will return the current set
+still at this point.
+
+
+When the credential set is ready, it should be committed to the current process
+by calling:
+
+	int commit_creds(struct cred *new);
+
+This will alter various aspects of the credentials and the process, giving the
+LSM a chance to do likewise, then it will use rcu_assign_pointer() to actually
+commit the new credentials to current->cred, it will release
+current->cred_replace_mutex to allow ptrace() to take place, and it will notify
+the scheduler and others of the changes.
+
+This function is guaranteed to return 0, so that it can be tail-called at the
+end of such functions as sys_setresuid().
+
+Note that this function consumes the caller's reference to the new credentials.
+The caller should _not_ call put_cred() on the new credentials afterwards.
+
+Furthermore, once this function has been called on a new set of credentials,
+those credentials may _not_ be changed further.
+
+
+Should the security checks fail or some other error occur after prepare_creds()
+has been called, then the following function should be invoked:
+
+	void abort_creds(struct cred *new);
+
+This releases the lock on current->cred_replace_mutex that prepare_creds() got
+and then releases the new credentials.
+
+
+A typical credentials alteration function would look something like this:
+
+	int alter_suid(uid_t suid)
+	{
+		struct cred *new;
+		int ret;
+
+		new = prepare_creds();
+		if (!new)
+			return -ENOMEM;
+
+		new->suid = suid;
+		ret = security_alter_suid(new);
+		if (ret < 0) {
+			abort_creds(new);
+			return ret;
+		}
+
+		return commit_creds(new);
+	}
+
+
+MANAGING CREDENTIALS
+--------------------
+
+There are some functions to help manage credentials:
+
+ (*) void put_cred(const struct cred *cred);
+
+     This releases a reference to the given set of credentials.  If the
+     reference count reaches zero, the credentials will be scheduled for
+     destruction by the RCU system.
+
+ (*) const struct cred *get_cred(const struct cred *cred);
+
+     This gets a reference on a live set of credentials, returning a pointer to
+     that set of credentials.
+
+ (*) struct cred *get_new_cred(struct cred *cred);
+
+     This gets a reference on a set of credentials that is under construction
+     and is thus still mutable, returning a pointer to that set of credentials.
+
+
+=====================
+OPEN FILE CREDENTIALS
+=====================
+
+When a new file is opened, a reference is obtained on the opening task's
+credentials and this is attached to the file struct as 'f_cred' in place of
+'f_uid' and 'f_gid'.  Code that used to access file->f_uid and file->f_gid
+should now access file->f_cred->fsuid and file->f_cred->fsgid.
+
+It is safe to access f_cred without the use of RCU or locking because the
+pointer will not change over the lifetime of the file struct, and nor will the
+contents of the cred struct pointed to, barring the exceptions listed above
+(see the Task Credentials section).
+
+
+=======================================
+OVERRIDING THE VFS'S USE OF CREDENTIALS
+=======================================
+
+Under some circumstances it is desirable to override the credentials used by
+the VFS, and that can be done by calling into such as vfs_mkdir() with a
+different set of credentials.  This is done in the following places:
+
+ (*) sys_faccessat().
+
+ (*) do_coredump().
+
+ (*) nfs4recover.c.
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 8edb4d1d5427..794aab5c66e5 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -1,4 +1,4 @@
-/* Credentials management
+/* Credentials management - see Documentation/credentials.txt
  *
  * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
@@ -169,6 +169,12 @@ static inline struct cred *get_new_cred(struct cred *cred)
  *
  * Get a reference on the specified set of credentials.  The caller must
  * release the reference.
+ *
+ * This is used to deal with a committed set of credentials.  Although the
+ * pointer is const, this will temporarily discard the const and increment the
+ * usage count.  The purpose of this is to attempt to catch at compile time the
+ * accidental alteration of a set of credentials that should be considered
+ * immutable.
  */
 static inline const struct cred *get_cred(const struct cred *cred)
 {
@@ -181,6 +187,10 @@ static inline const struct cred *get_cred(const struct cred *cred)
  *
  * Release a reference to a set of credentials, deleting them when the last ref
  * is released.
+ *
+ * This takes a const pointer to a set of credentials because the credentials
+ * on task_struct are attached by const pointers to prevent accidental
+ * alteration of otherwise immutable credential sets.
  */
 static inline void put_cred(const struct cred *_cred)
 {
diff --git a/kernel/cred.c b/kernel/cred.c
index e6fcdd67b2ec..b8bd2f99d8ce 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -1,4 +1,4 @@
-/* Task credentials management
+/* Task credentials management - see Documentation/credentials.txt
  *
  * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
-- 
cgit v1.2.3


From 3b11a1decef07c19443d24ae926982bc8ec9f4c0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:26 +1100
Subject: CRED: Differentiate objective and effective subjective credentials on
 a task

Differentiate the objective and real subjective credentials from the effective
subjective credentials on a task by introducing a second credentials pointer
into the task_struct.

task_struct::real_cred then refers to the objective and apparent real
subjective credentials of a task, as perceived by the other tasks in the
system.

task_struct::cred then refers to the effective subjective credentials of a
task, as used by that task when it's actually running.  These are not visible
to the other tasks in the system.

__task_cred(task) then refers to the objective/real credentials of the task in
question.

current_cred() refers to the effective subjective credentials of the current
task.

prepare_creds() uses the objective creds as a base and commit_creds() changes
both pointers in the task_struct (indeed commit_creds() requires them to be the
same).

override_creds() and revert_creds() change the subjective creds pointer only,
and the former returns the old subjective creds.  These are used by NFSD,
faccessat() and do_coredump(), and will by used by CacheFiles.

In SELinux, current_has_perm() is provided as an alternative to
task_has_perm().  This uses the effective subjective context of current,
whereas task_has_perm() uses the objective/real context of the subject.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/nfsd/auth.c            |  5 +++-
 include/linux/cred.h      | 29 +++++++++++----------
 include/linux/init_task.h |  1 +
 include/linux/sched.h     |  5 +++-
 kernel/cred.c             | 38 ++++++++++++++++++---------
 kernel/fork.c             |  6 +++--
 security/selinux/hooks.c  | 65 ++++++++++++++++++++++++++++++-----------------
 7 files changed, 95 insertions(+), 54 deletions(-)

(limited to 'kernel')

diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 836ffa1047d9..0184fe9b514c 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -34,6 +34,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 	int flags = nfsexp_flags(rqstp, exp);
 	int ret;
 
+	/* discard any old override before preparing the new set */
+	revert_creds(get_cred(current->real_cred));
 	new = prepare_creds();
 	if (!new)
 		return -ENOMEM;
@@ -82,7 +84,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 	else
 		new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
 							new->cap_permitted);
-	return commit_creds(new);
+	put_cred(override_creds(new));
+	return 0;
 
 oom:
 	ret = -ENOMEM;
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 794aab5c66e5..55a9c995d694 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -146,8 +146,8 @@ extern struct cred *prepare_exec_creds(void);
 extern struct cred *prepare_usermodehelper_creds(void);
 extern int commit_creds(struct cred *);
 extern void abort_creds(struct cred *);
-extern const struct cred *override_creds(const struct cred *) __deprecated;
-extern void revert_creds(const struct cred *) __deprecated;
+extern const struct cred *override_creds(const struct cred *);
+extern void revert_creds(const struct cred *);
 extern void __init cred_init(void);
 
 /**
@@ -202,32 +202,32 @@ static inline void put_cred(const struct cred *_cred)
 }
 
 /**
- * current_cred - Access the current task's credentials
+ * current_cred - Access the current task's subjective credentials
  *
- * Access the credentials of the current task.
+ * Access the subjective credentials of the current task.
  */
 #define current_cred() \
 	(current->cred)
 
 /**
- * __task_cred - Access another task's credentials
+ * __task_cred - Access a task's objective credentials
  * @task: The task to query
  *
- * Access the credentials of another task.  The caller must hold the
- * RCU readlock.
+ * Access the objective credentials of a task.  The caller must hold the RCU
+ * readlock.
  *
  * The caller must make sure task doesn't go away, either by holding a ref on
  * task or by holding tasklist_lock to prevent it from being unlinked.
  */
 #define __task_cred(task) \
-	((const struct cred *)(rcu_dereference((task)->cred)))
+	((const struct cred *)(rcu_dereference((task)->real_cred)))
 
 /**
- * get_task_cred - Get another task's credentials
+ * get_task_cred - Get another task's objective credentials
  * @task: The task to query
  *
- * Get the credentials of a task, pinning them so that they can't go away.
- * Accessing a task's credentials directly is not permitted.
+ * Get the objective credentials of a task, pinning them so that they can't go
+ * away.  Accessing a task's credentials directly is not permitted.
  *
  * The caller must make sure task doesn't go away, either by holding a ref on
  * task or by holding tasklist_lock to prevent it from being unlinked.
@@ -243,10 +243,11 @@ static inline void put_cred(const struct cred *_cred)
 })
 
 /**
- * get_current_cred - Get the current task's credentials
+ * get_current_cred - Get the current task's subjective credentials
  *
- * Get the credentials of the current task, pinning them so that they can't go
- * away.  Accessing the current task's credentials directly is not permitted.
+ * Get the subjective credentials of the current task, pinning them so that
+ * they can't go away.  Accessing the current task's credentials directly is
+ * not permitted.
  */
 #define get_current_cred()				\
 	(get_cred(current_cred()))
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 08c3b24ad9a8..2597858035cd 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -149,6 +149,7 @@ extern struct cred init_cred;
 	.children	= LIST_HEAD_INIT(tsk.children),			\
 	.sibling	= LIST_HEAD_INIT(tsk.sibling),			\
 	.group_leader	= &tsk,						\
+	.real_cred	= &init_cred,					\
 	.cred		= &init_cred,					\
 	.cred_exec_mutex =						\
 		 __MUTEX_INITIALIZER(tsk.cred_exec_mutex),		\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 121d655e460d..3443123b0709 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1145,7 +1145,10 @@ struct task_struct {
 	struct list_head cpu_timers[3];
 
 /* process credentials */
-	const struct cred *cred;	/* actual/objective task credentials (COW) */
+	const struct cred *real_cred;	/* objective and real subjective task
+					 * credentials (COW) */
+	const struct cred *cred;	/* effective (overridable) subjective task
+					 * credentials (COW) */
 	struct mutex cred_exec_mutex;	/* execve vs ptrace cred calculation mutex */
 
 	char comm[TASK_COMM_LEN]; /* executable name excluding path
diff --git a/kernel/cred.c b/kernel/cred.c
index b8bd2f99d8ce..f3ca10660617 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -35,7 +35,7 @@ static struct thread_group_cred init_tgcred = {
  * The initial credentials for the initial task
  */
 struct cred init_cred = {
-	.usage			= ATOMIC_INIT(3),
+	.usage			= ATOMIC_INIT(4),
 	.securebits		= SECUREBITS_DEFAULT,
 	.cap_inheritable	= CAP_INIT_INH_SET,
 	.cap_permitted		= CAP_FULL_SET,
@@ -120,6 +120,8 @@ EXPORT_SYMBOL(__put_cred);
  * prepare a new copy, which the caller then modifies and then commits by
  * calling commit_creds().
  *
+ * Preparation involves making a copy of the objective creds for modification.
+ *
  * Returns a pointer to the new creds-to-be if successful, NULL otherwise.
  *
  * Call commit_creds() or abort_creds() to clean up.
@@ -130,7 +132,7 @@ struct cred *prepare_creds(void)
 	const struct cred *old;
 	struct cred *new;
 
-	BUG_ON(atomic_read(&task->cred->usage) < 1);
+	BUG_ON(atomic_read(&task->real_cred->usage) < 1);
 
 	new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
 	if (!new)
@@ -262,6 +264,9 @@ error:
  *
  * We share if we can, but under some circumstances we have to generate a new
  * set.
+ *
+ * The new process gets the current process's subjective credentials as its
+ * objective and subjective credentials
  */
 int copy_creds(struct task_struct *p, unsigned long clone_flags)
 {
@@ -278,6 +283,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 #endif
 		clone_flags & CLONE_THREAD
 	    ) {
+		p->real_cred = get_cred(p->cred);
 		get_cred(p->cred);
 		atomic_inc(&p->cred->user->processes);
 		return 0;
@@ -317,7 +323,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 #endif
 
 	atomic_inc(&new->user->processes);
-	p->cred = new;
+	p->cred = p->real_cred = get_cred(new);
 	return 0;
 }
 
@@ -326,7 +332,9 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
  * @new: The credentials to be assigned
  *
  * Install a new set of credentials to the current task, using RCU to replace
- * the old set.
+ * the old set.  Both the objective and the subjective credentials pointers are
+ * updated.  This function may not be called if the subjective credentials are
+ * in an overridden state.
  *
  * This function eats the caller's reference to the new credentials.
  *
@@ -338,12 +346,15 @@ int commit_creds(struct cred *new)
 	struct task_struct *task = current;
 	const struct cred *old;
 
+	BUG_ON(task->cred != task->real_cred);
+	BUG_ON(atomic_read(&task->real_cred->usage) < 2);
 	BUG_ON(atomic_read(&new->usage) < 1);
-	BUG_ON(atomic_read(&task->cred->usage) < 1);
 
-	old = task->cred;
+	old = task->real_cred;
 	security_commit_creds(new, old);
 
+	get_cred(new); /* we will require a ref for the subj creds too */
+
 	/* dumpability changes */
 	if (old->euid != new->euid ||
 	    old->egid != new->egid ||
@@ -369,6 +380,7 @@ int commit_creds(struct cred *new)
 	 */
 	if (new->user != old->user)
 		atomic_inc(&new->user->processes);
+	rcu_assign_pointer(task->real_cred, new);
 	rcu_assign_pointer(task->cred, new);
 	if (new->user != old->user)
 		atomic_dec(&old->user->processes);
@@ -388,6 +400,8 @@ int commit_creds(struct cred *new)
 	    new->fsgid != old->fsgid)
 		proc_id_connector(task, PROC_EVENT_GID);
 
+	/* release the old obj and subj refs both */
+	put_cred(old);
 	put_cred(old);
 	return 0;
 }
@@ -408,11 +422,11 @@ void abort_creds(struct cred *new)
 EXPORT_SYMBOL(abort_creds);
 
 /**
- * override_creds - Temporarily override the current process's credentials
+ * override_creds - Override the current process's subjective credentials
  * @new: The credentials to be assigned
  *
- * Install a set of temporary override credentials on the current process,
- * returning the old set for later reversion.
+ * Install a set of temporary override subjective credentials on the current
+ * process, returning the old set for later reversion.
  */
 const struct cred *override_creds(const struct cred *new)
 {
@@ -424,11 +438,11 @@ const struct cred *override_creds(const struct cred *new)
 EXPORT_SYMBOL(override_creds);
 
 /**
- * revert_creds - Revert a temporary credentials override
+ * revert_creds - Revert a temporary subjective credentials override
  * @old: The credentials to be restored
  *
- * Revert a temporary set of override credentials to an old set, discarding the
- * override set.
+ * Revert a temporary set of override subjective credentials to an old set,
+ * discarding the override set.
  */
 void revert_creds(const struct cred *old)
 {
diff --git a/kernel/fork.c b/kernel/fork.c
index 82a7948a664e..af0d0f04585c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -146,6 +146,7 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
+	put_cred(tsk->real_cred);
 	put_cred(tsk->cred);
 	delayacct_tsk_free(tsk);
 
@@ -961,10 +962,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
 #endif
 	retval = -EAGAIN;
-	if (atomic_read(&p->cred->user->processes) >=
+	if (atomic_read(&p->real_cred->user->processes) >=
 			p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
 		if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
-		    p->cred->user != current->nsproxy->user_ns->root_user)
+		    p->real_cred->user != current->nsproxy->user_ns->root_user)
 			goto bad_fork_free;
 	}
 
@@ -1278,6 +1279,7 @@ bad_fork_cleanup_put_domain:
 	module_put(task_thread_info(p)->exec_domain->module);
 bad_fork_cleanup_count:
 	atomic_dec(&p->cred->user->processes);
+	put_cred(p->real_cred);
 	put_cred(p->cred);
 bad_fork_free:
 	free_task(p);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 21a592184633..91b06f2aa963 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -161,7 +161,7 @@ static int selinux_secmark_enabled(void)
  */
 static void cred_init_security(void)
 {
-	struct cred *cred = (struct cred *) current->cred;
+	struct cred *cred = (struct cred *) current->real_cred;
 	struct task_security_struct *tsec;
 
 	tsec = kzalloc(sizeof(struct task_security_struct), GFP_KERNEL);
@@ -184,7 +184,7 @@ static inline u32 cred_sid(const struct cred *cred)
 }
 
 /*
- * get the security ID of a task
+ * get the objective security ID of a task
  */
 static inline u32 task_sid(const struct task_struct *task)
 {
@@ -197,7 +197,7 @@ static inline u32 task_sid(const struct task_struct *task)
 }
 
 /*
- * get the security ID of the current task
+ * get the subjective security ID of the current task
  */
 static inline u32 current_sid(void)
 {
@@ -1395,6 +1395,7 @@ static int cred_has_perm(const struct cred *actor,
  * Check permission between a pair of tasks, e.g. signal checks,
  * fork check, ptrace check, etc.
  * tsk1 is the actor and tsk2 is the target
+ * - this uses the default subjective creds of tsk1
  */
 static int task_has_perm(const struct task_struct *tsk1,
 			 const struct task_struct *tsk2,
@@ -1410,6 +1411,22 @@ static int task_has_perm(const struct task_struct *tsk1,
 	return avc_has_perm(sid1, sid2, SECCLASS_PROCESS, perms, NULL);
 }
 
+/*
+ * Check permission between current and another task, e.g. signal checks,
+ * fork check, ptrace check, etc.
+ * current is the actor and tsk2 is the target
+ * - this uses current's subjective creds
+ */
+static int current_has_perm(const struct task_struct *tsk,
+			    u32 perms)
+{
+	u32 sid, tsid;
+
+	sid = current_sid();
+	tsid = task_sid(tsk);
+	return avc_has_perm(sid, tsid, SECCLASS_PROCESS, perms, NULL);
+}
+
 #if CAP_LAST_CAP > 63
 #error Fix SELinux to handle capabilities > 63.
 #endif
@@ -1807,7 +1824,7 @@ static int selinux_ptrace_may_access(struct task_struct *child,
 		return avc_has_perm(sid, csid, SECCLASS_FILE, FILE__READ, NULL);
 	}
 
-	return task_has_perm(current, child, PROCESS__PTRACE);
+	return current_has_perm(child, PROCESS__PTRACE);
 }
 
 static int selinux_ptrace_traceme(struct task_struct *parent)
@@ -1826,7 +1843,7 @@ static int selinux_capget(struct task_struct *target, kernel_cap_t *effective,
 {
 	int error;
 
-	error = task_has_perm(current, target, PROCESS__GETCAP);
+	error = current_has_perm(target, PROCESS__GETCAP);
 	if (error)
 		return error;
 
@@ -3071,7 +3088,7 @@ static int selinux_file_mprotect(struct vm_area_struct *vma,
 		} else if (!vma->vm_file &&
 			   vma->vm_start <= vma->vm_mm->start_stack &&
 			   vma->vm_end >= vma->vm_mm->start_stack) {
-			rc = task_has_perm(current, current, PROCESS__EXECSTACK);
+			rc = current_has_perm(current, PROCESS__EXECSTACK);
 		} else if (vma->vm_file && vma->anon_vma) {
 			/*
 			 * We are making executable a file mapping that has
@@ -3220,7 +3237,7 @@ static int selinux_task_create(unsigned long clone_flags)
 	if (rc)
 		return rc;
 
-	return task_has_perm(current, current, PROCESS__FORK);
+	return current_has_perm(current, PROCESS__FORK);
 }
 
 /*
@@ -3285,17 +3302,17 @@ static int selinux_task_setgid(gid_t id0, gid_t id1, gid_t id2, int flags)
 
 static int selinux_task_setpgid(struct task_struct *p, pid_t pgid)
 {
-	return task_has_perm(current, p, PROCESS__SETPGID);
+	return current_has_perm(p, PROCESS__SETPGID);
 }
 
 static int selinux_task_getpgid(struct task_struct *p)
 {
-	return task_has_perm(current, p, PROCESS__GETPGID);
+	return current_has_perm(p, PROCESS__GETPGID);
 }
 
 static int selinux_task_getsid(struct task_struct *p)
 {
-	return task_has_perm(current, p, PROCESS__GETSESSION);
+	return current_has_perm(p, PROCESS__GETSESSION);
 }
 
 static void selinux_task_getsecid(struct task_struct *p, u32 *secid)
@@ -3317,7 +3334,7 @@ static int selinux_task_setnice(struct task_struct *p, int nice)
 	if (rc)
 		return rc;
 
-	return task_has_perm(current, p, PROCESS__SETSCHED);
+	return current_has_perm(p, PROCESS__SETSCHED);
 }
 
 static int selinux_task_setioprio(struct task_struct *p, int ioprio)
@@ -3328,12 +3345,12 @@ static int selinux_task_setioprio(struct task_struct *p, int ioprio)
 	if (rc)
 		return rc;
 
-	return task_has_perm(current, p, PROCESS__SETSCHED);
+	return current_has_perm(p, PROCESS__SETSCHED);
 }
 
 static int selinux_task_getioprio(struct task_struct *p)
 {
-	return task_has_perm(current, p, PROCESS__GETSCHED);
+	return current_has_perm(p, PROCESS__GETSCHED);
 }
 
 static int selinux_task_setrlimit(unsigned int resource, struct rlimit *new_rlim)
@@ -3350,7 +3367,7 @@ static int selinux_task_setrlimit(unsigned int resource, struct rlimit *new_rlim
 	   later be used as a safe reset point for the soft limit
 	   upon context transitions.  See selinux_bprm_committing_creds. */
 	if (old_rlim->rlim_max != new_rlim->rlim_max)
-		return task_has_perm(current, current, PROCESS__SETRLIMIT);
+		return current_has_perm(current, PROCESS__SETRLIMIT);
 
 	return 0;
 }
@@ -3363,17 +3380,17 @@ static int selinux_task_setscheduler(struct task_struct *p, int policy, struct s
 	if (rc)
 		return rc;
 
-	return task_has_perm(current, p, PROCESS__SETSCHED);
+	return current_has_perm(p, PROCESS__SETSCHED);
 }
 
 static int selinux_task_getscheduler(struct task_struct *p)
 {
-	return task_has_perm(current, p, PROCESS__GETSCHED);
+	return current_has_perm(p, PROCESS__GETSCHED);
 }
 
 static int selinux_task_movememory(struct task_struct *p)
 {
-	return task_has_perm(current, p, PROCESS__SETSCHED);
+	return current_has_perm(p, PROCESS__SETSCHED);
 }
 
 static int selinux_task_kill(struct task_struct *p, struct siginfo *info,
@@ -3394,7 +3411,7 @@ static int selinux_task_kill(struct task_struct *p, struct siginfo *info,
 		rc = avc_has_perm(secid, task_sid(p),
 				  SECCLASS_PROCESS, perm, NULL);
 	else
-		rc = task_has_perm(current, p, perm);
+		rc = current_has_perm(p, perm);
 	return rc;
 }
 
@@ -5250,7 +5267,7 @@ static int selinux_getprocattr(struct task_struct *p,
 	unsigned len;
 
 	if (current != p) {
-		error = task_has_perm(current, p, PROCESS__GETATTR);
+		error = current_has_perm(p, PROCESS__GETATTR);
 		if (error)
 			return error;
 	}
@@ -5309,15 +5326,15 @@ static int selinux_setprocattr(struct task_struct *p,
 	 * above restriction is ever removed.
 	 */
 	if (!strcmp(name, "exec"))
-		error = task_has_perm(current, p, PROCESS__SETEXEC);
+		error = current_has_perm(p, PROCESS__SETEXEC);
 	else if (!strcmp(name, "fscreate"))
-		error = task_has_perm(current, p, PROCESS__SETFSCREATE);
+		error = current_has_perm(p, PROCESS__SETFSCREATE);
 	else if (!strcmp(name, "keycreate"))
-		error = task_has_perm(current, p, PROCESS__SETKEYCREATE);
+		error = current_has_perm(p, PROCESS__SETKEYCREATE);
 	else if (!strcmp(name, "sockcreate"))
-		error = task_has_perm(current, p, PROCESS__SETSOCKCREATE);
+		error = current_has_perm(p, PROCESS__SETSOCKCREATE);
 	else if (!strcmp(name, "current"))
-		error = task_has_perm(current, p, PROCESS__SETCURRENT);
+		error = current_has_perm(p, PROCESS__SETCURRENT);
 	else
 		error = -EINVAL;
 	if (error)
-- 
cgit v1.2.3


From 3a3b7ce9336952ea7b9564d976d068a238976c9d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:28 +1100
Subject: CRED: Allow kernel services to override LSM settings for task actions

Allow kernel services to override LSM settings appropriate to the actions
performed by a task by duplicating a set of credentials, modifying it and then
using task_struct::cred to point to it when performing operations on behalf of
a task.

This is used, for example, by CacheFiles which has to transparently access the
cache on behalf of a process that thinks it is doing, say, NFS accesses with a
potentially inappropriate (with respect to accessing the cache) set of
credentials.

This patch provides two LSM hooks for modifying a task security record:

 (*) security_kernel_act_as() which allows modification of the security datum
     with which a task acts on other objects (most notably files).

 (*) security_kernel_create_files_as() which allows modification of the
     security datum that is used to initialise the security data on a file that
     a task creates.

The patch also provides four new credentials handling functions, which wrap the
LSM functions:

 (1) prepare_kernel_cred()

     Prepare a set of credentials for a kernel service to use, based either on
     a daemon's credentials or on init_cred.  All the keyrings are cleared.

 (2) set_security_override()

     Set the LSM security ID in a set of credentials to a specific security
     context, assuming permission from the LSM policy.

 (3) set_security_override_from_ctx()

     As (2), but takes the security context as a string.

 (4) set_create_files_as()

     Set the file creation LSM security ID in a set of credentials to be the
     same as that on a particular inode.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com> [Smack changes]
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/cred.h       |   6 +++
 include/linux/security.h   |  28 +++++++++++
 kernel/cred.c              | 113 +++++++++++++++++++++++++++++++++++++++++++++
 security/capability.c      |  12 +++++
 security/security.c        |  10 ++++
 security/selinux/hooks.c   |  46 ++++++++++++++++++
 security/smack/smack_lsm.c |  37 +++++++++++++++
 7 files changed, 252 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index 55a9c995d694..26c1ab179946 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -18,6 +18,7 @@
 
 struct user_struct;
 struct cred;
+struct inode;
 
 /*
  * COW Supplementary groups list
@@ -148,6 +149,11 @@ extern int commit_creds(struct cred *);
 extern void abort_creds(struct cred *);
 extern const struct cred *override_creds(const struct cred *);
 extern void revert_creds(const struct cred *);
+extern struct cred *prepare_kernel_cred(struct task_struct *);
+extern int change_create_files_as(struct cred *, struct inode *);
+extern int set_security_override(struct cred *, u32);
+extern int set_security_override_from_ctx(struct cred *, const char *);
+extern int set_create_files_as(struct cred *, struct inode *);
 extern void __init cred_init(void);
 
 /**
diff --git a/include/linux/security.h b/include/linux/security.h
index 56a0eed65673..59a11e19b617 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -587,6 +587,19 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@new points to the new credentials.
  *	@old points to the original credentials.
  *	Install a new set of credentials.
+ * @kernel_act_as:
+ *	Set the credentials for a kernel service to act as (subjective context).
+ *	@new points to the credentials to be modified.
+ *	@secid specifies the security ID to be set
+ *	The current task must be the one that nominated @secid.
+ *	Return 0 if successful.
+ * @kernel_create_files_as:
+ *	Set the file creation context in a set of credentials to be the same as
+ *	the objective context of the specified inode.
+ *	@new points to the credentials to be modified.
+ *	@inode points to the inode to use as a reference.
+ *	The current task must be the one that nominated @inode.
+ *	Return 0 if successful.
  * @task_setuid:
  *	Check permission before setting one or more of the user identity
  *	attributes of the current process.  The @flags parameter indicates
@@ -1381,6 +1394,8 @@ struct security_operations {
 	int (*cred_prepare)(struct cred *new, const struct cred *old,
 			    gfp_t gfp);
 	void (*cred_commit)(struct cred *new, const struct cred *old);
+	int (*kernel_act_as)(struct cred *new, u32 secid);
+	int (*kernel_create_files_as)(struct cred *new, struct inode *inode);
 	int (*task_setuid) (uid_t id0, uid_t id1, uid_t id2, int flags);
 	int (*task_fix_setuid) (struct cred *new, const struct cred *old,
 				int flags);
@@ -1632,6 +1647,8 @@ int security_task_create(unsigned long clone_flags);
 void security_cred_free(struct cred *cred);
 int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp);
 void security_commit_creds(struct cred *new, const struct cred *old);
+int security_kernel_act_as(struct cred *new, u32 secid);
+int security_kernel_create_files_as(struct cred *new, struct inode *inode);
 int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags);
 int security_task_fix_setuid(struct cred *new, const struct cred *old,
 			     int flags);
@@ -2151,6 +2168,17 @@ static inline void security_commit_creds(struct cred *new,
 {
 }
 
+static inline int security_kernel_act_as(struct cred *cred, u32 secid)
+{
+	return 0;
+}
+
+static inline int security_kernel_create_files_as(struct cred *cred,
+						  struct inode *inode)
+{
+	return 0;
+}
+
 static inline int security_task_setuid(uid_t id0, uid_t id1, uid_t id2,
 				       int flags)
 {
diff --git a/kernel/cred.c b/kernel/cred.c
index f3ca10660617..13697ca2bb38 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -462,3 +462,116 @@ void __init cred_init(void)
 	cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred),
 				     0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 }
+
+/**
+ * prepare_kernel_cred - Prepare a set of credentials for a kernel service
+ * @daemon: A userspace daemon to be used as a reference
+ *
+ * Prepare a set of credentials for a kernel service.  This can then be used to
+ * override a task's own credentials so that work can be done on behalf of that
+ * task that requires a different subjective context.
+ *
+ * @daemon is used to provide a base for the security record, but can be NULL.
+ * If @daemon is supplied, then the security data will be derived from that;
+ * otherwise they'll be set to 0 and no groups, full capabilities and no keys.
+ *
+ * The caller may change these controls afterwards if desired.
+ *
+ * Returns the new credentials or NULL if out of memory.
+ *
+ * Does not take, and does not return holding current->cred_replace_mutex.
+ */
+struct cred *prepare_kernel_cred(struct task_struct *daemon)
+{
+	const struct cred *old;
+	struct cred *new;
+
+	new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
+	if (!new)
+		return NULL;
+
+	if (daemon)
+		old = get_task_cred(daemon);
+	else
+		old = get_cred(&init_cred);
+
+	get_uid(new->user);
+	get_group_info(new->group_info);
+
+#ifdef CONFIG_KEYS
+	atomic_inc(&init_tgcred.usage);
+	new->tgcred = &init_tgcred;
+	new->request_key_auth = NULL;
+	new->thread_keyring = NULL;
+	new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+#endif
+
+#ifdef CONFIG_SECURITY
+	new->security = NULL;
+#endif
+	if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
+		goto error;
+
+	atomic_set(&new->usage, 1);
+	put_cred(old);
+	return new;
+
+error:
+	put_cred(new);
+	return NULL;
+}
+EXPORT_SYMBOL(prepare_kernel_cred);
+
+/**
+ * set_security_override - Set the security ID in a set of credentials
+ * @new: The credentials to alter
+ * @secid: The LSM security ID to set
+ *
+ * Set the LSM security ID in a set of credentials so that the subjective
+ * security is overridden when an alternative set of credentials is used.
+ */
+int set_security_override(struct cred *new, u32 secid)
+{
+	return security_kernel_act_as(new, secid);
+}
+EXPORT_SYMBOL(set_security_override);
+
+/**
+ * set_security_override_from_ctx - Set the security ID in a set of credentials
+ * @new: The credentials to alter
+ * @secctx: The LSM security context to generate the security ID from.
+ *
+ * Set the LSM security ID in a set of credentials so that the subjective
+ * security is overridden when an alternative set of credentials is used.  The
+ * security ID is specified in string form as a security context to be
+ * interpreted by the LSM.
+ */
+int set_security_override_from_ctx(struct cred *new, const char *secctx)
+{
+	u32 secid;
+	int ret;
+
+	ret = security_secctx_to_secid(secctx, strlen(secctx), &secid);
+	if (ret < 0)
+		return ret;
+
+	return set_security_override(new, secid);
+}
+EXPORT_SYMBOL(set_security_override_from_ctx);
+
+/**
+ * set_create_files_as - Set the LSM file create context in a set of credentials
+ * @new: The credentials to alter
+ * @inode: The inode to take the context from
+ *
+ * Change the LSM file creation context in a set of credentials to be the same
+ * as the object context of the specified inode, so that the new inodes have
+ * the same MAC context as that inode.
+ */
+int set_create_files_as(struct cred *new, struct inode *inode)
+{
+	new->fsuid = inode->i_uid;
+	new->fsgid = inode->i_gid;
+	return security_kernel_create_files_as(new, inode);
+}
+EXPORT_SYMBOL(set_create_files_as);
diff --git a/security/capability.c b/security/capability.c
index 185804f99ad1..b9e391425e6f 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -348,6 +348,16 @@ static void cap_cred_commit(struct cred *new, const struct cred *old)
 {
 }
 
+static int cap_kernel_act_as(struct cred *new, u32 secid)
+{
+	return 0;
+}
+
+static int cap_kernel_create_files_as(struct cred *new, struct inode *inode)
+{
+	return 0;
+}
+
 static int cap_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags)
 {
 	return 0;
@@ -889,6 +899,8 @@ void security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, cred_free);
 	set_to_cap_if_null(ops, cred_prepare);
 	set_to_cap_if_null(ops, cred_commit);
+	set_to_cap_if_null(ops, kernel_act_as);
+	set_to_cap_if_null(ops, kernel_create_files_as);
 	set_to_cap_if_null(ops, task_setuid);
 	set_to_cap_if_null(ops, task_fix_setuid);
 	set_to_cap_if_null(ops, task_setgid);
diff --git a/security/security.c b/security/security.c
index dc5babb2d6d8..038ef04b2c7f 100644
--- a/security/security.c
+++ b/security/security.c
@@ -616,6 +616,16 @@ void security_commit_creds(struct cred *new, const struct cred *old)
 	return security_ops->cred_commit(new, old);
 }
 
+int security_kernel_act_as(struct cred *new, u32 secid)
+{
+	return security_ops->kernel_act_as(new, secid);
+}
+
+int security_kernel_create_files_as(struct cred *new, struct inode *inode)
+{
+	return security_ops->kernel_create_files_as(new, inode);
+}
+
 int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags)
 {
 	return security_ops->task_setuid(id0, id1, id2, flags);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 91b06f2aa963..520f82ab3fbf 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3277,6 +3277,50 @@ static void selinux_cred_commit(struct cred *new, const struct cred *old)
 	secondary_ops->cred_commit(new, old);
 }
 
+/*
+ * set the security data for a kernel service
+ * - all the creation contexts are set to unlabelled
+ */
+static int selinux_kernel_act_as(struct cred *new, u32 secid)
+{
+	struct task_security_struct *tsec = new->security;
+	u32 sid = current_sid();
+	int ret;
+
+	ret = avc_has_perm(sid, secid,
+			   SECCLASS_KERNEL_SERVICE,
+			   KERNEL_SERVICE__USE_AS_OVERRIDE,
+			   NULL);
+	if (ret == 0) {
+		tsec->sid = secid;
+		tsec->create_sid = 0;
+		tsec->keycreate_sid = 0;
+		tsec->sockcreate_sid = 0;
+	}
+	return ret;
+}
+
+/*
+ * set the file creation context in a security record to the same as the
+ * objective context of the specified inode
+ */
+static int selinux_kernel_create_files_as(struct cred *new, struct inode *inode)
+{
+	struct inode_security_struct *isec = inode->i_security;
+	struct task_security_struct *tsec = new->security;
+	u32 sid = current_sid();
+	int ret;
+
+	ret = avc_has_perm(sid, isec->sid,
+			   SECCLASS_KERNEL_SERVICE,
+			   KERNEL_SERVICE__CREATE_FILES_AS,
+			   NULL);
+
+	if (ret == 0)
+		tsec->create_sid = isec->sid;
+	return 0;
+}
+
 static int selinux_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags)
 {
 	/* Since setuid only affects the current process, and
@@ -5593,6 +5637,8 @@ static struct security_operations selinux_ops = {
 	.cred_free =			selinux_cred_free,
 	.cred_prepare =			selinux_cred_prepare,
 	.cred_commit =			selinux_cred_commit,
+	.kernel_act_as =		selinux_kernel_act_as,
+	.kernel_create_files_as =	selinux_kernel_create_files_as,
 	.task_setuid =			selinux_task_setuid,
 	.task_fix_setuid =		selinux_task_fix_setuid,
 	.task_setgid =			selinux_task_setgid,
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index de396742abf4..8ad48161cef5 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -1011,6 +1011,41 @@ static void smack_cred_commit(struct cred *new, const struct cred *old)
 {
 }
 
+/**
+ * smack_kernel_act_as - Set the subjective context in a set of credentials
+ * @new points to the set of credentials to be modified.
+ * @secid specifies the security ID to be set
+ *
+ * Set the security data for a kernel service.
+ */
+static int smack_kernel_act_as(struct cred *new, u32 secid)
+{
+	char *smack = smack_from_secid(secid);
+
+	if (smack == NULL)
+		return -EINVAL;
+
+	new->security = smack;
+	return 0;
+}
+
+/**
+ * smack_kernel_create_files_as - Set the file creation label in a set of creds
+ * @new points to the set of credentials to be modified
+ * @inode points to the inode to use as a reference
+ *
+ * Set the file creation context in a set of credentials to the same
+ * as the objective context of the specified inode
+ */
+static int smack_kernel_create_files_as(struct cred *new,
+					struct inode *inode)
+{
+	struct inode_smack *isp = inode->i_security;
+
+	new->security = isp->smk_inode;
+	return 0;
+}
+
 /**
  * smack_task_setpgid - Smack check on setting pgid
  * @p: the task object
@@ -2641,6 +2676,8 @@ struct security_operations smack_ops = {
 	.cred_free =			smack_cred_free,
 	.cred_prepare =			smack_cred_prepare,
 	.cred_commit =			smack_cred_commit,
+	.kernel_act_as =		smack_kernel_act_as,
+	.kernel_create_files_as =	smack_kernel_create_files_as,
 	.task_fix_setuid =		cap_task_fix_setuid,
 	.task_setpgid = 		smack_task_setpgid,
 	.task_getpgid = 		smack_task_getpgid,
-- 
cgit v1.2.3


From f3c7ac40a99f4044b843e6e2c4f46ab2d354c563 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 14 Nov 2008 16:21:19 -0800
Subject: ftrace: remove condition from ftrace_record_ip

Impact: let module functions be recorded when dyn ftrace not enabled

When dynamic ftrace had a daemon and a hash to record the locations
of mcount callers at run time, the recording needed to stop when
ftrace was disabled. But now that the recording is done at compile time
and the ftrace_record_ip is only called at boot up and when a module
is loaded, we no longer need to check if ftrace_enabled is set.
In fact, this breaks module load if it is not set because we skip
over module functions.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 54cb9a7d15e5..3160254f6c7e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -334,7 +334,7 @@ ftrace_record_ip(unsigned long ip)
 {
 	struct dyn_ftrace *rec;
 
-	if (!ftrace_enabled || ftrace_disabled)
+	if (ftrace_disabled)
 		return NULL;
 
 	rec = ftrace_alloc_dyn_node(ip);
-- 
cgit v1.2.3


From b17e8a37a13d0e87165054714434534bb7e69f2d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 14 Nov 2008 16:21:19 -0800
Subject: ftrace: disable ftrace on anomalies in trace start and stop

Impact: robust feature to disable ftrace on start or stop tracing on error

Currently only the initial conversion to nops will disable ftrace
on an anomaly. But if an anomaly happens on start or stopping of the
tracer, it will silently fail.

This patch adds a check there too, to disable ftrace and warn if the
conversion fails.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 81 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 44 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3160254f6c7e..d5bd21f39524 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -348,6 +348,47 @@ ftrace_record_ip(unsigned long ip)
 	return rec;
 }
 
+static void print_ip_ins(const char *fmt, unsigned char *p)
+{
+	int i;
+
+	printk(KERN_CONT "%s", fmt);
+
+	for (i = 0; i < MCOUNT_INSN_SIZE; i++)
+		printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
+}
+
+static void ftrace_bug(int failed, unsigned long ip,
+		       unsigned char *expected,
+		       unsigned char *replace)
+{
+	switch (failed) {
+	case -EFAULT:
+		FTRACE_WARN_ON_ONCE(1);
+		pr_info("ftrace faulted on modifying ");
+		print_ip_sym(ip);
+		break;
+	case -EINVAL:
+		FTRACE_WARN_ON_ONCE(1);
+		pr_info("ftrace failed to modify ");
+		print_ip_sym(ip);
+		print_ip_ins(" expected: ", expected);
+		print_ip_ins(" actual: ", (unsigned char *)ip);
+		print_ip_ins(" replace: ", replace);
+		printk(KERN_CONT "\n");
+		break;
+	case -EPERM:
+		FTRACE_WARN_ON_ONCE(1);
+		pr_info("ftrace faulted on writing ");
+		print_ip_sym(ip);
+		break;
+	default:
+		FTRACE_WARN_ON_ONCE(1);
+		pr_info("ftrace faulted on unknown error ");
+		print_ip_sym(ip);
+	}
+}
+
 #define FTRACE_ADDR ((long)(ftrace_caller))
 
 static int
@@ -465,22 +506,13 @@ static void ftrace_replace_code(int enable)
 				if ((system_state == SYSTEM_BOOTING) ||
 				    !core_kernel_text(rec->ip)) {
 					ftrace_free_rec(rec);
-				}
+				} else
+					ftrace_bug(failed, rec->ip, old, new);
 			}
 		}
 	}
 }
 
-static void print_ip_ins(const char *fmt, unsigned char *p)
-{
-	int i;
-
-	printk(KERN_CONT "%s", fmt);
-
-	for (i = 0; i < MCOUNT_INSN_SIZE; i++)
-		printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
-}
-
 static int
 ftrace_code_disable(struct dyn_ftrace *rec)
 {
@@ -495,32 +527,7 @@ ftrace_code_disable(struct dyn_ftrace *rec)
 
 	ret = ftrace_modify_code(ip, call, nop);
 	if (ret) {
-		switch (ret) {
-		case -EFAULT:
-			FTRACE_WARN_ON_ONCE(1);
-			pr_info("ftrace faulted on modifying ");
-			print_ip_sym(ip);
-			break;
-		case -EINVAL:
-			FTRACE_WARN_ON_ONCE(1);
-			pr_info("ftrace failed to modify ");
-			print_ip_sym(ip);
-			print_ip_ins(" expected: ", call);
-			print_ip_ins(" actual: ", (unsigned char *)ip);
-			print_ip_ins(" replace: ", nop);
-			printk(KERN_CONT "\n");
-			break;
-		case -EPERM:
-			FTRACE_WARN_ON_ONCE(1);
-			pr_info("ftrace faulted on writing ");
-			print_ip_sym(ip);
-			break;
-		default:
-			FTRACE_WARN_ON_ONCE(1);
-			pr_info("ftrace faulted on unknown error ");
-			print_ip_sym(ip);
-		}
-
+		ftrace_bug(ret, ip, call, nop);
 		rec->flags |= FTRACE_FL_FAILED;
 		return 0;
 	}
-- 
cgit v1.2.3


From 918c115410c6cc57033835b6a401e57697f9ea4f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 14 Nov 2008 16:21:19 -0800
Subject: ftrace: do not process freed records

Impact: keep from converting freed records

When the tracer is started or stopped, it converts all code pointed
to by the saved records into callers to ftrace or nops. When modules
are unloaded, their records are freed, but they still exist within
the record pages.

This patch changes the code to skip over freed records.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d5bd21f39524..3940c71ac2a2 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -488,8 +488,12 @@ static void ftrace_replace_code(int enable)
 		for (i = 0; i < pg->index; i++) {
 			rec = &pg->records[i];
 
-			/* don't modify code that has already faulted */
-			if (rec->flags & FTRACE_FL_FAILED)
+			/*
+			 * Skip over free records and records that have
+			 * failed.
+			 */
+			if (rec->flags & FTRACE_FL_FREE ||
+			    rec->flags & FTRACE_FL_FAILED)
 				continue;
 
 			/* ignore updates to this record's mcount site */
-- 
cgit v1.2.3


From d51ad7ac48f991c4a8834485727efa99a691cb87 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 15 Nov 2008 15:48:29 -0500
Subject: ftrace: replace raw_local_irq_save with local_irq_save

Impact: fix lockdep disabling itself when function tracing is enabled

The raw_local_irq_saves used in ftrace is causing problems with
lockdep. (it thinks the irq flags are out of sync and disables
itself with a warning)

The raw ops here are not needed, and the normal local_irq_save is fine.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c          | 4 ++--
 kernel/trace/trace_selftest.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4a904623e05d..dff4bee591b9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1051,7 +1051,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
 	 * Need to use raw, since this must be called before the
 	 * recursive protection is performed.
 	 */
-	raw_local_irq_save(flags);
+	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 	disabled = atomic_inc_return(&data->disabled);
@@ -1062,7 +1062,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
 	}
 
 	atomic_dec(&data->disabled);
-	raw_local_irq_restore(flags);
+	local_irq_restore(flags);
 }
 
 #ifdef CONFIG_FUNCTION_RET_TRACER
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 24e6e075e6d6..5cb64ea061b5 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -52,7 +52,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
 	int cpu, ret = 0;
 
 	/* Don't allow flipping of max traces now */
-	raw_local_irq_save(flags);
+	local_irq_save(flags);
 	__raw_spin_lock(&ftrace_max_lock);
 
 	cnt = ring_buffer_entries(tr->buffer);
@@ -63,7 +63,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
 			break;
 	}
 	__raw_spin_unlock(&ftrace_max_lock);
-	raw_local_irq_restore(flags);
+	local_irq_restore(flags);
 
 	if (count)
 		*count = cnt;
-- 
cgit v1.2.3


From 31e889098a80ceb3e9e3c555d522b2686a6663c6 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 14 Nov 2008 16:21:19 -0800
Subject: ftrace: pass module struct to arch dynamic ftrace functions

Impact: allow archs more flexibility on dynamic ftrace implementations

Dynamic ftrace has largly been developed on x86. Since x86 does not
have the same limitations as other architectures, the ftrace interaction
between the generic code and the architecture specific code was not
flexible enough to handle some of the issues that other architectures
have.

Most notably, module trampolines. Due to the limited branch distance
that archs make in calling kernel core code from modules, the module
load code must create a trampoline to jump to what will make the
larger jump into core kernel code.

The problem arises when this happens to a call to mcount. Ftrace checks
all code before modifying it and makes sure the current code is what
it expects. Right now, there is not enough information to handle modifying
module trampolines.

This patch changes the API between generic dynamic ftrace code and
the arch dependent code. There is now two functions for modifying code:

  ftrace_make_nop(mod, rec, addr) - convert the code at rec->ip into
       a nop, where the original text is calling addr. (mod is the
       module struct if called by module init)

  ftrace_make_caller(rec, addr) - convert the code rec->ip that should
       be a nop into a caller to addr.

The record "rec" now has a new field called "arch" where the architecture
can add any special attributes to each call site record.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/ftrace.h |  8 ++++++
 arch/x86/kernel/ftrace.c      | 29 +++++++++++++++++---
 include/linux/ftrace.h        | 53 +++++++++++++++++++++++++++---------
 kernel/module.c               |  2 +-
 kernel/trace/ftrace.c         | 62 +++++++++++++++++--------------------------
 5 files changed, 100 insertions(+), 54 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 9b6a1fa19e70..2bb43b433e07 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -17,6 +17,14 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
 	 */
 	return addr - 1;
 }
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+struct dyn_arch_ftrace {
+	/* No extra data needed for x86 */
+};
+
+#endif /*  CONFIG_DYNAMIC_FTRACE */
 #endif /* __ASSEMBLY__ */
 #endif /* CONFIG_FUNCTION_TRACER */
 
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index fe832738e1e2..762222ad1387 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -166,7 +166,7 @@ static int ftrace_calc_offset(long ip, long addr)
 	return (int)(addr - ip);
 }
 
-unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
+static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
 {
 	static union ftrace_code_union calc;
 
@@ -311,12 +311,12 @@ do_ftrace_mod_code(unsigned long ip, void *new_code)
 
 static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
 
-unsigned char *ftrace_nop_replace(void)
+static unsigned char *ftrace_nop_replace(void)
 {
 	return ftrace_nop;
 }
 
-int
+static int
 ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 		   unsigned char *new_code)
 {
@@ -349,6 +349,29 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 	return 0;
 }
 
+int ftrace_make_nop(struct module *mod,
+		    struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned char *new, *old;
+	unsigned long ip = rec->ip;
+
+	old = ftrace_call_replace(ip, addr);
+	new = ftrace_nop_replace();
+
+	return ftrace_modify_code(rec->ip, old, new);
+}
+
+int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned char *new, *old;
+	unsigned long ip = rec->ip;
+
+	old = ftrace_nop_replace();
+	new = ftrace_call_replace(ip, addr);
+
+	return ftrace_modify_code(rec->ip, old, new);
+}
+
 int ftrace_update_ftrace_func(ftrace_func_t func)
 {
 	unsigned long ip = (unsigned long)(&ftrace_call);
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 4fbc4a8b86a5..166a2070ef65 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -74,6 +74,9 @@ static inline void ftrace_start(void) { }
 #endif /* CONFIG_FUNCTION_TRACER */
 
 #ifdef CONFIG_DYNAMIC_FTRACE
+/* asm/ftrace.h must be defined for archs supporting dynamic ftrace */
+#include <asm/ftrace.h>
+
 enum {
 	FTRACE_FL_FREE		= (1 << 0),
 	FTRACE_FL_FAILED	= (1 << 1),
@@ -88,6 +91,7 @@ struct dyn_ftrace {
 	struct list_head	list;
 	unsigned long		ip; /* address of mcount call-site */
 	unsigned long		flags;
+	struct dyn_arch_ftrace	arch;
 };
 
 int ftrace_force_update(void);
@@ -95,22 +99,40 @@ void ftrace_set_filter(unsigned char *buf, int len, int reset);
 
 /* defined in arch */
 extern int ftrace_ip_converted(unsigned long ip);
-extern unsigned char *ftrace_nop_replace(void);
-extern unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr);
 extern int ftrace_dyn_arch_init(void *data);
 extern int ftrace_update_ftrace_func(ftrace_func_t func);
 extern void ftrace_caller(void);
 extern void ftrace_call(void);
 extern void mcount_call(void);
 
-/* May be defined in arch */
-extern int ftrace_arch_read_dyn_info(char *buf, int size);
+/**
+ * ftrace_make_nop - convert code into top
+ * @mod: module structure if called by module load initialization
+ * @rec: the mcount call site record
+ * @addr: the address that the call site should be calling
+ *
+ * This is a very sensitive operation and great care needs
+ * to be taken by the arch.  The operation should carefully
+ * read the location, check to see if what is read is indeed
+ * what we expect it to be, and then on success of the compare,
+ * it should write to the location.
+ *
+ * The code segment at @rec->ip should be a caller to @addr
+ *
+ * Return must be:
+ *  0 on success
+ *  -EFAULT on error reading the location
+ *  -EINVAL on a failed compare of the contents
+ *  -EPERM  on error writing to the location
+ * Any other value will be considered a failure.
+ */
+extern int ftrace_make_nop(struct module *mod,
+			   struct dyn_ftrace *rec, unsigned long addr);
 
 /**
- * ftrace_modify_code - modify code segment
- * @ip: the address of the code segment
- * @old_code: the contents of what is expected to be there
- * @new_code: the code to patch in
+ * ftrace_make_call - convert a nop call site into a call to addr
+ * @rec: the mcount call site record
+ * @addr: the address that the call site should call
  *
  * This is a very sensitive operation and great care needs
  * to be taken by the arch.  The operation should carefully
@@ -118,6 +140,8 @@ extern int ftrace_arch_read_dyn_info(char *buf, int size);
  * what we expect it to be, and then on success of the compare,
  * it should write to the location.
  *
+ * The code segment at @rec->ip should be a nop
+ *
  * Return must be:
  *  0 on success
  *  -EFAULT on error reading the location
@@ -125,8 +149,11 @@ extern int ftrace_arch_read_dyn_info(char *buf, int size);
  *  -EPERM  on error writing to the location
  * Any other value will be considered a failure.
  */
-extern int ftrace_modify_code(unsigned long ip, unsigned char *old_code,
-			      unsigned char *new_code);
+extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr);
+
+
+/* May be defined in arch */
+extern int ftrace_arch_read_dyn_info(char *buf, int size);
 
 extern int skip_trace(unsigned long ip);
 
@@ -259,11 +286,13 @@ static inline void ftrace_dump(void) { }
 
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 extern void ftrace_init(void);
-extern void ftrace_init_module(unsigned long *start, unsigned long *end);
+extern void ftrace_init_module(struct module *mod,
+			       unsigned long *start, unsigned long *end);
 #else
 static inline void ftrace_init(void) { }
 static inline void
-ftrace_init_module(unsigned long *start, unsigned long *end) { }
+ftrace_init_module(struct module *mod,
+		   unsigned long *start, unsigned long *end) { }
 #endif
 
 
diff --git a/kernel/module.c b/kernel/module.c
index 1f4cc00e0c20..69791274e899 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2201,7 +2201,7 @@ static noinline struct module *load_module(void __user *umod,
 	/* sechdrs[0].sh_size is always zero */
 	mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc",
 			    sizeof(*mseg), &num_mcount);
-	ftrace_init_module(mseg, mseg + num_mcount);
+	ftrace_init_module(mod, mseg, mseg + num_mcount);
 
 	err = module_finalize(hdr, sechdrs, mod);
 	if (err < 0)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3940c71ac2a2..e9a5fbfce08e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -358,9 +358,7 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
 		printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
 }
 
-static void ftrace_bug(int failed, unsigned long ip,
-		       unsigned char *expected,
-		       unsigned char *replace)
+static void ftrace_bug(int failed, unsigned long ip)
 {
 	switch (failed) {
 	case -EFAULT:
@@ -372,9 +370,7 @@ static void ftrace_bug(int failed, unsigned long ip,
 		FTRACE_WARN_ON_ONCE(1);
 		pr_info("ftrace failed to modify ");
 		print_ip_sym(ip);
-		print_ip_ins(" expected: ", expected);
 		print_ip_ins(" actual: ", (unsigned char *)ip);
-		print_ip_ins(" replace: ", replace);
 		printk(KERN_CONT "\n");
 		break;
 	case -EPERM:
@@ -392,8 +388,7 @@ static void ftrace_bug(int failed, unsigned long ip,
 #define FTRACE_ADDR ((long)(ftrace_caller))
 
 static int
-__ftrace_replace_code(struct dyn_ftrace *rec,
-		      unsigned char *old, unsigned char *new, int enable)
+__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 {
 	unsigned long ip, fl;
 
@@ -435,12 +430,10 @@ __ftrace_replace_code(struct dyn_ftrace *rec,
 		 * otherwise enable it!
 		 */
 		if (fl & FTRACE_FL_ENABLED) {
-			/* swap new and old */
-			new = old;
-			old = ftrace_call_replace(ip, FTRACE_ADDR);
+			enable = 0;
 			rec->flags &= ~FTRACE_FL_ENABLED;
 		} else {
-			new = ftrace_call_replace(ip, FTRACE_ADDR);
+			enable = 1;
 			rec->flags |= FTRACE_FL_ENABLED;
 		}
 	} else {
@@ -453,10 +446,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec,
 			fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED);
 			if (fl == FTRACE_FL_NOTRACE)
 				return 0;
-
-			new = ftrace_call_replace(ip, FTRACE_ADDR);
-		} else
-			old = ftrace_call_replace(ip, FTRACE_ADDR);
+		}
 
 		if (enable) {
 			if (rec->flags & FTRACE_FL_ENABLED)
@@ -469,21 +459,18 @@ __ftrace_replace_code(struct dyn_ftrace *rec,
 		}
 	}
 
-	return ftrace_modify_code(ip, old, new);
+	if (enable)
+		return ftrace_make_call(rec, FTRACE_ADDR);
+	else
+		return ftrace_make_nop(NULL, rec, FTRACE_ADDR);
 }
 
 static void ftrace_replace_code(int enable)
 {
 	int i, failed;
-	unsigned char *new = NULL, *old = NULL;
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
 
-	if (enable)
-		old = ftrace_nop_replace();
-	else
-		new = ftrace_nop_replace();
-
 	for (pg = ftrace_pages_start; pg; pg = pg->next) {
 		for (i = 0; i < pg->index; i++) {
 			rec = &pg->records[i];
@@ -504,34 +491,30 @@ static void ftrace_replace_code(int enable)
 				unfreeze_record(rec);
 			}
 
-			failed = __ftrace_replace_code(rec, old, new, enable);
+			failed = __ftrace_replace_code(rec, enable);
 			if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
 				rec->flags |= FTRACE_FL_FAILED;
 				if ((system_state == SYSTEM_BOOTING) ||
 				    !core_kernel_text(rec->ip)) {
 					ftrace_free_rec(rec);
 				} else
-					ftrace_bug(failed, rec->ip, old, new);
+					ftrace_bug(failed, rec->ip);
 			}
 		}
 	}
 }
 
 static int
-ftrace_code_disable(struct dyn_ftrace *rec)
+ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
 {
 	unsigned long ip;
-	unsigned char *nop, *call;
 	int ret;
 
 	ip = rec->ip;
 
-	nop = ftrace_nop_replace();
-	call = ftrace_call_replace(ip, mcount_addr);
-
-	ret = ftrace_modify_code(ip, call, nop);
+	ret = ftrace_make_nop(mod, rec, mcount_addr);
 	if (ret) {
-		ftrace_bug(ret, ip, call, nop);
+		ftrace_bug(ret, ip);
 		rec->flags |= FTRACE_FL_FAILED;
 		return 0;
 	}
@@ -650,7 +633,7 @@ static cycle_t		ftrace_update_time;
 static unsigned long	ftrace_update_cnt;
 unsigned long		ftrace_update_tot_cnt;
 
-static int ftrace_update_code(void)
+static int ftrace_update_code(struct module *mod)
 {
 	struct dyn_ftrace *p, *t;
 	cycle_t start, stop;
@@ -667,7 +650,7 @@ static int ftrace_update_code(void)
 		list_del_init(&p->list);
 
 		/* convert record (i.e, patch mcount-call with NOP) */
-		if (ftrace_code_disable(p)) {
+		if (ftrace_code_disable(mod, p)) {
 			p->flags |= FTRACE_FL_CONVERTED;
 			ftrace_update_cnt++;
 		} else
@@ -1309,7 +1292,8 @@ static __init int ftrace_init_debugfs(void)
 
 fs_initcall(ftrace_init_debugfs);
 
-static int ftrace_convert_nops(unsigned long *start,
+static int ftrace_convert_nops(struct module *mod,
+			       unsigned long *start,
 			       unsigned long *end)
 {
 	unsigned long *p;
@@ -1325,18 +1309,19 @@ static int ftrace_convert_nops(unsigned long *start,
 
 	/* disable interrupts to prevent kstop machine */
 	local_irq_save(flags);
-	ftrace_update_code();
+	ftrace_update_code(mod);
 	local_irq_restore(flags);
 	mutex_unlock(&ftrace_start_lock);
 
 	return 0;
 }
 
-void ftrace_init_module(unsigned long *start, unsigned long *end)
+void ftrace_init_module(struct module *mod,
+			unsigned long *start, unsigned long *end)
 {
 	if (ftrace_disabled || start == end)
 		return;
-	ftrace_convert_nops(start, end);
+	ftrace_convert_nops(mod, start, end);
 }
 
 extern unsigned long __start_mcount_loc[];
@@ -1366,7 +1351,8 @@ void __init ftrace_init(void)
 
 	last_ftrace_enabled = ftrace_enabled = 1;
 
-	ret = ftrace_convert_nops(__start_mcount_loc,
+	ret = ftrace_convert_nops(NULL,
+				  __start_mcount_loc,
 				  __stop_mcount_loc);
 
 	return;
-- 
cgit v1.2.3


From 20e5227e9f55ae1969934821ccbf581563785bbe Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 14 Nov 2008 16:21:19 -0800
Subject: ftrace: allow NULL pointers in mcount_loc

Impact: make ftrace_convert_nops() more permissive

Due to the way different architecture linkers combine the data sections
of the mcount_loc (the section that lists all the locations that
call mcount), there may be zeros added in that section. This is usually
due to strange alignments that the linker performs, that pads in zeros.

This patch makes the conversion code to nops skip any pointer in
the mcount_loc section that is NULL.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index e9a5fbfce08e..cc4219135dc9 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1304,6 +1304,14 @@ static int ftrace_convert_nops(struct module *mod,
 	p = start;
 	while (p < end) {
 		addr = ftrace_call_adjust(*p++);
+		/*
+		 * Some architecture linkers will pad between
+		 * the different mcount_loc sections of different
+		 * object files to satisfy alignments.
+		 * Skip any NULL pointers.
+		 */
+		if (!addr)
+			continue;
 		ftrace_record_ip(addr);
 	}
 
-- 
cgit v1.2.3


From 982c350b9ec4b3564d67f3627a274ae61bbc7e95 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 15 Nov 2008 16:31:41 -0500
Subject: ftrace: fix dyn ftrace filter

Impact: correct implementation of dyn ftrace filter

The old decisions made by the filter algorithm was complex and incorrect.
This lead to inconsistent enabling or disabling of functions when
the filter was used.

This patch simplifies that code and in doing so, corrects the usage
of the filters.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 83 ++++++++++++++++++++++-----------------------------
 1 file changed, 36 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index cc4219135dc9..b9f2e22faf2e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -394,72 +394,62 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 
 	ip = rec->ip;
 
-	if (ftrace_filtered && enable) {
+	/*
+	 * If this record is not to be traced and
+	 * it is not enabled then do nothing.
+	 *
+	 * If this record is not to be traced and
+	 * it is enabled then disabled it.
+	 *
+	 */
+	if (rec->flags & FTRACE_FL_NOTRACE) {
+		if (rec->flags & FTRACE_FL_ENABLED)
+			rec->flags &= ~FTRACE_FL_ENABLED;
+		else
+			return 0;
+
+	} else if (ftrace_filtered && enable) {
 		/*
-		 * If filtering is on:
-		 *
-		 * If this record is set to be filtered and
-		 * is enabled then do nothing.
-		 *
-		 * If this record is set to be filtered and
-		 * it is not enabled, enable it.
-		 *
-		 * If this record is not set to be filtered
-		 * and it is not enabled do nothing.
-		 *
-		 * If this record is set not to trace then
-		 * do nothing.
-		 *
-		 * If this record is set not to trace and
-		 * it is enabled then disable it.
-		 *
-		 * If this record is not set to be filtered and
-		 * it is enabled, disable it.
+		 * Filtering is on:
 		 */
 
-		fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE |
-				   FTRACE_FL_ENABLED);
+		fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED);
 
-		if ((fl ==  (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) ||
-		    (fl ==  (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE)) ||
-		    !fl || (fl == FTRACE_FL_NOTRACE))
+		/* Record is filtered and enabled, do nothing */
+		if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED))
 			return 0;
 
-		/*
-		 * If it is enabled disable it,
-		 * otherwise enable it!
-		 */
-		if (fl & FTRACE_FL_ENABLED) {
-			enable = 0;
+		/* Record is not filtered and is not enabled do nothing */
+		if (!fl)
+			return 0;
+
+		/* Record is not filtered but enabled, disable it */
+		if (fl == FTRACE_FL_ENABLED)
 			rec->flags &= ~FTRACE_FL_ENABLED;
-		} else {
-			enable = 1;
+		else
+		/* Otherwise record is filtered but not enabled, enable it */
 			rec->flags |= FTRACE_FL_ENABLED;
-		}
 	} else {
+		/* Disable or not filtered */
 
 		if (enable) {
-			/*
-			 * If this record is set not to trace and is
-			 * not enabled, do nothing.
-			 */
-			fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED);
-			if (fl == FTRACE_FL_NOTRACE)
-				return 0;
-		}
-
-		if (enable) {
+			/* if record is enabled, do nothing */
 			if (rec->flags & FTRACE_FL_ENABLED)
 				return 0;
+
 			rec->flags |= FTRACE_FL_ENABLED;
+
 		} else {
+
+			/* if record is not enabled do nothing */
 			if (!(rec->flags & FTRACE_FL_ENABLED))
 				return 0;
+
 			rec->flags &= ~FTRACE_FL_ENABLED;
 		}
 	}
 
-	if (enable)
+	if (rec->flags & FTRACE_FL_ENABLED)
 		return ftrace_make_call(rec, FTRACE_ADDR);
 	else
 		return ftrace_make_nop(NULL, rec, FTRACE_ADDR);
@@ -554,8 +544,7 @@ static void ftrace_startup(void)
 
 	mutex_lock(&ftrace_start_lock);
 	ftrace_start_up++;
-	if (ftrace_start_up == 1)
-		command |= FTRACE_ENABLE_CALLS;
+	command |= FTRACE_ENABLE_CALLS;
 
 	if (saved_ftrace_func != ftrace_trace_function) {
 		saved_ftrace_func = ftrace_trace_function;
-- 
cgit v1.2.3


From ee02a2e5c88ca2e4d6921f08d037b46d5bf82641 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 15 Nov 2008 16:31:41 -0500
Subject: ftrace: make filtered functions effective on setting

Impact: set filtered functions at time the filter is set

It can be confusing when the set_filter_functions is set (or cleared)
and the functions being recorded by the dynamic tracer does not
match.

This patch causes the code to be updated if the function tracer is
enabled and the filter is changed.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b9f2e22faf2e..b42ec1de546b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1194,7 +1194,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
 
 	mutex_lock(&ftrace_sysctl_lock);
 	mutex_lock(&ftrace_start_lock);
-	if (iter->filtered && ftrace_start_up && ftrace_enabled)
+	if (ftrace_start_up && ftrace_enabled)
 		ftrace_run_update_code(FTRACE_ENABLE_CALLS);
 	mutex_unlock(&ftrace_start_lock);
 	mutex_unlock(&ftrace_sysctl_lock);
-- 
cgit v1.2.3


From e6e7a65aabdb696cf05a56cfd495c49a11fd4cde Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 16 Nov 2008 05:53:19 +0100
Subject: tracing/ftrace: fix unexpected -EINVAL when longest tracer name is
 set

Impact: fix confusing write() -EINVAL when changing the tracer

The following commit d9e540762f5cdd89f24e518ad1fd31142d0b9726 remade
alive the bug which made the set of a new tracer returning -EINVAL if
this is the longest name of tracer. This patch corrects it.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dff4bee591b9..80898f4870cc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2655,6 +2655,9 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
 	char buf[max_tracer_type_len+1];
 	int i;
 	size_t ret;
+	int err;
+
+	ret = cnt;
 
 	if (cnt > max_tracer_type_len)
 		cnt = max_tracer_type_len;
@@ -2668,12 +2671,11 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
 	for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
 		buf[i] = 0;
 
-	ret = tracing_set_tracer(buf);
-	if (!ret)
-		ret = cnt;
+	err = tracing_set_tracer(buf);
+	if (err)
+		return err;
 
-	if (ret > 0)
-		filp->f_pos += ret;
+	filp->f_pos += ret;
 
 	return ret;
 }
-- 
cgit v1.2.3


From 1c80025a49855b12fa09bb6db71820e3367b1369 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 16 Nov 2008 05:57:26 +0100
Subject: tracing/ftrace: change the type of the init() callback

Impact: extend the ->init() method with the ability to fail

This bring a way to know if the initialization of a tracer successed.
A tracer must return 0 on success and a traditional error (ie:
-ENOMEM) if it fails.

If a tracer fails to init, it is free to print a detailed warn. The
tracing api will not and switch to a new tracer will just return the
error from the init callback.

Note: this will be used for the return tracer.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c                  |  7 ++--
 kernel/trace/trace.h                  |  3 +-
 kernel/trace/trace_boot.c             |  3 +-
 kernel/trace/trace_branch.c           |  3 +-
 kernel/trace/trace_functions.c        |  3 +-
 kernel/trace/trace_functions_return.c |  3 +-
 kernel/trace/trace_irqsoff.c          |  9 +++--
 kernel/trace/trace_mmiotrace.c        |  3 +-
 kernel/trace/trace_nop.c              |  3 +-
 kernel/trace/trace_sched_switch.c     |  3 +-
 kernel/trace/trace_sched_wakeup.c     |  3 +-
 kernel/trace/trace_selftest.c         | 66 ++++++++++++++++++++++++++++++-----
 kernel/trace/trace_sysprof.c          |  3 +-
 13 files changed, 88 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 80898f4870cc..396fda034e3f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2638,8 +2638,11 @@ static int tracing_set_tracer(char *buf)
 		current_trace->reset(tr);
 
 	current_trace = t;
-	if (t->init)
-		t->init(tr);
+	if (t->init) {
+		ret = t->init(tr);
+		if (ret)
+			goto out;
+	}
 
 	trace_branch_enable(tr);
  out:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 790ea8c0e1f3..cdbd5cc22be8 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -264,7 +264,8 @@ enum print_line_t {
  */
 struct tracer {
 	const char		*name;
-	void			(*init)(struct trace_array *tr);
+	/* Your tracer should raise a warning if init fails */
+	int			(*init)(struct trace_array *tr);
 	void			(*reset)(struct trace_array *tr);
 	void			(*start)(struct trace_array *tr);
 	void			(*stop)(struct trace_array *tr);
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index cb333b7fd113..a4fa2c57e34e 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -47,7 +47,7 @@ static void reset_boot_trace(struct trace_array *tr)
 		tracing_reset(tr, cpu);
 }
 
-static void boot_trace_init(struct trace_array *tr)
+static int boot_trace_init(struct trace_array *tr)
 {
 	int cpu;
 	boot_trace = tr;
@@ -56,6 +56,7 @@ static void boot_trace_init(struct trace_array *tr)
 		tracing_reset(tr, cpu);
 
 	tracing_sched_switch_assign_trace(tr);
+	return 0;
 }
 
 static enum print_line_t
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 85265553918f..44bd39539d61 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -125,7 +125,7 @@ static void stop_branch_trace(struct trace_array *tr)
 	disable_branch_tracing();
 }
 
-static void branch_trace_init(struct trace_array *tr)
+static int branch_trace_init(struct trace_array *tr)
 {
 	int cpu;
 
@@ -133,6 +133,7 @@ static void branch_trace_init(struct trace_array *tr)
 		tracing_reset(tr, cpu);
 
 	start_branch_trace(tr);
+	return 0;
 }
 
 static void branch_trace_reset(struct trace_array *tr)
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 8693b7a0a5b2..e74f6d0a3216 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -42,9 +42,10 @@ static void stop_function_trace(struct trace_array *tr)
 	tracing_stop_cmdline_record();
 }
 
-static void function_trace_init(struct trace_array *tr)
+static int function_trace_init(struct trace_array *tr)
 {
 	start_function_trace(tr);
+	return 0;
 }
 
 static void function_trace_reset(struct trace_array *tr)
diff --git a/kernel/trace/trace_functions_return.c b/kernel/trace/trace_functions_return.c
index 7680b21537dd..61185f756a13 100644
--- a/kernel/trace/trace_functions_return.c
+++ b/kernel/trace/trace_functions_return.c
@@ -24,13 +24,14 @@ static void stop_return_trace(struct trace_array *tr)
 	unregister_ftrace_return();
 }
 
-static void return_trace_init(struct trace_array *tr)
+static int return_trace_init(struct trace_array *tr)
 {
 	int cpu;
 	for_each_online_cpu(cpu)
 		tracing_reset(tr, cpu);
 
 	start_return_trace(tr);
+	return 0;
 }
 
 static void return_trace_reset(struct trace_array *tr)
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index d919d4eaa7cc..7c2e326bbc8b 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -416,11 +416,12 @@ static void irqsoff_tracer_close(struct trace_iterator *iter)
 }
 
 #ifdef CONFIG_IRQSOFF_TRACER
-static void irqsoff_tracer_init(struct trace_array *tr)
+static int irqsoff_tracer_init(struct trace_array *tr)
 {
 	trace_type = TRACER_IRQS_OFF;
 
 	__irqsoff_tracer_init(tr);
+	return 0;
 }
 static struct tracer irqsoff_tracer __read_mostly =
 {
@@ -442,11 +443,12 @@ static struct tracer irqsoff_tracer __read_mostly =
 #endif
 
 #ifdef CONFIG_PREEMPT_TRACER
-static void preemptoff_tracer_init(struct trace_array *tr)
+static int preemptoff_tracer_init(struct trace_array *tr)
 {
 	trace_type = TRACER_PREEMPT_OFF;
 
 	__irqsoff_tracer_init(tr);
+	return 0;
 }
 
 static struct tracer preemptoff_tracer __read_mostly =
@@ -471,11 +473,12 @@ static struct tracer preemptoff_tracer __read_mostly =
 #if defined(CONFIG_IRQSOFF_TRACER) && \
 	defined(CONFIG_PREEMPT_TRACER)
 
-static void preemptirqsoff_tracer_init(struct trace_array *tr)
+static int preemptirqsoff_tracer_init(struct trace_array *tr)
 {
 	trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF;
 
 	__irqsoff_tracer_init(tr);
+	return 0;
 }
 
 static struct tracer preemptirqsoff_tracer __read_mostly =
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 51bcf370215e..433d650eda9f 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -30,13 +30,14 @@ static void mmio_reset_data(struct trace_array *tr)
 		tracing_reset(tr, cpu);
 }
 
-static void mmio_trace_init(struct trace_array *tr)
+static int mmio_trace_init(struct trace_array *tr)
 {
 	pr_debug("in %s\n", __func__);
 	mmio_trace_array = tr;
 
 	mmio_reset_data(tr);
 	enable_mmiotrace();
+	return 0;
 }
 
 static void mmio_trace_reset(struct trace_array *tr)
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index 2ef1d227e7d8..0e77415caed3 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -24,7 +24,7 @@ static void stop_nop_trace(struct trace_array *tr)
 	/* Nothing to do! */
 }
 
-static void nop_trace_init(struct trace_array *tr)
+static int nop_trace_init(struct trace_array *tr)
 {
 	int cpu;
 	ctx_trace = tr;
@@ -33,6 +33,7 @@ static void nop_trace_init(struct trace_array *tr)
 		tracing_reset(tr, cpu);
 
 	start_nop_trace(tr);
+	return 0;
 }
 
 static void nop_trace_reset(struct trace_array *tr)
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index be35bdfe2e38..863390557b44 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -206,10 +206,11 @@ static void stop_sched_trace(struct trace_array *tr)
 	tracing_stop_sched_switch_record();
 }
 
-static void sched_switch_trace_init(struct trace_array *tr)
+static int sched_switch_trace_init(struct trace_array *tr)
 {
 	ctx_trace = tr;
 	start_sched_trace(tr);
+	return 0;
 }
 
 static void sched_switch_trace_reset(struct trace_array *tr)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 983f2b1478c9..0067b49746c1 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -331,10 +331,11 @@ static void stop_wakeup_tracer(struct trace_array *tr)
 	unregister_trace_sched_wakeup(probe_wakeup);
 }
 
-static void wakeup_tracer_init(struct trace_array *tr)
+static int wakeup_tracer_init(struct trace_array *tr)
 {
 	wakeup_trace = tr;
 	start_wakeup_tracer(tr);
+	return 0;
 }
 
 static void wakeup_tracer_reset(struct trace_array *tr)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 5cb64ea061b5..88c8eb70f54a 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -71,6 +71,11 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
 	return ret;
 }
 
+static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
+{
+	printk(KERN_WARNING "Failed to init %s tracer, init returned %d\n",
+		trace->name, init_ret);
+}
 #ifdef CONFIG_FUNCTION_TRACER
 
 #ifdef CONFIG_DYNAMIC_FTRACE
@@ -111,7 +116,11 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 	ftrace_set_filter(func_name, strlen(func_name), 1);
 
 	/* enable tracing */
-	trace->init(tr);
+	ret = trace->init(tr);
+	if (ret) {
+		warn_failed_init_tracer(trace, ret);
+		goto out;
+	}
 
 	/* Sleep for a 1/10 of a second */
 	msleep(100);
@@ -181,7 +190,12 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 	ftrace_enabled = 1;
 	tracer_enabled = 1;
 
-	trace->init(tr);
+	ret = trace->init(tr);
+	if (ret) {
+		warn_failed_init_tracer(trace, ret);
+		goto out;
+	}
+
 	/* Sleep for a 1/10 of a second */
 	msleep(100);
 	/* stop the tracing. */
@@ -223,7 +237,12 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
 	int ret;
 
 	/* start the tracing */
-	trace->init(tr);
+	ret = trace->init(tr);
+	if (ret) {
+		warn_failed_init_tracer(trace, ret);
+		return ret;
+	}
+
 	/* reset the max latency */
 	tracing_max_latency = 0;
 	/* disable interrupts for a bit */
@@ -272,7 +291,12 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
 	}
 
 	/* start the tracing */
-	trace->init(tr);
+	ret = trace->init(tr);
+	if (ret) {
+		warn_failed_init_tracer(trace, ret);
+		return ret;
+	}
+
 	/* reset the max latency */
 	tracing_max_latency = 0;
 	/* disable preemption for a bit */
@@ -321,7 +345,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 	}
 
 	/* start the tracing */
-	trace->init(tr);
+	ret = trace->init(tr);
+	if (ret) {
+		warn_failed_init_tracer(trace, ret);
+		goto out;
+	}
 
 	/* reset the max latency */
 	tracing_max_latency = 0;
@@ -449,7 +477,12 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
 	wait_for_completion(&isrt);
 
 	/* start the tracing */
-	trace->init(tr);
+	ret = trace->init(tr);
+	if (ret) {
+		warn_failed_init_tracer(trace, ret);
+		return ret;
+	}
+
 	/* reset the max latency */
 	tracing_max_latency = 0;
 
@@ -505,7 +538,12 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
 	int ret;
 
 	/* start the tracing */
-	trace->init(tr);
+	ret = trace->init(tr);
+	if (ret) {
+		warn_failed_init_tracer(trace, ret);
+		return ret;
+	}
+
 	/* Sleep for a 1/10 of a second */
 	msleep(100);
 	/* stop the tracing. */
@@ -532,7 +570,12 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
 	int ret;
 
 	/* start the tracing */
-	trace->init(tr);
+	ret = trace->init(tr);
+	if (ret) {
+		warn_failed_init_tracer(trace, ret);
+		return 0;
+	}
+
 	/* Sleep for a 1/10 of a second */
 	msleep(100);
 	/* stop the tracing. */
@@ -554,7 +597,12 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
 	int ret;
 
 	/* start the tracing */
-	trace->init(tr);
+	ret = trace->init(tr);
+	if (ret) {
+		warn_failed_init_tracer(trace, ret);
+		return ret;
+	}
+
 	/* Sleep for a 1/10 of a second */
 	msleep(100);
 	/* stop the tracing. */
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 05f753422aea..54960edb96d0 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -261,11 +261,12 @@ static void stop_stack_trace(struct trace_array *tr)
 	mutex_unlock(&sample_timer_lock);
 }
 
-static void stack_trace_init(struct trace_array *tr)
+static int stack_trace_init(struct trace_array *tr)
 {
 	sysprof_trace = tr;
 
 	start_stack_trace(tr);
+	return 0;
 }
 
 static void stack_trace_reset(struct trace_array *tr)
-- 
cgit v1.2.3


From 072b40a15616fe6bea68466e6bffcfcbf5c8f26f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 16 Nov 2008 05:59:52 +0100
Subject: tracing/branch-tracer: fix a trace recursion on branch tracer

Impact: fix crash when enabling the branch-tracer

When the branch tracer inserts an event through
probe_likely_condition(), it calls local_irq_save() and then results
in a trace recursion.

local_irq_save() -> trace_hardirqs_off() -> trace_hardirqs_off_caller()
	-> unlikely()

The trace_branch.c file is protected by DISABLE_BRANCH_PROFILING but
that doesn't prevent from external call to functions that use
unlikely().

My box crashed each time I tried to set this tracer (sudden and hard
reboot).

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_branch.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 85265553918f..2511e32572ca 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -41,7 +41,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	if (unlikely(!tr))
 		return;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
 		goto out;
@@ -73,7 +73,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 
  out:
 	atomic_dec(&tr->data[cpu]->disabled);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static inline
-- 
cgit v1.2.3


From e7d3737ea1b102030f44e96c97754101e41515f0 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 16 Nov 2008 06:02:06 +0100
Subject: tracing/function-return-tracer: support for dynamic ftrace on
 function return tracer

This patch adds the support for dynamic tracing on the function return tracer.
The whole difference with normal dynamic function tracing is that we don't need
to hook on a particular callback. The only pro that we want is to nop or set
dynamically the calls to ftrace_caller (which is ftrace_return_caller here).

Some security checks ensure that we are not trying to launch dynamic tracing for
return tracing while normal function tracing is already running.

An example of trace with getnstimeofday set as a filter:

ktime_get_ts+0x22/0x50 -> getnstimeofday (2283 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1396 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1382 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1825 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1426 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1464 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1524 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1382 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1382 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1434 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1464 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1502 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1404 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1397 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1051 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1314 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1344 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1163 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1390 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1374 ns)

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_32.S            |  18 ++-
 arch/x86/kernel/ftrace.c              | 258 +++++++++++++++++-----------------
 include/linux/ftrace.h                |  16 ++-
 kernel/trace/Kconfig                  |   1 -
 kernel/trace/ftrace.c                 |  58 +++++++-
 kernel/trace/trace_functions_return.c |  15 +-
 6 files changed, 211 insertions(+), 155 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index f97621149839..74defe21ba42 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1190,7 +1190,7 @@ ENTRY(mcount)
 	jnz trace
 #ifdef CONFIG_FUNCTION_RET_TRACER
 	cmpl $ftrace_stub, ftrace_function_return
-	jnz trace_return
+	jnz ftrace_return_caller
 #endif
 .globl ftrace_stub
 ftrace_stub:
@@ -1211,9 +1211,15 @@ trace:
 	popl %ecx
 	popl %eax
 	jmp ftrace_stub
+END(mcount)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FUNCTION_TRACER */
 
 #ifdef CONFIG_FUNCTION_RET_TRACER
-trace_return:
+ENTRY(ftrace_return_caller)
+	cmpl $0, function_trace_stop
+	jne ftrace_stub
+
 	pushl %eax
 	pushl %ecx
 	pushl %edx
@@ -1223,7 +1229,8 @@ trace_return:
 	popl %edx
 	popl %ecx
 	popl %eax
-	jmp ftrace_stub
+	ret
+END(ftrace_return_caller)
 
 .globl return_to_handler
 return_to_handler:
@@ -1237,10 +1244,7 @@ return_to_handler:
 	popl %ecx
 	popl %eax
 	ret
-#endif /* CONFIG_FUNCTION_RET_TRACER */
-END(mcount)
-#endif /* CONFIG_DYNAMIC_FTRACE */
-#endif /* CONFIG_FUNCTION_TRACER */
+#endif
 
 .section .rodata,"a"
 #include "syscall_table_32.S"
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index d98b5a8ecf4c..924153edd973 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -24,134 +24,6 @@
 #include <asm/nmi.h>
 
 
-
-#ifdef CONFIG_FUNCTION_RET_TRACER
-
-/*
- * These functions are picked from those used on
- * this page for dynamic ftrace. They have been
- * simplified to ignore all traces in NMI context.
- */
-static atomic_t in_nmi;
-
-void ftrace_nmi_enter(void)
-{
-	atomic_inc(&in_nmi);
-}
-
-void ftrace_nmi_exit(void)
-{
-	atomic_dec(&in_nmi);
-}
-
-/* Add a function return address to the trace stack on thread info.*/
-static int push_return_trace(unsigned long ret, unsigned long long time,
-				unsigned long func)
-{
-	int index;
-	struct thread_info *ti = current_thread_info();
-
-	/* The return trace stack is full */
-	if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1)
-		return -EBUSY;
-
-	index = ++ti->curr_ret_stack;
-	barrier();
-	ti->ret_stack[index].ret = ret;
-	ti->ret_stack[index].func = func;
-	ti->ret_stack[index].calltime = time;
-
-	return 0;
-}
-
-/* Retrieve a function return address to the trace stack on thread info.*/
-static void pop_return_trace(unsigned long *ret, unsigned long long *time,
-				unsigned long *func)
-{
-	int index;
-
-	struct thread_info *ti = current_thread_info();
-	index = ti->curr_ret_stack;
-	*ret = ti->ret_stack[index].ret;
-	*func = ti->ret_stack[index].func;
-	*time = ti->ret_stack[index].calltime;
-	ti->curr_ret_stack--;
-}
-
-/*
- * Send the trace to the ring-buffer.
- * @return the original return address.
- */
-unsigned long ftrace_return_to_handler(void)
-{
-	struct ftrace_retfunc trace;
-	pop_return_trace(&trace.ret, &trace.calltime, &trace.func);
-	trace.rettime = cpu_clock(raw_smp_processor_id());
-	ftrace_function_return(&trace);
-
-	return trace.ret;
-}
-
-/*
- * Hook the return address and push it in the stack of return addrs
- * in current thread info.
- */
-void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
-{
-	unsigned long old;
-	unsigned long long calltime;
-	int faulted;
-	unsigned long return_hooker = (unsigned long)
-				&return_to_handler;
-
-	/* Nmi's are currently unsupported */
-	if (atomic_read(&in_nmi))
-		return;
-
-	/*
-	 * Protect against fault, even if it shouldn't
-	 * happen. This tool is too much intrusive to
-	 * ignore such a protection.
-	 */
-	asm volatile(
-		"1: movl (%[parent_old]), %[old]\n"
-		"2: movl %[return_hooker], (%[parent_replaced])\n"
-		"   movl $0, %[faulted]\n"
-
-		".section .fixup, \"ax\"\n"
-		"3: movl $1, %[faulted]\n"
-		".previous\n"
-
-		".section __ex_table, \"a\"\n"
-		"   .long 1b, 3b\n"
-		"   .long 2b, 3b\n"
-		".previous\n"
-
-		: [parent_replaced] "=r" (parent), [old] "=r" (old),
-		  [faulted] "=r" (faulted)
-		: [parent_old] "0" (parent), [return_hooker] "r" (return_hooker)
-		: "memory"
-	);
-
-	if (WARN_ON(faulted)) {
-		unregister_ftrace_return();
-		return;
-	}
-
-	if (WARN_ON(!__kernel_text_address(old))) {
-		unregister_ftrace_return();
-		*parent = old;
-		return;
-	}
-
-	calltime = cpu_clock(raw_smp_processor_id());
-
-	if (push_return_trace(old, calltime, self_addr) == -EBUSY)
-		*parent = old;
-}
-
-#endif
-
 #ifdef CONFIG_DYNAMIC_FTRACE
 
 union ftrace_code_union {
@@ -450,3 +322,133 @@ int __init ftrace_dyn_arch_init(void *data)
 	return 0;
 }
 #endif
+
+#ifdef CONFIG_FUNCTION_RET_TRACER
+
+#ifndef CONFIG_DYNAMIC_FTRACE
+
+/*
+ * These functions are picked from those used on
+ * this page for dynamic ftrace. They have been
+ * simplified to ignore all traces in NMI context.
+ */
+static atomic_t in_nmi;
+
+void ftrace_nmi_enter(void)
+{
+	atomic_inc(&in_nmi);
+}
+
+void ftrace_nmi_exit(void)
+{
+	atomic_dec(&in_nmi);
+}
+#endif /* !CONFIG_DYNAMIC_FTRACE */
+
+/* Add a function return address to the trace stack on thread info.*/
+static int push_return_trace(unsigned long ret, unsigned long long time,
+				unsigned long func)
+{
+	int index;
+	struct thread_info *ti = current_thread_info();
+
+	/* The return trace stack is full */
+	if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1)
+		return -EBUSY;
+
+	index = ++ti->curr_ret_stack;
+	barrier();
+	ti->ret_stack[index].ret = ret;
+	ti->ret_stack[index].func = func;
+	ti->ret_stack[index].calltime = time;
+
+	return 0;
+}
+
+/* Retrieve a function return address to the trace stack on thread info.*/
+static void pop_return_trace(unsigned long *ret, unsigned long long *time,
+				unsigned long *func)
+{
+	int index;
+
+	struct thread_info *ti = current_thread_info();
+	index = ti->curr_ret_stack;
+	*ret = ti->ret_stack[index].ret;
+	*func = ti->ret_stack[index].func;
+	*time = ti->ret_stack[index].calltime;
+	ti->curr_ret_stack--;
+}
+
+/*
+ * Send the trace to the ring-buffer.
+ * @return the original return address.
+ */
+unsigned long ftrace_return_to_handler(void)
+{
+	struct ftrace_retfunc trace;
+	pop_return_trace(&trace.ret, &trace.calltime, &trace.func);
+	trace.rettime = cpu_clock(raw_smp_processor_id());
+	ftrace_function_return(&trace);
+
+	return trace.ret;
+}
+
+/*
+ * Hook the return address and push it in the stack of return addrs
+ * in current thread info.
+ */
+void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
+{
+	unsigned long old;
+	unsigned long long calltime;
+	int faulted;
+	unsigned long return_hooker = (unsigned long)
+				&return_to_handler;
+
+	/* Nmi's are currently unsupported */
+	if (atomic_read(&in_nmi))
+		return;
+
+	/*
+	 * Protect against fault, even if it shouldn't
+	 * happen. This tool is too much intrusive to
+	 * ignore such a protection.
+	 */
+	asm volatile(
+		"1: movl (%[parent_old]), %[old]\n"
+		"2: movl %[return_hooker], (%[parent_replaced])\n"
+		"   movl $0, %[faulted]\n"
+
+		".section .fixup, \"ax\"\n"
+		"3: movl $1, %[faulted]\n"
+		".previous\n"
+
+		".section __ex_table, \"a\"\n"
+		"   .long 1b, 3b\n"
+		"   .long 2b, 3b\n"
+		".previous\n"
+
+		: [parent_replaced] "=r" (parent), [old] "=r" (old),
+		  [faulted] "=r" (faulted)
+		: [parent_old] "0" (parent), [return_hooker] "r" (return_hooker)
+		: "memory"
+	);
+
+	if (WARN_ON(faulted)) {
+		unregister_ftrace_return();
+		return;
+	}
+
+	if (WARN_ON(!__kernel_text_address(old))) {
+		unregister_ftrace_return();
+		*parent = old;
+		return;
+	}
+
+	calltime = cpu_clock(raw_smp_processor_id());
+
+	if (push_return_trace(old, calltime, self_addr) == -EBUSY)
+		*parent = old;
+}
+
+#endif /* CONFIG_FUNCTION_RET_TRACER */
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 166a2070ef65..f1af1aab00e6 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -25,6 +25,17 @@ struct ftrace_ops {
 
 extern int function_trace_stop;
 
+/*
+ * Type of the current tracing.
+ */
+enum ftrace_tracing_type_t {
+	FTRACE_TYPE_ENTER = 0, /* Hook the call of the function */
+	FTRACE_TYPE_RETURN,	/* Hook the return of the function */
+};
+
+/* Current tracing type, default is FTRACE_TYPE_ENTER */
+extern enum ftrace_tracing_type_t ftrace_tracing_type;
+
 /**
  * ftrace_stop - stop function tracer.
  *
@@ -104,6 +115,9 @@ extern int ftrace_update_ftrace_func(ftrace_func_t func);
 extern void ftrace_caller(void);
 extern void ftrace_call(void);
 extern void mcount_call(void);
+#ifdef CONFIG_FUNCTION_RET_TRACER
+extern void ftrace_return_caller(void);
+#endif
 
 /**
  * ftrace_make_nop - convert code into top
@@ -310,7 +324,7 @@ struct ftrace_retfunc {
 /* Type of a callback handler of tracing return function */
 typedef void (*trace_function_return_t)(struct ftrace_retfunc *);
 
-extern void register_ftrace_return(trace_function_return_t func);
+extern int register_ftrace_return(trace_function_return_t func);
 /* The current handler in use */
 extern trace_function_return_t ftrace_function_return;
 extern void unregister_ftrace_return(void);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 9c89526b6b7c..b8378fad29a3 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -59,7 +59,6 @@ config FUNCTION_TRACER
 
 config FUNCTION_RET_TRACER
 	bool "Kernel Function return Tracer"
-	depends on !DYNAMIC_FTRACE
 	depends on HAVE_FUNCTION_RET_TRACER
 	depends on FUNCTION_TRACER
 	help
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b42ec1de546b..2f78a45aac14 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -50,6 +50,9 @@ static int last_ftrace_enabled;
 /* Quick disabling of function tracer. */
 int function_trace_stop;
 
+/* By default, current tracing type is normal tracing. */
+enum ftrace_tracing_type_t ftrace_tracing_type = FTRACE_TYPE_ENTER;
+
 /*
  * ftrace_disabled is set when an anomaly is discovered.
  * ftrace_disabled is much stronger than ftrace_enabled.
@@ -385,12 +388,21 @@ static void ftrace_bug(int failed, unsigned long ip)
 	}
 }
 
-#define FTRACE_ADDR ((long)(ftrace_caller))
 
 static int
 __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 {
 	unsigned long ip, fl;
+	unsigned long ftrace_addr;
+
+#ifdef CONFIG_FUNCTION_RET_TRACER
+	if (ftrace_tracing_type == FTRACE_TYPE_ENTER)
+		ftrace_addr = (unsigned long)ftrace_caller;
+	else
+		ftrace_addr = (unsigned long)ftrace_return_caller;
+#else
+	ftrace_addr = (unsigned long)ftrace_caller;
+#endif
 
 	ip = rec->ip;
 
@@ -450,9 +462,9 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 	}
 
 	if (rec->flags & FTRACE_FL_ENABLED)
-		return ftrace_make_call(rec, FTRACE_ADDR);
+		return ftrace_make_call(rec, ftrace_addr);
 	else
-		return ftrace_make_nop(NULL, rec, FTRACE_ADDR);
+		return ftrace_make_nop(NULL, rec, ftrace_addr);
 }
 
 static void ftrace_replace_code(int enable)
@@ -1405,10 +1417,17 @@ int register_ftrace_function(struct ftrace_ops *ops)
 		return -1;
 
 	mutex_lock(&ftrace_sysctl_lock);
+
+	if (ftrace_tracing_type == FTRACE_TYPE_RETURN) {
+		ret = -EBUSY;
+		goto out;
+	}
+
 	ret = __register_ftrace_function(ops);
 	ftrace_startup();
-	mutex_unlock(&ftrace_sysctl_lock);
 
+out:
+	mutex_unlock(&ftrace_sysctl_lock);
 	return ret;
 }
 
@@ -1474,16 +1493,45 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 }
 
 #ifdef CONFIG_FUNCTION_RET_TRACER
+
+/* The callback that hooks the return of a function */
 trace_function_return_t ftrace_function_return =
 			(trace_function_return_t)ftrace_stub;
-void register_ftrace_return(trace_function_return_t func)
+
+int register_ftrace_return(trace_function_return_t func)
 {
+	int ret = 0;
+
+	mutex_lock(&ftrace_sysctl_lock);
+
+	/*
+	 * Don't launch return tracing if normal function
+	 * tracing is already running.
+	 */
+	if (ftrace_trace_function != ftrace_stub) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	ftrace_tracing_type = FTRACE_TYPE_RETURN;
 	ftrace_function_return = func;
+	ftrace_startup();
+
+out:
+	mutex_unlock(&ftrace_sysctl_lock);
+	return ret;
 }
 
 void unregister_ftrace_return(void)
 {
+	mutex_lock(&ftrace_sysctl_lock);
+
 	ftrace_function_return = (trace_function_return_t)ftrace_stub;
+	ftrace_shutdown();
+	/* Restore normal tracing type */
+	ftrace_tracing_type = FTRACE_TYPE_ENTER;
+
+	mutex_unlock(&ftrace_sysctl_lock);
 }
 #endif
 
diff --git a/kernel/trace/trace_functions_return.c b/kernel/trace/trace_functions_return.c
index 61185f756a13..a68564af022b 100644
--- a/kernel/trace/trace_functions_return.c
+++ b/kernel/trace/trace_functions_return.c
@@ -14,29 +14,18 @@
 #include "trace.h"
 
 
-static void start_return_trace(struct trace_array *tr)
-{
-	register_ftrace_return(&trace_function_return);
-}
-
-static void stop_return_trace(struct trace_array *tr)
-{
-	unregister_ftrace_return();
-}
-
 static int return_trace_init(struct trace_array *tr)
 {
 	int cpu;
 	for_each_online_cpu(cpu)
 		tracing_reset(tr, cpu);
 
-	start_return_trace(tr);
-	return 0;
+	return register_ftrace_return(&trace_function_return);
 }
 
 static void return_trace_reset(struct trace_array *tr)
 {
-		stop_return_trace(tr);
+		unregister_ftrace_return();
 }
 
 
-- 
cgit v1.2.3


From 2bdba316c989da028a59becf7516c6350ce3c173 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 14 Nov 2008 17:47:35 -0500
Subject: markers: fix unregister

Impact: fix marker registers/unregister race

get_marker() can return a NULL entry because the mutex is released in
the middle of those functions. Make sure we check to see if it has been
concurrently removed.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/marker.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/marker.c b/kernel/marker.c
index 2898b647d415..de683a7799e7 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -653,10 +653,11 @@ int marker_probe_register(const char *name, const char *format,
 		goto end;
 	}
 	mutex_unlock(&markers_mutex);
-	marker_update_probes();		/* may update entry */
+	marker_update_probes();
 	mutex_lock(&markers_mutex);
 	entry = get_marker(name);
-	WARN_ON(!entry);
+	if (!entry)
+		goto end;
 	if (entry->rcu_pending)
 		rcu_barrier_sched();
 	entry->oldptr = old;
@@ -697,7 +698,7 @@ int marker_probe_unregister(const char *name,
 		rcu_barrier_sched();
 	old = marker_entry_remove_probe(entry, probe, probe_private);
 	mutex_unlock(&markers_mutex);
-	marker_update_probes();		/* may update entry */
+	marker_update_probes();
 	mutex_lock(&markers_mutex);
 	entry = get_marker(name);
 	if (!entry)
@@ -778,10 +779,11 @@ int marker_probe_unregister_private_data(marker_probe_func *probe,
 		rcu_barrier_sched();
 	old = marker_entry_remove_probe(entry, NULL, probe_private);
 	mutex_unlock(&markers_mutex);
-	marker_update_probes();		/* may update entry */
+	marker_update_probes();
 	mutex_lock(&markers_mutex);
 	entry = get_marker_from_private_data(probe, probe_private);
-	WARN_ON(!entry);
+	if (!entry)
+		goto end;
 	if (entry->rcu_pending)
 		rcu_barrier_sched();
 	entry->oldptr = old;
-- 
cgit v1.2.3


From 021aeb057fc48af03fe5f37d3dda366c0d97aaf3 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 14 Nov 2008 17:47:37 -0500
Subject: markers: use rcu_*_sched_notrace and notrace

Make marker critical code use notrace to make sure they can be used as an
ftrace callback.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/marker.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/marker.c b/kernel/marker.c
index de683a7799e7..22cd7bae63e0 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -81,7 +81,7 @@ struct marker_entry {
  * though the function pointer change and the marker enabling are two distinct
  * operations that modifies the execution flow of preemptible code.
  */
-void __mark_empty_function(void *probe_private, void *call_private,
+notrace void __mark_empty_function(void *probe_private, void *call_private,
 	const char *fmt, va_list *args)
 {
 }
@@ -97,7 +97,8 @@ EXPORT_SYMBOL_GPL(__mark_empty_function);
  * need to put a full smp_rmb() in this branch. This is why we do not use
  * rcu_dereference() for the pointer read.
  */
-void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
+notrace void marker_probe_cb(const struct marker *mdata,
+		void *call_private, ...)
 {
 	va_list args;
 	char ptype;
@@ -107,7 +108,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
 	 * sure the teardown of the callbacks can be done correctly when they
 	 * are in modules and they insure RCU read coherency.
 	 */
-	rcu_read_lock_sched();
+	rcu_read_lock_sched_notrace();
 	ptype = mdata->ptype;
 	if (likely(!ptype)) {
 		marker_probe_func *func;
@@ -145,7 +146,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
 			va_end(args);
 		}
 	}
-	rcu_read_unlock_sched();
+	rcu_read_unlock_sched_notrace();
 }
 EXPORT_SYMBOL_GPL(marker_probe_cb);
 
@@ -157,12 +158,13 @@ EXPORT_SYMBOL_GPL(marker_probe_cb);
  *
  * Should be connected to markers "MARK_NOARGS".
  */
-static void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
+static notrace void marker_probe_cb_noarg(const struct marker *mdata,
+		void *call_private, ...)
 {
 	va_list args;	/* not initialized */
 	char ptype;
 
-	rcu_read_lock_sched();
+	rcu_read_lock_sched_notrace();
 	ptype = mdata->ptype;
 	if (likely(!ptype)) {
 		marker_probe_func *func;
@@ -195,7 +197,7 @@ static void marker_probe_cb_noarg(const struct marker *mdata, void *call_private
 			multi[i].func(multi[i].probe_private, call_private,
 				mdata->format, &args);
 	}
-	rcu_read_unlock_sched();
+	rcu_read_unlock_sched_notrace();
 }
 
 static void free_old_closure(struct rcu_head *head)
-- 
cgit v1.2.3


From a419246ac7c2d9282dfd843103702895bb3f3fd7 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 14 Nov 2008 17:47:38 -0500
Subject: markers: use module notifier

Impact: cleanup

Use module notifiers instead of adding a hook in module.c.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/marker.c | 29 +++++++++++++++++++++++++++++
 kernel/module.c |  4 ----
 2 files changed, 29 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/marker.c b/kernel/marker.c
index 22cd7bae63e0..348e70cc355a 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -846,3 +846,32 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe,
 	return ERR_PTR(-ENOENT);
 }
 EXPORT_SYMBOL_GPL(marker_get_private_data);
+
+int marker_module_notify(struct notifier_block *self,
+			 unsigned long val, void *data)
+{
+	struct module *mod = data;
+
+	switch (val) {
+	case MODULE_STATE_COMING:
+		marker_update_probe_range(mod->markers,
+			mod->markers + mod->num_markers);
+		break;
+	case MODULE_STATE_GOING:
+		marker_update_probe_range(mod->markers,
+			mod->markers + mod->num_markers);
+		break;
+	}
+	return 0;
+}
+
+struct notifier_block marker_module_nb = {
+	.notifier_call = marker_module_notify,
+	.priority = 0,
+};
+
+static int init_markers(void)
+{
+	return register_module_notifier(&marker_module_nb);
+}
+__initcall(init_markers);
diff --git a/kernel/module.c b/kernel/module.c
index 1f4cc00e0c20..72c6ca574211 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2184,10 +2184,6 @@ static noinline struct module *load_module(void __user *umod,
 		struct mod_debug *debug;
 		unsigned int num_debug;
 
-#ifdef CONFIG_MARKERS
-		marker_update_probe_range(mod->markers,
-			mod->markers + mod->num_markers);
-#endif
 		debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
 				     sizeof(*debug), &num_debug);
 		dynamic_printk_setup(debug, num_debug);
-- 
cgit v1.2.3


From c1df1bd2c4d4b20c83755a0f41956b57aec4842a Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 14 Nov 2008 17:47:39 -0500
Subject: markers: auto enable tracepoints (new API : trace_mark_tp())

Impact: new API

Add a new API trace_mark_tp(), which declares a marker within a
tracepoint probe. When the marker is activated, the tracepoint is
automatically enabled.

No branch test is used at the marker site, because it would be a
duplicate of the branch already present in the tracepoint.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/marker.h | 45 +++++++++++++++++++++++++++++++++++++++++-
 init/Kconfig           |  1 +
 kernel/marker.c        | 53 ++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 96 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/marker.h b/include/linux/marker.h
index 05ec0df37089..57a307018ceb 100644
--- a/include/linux/marker.h
+++ b/include/linux/marker.h
@@ -49,6 +49,8 @@ struct marker {
 	void (*call)(const struct marker *mdata, void *call_private, ...);
 	struct marker_probe_closure single;
 	struct marker_probe_closure *multi;
+	const char *tp_name;	/* Optional tracepoint name */
+	void *tp_cb;		/* Optional tracepoint callback */
 } __attribute__((aligned(8)));
 
 #ifdef CONFIG_MARKERS
@@ -73,7 +75,7 @@ struct marker {
 		__attribute__((section("__markers"), aligned(8))) =	\
 		{ __mstrtab_##name, &__mstrtab_##name[sizeof(#name)],	\
 		0, 0, marker_probe_cb,					\
-		{ __mark_empty_function, NULL}, NULL };			\
+		{ __mark_empty_function, NULL}, NULL, NULL, NULL };	\
 		__mark_check_format(format, ## args);			\
 		if (unlikely(__mark_##name.state)) {			\
 			(*__mark_##name.call)				\
@@ -81,11 +83,38 @@ struct marker {
 		}							\
 	} while (0)
 
+#define __trace_mark_tp(name, call_private, tp_name, tp_cb, format, args...) \
+	do {								\
+		void __check_tp_type(void)				\
+		{							\
+			register_trace_##tp_name(tp_cb);		\
+		}							\
+		static const char __mstrtab_##name[]			\
+		__attribute__((section("__markers_strings")))		\
+		= #name "\0" format;					\
+		static struct marker __mark_##name			\
+		__attribute__((section("__markers"), aligned(8))) =	\
+		{ __mstrtab_##name, &__mstrtab_##name[sizeof(#name)],	\
+		0, 0, marker_probe_cb,					\
+		{ __mark_empty_function, NULL}, NULL, #tp_name, tp_cb };\
+		__mark_check_format(format, ## args);			\
+		(*__mark_##name.call)(&__mark_##name, call_private,	\
+					## args);			\
+	} while (0)
+
 extern void marker_update_probe_range(struct marker *begin,
 	struct marker *end);
 #else /* !CONFIG_MARKERS */
 #define __trace_mark(generic, name, call_private, format, args...) \
 		__mark_check_format(format, ## args)
+#define __trace_mark_tp(name, call_private, tp_name, tp_cb, format, args...) \
+	do {								\
+		void __check_tp_type(void)				\
+		{							\
+			register_trace_##tp_name(tp_cb);		\
+		}							\
+		__mark_check_format(format, ## args);			\
+	} while (0)
 static inline void marker_update_probe_range(struct marker *begin,
 	struct marker *end)
 { }
@@ -117,6 +146,20 @@ static inline void marker_update_probe_range(struct marker *begin,
 #define _trace_mark(name, format, args...) \
 	__trace_mark(1, name, NULL, format, ## args)
 
+/**
+ * trace_mark_tp - Marker in a tracepoint callback
+ * @name: marker name, not quoted.
+ * @tp_name: tracepoint name, not quoted.
+ * @tp_cb: tracepoint callback. Should have an associated global symbol so it
+ *         is not optimized away by the compiler (should not be static).
+ * @format: format string
+ * @args...: variable argument list
+ *
+ * Places a marker in a tracepoint callback.
+ */
+#define trace_mark_tp(name, tp_name, tp_cb, format, args...)	\
+	__trace_mark_tp(name, NULL, tp_name, tp_cb, format, ## args)
+
 /**
  * MARK_NOARGS - Format string for a marker with no argument.
  */
diff --git a/init/Kconfig b/init/Kconfig
index 86b00c53fade..f5bacb438711 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -808,6 +808,7 @@ config TRACEPOINTS
 
 config MARKERS
 	bool "Activate markers"
+	depends on TRACEPOINTS
 	help
 	  Place an empty function call at each marker site. Can be
 	  dynamically changed for a probe function.
diff --git a/kernel/marker.c b/kernel/marker.c
index 348e70cc355a..c14ec26a9b9f 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -479,7 +479,7 @@ static int marker_set_format(struct marker_entry *entry, const char *format)
 static int set_marker(struct marker_entry *entry, struct marker *elem,
 		int active)
 {
-	int ret;
+	int ret = 0;
 	WARN_ON(strcmp(entry->name, elem->name) != 0);
 
 	if (entry->format) {
@@ -531,9 +531,40 @@ static int set_marker(struct marker_entry *entry, struct marker *elem,
 	 */
 	smp_wmb();
 	elem->ptype = entry->ptype;
+
+	if (elem->tp_name && (active ^ elem->state)) {
+		WARN_ON(!elem->tp_cb);
+		/*
+		 * It is ok to directly call the probe registration because type
+		 * checking has been done in the __trace_mark_tp() macro.
+		 */
+
+		if (active) {
+			/*
+			 * try_module_get should always succeed because we hold
+			 * lock_module() to get the tp_cb address.
+			 */
+			ret = try_module_get(__module_text_address(
+				(unsigned long)elem->tp_cb));
+			BUG_ON(!ret);
+			ret = tracepoint_probe_register_noupdate(
+				elem->tp_name,
+				elem->tp_cb);
+		} else {
+			ret = tracepoint_probe_unregister_noupdate(
+				elem->tp_name,
+				elem->tp_cb);
+			/*
+			 * tracepoint_probe_update_all() must be called
+			 * before the module containing tp_cb is unloaded.
+			 */
+			module_put(__module_text_address(
+				(unsigned long)elem->tp_cb));
+		}
+	}
 	elem->state = active;
 
-	return 0;
+	return ret;
 }
 
 /*
@@ -544,7 +575,24 @@ static int set_marker(struct marker_entry *entry, struct marker *elem,
  */
 static void disable_marker(struct marker *elem)
 {
+	int ret;
+
 	/* leave "call" as is. It is known statically. */
+	if (elem->tp_name && elem->state) {
+		WARN_ON(!elem->tp_cb);
+		/*
+		 * It is ok to directly call the probe registration because type
+		 * checking has been done in the __trace_mark_tp() macro.
+		 */
+		ret = tracepoint_probe_unregister_noupdate(elem->tp_name,
+			elem->tp_cb);
+		WARN_ON(ret);
+		/*
+		 * tracepoint_probe_update_all() must be called
+		 * before the module containing tp_cb is unloaded.
+		 */
+		module_put(__module_text_address((unsigned long)elem->tp_cb));
+	}
 	elem->state = 0;
 	elem->single.func = __mark_empty_function;
 	/* Update the function before setting the ptype */
@@ -608,6 +656,7 @@ static void marker_update_probes(void)
 	marker_update_probe_range(__start___markers, __stop___markers);
 	/* Markers in modules. */
 	module_update_markers();
+	tracepoint_probe_update_all();
 }
 
 /**
-- 
cgit v1.2.3


From de0baf9ad661ac630a45a50ea1717cc4f4b33ace Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 14 Nov 2008 17:47:42 -0500
Subject: tracepoints: fix disable

Impact: fix race

Set the probe array pointer to NULL when the tracepoint is disabled.
The probe array point not being NULL could generate a race condition
where the reader would dereference a freed pointer.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/tracepoint.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index e96590f17de1..47a7303d6cd9 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -262,6 +262,7 @@ static void set_tracepoint(struct tracepoint_entry **entry,
 static void disable_tracepoint(struct tracepoint *elem)
 {
 	elem->state = 0;
+	rcu_assign_pointer(elem->funcs, NULL);
 }
 
 /**
-- 
cgit v1.2.3


From 32f85742778dfc2c74975cf0b9f5bdb13470cb32 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 14 Nov 2008 17:47:46 -0500
Subject: tracepoints: use modules notifiers

Impact: cleanup

Use module notifiers for tracepoint updates rather than adding a hook in
module.c.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/module.c     |  5 -----
 kernel/tracepoint.c | 29 +++++++++++++++++++++++++++++
 2 files changed, 29 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 72c6ca574211..fc1dff9a178c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2187,11 +2187,6 @@ static noinline struct module *load_module(void __user *umod,
 		debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
 				     sizeof(*debug), &num_debug);
 		dynamic_printk_setup(debug, num_debug);
-
-#ifdef CONFIG_TRACEPOINTS
-		tracepoint_update_probe_range(mod->tracepoints,
-			mod->tracepoints + mod->num_tracepoints);
-#endif
 	}
 
 	/* sechdrs[0].sh_size is always zero */
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 47a7303d6cd9..94ac4e35530d 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -541,3 +541,32 @@ void tracepoint_iter_reset(struct tracepoint_iter *iter)
 	iter->tracepoint = NULL;
 }
 EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
+
+int tracepoint_module_notify(struct notifier_block *self,
+			     unsigned long val, void *data)
+{
+	struct module *mod = data;
+
+	switch (val) {
+	case MODULE_STATE_COMING:
+		tracepoint_update_probe_range(mod->tracepoints,
+			mod->tracepoints + mod->num_tracepoints);
+		break;
+	case MODULE_STATE_GOING:
+		tracepoint_update_probe_range(mod->tracepoints,
+			mod->tracepoints + mod->num_tracepoints);
+		break;
+	}
+	return 0;
+}
+
+struct notifier_block tracepoint_module_nb = {
+	.notifier_call = tracepoint_module_notify,
+	.priority = 0,
+};
+
+static int init_tracepoints(void)
+{
+	return register_module_notifier(&tracepoint_module_nb);
+}
+__initcall(init_tracepoints);
-- 
cgit v1.2.3


From 7e066fb870fcd1025ec3ba7bbde5d541094f4ce1 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 14 Nov 2008 17:47:47 -0500
Subject: tracepoints: add DECLARE_TRACE() and DEFINE_TRACE()

Impact: API *CHANGE*. Must update all tracepoint users.

Add DEFINE_TRACE() to tracepoints to let them declare the tracepoint
structure in a single spot for all the kernel. It helps reducing memory
consumption, especially when declaring a lot of tracepoints, e.g. for
kmalloc tracing.

*API CHANGE WARNING*: now, DECLARE_TRACE() must be used in headers for
tracepoint declarations rather than DEFINE_TRACE(). This is the sane way
to do it. The name previously used was misleading.

Updates scheduler instrumentation to follow this API change.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/tracepoints.txt           |  7 ++++++-
 include/asm-generic/vmlinux.lds.h       |  1 +
 include/linux/tracepoint.h              | 35 +++++++++++++++++++++++----------
 include/trace/sched.h                   | 24 +++++++++++-----------
 kernel/exit.c                           |  4 ++++
 kernel/fork.c                           |  2 ++
 kernel/kthread.c                        |  3 +++
 kernel/sched.c                          |  6 ++++++
 kernel/signal.c                         |  2 ++
 samples/tracepoints/tp-samples-trace.h  |  4 ++--
 samples/tracepoints/tracepoint-sample.c |  3 +++
 11 files changed, 66 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/tracepoints.txt b/Documentation/tracepoints.txt
index 5d354e167494..e8ad47b437f3 100644
--- a/Documentation/tracepoints.txt
+++ b/Documentation/tracepoints.txt
@@ -42,7 +42,7 @@ In include/trace/subsys.h :
 
 #include <linux/tracepoint.h>
 
-DEFINE_TRACE(subsys_eventname,
+DECLARE_TRACE(subsys_eventname,
 	TPPTOTO(int firstarg, struct task_struct *p),
 	TPARGS(firstarg, p));
 
@@ -50,6 +50,8 @@ In subsys/file.c (where the tracing statement must be added) :
 
 #include <trace/subsys.h>
 
+DEFINE_TRACE(subsys_eventname);
+
 void somefct(void)
 {
 	...
@@ -86,6 +88,9 @@ to limit collisions. Tracepoint names are global to the kernel: they are
 considered as being the same whether they are in the core kernel image or in
 modules.
 
+If the tracepoint has to be used in kernel modules, an
+EXPORT_TRACEPOINT_SYMBOL_GPL() or EXPORT_TRACEPOINT_SYMBOL() can be used to
+export the defined tracepoints.
 
 * Probe / tracepoint example
 
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index a5e4ed9baec8..3b46ae464933 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -71,6 +71,7 @@
 	VMLINUX_SYMBOL(__start___markers) = .;				\
 	*(__markers)							\
 	VMLINUX_SYMBOL(__stop___markers) = .;				\
+	. = ALIGN(32);							\
 	VMLINUX_SYMBOL(__start___tracepoints) = .;			\
 	*(__tracepoints)						\
 	VMLINUX_SYMBOL(__stop___tracepoints) = .;			\
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 7e9b42aeae0e..757005458366 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -24,8 +24,12 @@ struct tracepoint {
 	const char *name;		/* Tracepoint name */
 	int state;			/* State. */
 	void **funcs;
-} __attribute__((aligned(8)));
-
+} __attribute__((aligned(32)));		/*
+					 * Aligned on 32 bytes because it is
+					 * globally visible and gcc happily
+					 * align these on the structure size.
+					 * Keep in sync with vmlinux.lds.h.
+					 */
 
 #define TPPROTO(args...)	args
 #define TPARGS(args...)		args
@@ -55,15 +59,10 @@ struct tracepoint {
  * not add unwanted padding between the beginning of the section and the
  * structure. Force alignment to the same alignment as the section start.
  */
-#define DEFINE_TRACE(name, proto, args)					\
+#define DECLARE_TRACE(name, proto, args)				\
+	extern struct tracepoint __tracepoint_##name;			\
 	static inline void trace_##name(proto)				\
 	{								\
-		static const char __tpstrtab_##name[]			\
-		__attribute__((section("__tracepoints_strings")))	\
-		= #name;						\
-		static struct tracepoint __tracepoint_##name		\
-		__attribute__((section("__tracepoints"), aligned(8))) =	\
-		{ __tpstrtab_##name, 0, NULL };				\
 		if (unlikely(__tracepoint_##name.state))		\
 			__DO_TRACE(&__tracepoint_##name,		\
 				TPPROTO(proto), TPARGS(args));		\
@@ -77,11 +76,23 @@ struct tracepoint {
 		return tracepoint_probe_unregister(#name, (void *)probe);\
 	}
 
+#define DEFINE_TRACE(name)						\
+	static const char __tpstrtab_##name[]				\
+	__attribute__((section("__tracepoints_strings"))) = #name;	\
+	struct tracepoint __tracepoint_##name				\
+	__attribute__((section("__tracepoints"), aligned(32))) =	\
+		{ __tpstrtab_##name, 0, NULL }
+
+#define EXPORT_TRACEPOINT_SYMBOL_GPL(name)				\
+	EXPORT_SYMBOL_GPL(__tracepoint_##name)
+#define EXPORT_TRACEPOINT_SYMBOL(name)					\
+	EXPORT_SYMBOL(__tracepoint_##name)
+
 extern void tracepoint_update_probe_range(struct tracepoint *begin,
 	struct tracepoint *end);
 
 #else /* !CONFIG_TRACEPOINTS */
-#define DEFINE_TRACE(name, proto, args)			\
+#define DECLARE_TRACE(name, proto, args)				\
 	static inline void _do_trace_##name(struct tracepoint *tp, proto) \
 	{ }								\
 	static inline void trace_##name(proto)				\
@@ -95,6 +106,10 @@ extern void tracepoint_update_probe_range(struct tracepoint *begin,
 		return -ENOSYS;						\
 	}
 
+#define DEFINE_TRACE(name)
+#define EXPORT_TRACEPOINT_SYMBOL_GPL(name)
+#define EXPORT_TRACEPOINT_SYMBOL(name)
+
 static inline void tracepoint_update_probe_range(struct tracepoint *begin,
 	struct tracepoint *end)
 { }
diff --git a/include/trace/sched.h b/include/trace/sched.h
index ad47369d01b5..9b2854abf7e2 100644
--- a/include/trace/sched.h
+++ b/include/trace/sched.h
@@ -4,52 +4,52 @@
 #include <linux/sched.h>
 #include <linux/tracepoint.h>
 
-DEFINE_TRACE(sched_kthread_stop,
+DECLARE_TRACE(sched_kthread_stop,
 	TPPROTO(struct task_struct *t),
 		TPARGS(t));
 
-DEFINE_TRACE(sched_kthread_stop_ret,
+DECLARE_TRACE(sched_kthread_stop_ret,
 	TPPROTO(int ret),
 		TPARGS(ret));
 
-DEFINE_TRACE(sched_wait_task,
+DECLARE_TRACE(sched_wait_task,
 	TPPROTO(struct rq *rq, struct task_struct *p),
 		TPARGS(rq, p));
 
-DEFINE_TRACE(sched_wakeup,
+DECLARE_TRACE(sched_wakeup,
 	TPPROTO(struct rq *rq, struct task_struct *p),
 		TPARGS(rq, p));
 
-DEFINE_TRACE(sched_wakeup_new,
+DECLARE_TRACE(sched_wakeup_new,
 	TPPROTO(struct rq *rq, struct task_struct *p),
 		TPARGS(rq, p));
 
-DEFINE_TRACE(sched_switch,
+DECLARE_TRACE(sched_switch,
 	TPPROTO(struct rq *rq, struct task_struct *prev,
 		struct task_struct *next),
 		TPARGS(rq, prev, next));
 
-DEFINE_TRACE(sched_migrate_task,
+DECLARE_TRACE(sched_migrate_task,
 	TPPROTO(struct rq *rq, struct task_struct *p, int dest_cpu),
 		TPARGS(rq, p, dest_cpu));
 
-DEFINE_TRACE(sched_process_free,
+DECLARE_TRACE(sched_process_free,
 	TPPROTO(struct task_struct *p),
 		TPARGS(p));
 
-DEFINE_TRACE(sched_process_exit,
+DECLARE_TRACE(sched_process_exit,
 	TPPROTO(struct task_struct *p),
 		TPARGS(p));
 
-DEFINE_TRACE(sched_process_wait,
+DECLARE_TRACE(sched_process_wait,
 	TPPROTO(struct pid *pid),
 		TPARGS(pid));
 
-DEFINE_TRACE(sched_process_fork,
+DECLARE_TRACE(sched_process_fork,
 	TPPROTO(struct task_struct *parent, struct task_struct *child),
 		TPARGS(parent, child));
 
-DEFINE_TRACE(sched_signal_send,
+DECLARE_TRACE(sched_signal_send,
 	TPPROTO(int sig, struct task_struct *p),
 		TPARGS(sig, p));
 
diff --git a/kernel/exit.c b/kernel/exit.c
index ae2b92be5fae..f995d2418668 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -54,6 +54,10 @@
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
 
+DEFINE_TRACE(sched_process_free);
+DEFINE_TRACE(sched_process_exit);
+DEFINE_TRACE(sched_process_wait);
+
 static void exit_mm(struct task_struct * tsk);
 
 static inline int task_detached(struct task_struct *p)
diff --git a/kernel/fork.c b/kernel/fork.c
index f6083561dfe0..0837d0deee5f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -79,6 +79,8 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
 
 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
 
+DEFINE_TRACE(sched_process_fork);
+
 int nr_processes(void)
 {
 	int cpu;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 8e7a7ce3ed0a..4fbc456f393d 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -21,6 +21,9 @@ static DEFINE_SPINLOCK(kthread_create_lock);
 static LIST_HEAD(kthread_create_list);
 struct task_struct *kthreadd_task;
 
+DEFINE_TRACE(sched_kthread_stop);
+DEFINE_TRACE(sched_kthread_stop_ret);
+
 struct kthread_create_info
 {
 	/* Information passed to kthread() from kthreadd. */
diff --git a/kernel/sched.c b/kernel/sched.c
index 50a21f964679..327f91c63c99 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -118,6 +118,12 @@
  */
 #define RUNTIME_INF	((u64)~0ULL)
 
+DEFINE_TRACE(sched_wait_task);
+DEFINE_TRACE(sched_wakeup);
+DEFINE_TRACE(sched_wakeup_new);
+DEFINE_TRACE(sched_switch);
+DEFINE_TRACE(sched_migrate_task);
+
 #ifdef CONFIG_SMP
 /*
  * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
diff --git a/kernel/signal.c b/kernel/signal.c
index 4530fc654455..e9afe63da24b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -41,6 +41,8 @@
 
 static struct kmem_cache *sigqueue_cachep;
 
+DEFINE_TRACE(sched_signal_send);
+
 static void __user *sig_handler(struct task_struct *t, int sig)
 {
 	return t->sighand->action[sig - 1].sa.sa_handler;
diff --git a/samples/tracepoints/tp-samples-trace.h b/samples/tracepoints/tp-samples-trace.h
index 0216b55bd640..01724e04c556 100644
--- a/samples/tracepoints/tp-samples-trace.h
+++ b/samples/tracepoints/tp-samples-trace.h
@@ -4,10 +4,10 @@
 #include <linux/proc_fs.h>	/* for struct inode and struct file */
 #include <linux/tracepoint.h>
 
-DEFINE_TRACE(subsys_event,
+DECLARE_TRACE(subsys_event,
 	TPPROTO(struct inode *inode, struct file *file),
 	TPARGS(inode, file));
-DEFINE_TRACE(subsys_eventb,
+DECLARE_TRACE(subsys_eventb,
 	TPPROTO(void),
 	TPARGS());
 #endif
diff --git a/samples/tracepoints/tracepoint-sample.c b/samples/tracepoints/tracepoint-sample.c
index 4ae4b7fcc043..00d169792a3e 100644
--- a/samples/tracepoints/tracepoint-sample.c
+++ b/samples/tracepoints/tracepoint-sample.c
@@ -13,6 +13,9 @@
 #include <linux/proc_fs.h>
 #include "tp-samples-trace.h"
 
+DEFINE_TRACE(subsys_event);
+DEFINE_TRACE(subsys_eventb);
+
 struct proc_dir_entry *pentry_example;
 
 static int my_open(struct inode *inode, struct file *file)
-- 
cgit v1.2.3


From 227a837567e339c74d9d4243d03a29bd943a018c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 16 Nov 2008 09:50:34 +0100
Subject: markers/tracpoints: fix non-modular build

fix:

 kernel/marker.c: In function 'marker_module_notify':
 kernel/marker.c:905: error: 'MODULE_STATE_COMING' undeclared (first use in this function)
 [...]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/marker.c     | 4 ++++
 kernel/tracepoint.c | 4 ++++
 2 files changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/marker.c b/kernel/marker.c
index c14ec26a9b9f..ea54f2647868 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -896,6 +896,8 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe,
 }
 EXPORT_SYMBOL_GPL(marker_get_private_data);
 
+#ifdef CONFIG_MODULES
+
 int marker_module_notify(struct notifier_block *self,
 			 unsigned long val, void *data)
 {
@@ -924,3 +926,5 @@ static int init_markers(void)
 	return register_module_notifier(&marker_module_nb);
 }
 __initcall(init_markers);
+
+#endif /* CONFIG_MODULES */
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 94ac4e35530d..79602740bbb5 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -542,6 +542,8 @@ void tracepoint_iter_reset(struct tracepoint_iter *iter)
 }
 EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
 
+#ifdef CONFIG_MODULES
+
 int tracepoint_module_notify(struct notifier_block *self,
 			     unsigned long val, void *data)
 {
@@ -570,3 +572,5 @@ static int init_tracepoints(void)
 	return register_module_notifier(&tracepoint_module_nb);
 }
 __initcall(init_tracepoints);
+
+#endif /* CONFIG_MODULES */
-- 
cgit v1.2.3


From 0c726da983de0704254250ef6495ca152e7abcca Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sun, 16 Nov 2008 16:07:58 +0530
Subject: tracing: branch tracer, fix writing to trace/trace_options

Impact: fix trace_options behavior

writing to trace/trace_options use the index of the array
to find the value of the flag. With branch tracer flag
defined conditionally, this breaks writing to trace_options
with branch tracer disabled.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 2 --
 kernel/trace/trace.h | 2 --
 2 files changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4a904623e05d..b04923b72ce0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -259,9 +259,7 @@ static const char *trace_options[] = {
 	"sched-tree",
 	"ftrace_printk",
 	"ftrace_preempt",
-#ifdef CONFIG_BRANCH_TRACER
 	"branch",
-#endif
 	"annotate",
 	NULL
 };
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 790ea8c0e1f3..b41d7b4c2cae 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -470,9 +470,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_SCHED_TREE		= 0x200,
 	TRACE_ITER_PRINTK		= 0x400,
 	TRACE_ITER_PREEMPTONLY		= 0x800,
-#ifdef CONFIG_BRANCH_TRACER
 	TRACE_ITER_BRANCH		= 0x1000,
-#endif
 	TRACE_ITER_ANNOTATE		= 0x2000,
 };
 
-- 
cgit v1.2.3


From adf9f19574334c9a29a2bc956009fcac7edf1a6b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 17 Nov 2008 19:23:42 +0100
Subject: tracing/ftrace: implement a set_flag callback for tracers

Impact: give a way to send specific messages to tracers

The current implementation of tracing uses some flags to control the
output of general tracers. But we have no way to implement custom
flags handling for a specific tracer. This patch proposes a new
callback for the struct tracer which called set_flag and a structure
that represents a 32 bits variable flag.

A tracer can implement a struct tracer_flags on which it puts the
initial value of the flag integer. Than it can place a range of flags
with their name and their flag mask on the flag integer. The structure
that implement a single flag is called struct tracer_opt.

These custom flags will be available through the trace_options file
like the general tracing flags. Changing their value is done like the
other general flags. For example if you have a flag that calls "foo",
you can activate it by writing "foo" or "nofoo" on trace_options.

Note that the set_flag callback is optional and is only needed if you
want the flags changing to be signaled to your tracer and let it to
accept or refuse their assignment.

V2: Some arrangements in coding style....

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++----
 kernel/trace/trace.h | 26 +++++++++++++++
 2 files changed, 112 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2596b5a968c4..9531fddcfb8d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -43,6 +43,20 @@
 unsigned long __read_mostly	tracing_max_latency = (cycle_t)ULONG_MAX;
 unsigned long __read_mostly	tracing_thresh;
 
+/* For tracers that don't implement custom flags */
+static struct tracer_opt dummy_tracer_opt[] = {
+	{ }
+};
+
+static struct tracer_flags dummy_tracer_flags = {
+	.val = 0,
+	.opts = dummy_tracer_opt
+};
+
+static int dummy_set_flag(u32 old_flags, u32 bit, int set)
+{
+	return 0;
+}
 
 /*
  * Kill all tracing for good (never come back).
@@ -529,6 +543,14 @@ int register_tracer(struct tracer *type)
 		}
 	}
 
+	if (!type->set_flag)
+		type->set_flag = &dummy_set_flag;
+	if (!type->flags)
+		type->flags = &dummy_tracer_flags;
+	else
+		if (!type->flags->opts)
+			type->flags->opts = dummy_tracer_opt;
+
 #ifdef CONFIG_FTRACE_STARTUP_TEST
 	if (type->selftest) {
 		struct tracer *saved_tracer = current_trace;
@@ -2426,10 +2448,13 @@ static ssize_t
 tracing_trace_options_read(struct file *filp, char __user *ubuf,
 		       size_t cnt, loff_t *ppos)
 {
+	int i;
 	char *buf;
 	int r = 0;
 	int len = 0;
-	int i;
+	u32 tracer_flags = current_trace->flags->val;
+	struct tracer_opt *trace_opts = current_trace->flags->opts;
+
 
 	/* calulate max size */
 	for (i = 0; trace_options[i]; i++) {
@@ -2437,6 +2462,15 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
 		len += 3; /* "no" and space */
 	}
 
+	/*
+	 * Increase the size with names of options specific
+	 * of the current tracer.
+	 */
+	for (i = 0; trace_opts[i].name; i++) {
+		len += strlen(trace_opts[i].name);
+		len += 3; /* "no" and space */
+	}
+
 	/* +2 for \n and \0 */
 	buf = kmalloc(len + 2, GFP_KERNEL);
 	if (!buf)
@@ -2449,6 +2483,15 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
 			r += sprintf(buf + r, "no%s ", trace_options[i]);
 	}
 
+	for (i = 0; trace_opts[i].name; i++) {
+		if (tracer_flags & trace_opts[i].bit)
+			r += sprintf(buf + r, "%s ",
+				trace_opts[i].name);
+		else
+			r += sprintf(buf + r, "no%s ",
+				trace_opts[i].name);
+	}
+
 	r += sprintf(buf + r, "\n");
 	WARN_ON(r >= len + 2);
 
@@ -2459,6 +2502,40 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
 	return r;
 }
 
+/* Try to assign a tracer specific option */
+static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
+{
+	struct tracer_flags *trace_flags = trace->flags;
+	struct tracer_opt *opts = NULL;
+	int ret = 0, i = 0;
+	int len;
+
+	for (i = 0; trace_flags->opts[i].name; i++) {
+		opts = &trace_flags->opts[i];
+		len = strlen(opts->name);
+
+		if (strncmp(cmp, opts->name, len) == 0) {
+			ret = trace->set_flag(trace_flags->val,
+				opts->bit, !neg);
+			break;
+		}
+	}
+	/* Not found */
+	if (!trace_flags->opts[i].name)
+		return -EINVAL;
+
+	/* Refused to handle */
+	if (ret)
+		return ret;
+
+	if (neg)
+		trace_flags->val &= ~opts->bit;
+	else
+		trace_flags->val |= opts->bit;
+
+	return 0;
+}
+
 static ssize_t
 tracing_trace_options_write(struct file *filp, const char __user *ubuf,
 			size_t cnt, loff_t *ppos)
@@ -2466,6 +2543,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
 	char buf[64];
 	char *cmp = buf;
 	int neg = 0;
+	int ret;
 	int i;
 
 	if (cnt >= sizeof(buf))
@@ -2492,11 +2570,13 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
 			break;
 		}
 	}
-	/*
-	 * If no option could be set, return an error:
-	 */
-	if (!trace_options[i])
-		return -EINVAL;
+
+	/* If no option could be set, test the specific tracer options */
+	if (!trace_options[i]) {
+		ret = set_tracer_option(current_trace, cmp, neg);
+		if (ret)
+			return ret;
+	}
 
 	filp->f_pos += cnt;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 37947f6b92bf..9d22618bf99f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -259,6 +259,29 @@ enum print_line_t {
 	TRACE_TYPE_UNHANDLED	= 2	/* Relay to other output functions */
 };
 
+
+/*
+ * An option specific to a tracer. This is a boolean value.
+ * The bit is the bit index that sets its value on the
+ * flags value in struct tracer_flags.
+ */
+struct tracer_opt {
+	const char 	*name; /* Will appear on the trace_options file */
+	u32 		bit; /* Mask assigned in val field in tracer_flags */
+};
+
+/*
+ * The set of specific options for a tracer. Your tracer
+ * have to set the initial value of the flags val.
+ */
+struct tracer_flags {
+	u32			val;
+	struct tracer_opt 	*opts;
+};
+
+/* Makes more easy to define a tracer opt */
+#define TRACER_OPT(s, b)	.name = #s, .bit = b
+
 /*
  * A specific tracer, represented by methods that operate on a trace array:
  */
@@ -280,8 +303,11 @@ struct tracer {
 					    struct trace_array *tr);
 #endif
 	enum print_line_t	(*print_line)(struct trace_iterator *iter);
+	/* If you handled the flag setting, return 0 */
+	int			(*set_flag)(u32 old_flags, u32 bit, int set);
 	struct tracer		*next;
 	int			print_max;
+	struct tracer_flags 	*flags;
 };
 
 struct trace_seq {
-- 
cgit v1.2.3


From 0619faf657806b943e6acf51f60f1cd023a96c78 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 17 Nov 2008 19:26:30 +0100
Subject: tracing/ftrace: make nop tracer using tracer flags

Impact: give an example on how to use specific tracer flags

This patch propose to use the nop tracer to provide an
example for using the tracer's custom flags implementation.

V2: replace structures and defines just after the headers includes for
    cleanliness.
V3: replace defines by enum values.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Noonan <steven@uplinklabs.net>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_nop.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index 0e77415caed3..b9767acd30ac 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -12,6 +12,27 @@
 
 #include "trace.h"
 
+/* Our two options */
+enum {
+	TRACE_NOP_OPT_ACCEPT = 0x1,
+	TRACE_NOP_OPT_REFUSE = 0x2
+};
+
+/* Options for the tracer (see trace_options file) */
+static struct tracer_opt nop_opts[] = {
+	/* Option that will be accepted by set_flag callback */
+	{ TRACER_OPT(test_nop_accept, TRACE_NOP_OPT_ACCEPT) },
+	/* Option that will be refused by set_flag callback */
+	{ TRACER_OPT(test_nop_refuse, TRACE_NOP_OPT_REFUSE) },
+	{ } /* Always set a last empty entry */
+};
+
+static struct tracer_flags nop_flags = {
+	/* You can check your flags value here when you want. */
+	.val = 0, /* By default: all flags disabled */
+	.opts = nop_opts
+};
+
 static struct trace_array	*ctx_trace;
 
 static void start_nop_trace(struct trace_array *tr)
@@ -41,6 +62,35 @@ static void nop_trace_reset(struct trace_array *tr)
 	stop_nop_trace(tr);
 }
 
+/* It only serves as a signal handler and a callback to
+ * accept or refuse tthe setting of a flag.
+ * If you don't implement it, then the flag setting will be
+ * automatically accepted.
+ */
+static int nop_set_flag(u32 old_flags, u32 bit, int set)
+{
+	/*
+	 * Note that you don't need to update nop_flags.val yourself.
+	 * The tracing Api will do it automatically if you return 0
+	 */
+	if (bit == TRACE_NOP_OPT_ACCEPT) {
+		printk(KERN_DEBUG "nop_test_accept flag set to %d: we accept."
+			" Now cat trace_options to see the result\n",
+			set);
+		return 0;
+	}
+
+	if (bit == TRACE_NOP_OPT_REFUSE) {
+		printk(KERN_DEBUG "nop_test_refuse flag set to %d: we refuse."
+			"Now cat trace_options to see the result\n",
+			set);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+
 struct tracer nop_trace __read_mostly =
 {
 	.name		= "nop",
@@ -49,5 +99,7 @@ struct tracer nop_trace __read_mostly =
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest	= trace_selftest_startup_nop,
 #endif
+	.flags		= &nop_flags,
+	.set_flag	= nop_set_flag
 };
 
-- 
cgit v1.2.3


From 0231022cc32d5f2e7f3c06b75691dda0ad6aec33 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 17 Nov 2008 03:22:41 +0100
Subject: tracing/function-return-tracer: add the overrun field

Impact: help to find the better depth of trace

We decided to arbitrary define the depth of function return trace as
"20". Perhaps this is not enough. To help finding an optimal depth, we
measure now the overrun: the number of functions that have been missed
for the current thread. By default this is not displayed, we have to
do set a particular flag on the return tracer: echo overrun >
/debug/tracing/trace_options And the overrun will be printed on the
right.

As the trace shows below, the current 20 depth is not enough.

update_wall_time+0x37f/0x8c0 -> update_xtime_cache (345 ns) (Overruns: 2838)
update_wall_time+0x384/0x8c0 -> clocksource_get_next (1141 ns) (Overruns: 2838)
do_timer+0x23/0x100 -> update_wall_time (3882 ns) (Overruns: 2838)
tick_do_update_jiffies64+0xbf/0x160 -> do_timer (5339 ns) (Overruns: 2838)
tick_sched_timer+0x6a/0xf0 -> tick_do_update_jiffies64 (7209 ns) (Overruns: 2838)
vgacon_set_cursor_size+0x98/0x120 -> native_io_delay (2613 ns) (Overruns: 274)
vgacon_cursor+0x16e/0x1d0 -> vgacon_set_cursor_size (33151 ns) (Overruns: 274)
set_cursor+0x5f/0x80 -> vgacon_cursor (36432 ns) (Overruns: 274)
con_flush_chars+0x34/0x40 -> set_cursor (38790 ns) (Overruns: 274)
release_console_sem+0x1ec/0x230 -> up (721 ns) (Overruns: 274)
release_console_sem+0x225/0x230 -> wake_up_klogd (316 ns) (Overruns: 274)
con_flush_chars+0x39/0x40 -> release_console_sem (2996 ns) (Overruns: 274)
con_write+0x22/0x30 -> con_flush_chars (46067 ns) (Overruns: 274)
n_tty_write+0x1cc/0x360 -> con_write (292670 ns) (Overruns: 274)
smp_apic_timer_interrupt+0x2a/0x90 -> native_apic_mem_write (330 ns) (Overruns: 274)
irq_enter+0x17/0x70 -> idle_cpu (413 ns) (Overruns: 274)
smp_apic_timer_interrupt+0x2f/0x90 -> irq_enter (1525 ns) (Overruns: 274)
ktime_get_ts+0x40/0x70 -> getnstimeofday (465 ns) (Overruns: 274)
ktime_get_ts+0x60/0x70 -> set_normalized_timespec (436 ns) (Overruns: 274)
ktime_get+0x16/0x30 -> ktime_get_ts (2501 ns) (Overruns: 274)
hrtimer_interrupt+0x77/0x1a0 -> ktime_get (3439 ns) (Overruns: 274)

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/thread_info.h    |  7 +++++++
 arch/x86/kernel/ftrace.c              | 10 ++++++---
 include/linux/ftrace.h                |  2 ++
 include/linux/sched.h                 |  1 +
 kernel/trace/trace.c                  |  1 +
 kernel/trace/trace.h                  |  1 +
 kernel/trace/trace_functions_return.c | 38 +++++++++++++++++++++++++++++------
 7 files changed, 51 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index a71158369fd4..e90e81ef6ab9 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -21,6 +21,7 @@ struct task_struct;
 struct exec_domain;
 #include <asm/processor.h>
 #include <asm/ftrace.h>
+#include <asm/atomic.h>
 
 struct thread_info {
 	struct task_struct	*task;		/* main task structure */
@@ -45,6 +46,11 @@ struct thread_info {
 	int		curr_ret_stack;
 	/* Stack of return addresses for return function tracing */
 	struct ftrace_ret_stack	ret_stack[FTRACE_RET_STACK_SIZE];
+	/*
+	 * Number of functions that haven't been traced
+	 * because of depth overrun.
+	 */
+	atomic_t	trace_overrun;
 #endif
 };
 
@@ -61,6 +67,7 @@ struct thread_info {
 		.fn = do_no_restart_syscall,	\
 	},					\
 	.curr_ret_stack = -1,\
+	.trace_overrun	= ATOMIC_INIT(0)	\
 }
 #else
 #define INIT_THREAD_INFO(tsk)			\
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 924153edd973..356bb1eb6e9a 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -353,8 +353,10 @@ static int push_return_trace(unsigned long ret, unsigned long long time,
 	struct thread_info *ti = current_thread_info();
 
 	/* The return trace stack is full */
-	if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1)
+	if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1) {
+		atomic_inc(&ti->trace_overrun);
 		return -EBUSY;
+	}
 
 	index = ++ti->curr_ret_stack;
 	barrier();
@@ -367,7 +369,7 @@ static int push_return_trace(unsigned long ret, unsigned long long time,
 
 /* Retrieve a function return address to the trace stack on thread info.*/
 static void pop_return_trace(unsigned long *ret, unsigned long long *time,
-				unsigned long *func)
+				unsigned long *func, unsigned long *overrun)
 {
 	int index;
 
@@ -376,6 +378,7 @@ static void pop_return_trace(unsigned long *ret, unsigned long long *time,
 	*ret = ti->ret_stack[index].ret;
 	*func = ti->ret_stack[index].func;
 	*time = ti->ret_stack[index].calltime;
+	*overrun = atomic_read(&ti->trace_overrun);
 	ti->curr_ret_stack--;
 }
 
@@ -386,7 +389,8 @@ static void pop_return_trace(unsigned long *ret, unsigned long long *time,
 unsigned long ftrace_return_to_handler(void)
 {
 	struct ftrace_retfunc trace;
-	pop_return_trace(&trace.ret, &trace.calltime, &trace.func);
+	pop_return_trace(&trace.ret, &trace.calltime, &trace.func,
+			&trace.overrun);
 	trace.rettime = cpu_clock(raw_smp_processor_id());
 	ftrace_function_return(&trace);
 
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index f1af1aab00e6..f7ba4ea5e128 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -318,6 +318,8 @@ struct ftrace_retfunc {
 	unsigned long func; /* Current function */
 	unsigned long long calltime;
 	unsigned long long rettime;
+	/* Number of functions that overran the depth limit for current task */
+	unsigned long overrun;
 };
 
 #ifdef CONFIG_FUNCTION_RET_TRACER
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 61c8cc36028a..c8e0db464206 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2016,6 +2016,7 @@ static inline void setup_thread_stack(struct task_struct *p, struct task_struct
 	 * used.
 	 */
 	task_thread_info(p)->curr_ret_stack = -1;
+	atomic_set(&task_thread_info(p)->trace_overrun, 0);
 #endif
 }
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9531fddcfb8d..e97c29a6e7b0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -853,6 +853,7 @@ static void __trace_function_return(struct trace_array *tr,
 	entry->parent_ip	= trace->ret;
 	entry->rettime		= trace->rettime;
 	entry->calltime		= trace->calltime;
+	entry->overrun		= trace->overrun;
 	ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
 }
 #endif
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9d22618bf99f..2cb12fd98f6b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -60,6 +60,7 @@ struct ftrace_ret_entry {
 	unsigned long		parent_ip;
 	unsigned long long	calltime;
 	unsigned long long	rettime;
+	unsigned long		overrun;
 };
 extern struct tracer boot_tracer;
 
diff --git a/kernel/trace/trace_functions_return.c b/kernel/trace/trace_functions_return.c
index a68564af022b..e00d64509c9c 100644
--- a/kernel/trace/trace_functions_return.c
+++ b/kernel/trace/trace_functions_return.c
@@ -14,6 +14,19 @@
 #include "trace.h"
 
 
+#define TRACE_RETURN_PRINT_OVERRUN	0x1
+static struct tracer_opt trace_opts[] = {
+	/* Display overruns or not */
+	{ TRACER_OPT(overrun, TRACE_RETURN_PRINT_OVERRUN) },
+	{ } /* Empty entry */
+};
+
+static struct tracer_flags tracer_flags = {
+	.val = 0, /* Don't display overruns by default */
+	.opts = trace_opts
+};
+
+
 static int return_trace_init(struct trace_array *tr)
 {
 	int cpu;
@@ -42,26 +55,39 @@ print_return_function(struct trace_iterator *iter)
 		ret = trace_seq_printf(s, "%pF -> ", (void *)field->parent_ip);
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
+
 		ret = seq_print_ip_sym(s, field->ip,
 					trace_flags & TRACE_ITER_SYM_MASK);
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
-		ret = trace_seq_printf(s, " (%llu ns)\n",
+
+		ret = trace_seq_printf(s, " (%llu ns)",
 					field->rettime - field->calltime);
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
-		else
-			return TRACE_TYPE_HANDLED;
+
+		if (tracer_flags.val & TRACE_RETURN_PRINT_OVERRUN) {
+			ret = trace_seq_printf(s, " (Overruns: %lu)",
+						field->overrun);
+			if (!ret)
+				return TRACE_TYPE_PARTIAL_LINE;
+		}
+
+		ret = trace_seq_printf(s, "\n");
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+
+		return TRACE_TYPE_HANDLED;
 	}
 	return TRACE_TYPE_UNHANDLED;
 }
 
-static struct tracer return_trace __read_mostly =
-{
+static struct tracer return_trace __read_mostly = {
 	.name	     = "return",
 	.init	     = return_trace_init,
 	.reset	     = return_trace_reset,
-	.print_line = print_return_function
+	.print_line = print_return_function,
+	.flags		= &tracer_flags,
 };
 
 static __init int init_return_trace(void)
-- 
cgit v1.2.3


From a22506347d038a66506c6f57e9b97104128e280d Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 18 Nov 2008 18:06:35 +0100
Subject: ftrace: preemptoff selftest not working

Impact: fix preemptoff and preemptirqsoff tracer self-tests

I was wondering why the preemptoff and preemptirqsoff tracer selftests
don't work on s390. After all its just that they get called from
non-preemptible context:

kernel_init() will execute all initcalls, however the first line in
kernel_init() is lock_kernel(), which causes the preempt_count to be
increased. Any later calls to add_preempt_count() (especially those
from the selftests) will therefore not result in a call to
trace_preempt_off() since the check below in add_preempt_count()
will be false:

        if (preempt_count() == val)
                trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));

Hence the trace buffer will be empty.

Fix this by releasing the BKL during the self-tests.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 396fda034e3f..16892121cb7c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -532,6 +532,13 @@ int register_tracer(struct tracer *type)
 	}
 
 #ifdef CONFIG_FTRACE_STARTUP_TEST
+	/*
+	 * When this gets called we hold the BKL which means that preemption
+	 * is disabled. Various trace selftests however need to disable
+	 * and enable preemption for successful tests. So we drop the BKL here
+	 * and grab it after the tests again.
+	 */
+	unlock_kernel();
 	if (type->selftest) {
 		struct tracer *saved_tracer = current_trace;
 		struct trace_array *tr = &global_trace;
@@ -562,6 +569,7 @@ int register_tracer(struct tracer *type)
 		}
 		printk(KERN_CONT "PASSED\n");
 	}
+	lock_kernel();
 #endif
 
 	type->next = trace_types;
-- 
cgit v1.2.3


From 86fa2f60674540df0b34f5c547ed0c1cf3a8f212 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 19 Nov 2008 10:00:15 +0100
Subject: ftrace: fix selftest locking

Impact: fix self-test boot crash

Self-test failure forgot to re-lock the BKL - crashing the next
initcall:

Testing tracer irqsoff: .. no entries found ..FAILED!
initcall init_irqsoff_tracer+0x0/0x11 returned 0 after 3906 usecs
calling  init_mmio_trace+0x0/0xf @ 1
------------[ cut here ]------------
Kernel BUG at c0c0a915 [verbose debug info unavailable]
invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC
last sysfs file:

Pid: 1, comm: swapper Not tainted (2.6.28-rc5-tip #53704)
EIP: 0060:[<c0c0a915>] EFLAGS: 00010286 CPU: 1
EIP is at unlock_kernel+0x10/0x2b
EAX: ffffffff EBX: 00000000 ECX: 00000000 EDX: f7030000
ESI: c12da19c EDI: 00000000 EBP: f7039f54 ESP: f7039f54
 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068
Process swapper (pid: 1, ti=f7038000 task=f7030000 task.ti=f7038000)
Stack:
 f7039f6c c0164d30 c013fed8 a7d8d7b4 00000000 00000000 f7039f74 c12fb78a
 f7039fd0 c0101132 c12fb77d 00000000 6f727200 6f632072 2d206564 c1002031
 0000000f f7039fa2 f7039fb0 3531b171 00000000 00000000 0000002f c12ca480
Call Trace:
 [<c0164d30>] ? register_tracer+0x66/0x13f
 [<c013fed8>] ? ktime_get+0x19/0x1b
 [<c12fb78a>] ? init_mmio_trace+0xd/0xf
 [<c0101132>] ? do_one_initcall+0x4a/0x111
 [<c12fb77d>] ? init_mmio_trace+0x0/0xf
 [<c015c7e6>] ? init_irq_proc+0x46/0x59
 [<c12e851d>] ? kernel_init+0x104/0x152
 [<c12e8419>] ? kernel_init+0x0/0x152
 [<c01038b7>] ? kernel_thread_helper+0x7/0x10
Code: 58 14 43 75 0a b8 00 9b 2d c1 e8 51 43 7a ff 64 a1 00 a0 37 c1 89 58 14 5b 5d c3 55 64 8b 15 00 a0 37 c1 83 7a 14 00 89 e5 79 04 <0f> 0b eb fe 8b 42 14 48 85 c0 89 42 14 79 0a b8 00 9b 2d c1 e8
EIP: [<c0c0a915>] unlock_kernel+0x10/0x2b SS:ESP 0068:f7039f54
---[ end trace a7919e7f17c0a725 ]---
Kernel panic - not syncing: Attempted to kill init!

So clean up the flow a bit.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 16892121cb7c..24b6238884f0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -520,7 +520,15 @@ int register_tracer(struct tracer *type)
 		return -1;
 	}
 
+	/*
+	 * When this gets called we hold the BKL which means that
+	 * preemption is disabled. Various trace selftests however
+	 * need to disable and enable preemption for successful tests.
+	 * So we drop the BKL here and grab it after the tests again.
+	 */
+	unlock_kernel();
 	mutex_lock(&trace_types_lock);
+
 	for (t = trace_types; t; t = t->next) {
 		if (strcmp(type->name, t->name) == 0) {
 			/* already found */
@@ -532,13 +540,6 @@ int register_tracer(struct tracer *type)
 	}
 
 #ifdef CONFIG_FTRACE_STARTUP_TEST
-	/*
-	 * When this gets called we hold the BKL which means that preemption
-	 * is disabled. Various trace selftests however need to disable
-	 * and enable preemption for successful tests. So we drop the BKL here
-	 * and grab it after the tests again.
-	 */
-	unlock_kernel();
 	if (type->selftest) {
 		struct tracer *saved_tracer = current_trace;
 		struct trace_array *tr = &global_trace;
@@ -550,9 +551,9 @@ int register_tracer(struct tracer *type)
 		 * internal tracing to verify that everything is in order.
 		 * If we fail, we do not register this tracer.
 		 */
-		for_each_tracing_cpu(i) {
+		for_each_tracing_cpu(i)
 			tracing_reset(tr, i);
-		}
+
 		current_trace = type;
 		/* the test is responsible for initializing and enabling */
 		pr_info("Testing tracer %s: ", type->name);
@@ -564,12 +565,11 @@ int register_tracer(struct tracer *type)
 			goto out;
 		}
 		/* Only reset on passing, to avoid touching corrupted buffers */
-		for_each_tracing_cpu(i) {
+		for_each_tracing_cpu(i)
 			tracing_reset(tr, i);
-		}
+
 		printk(KERN_CONT "PASSED\n");
 	}
-	lock_kernel();
 #endif
 
 	type->next = trace_types;
@@ -580,6 +580,7 @@ int register_tracer(struct tracer *type)
 
  out:
 	mutex_unlock(&trace_types_lock);
+	lock_kernel();
 
 	return ret;
 }
-- 
cgit v1.2.3


From 60a515132086b2c28a8141d873297fdf7a180ca7 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 18 Nov 2008 22:20:10 -0800
Subject: profiling: clean up profile_nop()

Impact: cleanup

No point in inlining this.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/profile.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/profile.c b/kernel/profile.c
index 5b7d1ac7124c..7f93a5042d3b 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -544,7 +544,7 @@ static const struct file_operations proc_profile_operations = {
 };
 
 #ifdef CONFIG_SMP
-static inline void profile_nop(void *unused)
+static void profile_nop(void *unused)
 {
 }
 
-- 
cgit v1.2.3


From ec4e0e2fe018992d980910db901637c814575914 Mon Sep 17 00:00:00 2001
From: Ken Chen <kenchen@google.com>
Date: Tue, 18 Nov 2008 22:41:57 -0800
Subject: sched: fix inconsistency when redistribute per-cpu tg->cfs_rq shares

Impact: make load-balancing more consistent

In the update_shares() path leading to tg_shares_up(), the calculation of
per-cpu cfs_rq shares is rather erratic even under moderate task wake up
rate.  The problem is that the per-cpu tg->cfs_rq load weight used in the
sd_rq_weight aggregation and actual redistribution of the cfs_rq->shares
are collected at different time.  Under moderate system load, we've seen
quite a bit of variation on the cfs_rq->shares and ultimately wildly
affects sched_entity's load weight.

This patch caches the result of initial per-cpu load weight when doing the
sum calculation, and then pass it down to update_group_shares_cpu() for
redistributing per-cpu cfs_rq shares.  This allows consistent total cfs_rq
shares across all CPUs. It also simplifies the rounding and zero load
weight check.

Signed-off-by: Ken Chen <kenchen@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 41 +++++++++++++++--------------------------
 1 file changed, 15 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index a4c156d9a4a5..93bfb086e60f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1453,27 +1453,13 @@ static void
 update_group_shares_cpu(struct task_group *tg, int cpu,
 			unsigned long sd_shares, unsigned long sd_rq_weight)
 {
-	int boost = 0;
 	unsigned long shares;
 	unsigned long rq_weight;
 
 	if (!tg->se[cpu])
 		return;
 
-	rq_weight = tg->cfs_rq[cpu]->load.weight;
-
-	/*
-	 * If there are currently no tasks on the cpu pretend there is one of
-	 * average load so that when a new task gets to run here it will not
-	 * get delayed by group starvation.
-	 */
-	if (!rq_weight) {
-		boost = 1;
-		rq_weight = NICE_0_LOAD;
-	}
-
-	if (unlikely(rq_weight > sd_rq_weight))
-		rq_weight = sd_rq_weight;
+	rq_weight = tg->cfs_rq[cpu]->rq_weight;
 
 	/*
 	 *           \Sum shares * rq_weight
@@ -1481,7 +1467,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
 	 *               \Sum rq_weight
 	 *
 	 */
-	shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
+	shares = (sd_shares * rq_weight) / sd_rq_weight;
 	shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
 
 	if (abs(shares - tg->se[cpu]->load.weight) >
@@ -1490,11 +1476,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
 		unsigned long flags;
 
 		spin_lock_irqsave(&rq->lock, flags);
-		/*
-		 * record the actual number of shares, not the boosted amount.
-		 */
-		tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
-		tg->cfs_rq[cpu]->rq_weight = rq_weight;
+		tg->cfs_rq[cpu]->shares = shares;
 
 		__set_se_shares(tg->se[cpu], shares);
 		spin_unlock_irqrestore(&rq->lock, flags);
@@ -1508,13 +1490,23 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
  */
 static int tg_shares_up(struct task_group *tg, void *data)
 {
-	unsigned long rq_weight = 0;
+	unsigned long weight, rq_weight = 0;
 	unsigned long shares = 0;
 	struct sched_domain *sd = data;
 	int i;
 
 	for_each_cpu_mask(i, sd->span) {
-		rq_weight += tg->cfs_rq[i]->load.weight;
+		/*
+		 * If there are currently no tasks on the cpu pretend there
+		 * is one of average load so that when a new task gets to
+		 * run here it will not get delayed by group starvation.
+		 */
+		weight = tg->cfs_rq[i]->load.weight;
+		if (!weight)
+			weight = NICE_0_LOAD;
+
+		tg->cfs_rq[i]->rq_weight = weight;
+		rq_weight += weight;
 		shares += tg->cfs_rq[i]->shares;
 	}
 
@@ -1524,9 +1516,6 @@ static int tg_shares_up(struct task_group *tg, void *data)
 	if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
 		shares = tg->shares;
 
-	if (!rq_weight)
-		rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
-
 	for_each_cpu_mask(i, sd->span)
 		update_group_shares_cpu(tg, i, shares, rq_weight);
 
-- 
cgit v1.2.3


From 957ad0166e9f76a8561dafa5e14ef5bd3f5e9a3b Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Fri, 21 Nov 2008 01:30:36 +0100
Subject: sched: update comment for move_task_off_dead_cpu

Impact: cleanup

This commit:

commit f7b4cddcc5aca03e80e357360c9424dfba5056c2
Author: Oleg Nesterov <oleg@tv-sign.ru>
Date:   Tue Oct 16 23:30:56 2007 -0700

    do CPU_DEAD migrating under read_lock(tasklist) instead of write_lock_irq(ta

    Currently move_task_off_dead_cpu() is called under
    write_lock_irq(tasklist).  This means it can't use task_lock() which is
    needed to improve migrating to take task's ->cpuset into account.

    Change the code to call move_task_off_dead_cpu() with irqs enabled, and
    change migrate_live_tasks() to use read_lock(tasklist).

...forgot to update the comment in front of move_task_off_dead_cpu.

Reference: http://lkml.org/lkml/2008/6/23/135

Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 93bfb086e60f..a6085d5166dd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6094,7 +6094,6 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
 
 /*
  * Figure out where task on dead CPU should go, use force if necessary.
- * NOTE: interrupts should be disabled by the caller
  */
 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
-- 
cgit v1.2.3


From f201ae2356c74bcae130b2177b3dca903ea98071 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 23 Nov 2008 06:22:56 +0100
Subject: tracing/function-return-tracer: store return stack into task_struct
 and allocate it dynamically

Impact: use deeper function tracing depth safely

Some tests showed that function return tracing needed a more deeper depth
of function calls. But it could be unsafe to store these return addresses
to the stack.

So these arrays will now be allocated dynamically into task_struct of current
only when the tracer is activated.

Typical scheme when tracer is activated:
- allocate a return stack for each task in global list.
- fork: allocate the return stack for the newly created task
- exit: free return stack of current
- idle init: same as fork

I chose a default depth of 50. I don't have overruns anymore.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/ftrace.h      |  1 -
 arch/x86/include/asm/thread_info.h | 29 ------------
 arch/x86/kernel/ftrace.c           | 29 ++++++------
 include/linux/ftrace.h             |  5 ++
 include/linux/sched.h              | 23 +++++----
 kernel/exit.c                      |  5 +-
 kernel/fork.c                      |  4 ++
 kernel/sched.c                     |  3 ++
 kernel/trace/ftrace.c              | 96 +++++++++++++++++++++++++++++++++++++-
 9 files changed, 137 insertions(+), 58 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 2bb43b433e07..754a3e082f94 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -29,7 +29,6 @@ struct dyn_arch_ftrace {
 #endif /* CONFIG_FUNCTION_TRACER */
 
 #ifdef CONFIG_FUNCTION_RET_TRACER
-#define FTRACE_RET_STACK_SIZE 20
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e90e81ef6ab9..0921b4018c11 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -40,36 +40,8 @@ struct thread_info {
 						*/
 	__u8			supervisor_stack[0];
 #endif
-
-#ifdef CONFIG_FUNCTION_RET_TRACER
-	/* Index of current stored adress in ret_stack */
-	int		curr_ret_stack;
-	/* Stack of return addresses for return function tracing */
-	struct ftrace_ret_stack	ret_stack[FTRACE_RET_STACK_SIZE];
-	/*
-	 * Number of functions that haven't been traced
-	 * because of depth overrun.
-	 */
-	atomic_t	trace_overrun;
-#endif
 };
 
-#ifdef CONFIG_FUNCTION_RET_TRACER
-#define INIT_THREAD_INFO(tsk)			\
-{						\
-	.task		= &tsk,			\
-	.exec_domain	= &default_exec_domain,	\
-	.flags		= 0,			\
-	.cpu		= 0,			\
-	.preempt_count	= 1,			\
-	.addr_limit	= KERNEL_DS,		\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
-	.curr_ret_stack = -1,\
-	.trace_overrun	= ATOMIC_INIT(0)	\
-}
-#else
 #define INIT_THREAD_INFO(tsk)			\
 {						\
 	.task		= &tsk,			\
@@ -82,7 +54,6 @@ struct thread_info {
 		.fn = do_no_restart_syscall,	\
 	},					\
 }
-#endif
 
 #define init_thread_info	(init_thread_union.thread_info)
 #define init_stack		(init_thread_union.stack)
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 356bb1eb6e9a..bb137f7297ed 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -350,19 +350,21 @@ static int push_return_trace(unsigned long ret, unsigned long long time,
 				unsigned long func)
 {
 	int index;
-	struct thread_info *ti = current_thread_info();
+
+	if (!current->ret_stack)
+		return -EBUSY;
 
 	/* The return trace stack is full */
-	if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1) {
-		atomic_inc(&ti->trace_overrun);
+	if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
+		atomic_inc(&current->trace_overrun);
 		return -EBUSY;
 	}
 
-	index = ++ti->curr_ret_stack;
+	index = ++current->curr_ret_stack;
 	barrier();
-	ti->ret_stack[index].ret = ret;
-	ti->ret_stack[index].func = func;
-	ti->ret_stack[index].calltime = time;
+	current->ret_stack[index].ret = ret;
+	current->ret_stack[index].func = func;
+	current->ret_stack[index].calltime = time;
 
 	return 0;
 }
@@ -373,13 +375,12 @@ static void pop_return_trace(unsigned long *ret, unsigned long long *time,
 {
 	int index;
 
-	struct thread_info *ti = current_thread_info();
-	index = ti->curr_ret_stack;
-	*ret = ti->ret_stack[index].ret;
-	*func = ti->ret_stack[index].func;
-	*time = ti->ret_stack[index].calltime;
-	*overrun = atomic_read(&ti->trace_overrun);
-	ti->curr_ret_stack--;
+	index = current->curr_ret_stack;
+	*ret = current->ret_stack[index].ret;
+	*func = current->ret_stack[index].func;
+	*time = current->ret_stack[index].calltime;
+	*overrun = atomic_read(&current->trace_overrun);
+	current->curr_ret_stack--;
 }
 
 /*
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index f7ba4ea5e128..2ba259b2defa 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -323,6 +323,8 @@ struct ftrace_retfunc {
 };
 
 #ifdef CONFIG_FUNCTION_RET_TRACER
+#define FTRACE_RETFUNC_DEPTH 50
+#define FTRACE_RETSTACK_ALLOC_SIZE 32
 /* Type of a callback handler of tracing return function */
 typedef void (*trace_function_return_t)(struct ftrace_retfunc *);
 
@@ -330,6 +332,9 @@ extern int register_ftrace_return(trace_function_return_t func);
 /* The current handler in use */
 extern trace_function_return_t ftrace_function_return;
 extern void unregister_ftrace_return(void);
+
+extern void ftrace_retfunc_init_task(struct task_struct *t);
+extern void ftrace_retfunc_exit_task(struct task_struct *t);
 #endif
 
 #endif /* _LINUX_FTRACE_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c8e0db464206..bee1e93c95ad 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1352,6 +1352,17 @@ struct task_struct {
 	unsigned long default_timer_slack_ns;
 
 	struct list_head	*scm_work_list;
+#ifdef CONFIG_FUNCTION_RET_TRACER
+	/* Index of current stored adress in ret_stack */
+	int curr_ret_stack;
+	/* Stack of return addresses for return function tracing */
+	struct ftrace_ret_stack	*ret_stack;
+	/*
+	 * Number of functions that haven't been traced
+	 * because of depth overrun.
+	 */
+	atomic_t trace_overrun;
+#endif
 };
 
 /*
@@ -2006,18 +2017,6 @@ static inline void setup_thread_stack(struct task_struct *p, struct task_struct
 {
 	*task_thread_info(p) = *task_thread_info(org);
 	task_thread_info(p)->task = p;
-
-#ifdef CONFIG_FUNCTION_RET_TRACER
-	/*
-	 * When fork() creates a child process, this function is called.
-	 * But the child task may not inherit the return adresses traced
-	 * by the return function tracer because it will directly execute
-	 * in userspace and will not return to kernel functions its parent
-	 * used.
-	 */
-	task_thread_info(p)->curr_ret_stack = -1;
-	atomic_set(&task_thread_info(p)->trace_overrun, 0);
-#endif
 }
 
 static inline unsigned long *end_of_stack(struct task_struct *p)
diff --git a/kernel/exit.c b/kernel/exit.c
index 35c8ec2ba03a..b9d446329da1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -47,6 +47,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/tracehook.h>
 #include <trace/sched.h>
+#include <linux/ftrace.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -1127,7 +1128,9 @@ NORET_TYPE void do_exit(long code)
 	preempt_disable();
 	/* causes final put_task_struct in finish_task_switch(). */
 	tsk->state = TASK_DEAD;
-
+#ifdef CONFIG_FUNCTION_RET_TRACER
+	ftrace_retfunc_exit_task(tsk);
+#endif
 	schedule();
 	BUG();
 	/* Avoid "noreturn function does return".  */
diff --git a/kernel/fork.c b/kernel/fork.c
index ac62f43ee430..d1eb30e69ccc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -47,6 +47,7 @@
 #include <linux/mount.h>
 #include <linux/audit.h>
 #include <linux/memcontrol.h>
+#include <linux/ftrace.h>
 #include <linux/profile.h>
 #include <linux/rmap.h>
 #include <linux/acct.h>
@@ -1269,6 +1270,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	total_forks++;
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
+#ifdef CONFIG_FUNCTION_RET_TRACER
+	ftrace_retfunc_init_task(p);
+#endif
 	proc_fork_connector(p);
 	cgroup_post_fork(p);
 	return p;
diff --git a/kernel/sched.c b/kernel/sched.c
index 4de56108c86f..fb17205950de 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5901,6 +5901,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 	 * The idle tasks have their own, simple scheduling class:
 	 */
 	idle->sched_class = &idle_sched_class;
+#ifdef CONFIG_FUNCTION_RET_TRACER
+	ftrace_retfunc_init_task(idle);
+#endif
 }
 
 /*
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f212da486689..90d99fb02ae4 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1498,10 +1498,77 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 
 #ifdef CONFIG_FUNCTION_RET_TRACER
 
+static atomic_t ftrace_retfunc_active;
+
 /* The callback that hooks the return of a function */
 trace_function_return_t ftrace_function_return =
 			(trace_function_return_t)ftrace_stub;
 
+
+/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
+static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
+{
+	int i;
+	int ret = 0;
+	unsigned long flags;
+	int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE;
+	struct task_struct *g, *t;
+
+	for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) {
+		ret_stack_list[i] = kmalloc(FTRACE_RETFUNC_DEPTH
+					* sizeof(struct ftrace_ret_stack),
+					GFP_KERNEL);
+		if (!ret_stack_list[i]) {
+			start = 0;
+			end = i;
+			ret = -ENOMEM;
+			goto free;
+		}
+	}
+
+	read_lock_irqsave(&tasklist_lock, flags);
+	do_each_thread(g, t) {
+		if (start == end) {
+			ret = -EAGAIN;
+			goto unlock;
+		}
+
+		if (t->ret_stack == NULL) {
+			t->ret_stack = ret_stack_list[start++];
+			t->curr_ret_stack = -1;
+			atomic_set(&t->trace_overrun, 0);
+		}
+	} while_each_thread(g, t);
+
+unlock:
+	read_unlock_irqrestore(&tasklist_lock, flags);
+free:
+	for (i = start; i < end; i++)
+		kfree(ret_stack_list[i]);
+	return ret;
+}
+
+/* Allocate a return stack for each task */
+static int start_return_tracing(void)
+{
+	struct ftrace_ret_stack **ret_stack_list;
+	int ret;
+
+	ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE *
+				sizeof(struct ftrace_ret_stack *),
+				GFP_KERNEL);
+
+	if (!ret_stack_list)
+		return -ENOMEM;
+
+	do {
+		ret = alloc_retstack_tasklist(ret_stack_list);
+	} while (ret == -EAGAIN);
+
+	kfree(ret_stack_list);
+	return ret;
+}
+
 int register_ftrace_return(trace_function_return_t func)
 {
 	int ret = 0;
@@ -1516,7 +1583,12 @@ int register_ftrace_return(trace_function_return_t func)
 		ret = -EBUSY;
 		goto out;
 	}
-
+	atomic_inc(&ftrace_retfunc_active);
+	ret = start_return_tracing();
+	if (ret) {
+		atomic_dec(&ftrace_retfunc_active);
+		goto out;
+	}
 	ftrace_tracing_type = FTRACE_TYPE_RETURN;
 	ftrace_function_return = func;
 	ftrace_startup();
@@ -1530,6 +1602,7 @@ void unregister_ftrace_return(void)
 {
 	mutex_lock(&ftrace_sysctl_lock);
 
+	atomic_dec(&ftrace_retfunc_active);
 	ftrace_function_return = (trace_function_return_t)ftrace_stub;
 	ftrace_shutdown();
 	/* Restore normal tracing type */
@@ -1537,6 +1610,27 @@ void unregister_ftrace_return(void)
 
 	mutex_unlock(&ftrace_sysctl_lock);
 }
+
+/* Allocate a return stack for newly created task */
+void ftrace_retfunc_init_task(struct task_struct *t)
+{
+	if (atomic_read(&ftrace_retfunc_active)) {
+		t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
+				* sizeof(struct ftrace_ret_stack),
+				GFP_KERNEL);
+		if (!t->ret_stack)
+			return;
+		t->curr_ret_stack = -1;
+		atomic_set(&t->trace_overrun, 0);
+	} else
+		t->ret_stack = NULL;
+}
+
+void ftrace_retfunc_exit_task(struct task_struct *t)
+{
+	kfree(t->ret_stack);
+	t->ret_stack = NULL;
+}
 #endif
 
 
-- 
cgit v1.2.3


From 82f60f0bc854aada696f27d863c03bef91f1509d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 23 Nov 2008 09:18:56 +0100
Subject: tracing/function-return-tracer: clean up task start/exit callbacks

Impact: cleanup

Eliminate #ifdefs in core code by using empty inline functions.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 3 +++
 kernel/exit.c          | 2 --
 kernel/fork.c          | 2 --
 kernel/sched.c         | 2 --
 4 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 2ba259b2defa..938ca1942641 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -335,6 +335,9 @@ extern void unregister_ftrace_return(void);
 
 extern void ftrace_retfunc_init_task(struct task_struct *t);
 extern void ftrace_retfunc_exit_task(struct task_struct *t);
+#else
+static inline void ftrace_retfunc_init_task(struct task_struct *t) { }
+static inline void ftrace_retfunc_exit_task(struct task_struct *t) { }
 #endif
 
 #endif /* _LINUX_FTRACE_H */
diff --git a/kernel/exit.c b/kernel/exit.c
index b9d446329da1..ef04d03b3286 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1128,9 +1128,7 @@ NORET_TYPE void do_exit(long code)
 	preempt_disable();
 	/* causes final put_task_struct in finish_task_switch(). */
 	tsk->state = TASK_DEAD;
-#ifdef CONFIG_FUNCTION_RET_TRACER
 	ftrace_retfunc_exit_task(tsk);
-#endif
 	schedule();
 	BUG();
 	/* Avoid "noreturn function does return".  */
diff --git a/kernel/fork.c b/kernel/fork.c
index d1eb30e69ccc..fbf4a4c0a628 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1270,9 +1270,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	total_forks++;
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
-#ifdef CONFIG_FUNCTION_RET_TRACER
 	ftrace_retfunc_init_task(p);
-#endif
 	proc_fork_connector(p);
 	cgroup_post_fork(p);
 	return p;
diff --git a/kernel/sched.c b/kernel/sched.c
index fb17205950de..388d9db044ab 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5901,9 +5901,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 	 * The idle tasks have their own, simple scheduling class:
 	 */
 	idle->sched_class = &idle_sched_class;
-#ifdef CONFIG_FUNCTION_RET_TRACER
 	ftrace_retfunc_init_task(idle);
-#endif
 }
 
 /*
-- 
cgit v1.2.3


From 02b67518e2b1c490787dac7f35e1204e74fe21ba Mon Sep 17 00:00:00 2001
From: Török Edwin <edwintorok@gmail.com>
Date: Sat, 22 Nov 2008 13:28:47 +0200
Subject: tracing: add support for userspace stacktraces in tracing/iter_ctrl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: add new (default-off) tracing visualization feature

Usage example:

 mount -t debugfs nodev /sys/kernel/debug
 cd /sys/kernel/debug/tracing
 echo userstacktrace >iter_ctrl
 echo sched_switch >current_tracer
 echo 1 >tracing_enabled
 .... run application ...
 echo 0 >tracing_enabled

Then read one of 'trace','latency_trace','trace_pipe'.

To get the best output you can compile your userspace programs with
frame pointers (at least glibc + the app you are tracing).

Signed-off-by: Török Edwin <edwintorok@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/ftrace.txt     |  5 ++-
 arch/x86/kernel/stacktrace.c | 57 +++++++++++++++++++++++++++
 include/linux/stacktrace.h   |  8 ++++
 kernel/trace/trace.c         | 93 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h         |  9 +++++
 5 files changed, 171 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt
index 753f4de4b175..79a80f79c062 100644
--- a/Documentation/ftrace.txt
+++ b/Documentation/ftrace.txt
@@ -324,7 +324,7 @@ output. To see what is available, simply cat the file:
 
   cat /debug/tracing/trace_options
   print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \
- noblock nostacktrace nosched-tree
+ noblock nostacktrace nosched-tree nouserstacktrace
 
 To disable one of the options, echo in the option prepended with "no".
 
@@ -378,6 +378,9 @@ Here are the available options:
 		When a trace is recorded, so is the stack of functions.
 		This allows for back traces of trace sites.
 
+  userstacktrace - This option changes the trace.
+		   It records a stacktrace of the current userspace thread.
+
   sched-tree - TBD (any users??)
 
 
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index a03e7f6d90c3..b15153060417 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -6,6 +6,7 @@
 #include <linux/sched.h>
 #include <linux/stacktrace.h>
 #include <linux/module.h>
+#include <linux/uaccess.h>
 #include <asm/stacktrace.h>
 
 static void save_stack_warning(void *data, char *msg)
@@ -83,3 +84,59 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
 		trace->entries[trace->nr_entries++] = ULONG_MAX;
 }
 EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
+
+/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */
+
+struct stack_frame {
+	const void __user	*next_fp;
+	unsigned long		return_address;
+};
+
+static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+{
+	int ret;
+
+	if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
+		return 0;
+
+	ret = 1;
+	pagefault_disable();
+	if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
+		ret = 0;
+	pagefault_enable();
+
+	return ret;
+}
+
+void save_stack_trace_user(struct stack_trace *trace)
+{
+	/*
+	 * Trace user stack if we are not a kernel thread
+	 */
+	if (current->mm) {
+		const struct pt_regs *regs = task_pt_regs(current);
+		const void __user *fp = (const void __user *)regs->bp;
+
+		if (trace->nr_entries < trace->max_entries)
+			trace->entries[trace->nr_entries++] = regs->ip;
+
+		while (trace->nr_entries < trace->max_entries) {
+			struct stack_frame frame;
+			frame.next_fp = NULL;
+			frame.return_address = 0;
+			if (!copy_stack_frame(fp, &frame))
+				break;
+			if ((unsigned long)fp < regs->sp)
+				break;
+			if (frame.return_address)
+				trace->entries[trace->nr_entries++] =
+					frame.return_address;
+			if (fp == frame.next_fp)
+				break;
+			fp = frame.next_fp;
+		}
+	}
+	if (trace->nr_entries < trace->max_entries)
+		trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
+
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
index b106fd8e0d5c..68de51468f5d 100644
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -15,9 +15,17 @@ extern void save_stack_trace_tsk(struct task_struct *tsk,
 				struct stack_trace *trace);
 
 extern void print_stack_trace(struct stack_trace *trace, int spaces);
+
+#ifdef CONFIG_X86
+extern void save_stack_trace_user(struct stack_trace *trace);
+#else
+# define save_stack_trace_user(trace)              do { } while (0)
+#endif
+
 #else
 # define save_stack_trace(trace)			do { } while (0)
 # define save_stack_trace_tsk(tsk, trace)		do { } while (0)
+# define save_stack_trace_user(trace)              do { } while (0)
 # define print_stack_trace(trace, spaces)		do { } while (0)
 #endif
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4ee6f0375222..ced8b4fa9f51 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -275,6 +275,7 @@ static const char *trace_options[] = {
 	"ftrace_preempt",
 	"branch",
 	"annotate",
+	"userstacktrace",
 	NULL
 };
 
@@ -918,6 +919,44 @@ void __trace_stack(struct trace_array *tr,
 	ftrace_trace_stack(tr, data, flags, skip, preempt_count());
 }
 
+static void ftrace_trace_userstack(struct trace_array *tr,
+		   struct trace_array_cpu *data,
+		   unsigned long flags, int pc)
+{
+	struct userstack_entry *entry;
+	struct stack_trace trace;
+	struct ring_buffer_event *event;
+	unsigned long irq_flags;
+
+	if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
+		return;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, flags, pc);
+	entry->ent.type		= TRACE_USER_STACK;
+
+	memset(&entry->caller, 0, sizeof(entry->caller));
+
+	trace.nr_entries	= 0;
+	trace.max_entries	= FTRACE_STACK_ENTRIES;
+	trace.skip		= 0;
+	trace.entries		= entry->caller;
+
+	save_stack_trace_user(&trace);
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+}
+
+void __trace_userstack(struct trace_array *tr,
+		   struct trace_array_cpu *data,
+		   unsigned long flags)
+{
+	ftrace_trace_userstack(tr, data, flags, preempt_count());
+}
+
 static void
 ftrace_trace_special(void *__tr, void *__data,
 		     unsigned long arg1, unsigned long arg2, unsigned long arg3,
@@ -941,6 +980,7 @@ ftrace_trace_special(void *__tr, void *__data,
 	entry->arg3			= arg3;
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
 	ftrace_trace_stack(tr, data, irq_flags, 4, pc);
+	ftrace_trace_userstack(tr, data, irq_flags, pc);
 
 	trace_wake_up();
 }
@@ -979,6 +1019,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->next_cpu	= task_cpu(next);
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
 	ftrace_trace_stack(tr, data, flags, 5, pc);
+	ftrace_trace_userstack(tr, data, flags, pc);
 }
 
 void
@@ -1008,6 +1049,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->next_cpu			= task_cpu(wakee);
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
 	ftrace_trace_stack(tr, data, flags, 6, pc);
+	ftrace_trace_userstack(tr, data, flags, pc);
 
 	trace_wake_up();
 }
@@ -1387,6 +1429,31 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
 	return ret;
 }
 
+static int
+seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
+		unsigned long sym_flags)
+{
+	int ret = 1;
+	unsigned i;
+
+	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+		unsigned long ip = entry->caller[i];
+
+		if (ip == ULONG_MAX || !ret)
+			break;
+		if (i)
+			ret = trace_seq_puts(s, " <- ");
+		if (!ip) {
+			ret = trace_seq_puts(s, "??");
+			continue;
+		}
+		if (ret /*&& (sym_flags & TRACE_ITER_SYM_ADDR)*/)
+			ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
+	}
+
+	return ret;
+}
+
 static void print_lat_help_header(struct seq_file *m)
 {
 	seq_puts(m, "#                  _------=> CPU#            \n");
@@ -1702,6 +1769,16 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 				 field->line);
 		break;
 	}
+	case TRACE_USER_STACK: {
+		struct userstack_entry *field;
+
+		trace_assign_type(field, entry);
+
+		seq_print_userip_objs(field, s, sym_flags);
+		if (entry->flags & TRACE_FLAG_CONT)
+			trace_seq_print_cont(s, iter);
+		break;
+	}
 	default:
 		trace_seq_printf(s, "Unknown type %d\n", entry->type);
 	}
@@ -1853,6 +1930,19 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 				 field->line);
 		break;
 	}
+	case TRACE_USER_STACK: {
+		struct userstack_entry *field;
+
+		trace_assign_type(field, entry);
+
+		ret = seq_print_userip_objs(field, s, sym_flags);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+		ret = trace_seq_putc(s, '\n');
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+		break;
+	}
 	}
 	return TRACE_TYPE_HANDLED;
 }
@@ -1912,6 +2002,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
 		break;
 	}
 	case TRACE_SPECIAL:
+	case TRACE_USER_STACK:
 	case TRACE_STACK: {
 		struct special_entry *field;
 
@@ -2000,6 +2091,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 		break;
 	}
 	case TRACE_SPECIAL:
+	case TRACE_USER_STACK:
 	case TRACE_STACK: {
 		struct special_entry *field;
 
@@ -2054,6 +2146,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 		break;
 	}
 	case TRACE_SPECIAL:
+	case TRACE_USER_STACK:
 	case TRACE_STACK: {
 		struct special_entry *field;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2cb12fd98f6b..17bb4c830b01 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -26,6 +26,7 @@ enum trace_type {
 	TRACE_BOOT_CALL,
 	TRACE_BOOT_RET,
 	TRACE_FN_RET,
+	TRACE_USER_STACK,
 
 	__TRACE_LAST_TYPE
 };
@@ -42,6 +43,7 @@ struct trace_entry {
 	unsigned char		flags;
 	unsigned char		preempt_count;
 	int			pid;
+	int			tgid;
 };
 
 /*
@@ -99,6 +101,11 @@ struct stack_entry {
 	unsigned long		caller[FTRACE_STACK_ENTRIES];
 };
 
+struct userstack_entry {
+	struct trace_entry	ent;
+	unsigned long		caller[FTRACE_STACK_ENTRIES];
+};
+
 /*
  * ftrace_printk entry:
  */
@@ -240,6 +247,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct ctx_switch_entry, 0);	\
 		IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \
 		IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK);	\
+		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
 		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\
 		IF_ASSIGN(var, ent, struct special_entry, 0);		\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\
@@ -500,6 +508,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_PREEMPTONLY		= 0x800,
 	TRACE_ITER_BRANCH		= 0x1000,
 	TRACE_ITER_ANNOTATE		= 0x2000,
+	TRACE_ITER_USERSTACKTRACE       = 0x4000
 };
 
 /*
-- 
cgit v1.2.3


From b54d3de9f3b8956653b06f1a32e9f9321c6d9027 Mon Sep 17 00:00:00 2001
From: Török Edwin <edwintorok@gmail.com>
Date: Sat, 22 Nov 2008 13:28:48 +0200
Subject: tracing: identify which executable object the userspace address
 belongs to
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: modify+improve the userstacktrace tracing visualization feature

Store thread group leader id, and use it to lookup the address in the
process's map. We could have looked up the address on thread's map,
but the thread might not exist by the time we are called. The process
might not exist either, but if you are reading trace_pipe, that is
unlikely.

Example usage:

 mount -t debugfs nodev /sys/kernel/debug
 cd /sys/kernel/debug/tracing
 echo userstacktrace >iter_ctrl
 echo sym-userobj >iter_ctrl
 echo sched_switch >current_tracer
 echo 1 >tracing_enabled
 cat trace_pipe >/tmp/trace&
 .... run application ...
 echo 0 >tracing_enabled
 cat /tmp/trace

You'll see stack entries like:

   /lib/libpthread-2.7.so[+0xd370]

You can convert them to function/line using:

   addr2line -fie /lib/libpthread-2.7.so 0xd370

Or:

   addr2line -fie /usr/lib/debug/libpthread-2.7.so 0xd370

For non-PIC/PIE executables this won't work:

   a.out[+0x73b]

You need to run the following: addr2line -fie a.out 0x40073b
(where 0x400000 is the default load address of a.out)

Signed-off-by: Török Edwin <edwintorok@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/ftrace.txt | 13 +++++++-
 kernel/trace/trace.c     | 86 ++++++++++++++++++++++++++++++++++++++++++++----
 kernel/trace/trace.h     |  3 +-
 3 files changed, 93 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt
index 79a80f79c062..35a78bc6651d 100644
--- a/Documentation/ftrace.txt
+++ b/Documentation/ftrace.txt
@@ -324,7 +324,7 @@ output. To see what is available, simply cat the file:
 
   cat /debug/tracing/trace_options
   print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \
- noblock nostacktrace nosched-tree nouserstacktrace
+ noblock nostacktrace nosched-tree nouserstacktrace nosym-userobj
 
 To disable one of the options, echo in the option prepended with "no".
 
@@ -381,6 +381,17 @@ Here are the available options:
   userstacktrace - This option changes the trace.
 		   It records a stacktrace of the current userspace thread.
 
+  sym-userobj - when user stacktrace are enabled, look up which object the
+		address belongs to, and print a relative address
+		This is especially useful when ASLR is on, otherwise you don't
+		get a chance to resolve the address to object/file/line after the app is no
+		longer running
+
+		The lookup is performed when you read trace,trace_pipe,latency_trace. Example:
+
+		a.out-1623  [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0
+x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6]
+
   sched-tree - TBD (any users??)
 
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ced8b4fa9f51..62776b71b1c5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -30,6 +30,7 @@
 #include <linux/gfp.h>
 #include <linux/fs.h>
 #include <linux/kprobes.h>
+#include <linux/seq_file.h>
 #include <linux/writeback.h>
 
 #include <linux/stacktrace.h>
@@ -276,6 +277,7 @@ static const char *trace_options[] = {
 	"branch",
 	"annotate",
 	"userstacktrace",
+	"sym-userobj",
 	NULL
 };
 
@@ -422,6 +424,28 @@ trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
 	return trace_seq_putmem(s, hex, j);
 }
 
+static int
+trace_seq_path(struct trace_seq *s, struct path *path)
+{
+	unsigned char *p;
+
+	if (s->len >= (PAGE_SIZE - 1))
+		return 0;
+	p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
+	if (!IS_ERR(p)) {
+		p = mangle_path(s->buffer + s->len, p, "\n");
+		if (p) {
+			s->len = p - s->buffer;
+			return 1;
+		}
+	} else {
+		s->buffer[s->len++] = '?';
+		return 1;
+	}
+
+	return 0;
+}
+
 static void
 trace_seq_reset(struct trace_seq *s)
 {
@@ -802,6 +826,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 
 	entry->preempt_count		= pc & 0xff;
 	entry->pid			= (tsk) ? tsk->pid : 0;
+	entry->tgid               	= (tsk) ? tsk->tgid : 0;
 	entry->flags =
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
 		(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1429,28 +1454,73 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
 	return ret;
 }
 
+static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
+				    unsigned long ip, unsigned long sym_flags)
+{
+	struct file *file = NULL;
+	unsigned long vmstart = 0;
+	int ret = 1;
+
+	if (mm) {
+		const struct vm_area_struct *vma = find_vma(mm, ip);
+		if (vma) {
+			file = vma->vm_file;
+			vmstart = vma->vm_start;
+		}
+	}
+	if (file) {
+		ret = trace_seq_path(s, &file->f_path);
+		if (ret)
+			ret = trace_seq_printf(s, "[+0x%lx]",
+					ip - vmstart);
+	}
+	if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
+		ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
+	return ret;
+}
+
 static int
 seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
-		unsigned long sym_flags)
+		      unsigned long sym_flags)
 {
+	struct mm_struct *mm = NULL;
 	int ret = 1;
 	unsigned i;
 
+	if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
+		struct task_struct *task;
+		/*
+		 * we do the lookup on the thread group leader,
+		 * since individual threads might have already quit!
+		 */
+		rcu_read_lock();
+		task = find_task_by_vpid(entry->ent.tgid);
+		rcu_read_unlock();
+
+		if (task)
+			mm = get_task_mm(task);
+	}
+
 	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
 		unsigned long ip = entry->caller[i];
 
 		if (ip == ULONG_MAX || !ret)
 			break;
-		if (i)
+		if (i && ret)
 			ret = trace_seq_puts(s, " <- ");
 		if (!ip) {
-			ret = trace_seq_puts(s, "??");
+			if (ret)
+				ret = trace_seq_puts(s, "??");
 			continue;
 		}
-		if (ret /*&& (sym_flags & TRACE_ITER_SYM_ADDR)*/)
-			ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
+		if (!ret)
+			break;
+		if (ret)
+			ret = seq_print_user_ip(s, mm, ip, sym_flags);
 	}
 
+	if (mm)
+		mmput(mm);
 	return ret;
 }
 
@@ -1775,8 +1845,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 		trace_assign_type(field, entry);
 
 		seq_print_userip_objs(field, s, sym_flags);
-		if (entry->flags & TRACE_FLAG_CONT)
-			trace_seq_print_cont(s, iter);
+		trace_seq_putc(s, '\n');
 		break;
 	}
 	default:
@@ -3581,6 +3650,9 @@ void ftrace_dump(void)
 		atomic_inc(&global_trace.data[cpu]->disabled);
 	}
 
+	/* don't look at user memory in panic mode */
+	trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
+
 	printk(KERN_TRACE "Dumping ftrace buffer:\n");
 
 	iter.tr = &global_trace;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 17bb4c830b01..28c15c2ebc22 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -508,7 +508,8 @@ enum trace_iterator_flags {
 	TRACE_ITER_PREEMPTONLY		= 0x800,
 	TRACE_ITER_BRANCH		= 0x1000,
 	TRACE_ITER_ANNOTATE		= 0x2000,
-	TRACE_ITER_USERSTACKTRACE       = 0x4000
+	TRACE_ITER_USERSTACKTRACE       = 0x4000,
+	TRACE_ITER_SYM_USEROBJ          = 0x8000
 };
 
 /*
-- 
cgit v1.2.3


From cbe2f5a6e84eebb98ab42fc5e58c3cd5b7767349 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 23 Nov 2008 10:37:12 +0100
Subject: tracing: allow tracing of suspend/resume & hibernation code again

Impact: widen function-tracing to suspend+resume (and hibernation) sequences

Now that the ftrace kernel thread is gone, we can allow tracing
during suspend/resume again.

So revert these two commits:

  f42ac38c5 "ftrace: disable tracing for suspend to ram"
  41108eb10 "ftrace: disable tracing for hibernation"

This should be tested very carefully, as it could interact with
altneratives instruction patching, etc.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/power/disk.c | 13 +++----------
 kernel/power/main.c |  5 +----
 2 files changed, 4 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index c9d74083746f..f77d3819ef57 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -22,7 +22,6 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
-#include <linux/ftrace.h>
 
 #include "power.h"
 
@@ -257,7 +256,7 @@ static int create_image(int platform_mode)
 
 int hibernation_snapshot(int platform_mode)
 {
-	int error, ftrace_save;
+	int error;
 
 	/* Free memory before shutting down devices. */
 	error = swsusp_shrink_memory();
@@ -269,7 +268,6 @@ int hibernation_snapshot(int platform_mode)
 		goto Close;
 
 	suspend_console();
-	ftrace_save = __ftrace_enabled_save();
 	error = device_suspend(PMSG_FREEZE);
 	if (error)
 		goto Recover_platform;
@@ -299,7 +297,6 @@ int hibernation_snapshot(int platform_mode)
  Resume_devices:
 	device_resume(in_suspend ?
 		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
-	__ftrace_enabled_restore(ftrace_save);
 	resume_console();
  Close:
 	platform_end(platform_mode);
@@ -370,11 +367,10 @@ static int resume_target_kernel(void)
 
 int hibernation_restore(int platform_mode)
 {
-	int error, ftrace_save;
+	int error;
 
 	pm_prepare_console();
 	suspend_console();
-	ftrace_save = __ftrace_enabled_save();
 	error = device_suspend(PMSG_QUIESCE);
 	if (error)
 		goto Finish;
@@ -389,7 +385,6 @@ int hibernation_restore(int platform_mode)
 	platform_restore_cleanup(platform_mode);
 	device_resume(PMSG_RECOVER);
  Finish:
-	__ftrace_enabled_restore(ftrace_save);
 	resume_console();
 	pm_restore_console();
 	return error;
@@ -402,7 +397,7 @@ int hibernation_restore(int platform_mode)
 
 int hibernation_platform_enter(void)
 {
-	int error, ftrace_save;
+	int error;
 
 	if (!hibernation_ops)
 		return -ENOSYS;
@@ -417,7 +412,6 @@ int hibernation_platform_enter(void)
 		goto Close;
 
 	suspend_console();
-	ftrace_save = __ftrace_enabled_save();
 	error = device_suspend(PMSG_HIBERNATE);
 	if (error) {
 		if (hibernation_ops->recover)
@@ -452,7 +446,6 @@ int hibernation_platform_enter(void)
 	hibernation_ops->finish();
  Resume_devices:
 	device_resume(PMSG_RESTORE);
-	__ftrace_enabled_restore(ftrace_save);
 	resume_console();
  Close:
 	hibernation_ops->end();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index b8f7ce9473e8..613f16941b85 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -22,7 +22,6 @@
 #include <linux/freezer.h>
 #include <linux/vmstat.h>
 #include <linux/syscalls.h>
-#include <linux/ftrace.h>
 
 #include "power.h"
 
@@ -317,7 +316,7 @@ static int suspend_enter(suspend_state_t state)
  */
 int suspend_devices_and_enter(suspend_state_t state)
 {
-	int error, ftrace_save;
+	int error;
 
 	if (!suspend_ops)
 		return -ENOSYS;
@@ -328,7 +327,6 @@ int suspend_devices_and_enter(suspend_state_t state)
 			goto Close;
 	}
 	suspend_console();
-	ftrace_save = __ftrace_enabled_save();
 	suspend_test_start();
 	error = device_suspend(PMSG_SUSPEND);
 	if (error) {
@@ -360,7 +358,6 @@ int suspend_devices_and_enter(suspend_state_t state)
 	suspend_test_start();
 	device_resume(PMSG_RESUME);
 	suspend_test_finish("resume devices");
-	__ftrace_enabled_restore(ftrace_save);
 	resume_console();
  Close:
 	if (suspend_ops->end)
-- 
cgit v1.2.3


From 45b797492a0758e64dff74e9db70e1f65e0603a5 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 21 Nov 2008 00:40:40 -0500
Subject: trace: consolidate unlikely and likely profiler

Impact: clean up to make one profiler of like and unlikely tracer

The likely and unlikely profiler prints out the file and line numbers
of the annotated branches that it is profiling. It shows the number
of times it was correct or incorrect in its guess. Having two
different files or sections for that matter to tell us if it was a
likely or unlikely is pretty pointless. We really only care if
it was correct or not.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-generic/vmlinux.lds.h |  9 +++------
 include/linux/compiler.h          | 24 +++++-------------------
 kernel/trace/Kconfig              |  3 +--
 kernel/trace/trace_branch.c       | 39 +++++++++++++--------------------------
 4 files changed, 22 insertions(+), 53 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 3b46ae464933..8bccb49981e5 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -46,12 +46,9 @@
 #endif
 
 #ifdef CONFIG_TRACE_BRANCH_PROFILING
-#define LIKELY_PROFILE()	VMLINUX_SYMBOL(__start_likely_profile) = .;   \
-				*(_ftrace_likely)			      \
-				VMLINUX_SYMBOL(__stop_likely_profile) = .;    \
-				VMLINUX_SYMBOL(__start_unlikely_profile) = .; \
-				*(_ftrace_unlikely)			      \
-				VMLINUX_SYMBOL(__stop_unlikely_profile) = .;
+#define LIKELY_PROFILE()	VMLINUX_SYMBOL(__start_annotated_branch_profile) = .; \
+				*(_ftrace_annotated_branch)			      \
+				VMLINUX_SYMBOL(__stop_annotated_branch_profile) = .;
 #else
 #define LIKELY_PROFILE()
 #endif
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index c25e525121f0..0628a2013fae 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -77,32 +77,18 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
 #define likely_notrace(x)	__builtin_expect(!!(x), 1)
 #define unlikely_notrace(x)	__builtin_expect(!!(x), 0)
 
-#define likely_check(x) ({						\
+#define __branch_check__(x, expect) ({					\
 			int ______r;					\
 			static struct ftrace_branch_data		\
 				__attribute__((__aligned__(4)))		\
-				__attribute__((section("_ftrace_likely"))) \
+				__attribute__((section("_ftrace_annotated_branch"))) \
 				______f = {				\
 				.func = __func__,			\
 				.file = __FILE__,			\
 				.line = __LINE__,			\
 			};						\
 			______r = likely_notrace(x);			\
-			ftrace_likely_update(&______f, ______r, 1);	\
-			______r;					\
-		})
-#define unlikely_check(x) ({						\
-			int ______r;					\
-			static struct ftrace_branch_data		\
-				__attribute__((__aligned__(4)))		\
-				__attribute__((section("_ftrace_unlikely"))) \
-				______f = {				\
-				.func = __func__,			\
-				.file = __FILE__,			\
-				.line = __LINE__,			\
-			};						\
-			______r = unlikely_notrace(x);			\
-			ftrace_likely_update(&______f, ______r, 0);	\
+			ftrace_likely_update(&______f, ______r, expect); \
 			______r;					\
 		})
 
@@ -112,10 +98,10 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
  * written by Daniel Walker.
  */
 # ifndef likely
-#  define likely(x)	(__builtin_constant_p(x) ? !!(x) : likely_check(x))
+#  define likely(x)	(__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 1))
 # endif
 # ifndef unlikely
-#  define unlikely(x)	(__builtin_constant_p(x) ? !!(x) : unlikely_check(x))
+#  define unlikely(x)	(__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 0))
 # endif
 #else
 # define likely(x)	__builtin_expect(!!(x), 1)
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index b8378fad29a3..7e3548705708 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -166,8 +166,7 @@ config TRACE_BRANCH_PROFILING
 	  This tracer profiles all the the likely and unlikely macros
 	  in the kernel. It will display the results in:
 
-	  /debugfs/tracing/profile_likely
-	  /debugfs/tracing/profile_unlikely
+	  /debugfs/tracing/profile_annotated_branch
 
 	  Note: this will add a significant overhead, only turn this
 	  on if you need to profile the system's use of these macros.
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 23f9b02ce967..21dedc8b50a4 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -261,7 +261,7 @@ static struct seq_operations tracing_likely_seq_ops = {
 	.show		= t_show,
 };
 
-static int tracing_likely_open(struct inode *inode, struct file *file)
+static int tracing_branch_open(struct inode *inode, struct file *file)
 {
 	int ret;
 
@@ -274,25 +274,18 @@ static int tracing_likely_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
-static struct file_operations tracing_likely_fops = {
-	.open		= tracing_likely_open,
+static const struct file_operations tracing_branch_fops = {
+	.open		= tracing_branch_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 };
 
-extern unsigned long __start_likely_profile[];
-extern unsigned long __stop_likely_profile[];
-extern unsigned long __start_unlikely_profile[];
-extern unsigned long __stop_unlikely_profile[];
+extern unsigned long __start_annotated_branch_profile[];
+extern unsigned long __stop_annotated_branch_profile[];
 
-static struct ftrace_pointer ftrace_likely_pos = {
-	.start			= __start_likely_profile,
-	.stop			= __stop_likely_profile,
-};
-
-static struct ftrace_pointer ftrace_unlikely_pos = {
-	.start			= __start_unlikely_profile,
-	.stop			= __stop_unlikely_profile,
+static const struct ftrace_pointer ftrace_annotated_branch_pos = {
+	.start			= __start_annotated_branch_profile,
+	.stop			= __stop_annotated_branch_profile,
 };
 
 static __init int ftrace_branch_init(void)
@@ -302,18 +295,12 @@ static __init int ftrace_branch_init(void)
 
 	d_tracer = tracing_init_dentry();
 
-	entry = debugfs_create_file("profile_likely", 0444, d_tracer,
-				    &ftrace_likely_pos,
-				    &tracing_likely_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'profile_likely' entry\n");
-
-	entry = debugfs_create_file("profile_unlikely", 0444, d_tracer,
-				    &ftrace_unlikely_pos,
-				    &tracing_likely_fops);
+	entry = debugfs_create_file("profile_annotated_branch", 0444, d_tracer,
+				    &ftrace_annotated_branch_pos,
+				    &tracing_branch_fops);
 	if (!entry)
-		pr_warning("Could not create debugfs"
-			   " 'profile_unlikely' entry\n");
+		pr_warning("Could not create debugfs "
+			   "'profile_annotatet_branch' entry\n");
 
 	return 0;
 }
-- 
cgit v1.2.3


From bac28bfe42ba98ee67503f78984d1d5e1ebbbb78 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 21 Nov 2008 01:51:53 -0500
Subject: trace: branch profiling should not print percent without data

Impact: cleanup on output of branch profiler

When a branch has not been taken, it does not make sense to show
a percentage incorrect or hit. This patch changes the behaviour
to print out a 'X' when the branch has not been executed yet.

For example:

 correct incorrect  %        Function                  File              Line
 ------- ---------  -        --------                  ----              ----
    2096        0   0 do_arch_prctl                  process_64.c         832
       0        0   X do_arch_prctl                  process_64.c         804
    2604        0   0 IS_ERR                         err.h                34
  130228     5765   4 __switch_to                    process_64.c         673
       0        0   X enable_TSC                     process_64.c         448
       0        0   X disable_TSC                    process_64.c         431

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_branch.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 21dedc8b50a4..142acb3b4e00 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -225,7 +225,7 @@ static int t_show(struct seq_file *m, void *v)
 {
 	struct ftrace_branch_data *p = v;
 	const char *f;
-	unsigned long percent;
+	long percent;
 
 	if (v == (void *)1) {
 		seq_printf(m, " correct incorrect  %% "
@@ -247,9 +247,13 @@ static int t_show(struct seq_file *m, void *v)
 		percent = p->incorrect * 100;
 		percent /= p->correct + p->incorrect;
 	} else
-		percent = p->incorrect ? 100 : 0;
+		percent = p->incorrect ? 100 : -1;
 
-	seq_printf(m, "%8lu %8lu %3lu ", p->correct, p->incorrect, percent);
+	seq_printf(m, "%8lu %8lu ",  p->correct, p->incorrect);
+	if (percent < 0)
+		seq_printf(m, "  X ");
+	else
+		seq_printf(m, "%3ld ", percent);
 	seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
 	return 0;
 }
-- 
cgit v1.2.3


From 2bcd521a684cc94befbe2ce7d5b613c841b0d304 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 21 Nov 2008 01:30:54 -0500
Subject: trace: profile all if conditionals

Impact: feature to profile if statements

This patch adds a branch profiler for all if () statements.
The results will be found in:

  /debugfs/tracing/profile_branch

For example:

   miss      hit    %        Function                  File              Line
 ------- ---------  -        --------                  ----              ----
       0        1 100 x86_64_start_reservations      head64.c             127
       0        1 100 copy_bootdata                  head64.c             69
       1        0   0 x86_64_start_kernel            head64.c             111
      32        0   0 set_intr_gate                  desc.h               319
       1        0   0 reserve_ebda_region            head.c               51
       1        0   0 reserve_ebda_region            head.c               47
       0        1 100 reserve_ebda_region            head.c               42
       0        0   X maxcpus                        main.c               165

Miss means the branch was not taken. Hit means the branch was taken.
The percent is the percentage the branch was taken.

This adds a significant amount of overhead and should only be used
by those analyzing their system.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-generic/vmlinux.lds.h | 11 ++++++++++-
 include/linux/compiler.h          | 38 ++++++++++++++++++++++++++++++++++++--
 kernel/trace/Kconfig              | 16 ++++++++++++++++
 kernel/trace/trace_branch.c       | 33 +++++++++++++++++++++++++++++++--
 4 files changed, 93 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 8bccb49981e5..eba835a2c2cd 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -53,6 +53,14 @@
 #define LIKELY_PROFILE()
 #endif
 
+#ifdef CONFIG_PROFILE_ALL_BRANCHES
+#define BRANCH_PROFILE()	VMLINUX_SYMBOL(__start_branch_profile) = .;   \
+				*(_ftrace_branch)			      \
+				VMLINUX_SYMBOL(__stop_branch_profile) = .;
+#else
+#define BRANCH_PROFILE()
+#endif
+
 /* .data section */
 #define DATA_DATA							\
 	*(.data)							\
@@ -72,7 +80,8 @@
 	VMLINUX_SYMBOL(__start___tracepoints) = .;			\
 	*(__tracepoints)						\
 	VMLINUX_SYMBOL(__stop___tracepoints) = .;			\
-	LIKELY_PROFILE()
+	LIKELY_PROFILE()		       				\
+	BRANCH_PROFILE()
 
 #define RO_DATA(align)							\
 	. = ALIGN((align));						\
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 0628a2013fae..ea7c6be354b7 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -63,8 +63,16 @@ struct ftrace_branch_data {
 	const char *func;
 	const char *file;
 	unsigned line;
-	unsigned long correct;
-	unsigned long incorrect;
+	union {
+		struct {
+			unsigned long correct;
+			unsigned long incorrect;
+		};
+		struct {
+			unsigned long miss;
+			unsigned long hit;
+		};
+	};
 };
 
 /*
@@ -103,6 +111,32 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
 # ifndef unlikely
 #  define unlikely(x)	(__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 0))
 # endif
+
+#ifdef CONFIG_PROFILE_ALL_BRANCHES
+/*
+ * "Define 'is'", Bill Clinton
+ * "Define 'if'", Steven Rostedt
+ */
+#define if(cond) if (__builtin_constant_p((cond)) ? !!(cond) :		\
+	({								\
+		int ______r;						\
+		static struct ftrace_branch_data			\
+			__attribute__((__aligned__(4)))			\
+			__attribute__((section("_ftrace_branch")))	\
+			______f = {					\
+				.func = __func__,			\
+				.file = __FILE__,			\
+				.line = __LINE__,			\
+			};						\
+		______r = !!(cond);					\
+		if (______r)						\
+			______f.hit++;					\
+		else							\
+			______f.miss++;					\
+		______r;						\
+	}))
+#endif /* CONFIG_PROFILE_ALL_BRANCHES */
+
 #else
 # define likely(x)	__builtin_expect(!!(x), 1)
 # define unlikely(x)	__builtin_expect(!!(x), 0)
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 7e3548705708..61e8cca6ff45 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -173,6 +173,22 @@ config TRACE_BRANCH_PROFILING
 
 	  Say N if unsure.
 
+config PROFILE_ALL_BRANCHES
+	bool "Profile all if conditionals"
+	depends on TRACE_BRANCH_PROFILING
+	help
+	  This tracer profiles all branch conditions. Every if ()
+	  taken in the kernel is recorded whether it hit or miss.
+	  The results will be displayed in:
+
+	  /debugfs/tracing/profile_branch
+
+	  This configuration, when enabled, will impose a great overhead
+	  on the system. This should only be enabled when the system
+	  is to be analyzed
+
+	  Say N if unsure.
+
 config TRACING_BRANCHES
 	bool
 	help
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 142acb3b4e00..85792aec64d2 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -185,6 +185,7 @@ EXPORT_SYMBOL(ftrace_likely_update);
 struct ftrace_pointer {
 	void		*start;
 	void		*stop;
+	int		hit;
 };
 
 static void *
@@ -223,13 +224,17 @@ static void t_stop(struct seq_file *m, void *p)
 
 static int t_show(struct seq_file *m, void *v)
 {
+	struct ftrace_pointer *fp = m->private;
 	struct ftrace_branch_data *p = v;
 	const char *f;
 	long percent;
 
 	if (v == (void *)1) {
-		seq_printf(m, " correct incorrect  %% "
-			      "       Function                "
+		if (fp->hit)
+			seq_printf(m, "   miss      hit    %% ");
+		else
+			seq_printf(m, " correct incorrect  %% ");
+		seq_printf(m, "       Function                "
 			      "  File              Line\n"
 			      " ------- ---------  - "
 			      "       --------                "
@@ -243,6 +248,9 @@ static int t_show(struct seq_file *m, void *v)
 		f--;
 	f++;
 
+	/*
+	 * The miss is overlayed on correct, and hit on incorrect.
+	 */
 	if (p->correct) {
 		percent = p->incorrect * 100;
 		percent /= p->correct + p->incorrect;
@@ -284,6 +292,18 @@ static const struct file_operations tracing_branch_fops = {
 	.llseek		= seq_lseek,
 };
 
+#ifdef CONFIG_PROFILE_ALL_BRANCHES
+extern unsigned long __start_branch_profile[];
+extern unsigned long __stop_branch_profile[];
+
+static struct ftrace_pointer ftrace_branch_pos = {
+	.start			= __start_branch_profile,
+	.stop			= __stop_branch_profile,
+	.hit			= 1,
+};
+
+#endif /* CONFIG_PROFILE_ALL_BRANCHES */
+
 extern unsigned long __start_annotated_branch_profile[];
 extern unsigned long __stop_annotated_branch_profile[];
 
@@ -306,6 +326,15 @@ static __init int ftrace_branch_init(void)
 		pr_warning("Could not create debugfs "
 			   "'profile_annotatet_branch' entry\n");
 
+#ifdef CONFIG_PROFILE_ALL_BRANCHES
+	entry = debugfs_create_file("profile_branch", 0444, d_tracer,
+				    &ftrace_branch_pos,
+				    &tracing_branch_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs"
+			   " 'profile_branch' entry\n");
+#endif
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 033601a32b2012b6948e80e739cca40bff4de4a0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 21 Nov 2008 12:41:55 -0500
Subject: ring-buffer: add tracing_off_permanent

Impact: feature to permanently disable ring buffer

This patch adds a API to the ring buffer code that will permanently
disable the ring buffer from ever recording. This should only be
called when some serious anomaly is detected, and the system
may be in an unstable state. When that happens, shutting down the
recording to the ring buffers may be appropriate.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ring_buffer.h |  1 +
 kernel/trace/ring_buffer.c  | 79 +++++++++++++++++++++++++++++++++++++--------
 2 files changed, 67 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index e097c2e6b6dc..3bb87a753fa3 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -122,6 +122,7 @@ void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
 
 void tracing_on(void);
 void tracing_off(void);
+void tracing_off_permanent(void);
 
 enum ring_buffer_flags {
 	RB_FL_OVERWRITE		= 1 << 0,
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 85ced143c2c4..e206951603c1 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -18,8 +18,46 @@
 
 #include "trace.h"
 
-/* Global flag to disable all recording to ring buffers */
-static int ring_buffers_off __read_mostly;
+/*
+ * A fast way to enable or disable all ring buffers is to
+ * call tracing_on or tracing_off. Turning off the ring buffers
+ * prevents all ring buffers from being recorded to.
+ * Turning this switch on, makes it OK to write to the
+ * ring buffer, if the ring buffer is enabled itself.
+ *
+ * There's three layers that must be on in order to write
+ * to the ring buffer.
+ *
+ * 1) This global flag must be set.
+ * 2) The ring buffer must be enabled for recording.
+ * 3) The per cpu buffer must be enabled for recording.
+ *
+ * In case of an anomaly, this global flag has a bit set that
+ * will permantly disable all ring buffers.
+ */
+
+/*
+ * Global flag to disable all recording to ring buffers
+ *  This has two bits: ON, DISABLED
+ *
+ *  ON   DISABLED
+ * ---- ----------
+ *   0      0        : ring buffers are off
+ *   1      0        : ring buffers are on
+ *   X      1        : ring buffers are permanently disabled
+ */
+
+enum {
+	RB_BUFFERS_ON_BIT	= 0,
+	RB_BUFFERS_DISABLED_BIT	= 1,
+};
+
+enum {
+	RB_BUFFERS_ON		= 1 << RB_BUFFERS_ON_BIT,
+	RB_BUFFERS_DISABLED	= 1 << RB_BUFFERS_DISABLED_BIT,
+};
+
+static long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
 
 /**
  * tracing_on - enable all tracing buffers
@@ -29,7 +67,7 @@ static int ring_buffers_off __read_mostly;
  */
 void tracing_on(void)
 {
-	ring_buffers_off = 0;
+	set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
 }
 
 /**
@@ -42,7 +80,18 @@ void tracing_on(void)
  */
 void tracing_off(void)
 {
-	ring_buffers_off = 1;
+	clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
+}
+
+/**
+ * tracing_off_permanent - permanently disable ring buffers
+ *
+ * This function, once called, will disable all ring buffers
+ * permanenty.
+ */
+void tracing_off_permanent(void)
+{
+	set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
 }
 
 #include "trace.h"
@@ -1185,7 +1234,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
 	struct ring_buffer_event *event;
 	int cpu, resched;
 
-	if (ring_buffers_off)
+	if (ring_buffer_flags != RB_BUFFERS_ON)
 		return NULL;
 
 	if (atomic_read(&buffer->record_disabled))
@@ -1297,7 +1346,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
 	int ret = -EBUSY;
 	int cpu, resched;
 
-	if (ring_buffers_off)
+	if (ring_buffer_flags != RB_BUFFERS_ON)
 		return -EBUSY;
 
 	if (atomic_read(&buffer->record_disabled))
@@ -2178,12 +2227,14 @@ static ssize_t
 rb_simple_read(struct file *filp, char __user *ubuf,
 	       size_t cnt, loff_t *ppos)
 {
-	int *p = filp->private_data;
+	long *p = filp->private_data;
 	char buf[64];
 	int r;
 
-	/* !ring_buffers_off == tracing_on */
-	r = sprintf(buf, "%d\n", !*p);
+	if (test_bit(RB_BUFFERS_DISABLED_BIT, p))
+		r = sprintf(buf, "permanently disabled\n");
+	else
+		r = sprintf(buf, "%d\n", test_bit(RB_BUFFERS_ON_BIT, p));
 
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
@@ -2192,7 +2243,7 @@ static ssize_t
 rb_simple_write(struct file *filp, const char __user *ubuf,
 		size_t cnt, loff_t *ppos)
 {
-	int *p = filp->private_data;
+	long *p = filp->private_data;
 	char buf[64];
 	long val;
 	int ret;
@@ -2209,8 +2260,10 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
 	if (ret < 0)
 		return ret;
 
-	/* !ring_buffers_off == tracing_on */
-	*p = !val;
+	if (val)
+		set_bit(RB_BUFFERS_ON_BIT, p);
+	else
+		clear_bit(RB_BUFFERS_ON_BIT, p);
 
 	(*ppos)++;
 
@@ -2232,7 +2285,7 @@ static __init int rb_init_debugfs(void)
 	d_tracer = tracing_init_dentry();
 
 	entry = debugfs_create_file("tracing_on", 0644, d_tracer,
-				    &ring_buffers_off, &rb_simple_fops);
+				    &ring_buffer_flags, &rb_simple_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs 'tracing_on' entry\n");
 
-- 
cgit v1.2.3


From 69bb54ec05f57da7f6fac2cec0820cbc970df20f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 21 Nov 2008 12:59:38 -0500
Subject: ftrace: add ftrace_off_permanent

Impact: add new API to disable all of ftrace on anomalies

It case of a serious anomaly being detected (like something caught by
lockdep) it is a good idea to disable all tracing immediately, without
grabing any locks.

This patch adds ftrace_off_permanent that disables the tracers, function
tracing and ring buffers without a way to enable them again. This should
only be used when something serious has been detected.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h |  2 ++
 kernel/trace/trace.c   | 15 +++++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index f7ba4ea5e128..13e9cfc09928 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -257,6 +257,7 @@ extern int ftrace_dump_on_oops;
 
 extern void tracing_start(void);
 extern void tracing_stop(void);
+extern void ftrace_off_permanent(void);
 
 extern void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
@@ -290,6 +291,7 @@ ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 0)));
 
 static inline void tracing_start(void) { }
 static inline void tracing_stop(void) { }
+static inline void ftrace_off_permanent(void) { }
 static inline int
 ftrace_printk(const char *fmt, ...)
 {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4ee6f0375222..0dbfb23ced97 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -660,6 +660,21 @@ static void trace_init_cmdlines(void)
 static int trace_stop_count;
 static DEFINE_SPINLOCK(tracing_start_lock);
 
+/**
+ * ftrace_off_permanent - disable all ftrace code permanently
+ *
+ * This should only be called when a serious anomally has
+ * been detected.  This will turn off the function tracing,
+ * ring buffers, and other tracing utilites. It takes no
+ * locks and can be called from any context.
+ */
+void ftrace_off_permanent(void)
+{
+	tracing_disabled = 1;
+	ftrace_stop();
+	tracing_off_permanent();
+}
+
 /**
  * tracing_start - quick start of the tracer
  *
-- 
cgit v1.2.3


From 0429149fb5e01edc410648591c19095d2074ee00 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 21 Nov 2008 14:44:57 -0500
Subject: trace: fix compiler warning in branch profiler

Impact: fix compiler warning

The ftrace_pointers used in the branch profiler are constant values.
They should never change. But the compiler complains when they are
passed into the debugfs_create_file as a data pointer, because the
function discards the qualifier.

This patch typecasts the parameter to debugfs_create_file back to
a void pointer. To remind the callbacks that they are pointing to
a constant value, I also modified the callback local pointers to
be const struct ftrace_pointer * as well.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_branch.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 85792aec64d2..877ee88e6a74 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -191,7 +191,7 @@ struct ftrace_pointer {
 static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct ftrace_pointer *f = m->private;
+	const struct ftrace_pointer *f = m->private;
 	struct ftrace_branch_data *p = v;
 
 	(*pos)++;
@@ -224,7 +224,7 @@ static void t_stop(struct seq_file *m, void *p)
 
 static int t_show(struct seq_file *m, void *v)
 {
-	struct ftrace_pointer *fp = m->private;
+	const struct ftrace_pointer *fp = m->private;
 	struct ftrace_branch_data *p = v;
 	const char *f;
 	long percent;
@@ -296,7 +296,7 @@ static const struct file_operations tracing_branch_fops = {
 extern unsigned long __start_branch_profile[];
 extern unsigned long __stop_branch_profile[];
 
-static struct ftrace_pointer ftrace_branch_pos = {
+static const struct ftrace_pointer ftrace_branch_pos = {
 	.start			= __start_branch_profile,
 	.stop			= __stop_branch_profile,
 	.hit			= 1,
@@ -320,7 +320,7 @@ static __init int ftrace_branch_init(void)
 	d_tracer = tracing_init_dentry();
 
 	entry = debugfs_create_file("profile_annotated_branch", 0444, d_tracer,
-				    &ftrace_annotated_branch_pos,
+				    (void *)&ftrace_annotated_branch_pos,
 				    &tracing_branch_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs "
@@ -328,7 +328,7 @@ static __init int ftrace_branch_init(void)
 
 #ifdef CONFIG_PROFILE_ALL_BRANCHES
 	entry = debugfs_create_file("profile_branch", 0444, d_tracer,
-				    &ftrace_branch_pos,
+				    (void *)&ftrace_branch_pos,
 				    &tracing_branch_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs"
-- 
cgit v1.2.3


From 8d7c6a96164651dbbab449ef0b5c20ae1f76a3a1 Mon Sep 17 00:00:00 2001
From: Török Edwin <edwintorok@gmail.com>
Date: Sun, 23 Nov 2008 12:39:06 +0200
Subject: tracing/stack-tracer: fix style issues
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: cleanup

Signed-off-by: Török Edwin <edwintorok@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/stacktrace.c | 51 +++++++++++++++++++++++++-------------------
 include/linux/stacktrace.h   |  2 +-
 kernel/trace/trace.c         |  7 +++---
 3 files changed, 33 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index b15153060417..10786af95545 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -89,7 +89,7 @@ EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
 
 struct stack_frame {
 	const void __user	*next_fp;
-	unsigned long		return_address;
+	unsigned long		ret_addr;
 };
 
 static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
@@ -108,33 +108,40 @@ static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
 	return ret;
 }
 
+static inline void __save_stack_trace_user(struct stack_trace *trace)
+{
+	const struct pt_regs *regs = task_pt_regs(current);
+	const void __user *fp = (const void __user *)regs->bp;
+
+	if (trace->nr_entries < trace->max_entries)
+		trace->entries[trace->nr_entries++] = regs->ip;
+
+	while (trace->nr_entries < trace->max_entries) {
+		struct stack_frame frame;
+
+		frame.next_fp = NULL;
+		frame.ret_addr = 0;
+		if (!copy_stack_frame(fp, &frame))
+			break;
+		if ((unsigned long)fp < regs->sp)
+			break;
+		if (frame.ret_addr) {
+			trace->entries[trace->nr_entries++] =
+				frame.ret_addr;
+		}
+		if (fp == frame.next_fp)
+			break;
+		fp = frame.next_fp;
+	}
+}
+
 void save_stack_trace_user(struct stack_trace *trace)
 {
 	/*
 	 * Trace user stack if we are not a kernel thread
 	 */
 	if (current->mm) {
-		const struct pt_regs *regs = task_pt_regs(current);
-		const void __user *fp = (const void __user *)regs->bp;
-
-		if (trace->nr_entries < trace->max_entries)
-			trace->entries[trace->nr_entries++] = regs->ip;
-
-		while (trace->nr_entries < trace->max_entries) {
-			struct stack_frame frame;
-			frame.next_fp = NULL;
-			frame.return_address = 0;
-			if (!copy_stack_frame(fp, &frame))
-				break;
-			if ((unsigned long)fp < regs->sp)
-				break;
-			if (frame.return_address)
-				trace->entries[trace->nr_entries++] =
-					frame.return_address;
-			if (fp == frame.next_fp)
-				break;
-			fp = frame.next_fp;
-		}
+		__save_stack_trace_user(trace);
 	}
 	if (trace->nr_entries < trace->max_entries)
 		trace->entries[trace->nr_entries++] = ULONG_MAX;
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
index 68de51468f5d..fd42d6851109 100644
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -25,7 +25,7 @@ extern void save_stack_trace_user(struct stack_trace *trace);
 #else
 # define save_stack_trace(trace)			do { } while (0)
 # define save_stack_trace_tsk(tsk, trace)		do { } while (0)
-# define save_stack_trace_user(trace)              do { } while (0)
+# define save_stack_trace_user(trace)			do { } while (0)
 # define print_stack_trace(trace, spaces)		do { } while (0)
 #endif
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 62776b71b1c5..dedf35f36971 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -948,9 +948,9 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 		   struct trace_array_cpu *data,
 		   unsigned long flags, int pc)
 {
+	struct ring_buffer_event *event;
 	struct userstack_entry *entry;
 	struct stack_trace trace;
-	struct ring_buffer_event *event;
 	unsigned long irq_flags;
 
 	if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
@@ -1471,8 +1471,7 @@ static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
 	if (file) {
 		ret = trace_seq_path(s, &file->f_path);
 		if (ret)
-			ret = trace_seq_printf(s, "[+0x%lx]",
-					ip - vmstart);
+			ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart);
 	}
 	if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
 		ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
@@ -1485,7 +1484,7 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
 {
 	struct mm_struct *mm = NULL;
 	int ret = 1;
-	unsigned i;
+	unsigned int i;
 
 	if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
 		struct task_struct *task;
-- 
cgit v1.2.3


From cffa10aecb6891f090a4d53a075bc40c082c45fc Mon Sep 17 00:00:00 2001
From: Török Edwin <edwintorok@gmail.com>
Date: Sun, 23 Nov 2008 12:39:07 +0200
Subject: tracing/stack-tracer: fix locking and refcounts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: fix refcounting/object-access bug

Hold mmap_sem while looking up/accessing vma.
Hold the RCU lock while using the task we looked up.

Signed-off-by: Török Edwin <edwintorok@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dedf35f36971..4c3bd82cec49 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1462,11 +1462,15 @@ static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
 	int ret = 1;
 
 	if (mm) {
-		const struct vm_area_struct *vma = find_vma(mm, ip);
+		const struct vm_area_struct *vma;
+
+		down_read(&mm->mmap_sem);
+		vma = find_vma(mm, ip);
 		if (vma) {
 			file = vma->vm_file;
 			vmstart = vma->vm_start;
 		}
+		up_read(&mm->mmap_sem);
 	}
 	if (file) {
 		ret = trace_seq_path(s, &file->f_path);
@@ -1494,10 +1498,9 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
 		 */
 		rcu_read_lock();
 		task = find_task_by_vpid(entry->ent.tgid);
-		rcu_read_unlock();
-
 		if (task)
 			mm = get_task_mm(task);
+		rcu_read_unlock();
 	}
 
 	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
-- 
cgit v1.2.3


From 8d26487fd4ddda7a0237da418fb8669fb06ae557 Mon Sep 17 00:00:00 2001
From: Török Edwin <edwintorok@gmail.com>
Date: Sun, 23 Nov 2008 12:39:08 +0200
Subject: tracing/stack-tracer: introduce CONFIG_USER_STACKTRACE_SUPPORT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: cleanup

User stack tracing is just implemented for x86, but it is not x86 specific.

Introduce a generic config flag, that is currently enabled only for x86.
When other arches implement it, they will have to
SELECT USER_STACKTRACE_SUPPORT.

Signed-off-by: Török Edwin <edwintorok@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig           | 1 +
 include/linux/stacktrace.h | 2 +-
 kernel/trace/Kconfig       | 3 +++
 3 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7a146baaa990..e49a4fd718fe 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -36,6 +36,7 @@ config X86
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_GENERIC_DMA_COHERENT if X86_32
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
+	select USER_STACKTRACE_SUPPORT
 
 config ARCH_DEFCONFIG
 	string
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
index fd42d6851109..1a8cecc4f38c 100644
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -16,7 +16,7 @@ extern void save_stack_trace_tsk(struct task_struct *tsk,
 
 extern void print_stack_trace(struct stack_trace *trace, int spaces);
 
-#ifdef CONFIG_X86
+#ifdef CONFIG_USER_STACKTRACE_SUPPORT
 extern void save_stack_trace_user(struct stack_trace *trace);
 #else
 # define save_stack_trace_user(trace)              do { } while (0)
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index b8378fad29a3..87fc34a1bb91 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -3,6 +3,9 @@
 #  select HAVE_FUNCTION_TRACER:
 #
 
+config USER_STACKTRACE_SUPPORT
+	bool
+
 config NOP_TRACER
 	bool
 
-- 
cgit v1.2.3


From e38da59269be8c0196d16dff1be5bb26076afc6a Mon Sep 17 00:00:00 2001
From: Török Edwin <edwintorok@gmail.com>
Date: Sun, 23 Nov 2008 13:08:10 +0200
Subject: tracing/stack-tracer: avoid races accessing file

Impact: fix race

vma->vm_file reference is only stable while holding the mmap_sem,
so move usage of it to within the critical section.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4c3bd82cec49..48d1536f1ca4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1470,13 +1470,13 @@ static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
 			file = vma->vm_file;
 			vmstart = vma->vm_start;
 		}
+		if (file) {
+			ret = trace_seq_path(s, &file->f_path);
+			if (ret)
+				ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart);
+		}
 		up_read(&mm->mmap_sem);
 	}
-	if (file) {
-		ret = trace_seq_path(s, &file->f_path);
-		if (ret)
-			ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart);
-	}
 	if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
 		ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
 	return ret;
-- 
cgit v1.2.3


From eae849ca034c7f1015f0a6f17421ebc737f0a069 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 23 Nov 2008 17:33:12 +0100
Subject: tracing/function-return-tracer: don't trace kfree while it frees the
 return stack

Impact: fix a crash

While I killed the cat process, I got sometimes the following (but rare)
crash:

[   65.689027] Pid: 2969, comm: cat Not tainted (2.6.28-rc6-tip #83) AMILO Li 2727
[   65.689027] EIP: 0060:[<00000000>] EFLAGS: 00010082 CPU: 1
[   65.689027] EIP is at 0x0
[   65.689027] EAX: 00000000 EBX: f66cd780 ECX: c019a64a EDX: f66cd780
[   65.689027] ESI: 00000286 EDI: f66cd780 EBP: f630be2c ESP: f630be24
[   65.689027]  DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068
[   65.689027] Process cat (pid: 2969, ti=f630a000 task=f66cd780 task.ti=f630a000)
[   65.689027] Stack:
[   65.689027]  00000012 f630bd54 f630be7c c012c853 00000000 c0133cc9 f66cda54 f630be5c
[   65.689027]  f630be68 f66cda54 f66cd88c f66cd878 f7070000 00000001 f630be90 c0135dbc
[   65.689027]  f614a614 f630be68 f630be68 f65ba200 00000002 f630bf10 f630be90 c012cad6
[   65.689027] Call Trace:
[   65.689027]  [<c012c853>] ? do_exit+0x603/0x850
[   65.689027]  [<c0133cc9>] ? next_signal+0x9/0x40
[   65.689027]  [<c0135dbc>] ? dequeue_signal+0x8c/0x180
[   65.689027]  [<c012cad6>] ? do_group_exit+0x36/0x90
[   65.689027]  [<c013709c>] ? get_signal_to_deliver+0x20c/0x390
[   65.689027]  [<c0102b69>] ? do_notify_resume+0x99/0x8b0
[   65.689027]  [<c02e6d1a>] ? tty_ldisc_deref+0x5a/0x80
[   65.689027]  [<c014db9b>] ? trace_hardirqs_on+0xb/0x10
[   65.689027]  [<c02e6d1a>] ? tty_ldisc_deref+0x5a/0x80
[   65.689027]  [<c02e39b0>] ? n_tty_write+0x0/0x340
[   65.689027]  [<c02e1812>] ? redirected_tty_write+0x82/0x90
[   65.689027]  [<c019ee99>] ? vfs_write+0x99/0xd0
[   65.689027]  [<c02e1790>] ? redirected_tty_write+0x0/0x90
[   65.689027]  [<c019f342>] ? sys_write+0x42/0x70
[   65.689027]  [<c01035ca>] ? work_notifysig+0x13/0x19
[   65.689027] Code:  Bad EIP value.
[   65.689027] EIP: [<00000000>] 0x0 SS:ESP 0068:f630be24

This is because on do_exit(), kfree is called to free the return addresses stack
but kfree is traced and stored its return address in this stack.
This patch fixes it.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 90d99fb02ae4..53042f118f23 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1628,8 +1628,13 @@ void ftrace_retfunc_init_task(struct task_struct *t)
 
 void ftrace_retfunc_exit_task(struct task_struct *t)
 {
-	kfree(t->ret_stack);
+	struct ftrace_ret_stack	*ret_stack = t->ret_stack;
+
 	t->ret_stack = NULL;
+	/* NULL must become visible to IRQs before we free it: */
+	barrier();
+
+	kfree(ret_stack);
 }
 #endif
 
-- 
cgit v1.2.3


From 65afa5e603d507014580ead016ec887b49e1afa6 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 23 Nov 2008 18:43:39 +0100
Subject: tracing/function-return-tracer: free the return stack on free_task()

Impact: avoid losing some traces when a task is freed

do_exit() is not the last function called when a task finishes.
There are still some functions which are to be called such as
ree_task().  So we delay the freeing of the return stack to the
last moment.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/exit.c | 2 --
 kernel/fork.c | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index ef04d03b3286..e5ae36ebe8af 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -47,7 +47,6 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/tracehook.h>
 #include <trace/sched.h>
-#include <linux/ftrace.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -1128,7 +1127,6 @@ NORET_TYPE void do_exit(long code)
 	preempt_disable();
 	/* causes final put_task_struct in finish_task_switch(). */
 	tsk->state = TASK_DEAD;
-	ftrace_retfunc_exit_task(tsk);
 	schedule();
 	BUG();
 	/* Avoid "noreturn function does return".  */
diff --git a/kernel/fork.c b/kernel/fork.c
index fbf4a4c0a628..d6e1a3205f62 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -140,6 +140,7 @@ void free_task(struct task_struct *tsk)
 	prop_local_destroy_single(&tsk->dirties);
 	free_thread_info(tsk->stack);
 	rt_mutex_debug_task_free(tsk);
+	ftrace_retfunc_exit_task(tsk);
 	free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
-- 
cgit v1.2.3


From eccdaeafaea3ed115068ba55d01f22e486e5437d Mon Sep 17 00:00:00 2001
From: Petr Tesarik <ptesarik@suse.cz>
Date: Mon, 24 Nov 2008 15:46:31 +0100
Subject: posix-cpu-timers: fix clock_gettime with CLOCK_PROCESS_CPUTIME_ID

Since CLOCK_PROCESS_CPUTIME_ID is in fact translated to -6, the switch
statement in cpu_clock_sample_group() must first mask off the irrelevant
bits, similar to cpu_clock_sample().

Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

--
 posix-cpu-timers.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
---
 kernel/posix-cpu-timers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 895337b16a24..4e5288a831de 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -311,7 +311,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
 	struct task_cputime cputime;
 
 	thread_group_cputime(p, &cputime);
-	switch (which_clock) {
+	switch (CPUCLOCK_WHICH(which_clock)) {
 	default:
 		return -EINVAL;
 	case CPUCLOCK_PROF:
-- 
cgit v1.2.3


From 18b6e0414e42d95183f07d8177e3ff0241abd825 Mon Sep 17 00:00:00 2001
From: Serge Hallyn <serue@us.ibm.com>
Date: Wed, 15 Oct 2008 16:38:45 -0500
Subject: User namespaces: set of cleanups (v2)

The user_ns is moved from nsproxy to user_struct, so that a struct
cred by itself is sufficient to determine access (which it otherwise
would not be).  Corresponding ecryptfs fixes (by David Howells) are
here as well.

Fix refcounting.  The following rules now apply:
        1. The task pins the user struct.
        2. The user struct pins its user namespace.
        3. The user namespace pins the struct user which created it.

User namespaces are cloned during copy_creds().  Unsharing a new user_ns
is no longer possible.  (We could re-add that, but it'll cause code
duplication and doesn't seem useful if PAM doesn't need to clone user
namespaces).

When a user namespace is created, its first user (uid 0) gets empty
keyrings and a clean group_info.

This incorporates a previous patch by David Howells.  Here
is his original patch description:

>I suggest adding the attached incremental patch.  It makes the following
>changes:
>
> (1) Provides a current_user_ns() macro to wrap accesses to current's user
>     namespace.
>
> (2) Fixes eCryptFS.
>
> (3) Renames create_new_userns() to create_user_ns() to be more consistent
>     with the other associated functions and because the 'new' in the name is
>     superfluous.
>
> (4) Moves the argument and permission checks made for CLONE_NEWUSER to the
>     beginning of do_fork() so that they're done prior to making any attempts
>     at allocation.
>
> (5) Calls create_user_ns() after prepare_creds(), and gives it the new creds
>     to fill in rather than have it return the new root user.  I don't imagine
>     the new root user being used for anything other than filling in a cred
>     struct.
>
>     This also permits me to get rid of a get_uid() and a free_uid(), as the
>     reference the creds were holding on the old user_struct can just be
>     transferred to the new namespace's creator pointer.
>
> (6) Makes create_user_ns() reset the UIDs and GIDs of the creds under
>     preparation rather than doing it in copy_creds().
>
>David

>Signed-off-by: David Howells <dhowells@redhat.com>

Changelog:
	Oct 20: integrate dhowells comments
		1. leave thread_keyring alone
		2. use current_user_ns() in set_user()

Signed-off-by: Serge Hallyn <serue@us.ibm.com>
---
 fs/ecryptfs/messaging.c        | 13 ++++----
 fs/ecryptfs/miscdev.c          | 19 ++++-------
 include/linux/cred.h           |  2 ++
 include/linux/init_task.h      |  1 -
 include/linux/nsproxy.h        |  1 -
 include/linux/sched.h          |  1 +
 include/linux/user_namespace.h | 13 +++-----
 kernel/cred.c                  | 15 +++++++--
 kernel/fork.c                  | 19 +++++++++--
 kernel/nsproxy.c               | 15 ++-------
 kernel/sys.c                   |  4 +--
 kernel/user.c                  | 47 ++++++++------------------
 kernel/user_namespace.c        | 75 +++++++++++++++++-------------------------
 13 files changed, 96 insertions(+), 129 deletions(-)

(limited to 'kernel')

diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index e0b0a4e28b9b..6913f727624d 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -360,7 +360,7 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
 	struct ecryptfs_msg_ctx *msg_ctx;
 	size_t msg_size;
 	struct nsproxy *nsproxy;
-	struct user_namespace *current_user_ns;
+	struct user_namespace *tsk_user_ns;
 	uid_t ctx_euid;
 	int rc;
 
@@ -385,9 +385,9 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
 		mutex_unlock(&ecryptfs_daemon_hash_mux);
 		goto wake_up;
 	}
-	current_user_ns = nsproxy->user_ns;
+	tsk_user_ns = __task_cred(msg_ctx->task)->user->user_ns;
 	ctx_euid = task_euid(msg_ctx->task);
-	rc = ecryptfs_find_daemon_by_euid(&daemon, ctx_euid, current_user_ns);
+	rc = ecryptfs_find_daemon_by_euid(&daemon, ctx_euid, tsk_user_ns);
 	rcu_read_unlock();
 	mutex_unlock(&ecryptfs_daemon_hash_mux);
 	if (rc) {
@@ -405,11 +405,11 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
 		       euid, ctx_euid);
 		goto unlock;
 	}
-	if (current_user_ns != user_ns) {
+	if (tsk_user_ns != user_ns) {
 		rc = -EBADMSG;
 		printk(KERN_WARNING "%s: Received message from user_ns "
 		       "[0x%p]; expected message from user_ns [0x%p]\n",
-		       __func__, user_ns, nsproxy->user_ns);
+		       __func__, user_ns, tsk_user_ns);
 		goto unlock;
 	}
 	if (daemon->pid != pid) {
@@ -468,8 +468,7 @@ ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type,
 	uid_t euid = current_euid();
 	int rc;
 
-	rc = ecryptfs_find_daemon_by_euid(&daemon, euid,
-					  current->nsproxy->user_ns);
+	rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns());
 	if (rc || !daemon) {
 		rc = -ENOTCONN;
 		printk(KERN_ERR "%s: User [%d] does not have a daemon "
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 047ac609695b..efd95a0ed1ea 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -47,8 +47,7 @@ ecryptfs_miscdev_poll(struct file *file, poll_table *pt)
 
 	mutex_lock(&ecryptfs_daemon_hash_mux);
 	/* TODO: Just use file->private_data? */
-	rc = ecryptfs_find_daemon_by_euid(&daemon, euid,
-					  current->nsproxy->user_ns);
+	rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns());
 	BUG_ON(rc || !daemon);
 	mutex_lock(&daemon->mux);
 	mutex_unlock(&ecryptfs_daemon_hash_mux);
@@ -95,11 +94,9 @@ ecryptfs_miscdev_open(struct inode *inode, struct file *file)
 		       "count; rc = [%d]\n", __func__, rc);
 		goto out_unlock_daemon_list;
 	}
-	rc = ecryptfs_find_daemon_by_euid(&daemon, euid,
-					  current->nsproxy->user_ns);
+	rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns());
 	if (rc || !daemon) {
-		rc = ecryptfs_spawn_daemon(&daemon, euid,
-					   current->nsproxy->user_ns,
+		rc = ecryptfs_spawn_daemon(&daemon, euid, current_user_ns(),
 					   task_pid(current));
 		if (rc) {
 			printk(KERN_ERR "%s: Error attempting to spawn daemon; "
@@ -153,8 +150,7 @@ ecryptfs_miscdev_release(struct inode *inode, struct file *file)
 	int rc;
 
 	mutex_lock(&ecryptfs_daemon_hash_mux);
-	rc = ecryptfs_find_daemon_by_euid(&daemon, euid,
-					  current->nsproxy->user_ns);
+	rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns());
 	BUG_ON(rc || !daemon);
 	mutex_lock(&daemon->mux);
 	BUG_ON(daemon->pid != task_pid(current));
@@ -254,8 +250,7 @@ ecryptfs_miscdev_read(struct file *file, char __user *buf, size_t count,
 
 	mutex_lock(&ecryptfs_daemon_hash_mux);
 	/* TODO: Just use file->private_data? */
-	rc = ecryptfs_find_daemon_by_euid(&daemon, euid,
-					  current->nsproxy->user_ns);
+	rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns());
 	BUG_ON(rc || !daemon);
 	mutex_lock(&daemon->mux);
 	if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) {
@@ -295,7 +290,7 @@ check_list:
 		goto check_list;
 	}
 	BUG_ON(euid != daemon->euid);
-	BUG_ON(current->nsproxy->user_ns != daemon->user_ns);
+	BUG_ON(current_user_ns() != daemon->user_ns);
 	BUG_ON(task_pid(current) != daemon->pid);
 	msg_ctx = list_first_entry(&daemon->msg_ctx_out_queue,
 				   struct ecryptfs_msg_ctx, daemon_out_list);
@@ -468,7 +463,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
 			goto out_free;
 		}
 		rc = ecryptfs_miscdev_response(&data[i], packet_size,
-					       euid, current->nsproxy->user_ns,
+					       euid, current_user_ns(),
 					       task_pid(current), seq);
 		if (rc)
 			printk(KERN_WARNING "%s: Failed to deliver miscdev "
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 26c1ab179946..3282ee4318e7 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -60,6 +60,7 @@ do {							\
 } while (0)
 
 extern struct group_info *groups_alloc(int);
+extern struct group_info init_groups;
 extern void groups_free(struct group_info *);
 extern int set_current_groups(struct group_info *);
 extern int set_groups(struct cred *, struct group_info *);
@@ -315,6 +316,7 @@ static inline void put_cred(const struct cred *_cred)
 #define current_fsgid() 	(current_cred_xxx(fsgid))
 #define current_cap()		(current_cred_xxx(cap_effective))
 #define current_user()		(current_cred_xxx(user))
+#define current_user_ns()	(current_cred_xxx(user)->user_ns)
 #define current_security()	(current_cred_xxx(security))
 
 #define current_uid_gid(_uid, _gid)		\
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 2597858035cd..959f5522d10a 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -57,7 +57,6 @@ extern struct nsproxy init_nsproxy;
 	.mnt_ns		= NULL,						\
 	INIT_NET_NS(net_ns)                                             \
 	INIT_IPC_NS(ipc_ns)						\
-	.user_ns	= &init_user_ns,				\
 }
 
 #define INIT_SIGHAND(sighand) {						\
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index c8a768e59640..afad7dec1b36 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -27,7 +27,6 @@ struct nsproxy {
 	struct ipc_namespace *ipc_ns;
 	struct mnt_namespace *mnt_ns;
 	struct pid_namespace *pid_ns;
-	struct user_namespace *user_ns;
 	struct net 	     *net_ns;
 };
 extern struct nsproxy init_nsproxy;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2036e9f26020..7f8015a3082e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -638,6 +638,7 @@ struct user_struct {
 	/* Hash table maintenance information */
 	struct hlist_node uidhash_node;
 	uid_t uid;
+	struct user_namespace *user_ns;
 
 #ifdef CONFIG_USER_SCHED
 	struct task_group *tg;
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index b5f41d4c2eec..315bcd375224 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -12,7 +12,7 @@
 struct user_namespace {
 	struct kref		kref;
 	struct hlist_head	uidhash_table[UIDHASH_SZ];
-	struct user_struct	*root_user;
+	struct user_struct	*creator;
 };
 
 extern struct user_namespace init_user_ns;
@@ -26,8 +26,7 @@ static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
 	return ns;
 }
 
-extern struct user_namespace *copy_user_ns(int flags,
-					   struct user_namespace *old_ns);
+extern int create_user_ns(struct cred *new);
 extern void free_user_ns(struct kref *kref);
 
 static inline void put_user_ns(struct user_namespace *ns)
@@ -43,13 +42,9 @@ static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
 	return &init_user_ns;
 }
 
-static inline struct user_namespace *copy_user_ns(int flags,
-						  struct user_namespace *old_ns)
+static inline int create_user_ns(struct cred *new)
 {
-	if (flags & CLONE_NEWUSER)
-		return ERR_PTR(-EINVAL);
-
-	return old_ns;
+	return -EINVAL;
 }
 
 static inline void put_user_ns(struct user_namespace *ns)
diff --git a/kernel/cred.c b/kernel/cred.c
index 13697ca2bb38..ff7bc071991c 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -274,6 +274,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 	struct thread_group_cred *tgcred;
 #endif
 	struct cred *new;
+	int ret;
 
 	mutex_init(&p->cred_exec_mutex);
 
@@ -293,6 +294,12 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 	if (!new)
 		return -ENOMEM;
 
+	if (clone_flags & CLONE_NEWUSER) {
+		ret = create_user_ns(new);
+		if (ret < 0)
+			goto error_put;
+	}
+
 #ifdef CONFIG_KEYS
 	/* new threads get their own thread keyrings if their parent already
 	 * had one */
@@ -309,8 +316,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 	if (!(clone_flags & CLONE_THREAD)) {
 		tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
 		if (!tgcred) {
-			put_cred(new);
-			return -ENOMEM;
+			ret = -ENOMEM;
+			goto error_put;
 		}
 		atomic_set(&tgcred->usage, 1);
 		spin_lock_init(&tgcred->lock);
@@ -325,6 +332,10 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 	atomic_inc(&new->user->processes);
 	p->cred = p->real_cred = get_cred(new);
 	return 0;
+
+error_put:
+	put_cred(new);
+	return ret;
 }
 
 /**
diff --git a/kernel/fork.c b/kernel/fork.c
index 29c18c14812d..1dd89451fae4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -976,7 +976,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (atomic_read(&p->real_cred->user->processes) >=
 			p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
 		if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
-		    p->real_cred->user != current->nsproxy->user_ns->root_user)
+		    p->real_cred->user != INIT_USER)
 			goto bad_fork_free;
 	}
 
@@ -1334,6 +1334,20 @@ long do_fork(unsigned long clone_flags,
 	int trace = 0;
 	long nr;
 
+	/*
+	 * Do some preliminary argument and permissions checking before we
+	 * actually start allocating stuff
+	 */
+	if (clone_flags & CLONE_NEWUSER) {
+		if (clone_flags & CLONE_THREAD)
+			return -EINVAL;
+		/* hopefully this check will go away when userns support is
+		 * complete
+		 */
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+	}
+
 	/*
 	 * We hope to recycle these flags after 2.6.26
 	 */
@@ -1581,8 +1595,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 	err = -EINVAL;
 	if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
 				CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
-				CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER|
-				CLONE_NEWNET))
+				CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
 		goto bad_unshare_out;
 
 	/*
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 1d3ef29a2583..63598dca2d0c 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -80,12 +80,6 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
 		goto out_pid;
 	}
 
-	new_nsp->user_ns = copy_user_ns(flags, tsk->nsproxy->user_ns);
-	if (IS_ERR(new_nsp->user_ns)) {
-		err = PTR_ERR(new_nsp->user_ns);
-		goto out_user;
-	}
-
 	new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns);
 	if (IS_ERR(new_nsp->net_ns)) {
 		err = PTR_ERR(new_nsp->net_ns);
@@ -95,9 +89,6 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
 	return new_nsp;
 
 out_net:
-	if (new_nsp->user_ns)
-		put_user_ns(new_nsp->user_ns);
-out_user:
 	if (new_nsp->pid_ns)
 		put_pid_ns(new_nsp->pid_ns);
 out_pid:
@@ -130,7 +121,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
 	get_nsproxy(old_ns);
 
 	if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
-				CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET)))
+				CLONE_NEWPID | CLONE_NEWNET)))
 		return 0;
 
 	if (!capable(CAP_SYS_ADMIN)) {
@@ -173,8 +164,6 @@ void free_nsproxy(struct nsproxy *ns)
 		put_ipc_ns(ns->ipc_ns);
 	if (ns->pid_ns)
 		put_pid_ns(ns->pid_ns);
-	if (ns->user_ns)
-		put_user_ns(ns->user_ns);
 	put_net(ns->net_ns);
 	kmem_cache_free(nsproxy_cachep, ns);
 }
@@ -189,7 +178,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
 	int err = 0;
 
 	if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
-			       CLONE_NEWUSER | CLONE_NEWNET)))
+			       CLONE_NEWNET)))
 		return 0;
 
 	if (!capable(CAP_SYS_ADMIN))
diff --git a/kernel/sys.c b/kernel/sys.c
index ab735040468a..ebe65c2c9873 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -565,13 +565,13 @@ static int set_user(struct cred *new)
 {
 	struct user_struct *new_user;
 
-	new_user = alloc_uid(current->nsproxy->user_ns, new->uid);
+	new_user = alloc_uid(current_user_ns(), new->uid);
 	if (!new_user)
 		return -EAGAIN;
 
 	if (atomic_read(&new_user->processes) >=
 				current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
-			new_user != current->nsproxy->user_ns->root_user) {
+			new_user != INIT_USER) {
 		free_uid(new_user);
 		return -EAGAIN;
 	}
diff --git a/kernel/user.c b/kernel/user.c
index d476307dd4b0..c0ef3a464438 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -20,9 +20,9 @@
 
 struct user_namespace init_user_ns = {
 	.kref = {
-		.refcount	= ATOMIC_INIT(2),
+		.refcount	= ATOMIC_INIT(1),
 	},
-	.root_user = &root_user,
+	.creator = &root_user,
 };
 EXPORT_SYMBOL_GPL(init_user_ns);
 
@@ -48,12 +48,14 @@ static struct kmem_cache *uid_cachep;
  */
 static DEFINE_SPINLOCK(uidhash_lock);
 
+/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->creator */
 struct user_struct root_user = {
-	.__count	= ATOMIC_INIT(1),
+	.__count	= ATOMIC_INIT(2),
 	.processes	= ATOMIC_INIT(1),
 	.files		= ATOMIC_INIT(0),
 	.sigpending	= ATOMIC_INIT(0),
 	.locked_shm     = 0,
+	.user_ns	= &init_user_ns,
 #ifdef CONFIG_USER_SCHED
 	.tg		= &init_task_group,
 #endif
@@ -314,12 +316,13 @@ done:
  * IRQ state (as stored in flags) is restored and uidhash_lock released
  * upon function exit.
  */
-static inline void free_user(struct user_struct *up, unsigned long flags)
+static void free_user(struct user_struct *up, unsigned long flags)
 {
 	/* restore back the count */
 	atomic_inc(&up->__count);
 	spin_unlock_irqrestore(&uidhash_lock, flags);
 
+	put_user_ns(up->user_ns);
 	INIT_WORK(&up->work, remove_user_sysfs_dir);
 	schedule_work(&up->work);
 }
@@ -335,13 +338,14 @@ static inline void uids_mutex_unlock(void) { }
  * IRQ state (as stored in flags) is restored and uidhash_lock released
  * upon function exit.
  */
-static inline void free_user(struct user_struct *up, unsigned long flags)
+static void free_user(struct user_struct *up, unsigned long flags)
 {
 	uid_hash_remove(up);
 	spin_unlock_irqrestore(&uidhash_lock, flags);
 	sched_destroy_user(up);
 	key_put(up->uid_keyring);
 	key_put(up->session_keyring);
+	put_user_ns(up->user_ns);
 	kmem_cache_free(uid_cachep, up);
 }
 
@@ -357,7 +361,7 @@ struct user_struct *find_user(uid_t uid)
 {
 	struct user_struct *ret;
 	unsigned long flags;
-	struct user_namespace *ns = current->nsproxy->user_ns;
+	struct user_namespace *ns = current_user()->user_ns;
 
 	spin_lock_irqsave(&uidhash_lock, flags);
 	ret = uid_hash_find(uid, uidhashentry(ns, uid));
@@ -404,6 +408,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
 		if (sched_create_user(new) < 0)
 			goto out_free_user;
 
+		new->user_ns = get_user_ns(ns);
+
 		if (uids_user_create(new))
 			goto out_destoy_sched;
 
@@ -427,7 +433,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
 			up = new;
 		}
 		spin_unlock_irq(&uidhash_lock);
-
 	}
 
 	uids_mutex_unlock();
@@ -436,6 +441,7 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
 
 out_destoy_sched:
 	sched_destroy_user(new);
+	put_user_ns(new->user_ns);
 out_free_user:
 	kmem_cache_free(uid_cachep, new);
 out_unlock:
@@ -443,33 +449,6 @@ out_unlock:
 	return NULL;
 }
 
-#ifdef CONFIG_USER_NS
-void release_uids(struct user_namespace *ns)
-{
-	int i;
-	unsigned long flags;
-	struct hlist_head *head;
-	struct hlist_node *nd;
-
-	spin_lock_irqsave(&uidhash_lock, flags);
-	/*
-	 * collapse the chains so that the user_struct-s will
-	 * be still alive, but not in hashes. subsequent free_uid()
-	 * will free them.
-	 */
-	for (i = 0; i < UIDHASH_SZ; i++) {
-		head = ns->uidhash_table + i;
-		while (!hlist_empty(head)) {
-			nd = head->first;
-			hlist_del_init(nd);
-		}
-	}
-	spin_unlock_irqrestore(&uidhash_lock, flags);
-
-	free_uid(ns->root_user);
-}
-#endif
-
 static int __init uid_cache_init(void)
 {
 	int n;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 0d9c51d67333..79084311ee57 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,70 +9,55 @@
 #include <linux/nsproxy.h>
 #include <linux/slab.h>
 #include <linux/user_namespace.h>
+#include <linux/cred.h>
 
 /*
- * Clone a new ns copying an original user ns, setting refcount to 1
- * @old_ns: namespace to clone
- * Return NULL on error (failure to kmalloc), new ns otherwise
+ * Create a new user namespace, deriving the creator from the user in the
+ * passed credentials, and replacing that user with the new root user for the
+ * new namespace.
+ *
+ * This is called by copy_creds(), which will finish setting the target task's
+ * credentials.
  */
-static struct user_namespace *clone_user_ns(struct user_namespace *old_ns)
+int create_user_ns(struct cred *new)
 {
 	struct user_namespace *ns;
-	struct user_struct *new_user;
-	struct cred *new;
+	struct user_struct *root_user;
 	int n;
 
 	ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL);
 	if (!ns)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 
 	kref_init(&ns->kref);
 
 	for (n = 0; n < UIDHASH_SZ; ++n)
 		INIT_HLIST_HEAD(ns->uidhash_table + n);
 
-	/* Insert new root user.  */
-	ns->root_user = alloc_uid(ns, 0);
-	if (!ns->root_user) {
+	/* Alloc new root user.  */
+	root_user = alloc_uid(ns, 0);
+	if (!root_user) {
 		kfree(ns);
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 	}
 
-	/* Reset current->user with a new one */
-	new_user = alloc_uid(ns, current_uid());
-	if (!new_user) {
-		free_uid(ns->root_user);
-		kfree(ns);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	/* Install the new user */
-	new = prepare_creds();
-	if (!new) {
-		free_uid(new_user);
-		free_uid(ns->root_user);
-		kfree(ns);
-	}
-	free_uid(new->user);
-	new->user = new_user;
-	commit_creds(new);
-	return ns;
-}
-
-struct user_namespace * copy_user_ns(int flags, struct user_namespace *old_ns)
-{
-	struct user_namespace *new_ns;
-
-	BUG_ON(!old_ns);
-	get_user_ns(old_ns);
-
-	if (!(flags & CLONE_NEWUSER))
-		return old_ns;
+	/* set the new root user in the credentials under preparation */
+	ns->creator = new->user;
+	new->user = root_user;
+	new->uid = new->euid = new->suid = new->fsuid = 0;
+	new->gid = new->egid = new->sgid = new->fsgid = 0;
+	put_group_info(new->group_info);
+	new->group_info = get_group_info(&init_groups);
+#ifdef CONFIG_KEYS
+	key_put(new->request_key_auth);
+	new->request_key_auth = NULL;
+#endif
+	/* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
 
-	new_ns = clone_user_ns(old_ns);
+	/* alloc_uid() incremented the userns refcount.  Just set it to 1 */
+	kref_set(&ns->kref, 1);
 
-	put_user_ns(old_ns);
-	return new_ns;
+	return 0;
 }
 
 void free_user_ns(struct kref *kref)
@@ -80,7 +65,7 @@ void free_user_ns(struct kref *kref)
 	struct user_namespace *ns;
 
 	ns = container_of(kref, struct user_namespace, kref);
-	release_uids(ns);
+	free_uid(ns->creator);
 	kfree(ns);
 }
 EXPORT_SYMBOL(free_user_ns);
-- 
cgit v1.2.3


From 6ded6ab9be4f6164aef1c527407c1b94f0929799 Mon Sep 17 00:00:00 2001
From: Serge Hallyn <serue@us.ibm.com>
Date: Mon, 24 Nov 2008 16:24:10 -0500
Subject: User namespaces: use the current_user_ns() macro

Fix up the last current_user()->user_ns instance to use
current_user_ns().

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
---
 kernel/user.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/user.c b/kernel/user.c
index c0ef3a464438..97202cb29adc 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -361,7 +361,7 @@ struct user_struct *find_user(uid_t uid)
 {
 	struct user_struct *ret;
 	unsigned long flags;
-	struct user_namespace *ns = current_user()->user_ns;
+	struct user_namespace *ns = current_user_ns();
 
 	spin_lock_irqsave(&uidhash_lock, flags);
 	ret = uid_hash_find(uid, uidhashentry(ns, uid));
-- 
cgit v1.2.3


From 8bba1bf5e2434c83f2fe8b1422604ace9bbe4cb8 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Tue, 25 Nov 2008 09:12:31 +0100
Subject: x86, ftrace: call trace->open() before stopping tracing; add
 trace->print_header()

Add a callback to allow an ftrace plug-in to write its own header.

Move the call to trace->open() up a few lines.

The changes are required by the BTS ftrace plug-in.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 11 +++++++----
 kernel/trace/trace.h |  1 +
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a45b59e53fbc..8df8fdd69c95 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2298,7 +2298,9 @@ static int s_show(struct seq_file *m, void *v)
 			seq_printf(m, "# tracer: %s\n", iter->trace->name);
 			seq_puts(m, "#\n");
 		}
-		if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
+		if (iter->trace && iter->trace->print_header)
+			iter->trace->print_header(m);
+		else if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
 			/* print nothing if the buffers are empty */
 			if (trace_empty(iter))
 				return 0;
@@ -2350,6 +2352,10 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 	iter->trace = current_trace;
 	iter->pos = -1;
 
+	/* Notify the tracer early; before we stop tracing. */
+	if (iter->trace && iter->trace->open)
+			iter->trace->open(iter);
+
 	/* Annotate start of buffers if we had overruns */
 	if (ring_buffer_overruns(iter->tr->buffer))
 		iter->iter_flags |= TRACE_FILE_ANNOTATE;
@@ -2375,9 +2381,6 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 	/* stop the trace while dumping */
 	tracing_stop();
 
-	if (iter->trace && iter->trace->open)
-			iter->trace->open(iter);
-
 	mutex_unlock(&trace_types_lock);
 
  out:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 28c15c2ebc22..717f9f045c6f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -311,6 +311,7 @@ struct tracer {
 	int			(*selftest)(struct tracer *trace,
 					    struct trace_array *tr);
 #endif
+	void			(*print_header)(struct seq_file *m);
 	enum print_line_t	(*print_line)(struct trace_iterator *iter);
 	/* If you handled the flag setting, return 0 */
 	int			(*set_flag)(u32 old_flags, u32 bit, int set);
-- 
cgit v1.2.3


From 1e9b51c28312f7334394aa30be56ff52c2b65b7e Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Tue, 25 Nov 2008 09:24:15 +0100
Subject: x86, bts, ftrace: a BTS ftrace plug-in prototype

Impact: add new ftrace plugin

A prototype for a BTS ftrace plug-in.

The tracer collects branch trace in a cyclic buffer for each cpu.

The tracer is not configurable and the trace for each snapshot is
appended when doing cat /debug/tracing/trace.

This is a proof of concept that will be extended with future patches
to become a (hopefully) useful tool.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig.cpu     |   1 +
 kernel/trace/Kconfig     |  11 ++
 kernel/trace/Makefile    |   1 +
 kernel/trace/trace.h     |  12 +++
 kernel/trace/trace_bts.c | 276 +++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 301 insertions(+)
 create mode 100644 kernel/trace/trace_bts.c

(limited to 'kernel')

diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index b815664fe370..85a78575956c 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -515,6 +515,7 @@ config CPU_SUP_UMC_32
 config X86_DS
 	def_bool X86_PTRACE_BTS
 	depends on X86_DEBUGCTLMSR
+	select HAVE_HW_BRANCH_TRACER
 
 config X86_PTRACE_BTS
 	bool "Branch Trace Store"
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 9cbf7761f498..620feadff67a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -28,6 +28,9 @@ config HAVE_DYNAMIC_FTRACE
 config HAVE_FTRACE_MCOUNT_RECORD
 	bool
 
+config HAVE_HW_BRANCH_TRACER
+	bool
+
 config TRACER_MAX_TRACE
 	bool
 
@@ -233,6 +236,14 @@ config STACK_TRACER
 
 	  Say N if unsure.
 
+config BTS_TRACER
+	depends on HAVE_HW_BRANCH_TRACER
+	bool "Trace branches"
+	select TRACING
+	help
+	  This tracer records all branches on the system in a circular
+	  buffer giving access to the last N branches for each cpu.
+
 config DYNAMIC_FTRACE
 	bool "enable/disable ftrace tracepoints dynamically"
 	depends on FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 1a8c9259dc69..cef4bcb4e822 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -31,5 +31,6 @@ obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
 obj-$(CONFIG_FUNCTION_RET_TRACER) += trace_functions_return.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
+obj-$(CONFIG_BTS_TRACER) += trace_bts.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 717f9f045c6f..3abd645e8af2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -27,6 +27,7 @@ enum trace_type {
 	TRACE_BOOT_RET,
 	TRACE_FN_RET,
 	TRACE_USER_STACK,
+	TRACE_BTS,
 
 	__TRACE_LAST_TYPE
 };
@@ -153,6 +154,12 @@ struct trace_branch {
 	char			correct;
 };
 
+struct bts_entry {
+	struct trace_entry	ent;
+	unsigned long		from;
+	unsigned long		to;
+};
+
 /*
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
@@ -258,6 +265,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
 		IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
 		IF_ASSIGN(var, ent, struct ftrace_ret_entry, TRACE_FN_RET);\
+		IF_ASSIGN(var, ent, struct bts_entry, TRACE_BTS);\
 		__ftrace_bad_type();					\
 	} while (0)
 
@@ -392,6 +400,10 @@ void trace_function(struct trace_array *tr,
 void
 trace_function_return(struct ftrace_retfunc *trace);
 
+void trace_bts(struct trace_array *tr,
+	       unsigned long from,
+	       unsigned long to);
+
 void tracing_start_cmdline_record(void);
 void tracing_stop_cmdline_record(void);
 void tracing_sched_switch_assign_trace(struct trace_array *tr);
diff --git a/kernel/trace/trace_bts.c b/kernel/trace/trace_bts.c
new file mode 100644
index 000000000000..23b76e4690ef
--- /dev/null
+++ b/kernel/trace/trace_bts.c
@@ -0,0 +1,276 @@
+/*
+ * BTS tracer
+ *
+ * Copyright (C) 2008 Markus Metzger <markus.t.metzger@gmail.com>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/ftrace.h>
+#include <linux/kallsyms.h>
+
+#include <asm/ds.h>
+
+#include "trace.h"
+
+
+#define SIZEOF_BTS (1 << 13)
+
+static DEFINE_PER_CPU(struct bts_tracer *, tracer);
+static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
+
+#define this_tracer per_cpu(tracer, smp_processor_id())
+#define this_buffer per_cpu(buffer, smp_processor_id())
+
+
+/*
+ * Information to interpret a BTS record.
+ * This will go into an in-kernel BTS interface.
+ */
+static unsigned char sizeof_field;
+static unsigned long debugctl_mask;
+
+#define sizeof_bts (3 * sizeof_field)
+
+static void bts_trace_cpuinit(struct cpuinfo_x86 *c)
+{
+	switch (c->x86) {
+	case 0x6:
+		switch (c->x86_model) {
+		case 0x0 ... 0xC:
+			break;
+		case 0xD:
+		case 0xE: /* Pentium M */
+			sizeof_field = sizeof(long);
+			debugctl_mask = (1<<6)|(1<<7);
+			break;
+		default:
+			sizeof_field = 8;
+			debugctl_mask = (1<<6)|(1<<7);
+			break;
+		}
+		break;
+	case 0xF:
+		switch (c->x86_model) {
+		case 0x0:
+		case 0x1:
+		case 0x2: /* Netburst */
+			sizeof_field = sizeof(long);
+			debugctl_mask = (1<<2)|(1<<3);
+			break;
+		default:
+			/* sorry, don't know about them */
+			break;
+		}
+		break;
+	default:
+		/* sorry, don't know about them */
+		break;
+	}
+}
+
+static inline void bts_enable(void)
+{
+	unsigned long debugctl;
+
+	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl | debugctl_mask);
+}
+
+static inline void bts_disable(void)
+{
+	unsigned long debugctl;
+
+	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl & ~debugctl_mask);
+}
+
+static void bts_trace_reset(struct trace_array *tr)
+{
+	int cpu;
+
+	tr->time_start = ftrace_now(tr->cpu);
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr, cpu);
+}
+
+static void bts_trace_start_cpu(void *arg)
+{
+	this_tracer =
+		ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS,
+			       /* ovfl = */ NULL, /* th = */ (size_t)-1);
+	if (IS_ERR(this_tracer)) {
+		this_tracer = NULL;
+		return;
+	}
+
+	bts_enable();
+}
+
+static void bts_trace_start(struct trace_array *tr)
+{
+	int cpu;
+
+	bts_trace_reset(tr);
+
+	for_each_cpu_mask(cpu, cpu_possible_map)
+		smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
+}
+
+static void bts_trace_stop_cpu(void *arg)
+{
+	if (this_tracer) {
+		bts_disable();
+
+		ds_release_bts(this_tracer);
+		this_tracer = NULL;
+	}
+}
+
+static void bts_trace_stop(struct trace_array *tr)
+{
+	int cpu;
+
+	for_each_cpu_mask(cpu, cpu_possible_map)
+		smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
+}
+
+static int bts_trace_init(struct trace_array *tr)
+{
+	bts_trace_cpuinit(&boot_cpu_data);
+	bts_trace_reset(tr);
+	bts_trace_start(tr);
+
+	return 0;
+}
+
+static void bts_trace_print_header(struct seq_file *m)
+{
+#ifdef __i386__
+	seq_puts(m, "# CPU#    FROM           TO     FUNCTION\n");
+	seq_puts(m, "#  |       |             |         |\n");
+#else
+	seq_puts(m,
+		 "# CPU#        FROM                   TO         FUNCTION\n");
+	seq_puts(m,
+		 "#  |           |                     |             |\n");
+#endif
+}
+
+static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
+{
+	struct trace_entry *entry = iter->ent;
+	struct trace_seq *seq = &iter->seq;
+	struct bts_entry *it;
+
+	trace_assign_type(it, entry);
+
+	if (entry->type == TRACE_BTS) {
+		int ret;
+#ifdef CONFIG_KALLSYMS
+		char function[KSYM_SYMBOL_LEN];
+		sprint_symbol(function, it->from);
+#else
+		char *function = "<unknown>";
+#endif
+
+		ret = trace_seq_printf(seq, "%4d  0x%lx -> 0x%lx [%s]\n",
+				       entry->cpu, it->from, it->to, function);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;;
+		return TRACE_TYPE_HANDLED;
+	}
+	return TRACE_TYPE_UNHANDLED;
+}
+
+void trace_bts(struct trace_array *tr, unsigned long from, unsigned long to)
+{
+	struct ring_buffer_event *event;
+	struct bts_entry *entry;
+	unsigned long irq;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq);
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, from);
+	entry->ent.type = TRACE_BTS;
+	entry->ent.cpu = smp_processor_id();
+	entry->from = from;
+	entry->to   = to;
+	ring_buffer_unlock_commit(tr->buffer, event, irq);
+}
+
+static void trace_bts_at(struct trace_array *tr, size_t index)
+{
+	const void *raw = NULL;
+	unsigned long from, to;
+	int err;
+
+	err = ds_access_bts(this_tracer, index, &raw);
+	if (err < 0)
+		return;
+
+	from = *(const unsigned long *)raw;
+	to = *(const unsigned long *)((const char *)raw + sizeof_field);
+
+	trace_bts(tr, from, to);
+}
+
+static void trace_bts_cpu(void *arg)
+{
+	struct trace_array *tr = (struct trace_array *) arg;
+	size_t index = 0, end = 0, i;
+	int err;
+
+	if (!this_tracer)
+		return;
+
+	bts_disable();
+
+	err = ds_get_bts_index(this_tracer, &index);
+	if (err < 0)
+		goto out;
+
+	err = ds_get_bts_end(this_tracer, &end);
+	if (err < 0)
+		goto out;
+
+	for (i = index; i < end; i++)
+		trace_bts_at(tr, i);
+
+	for (i = 0; i < index; i++)
+		trace_bts_at(tr, i);
+
+out:
+	bts_enable();
+}
+
+static void trace_bts_prepare(struct trace_iterator *iter)
+{
+	int cpu;
+
+	for_each_cpu_mask(cpu, cpu_possible_map)
+		smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1);
+}
+
+struct tracer bts_tracer __read_mostly =
+{
+	.name		= "bts",
+	.init		= bts_trace_init,
+	.reset		= bts_trace_stop,
+	.print_header	= bts_trace_print_header,
+	.print_line	= bts_trace_print_line,
+	.start		= bts_trace_start,
+	.stop		= bts_trace_stop,
+	.open		= trace_bts_prepare
+};
+
+__init static int init_bts_trace(void)
+{
+	return register_tracer(&bts_tracer);
+}
+device_initcall(init_bts_trace);
-- 
cgit v1.2.3


From fb52607afcd0629776f1dc9e657647ceae81dd50 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 25 Nov 2008 21:07:04 +0100
Subject: tracing/function-return-tracer: change the name into
 function-graph-tracer

Impact: cleanup

This patch changes the name of the "return function tracer" into
function-graph-tracer which is a more suitable name for a tracing
which makes one able to retrieve the ordered call stack during
the code flow.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig                     |  2 +-
 arch/x86/include/asm/ftrace.h        |  4 +-
 arch/x86/kernel/Makefile             |  4 +-
 arch/x86/kernel/entry_32.S           | 12 ++---
 arch/x86/kernel/ftrace.c             | 12 ++---
 include/linux/ftrace.h               | 24 ++++-----
 include/linux/ftrace_irq.h           |  2 +-
 include/linux/sched.h                |  2 +-
 kernel/Makefile                      |  2 +-
 kernel/fork.c                        |  4 +-
 kernel/sched.c                       |  2 +-
 kernel/trace/Kconfig                 | 19 ++++---
 kernel/trace/Makefile                |  2 +-
 kernel/trace/ftrace.c                | 26 +++++-----
 kernel/trace/trace.c                 | 18 +++----
 kernel/trace/trace.h                 | 12 ++---
 kernel/trace/trace_functions_graph.c | 98 ++++++++++++++++++++++++++++++++++++
 17 files changed, 173 insertions(+), 72 deletions(-)
 create mode 100644 kernel/trace/trace_functions_graph.c

(limited to 'kernel')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e49a4fd718fe..0842b1127684 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -29,7 +29,7 @@ config X86
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FUNCTION_TRACER
-	select HAVE_FUNCTION_RET_TRACER if X86_32
+	select HAVE_FUNCTION_GRAPH_TRACER if X86_32
 	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
 	select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
 	select HAVE_ARCH_KGDB if !X86_VOYAGER
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 754a3e082f94..7e61b4ceb9a4 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -28,7 +28,7 @@ struct dyn_arch_ftrace {
 #endif /* __ASSEMBLY__ */
 #endif /* CONFIG_FUNCTION_TRACER */
 
-#ifdef CONFIG_FUNCTION_RET_TRACER
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
 #ifndef __ASSEMBLY__
 
@@ -51,6 +51,6 @@ struct ftrace_ret_stack {
 extern void return_to_handler(void);
 
 #endif /* __ASSEMBLY__ */
-#endif /* CONFIG_FUNCTION_RET_TRACER */
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
 #endif /* _ASM_X86_FTRACE_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index af2bc36ca1c4..64939a0c3986 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -14,7 +14,7 @@ CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
 CFLAGS_REMOVE_ftrace.o = -pg
 endif
 
-ifdef CONFIG_FUNCTION_RET_TRACER
+ifdef CONFIG_FUNCTION_GRAPH_TRACER
 # Don't trace __switch_to() but let it for function tracer
 CFLAGS_REMOVE_process_32.o = -pg
 endif
@@ -70,7 +70,7 @@ obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o nmi.o
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o
 obj-$(CONFIG_X86_REBOOTFIXUPS)	+= reboot_fixups_32.o
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
-obj-$(CONFIG_FUNCTION_RET_TRACER)	+= ftrace.o
+obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o
 obj-$(CONFIG_KEXEC)		+= machine_kexec_$(BITS).o
 obj-$(CONFIG_KEXEC)		+= relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 74defe21ba42..2b1f0f081a6b 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1188,9 +1188,9 @@ ENTRY(mcount)
 
 	cmpl $ftrace_stub, ftrace_trace_function
 	jnz trace
-#ifdef CONFIG_FUNCTION_RET_TRACER
-	cmpl $ftrace_stub, ftrace_function_return
-	jnz ftrace_return_caller
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	cmpl $ftrace_stub, ftrace_graph_function
+	jnz ftrace_graph_caller
 #endif
 .globl ftrace_stub
 ftrace_stub:
@@ -1215,8 +1215,8 @@ END(mcount)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 #endif /* CONFIG_FUNCTION_TRACER */
 
-#ifdef CONFIG_FUNCTION_RET_TRACER
-ENTRY(ftrace_return_caller)
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
 	cmpl $0, function_trace_stop
 	jne ftrace_stub
 
@@ -1230,7 +1230,7 @@ ENTRY(ftrace_return_caller)
 	popl %ecx
 	popl %eax
 	ret
-END(ftrace_return_caller)
+END(ftrace_graph_caller)
 
 .globl return_to_handler
 return_to_handler:
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index bb137f7297ed..3595a4c14aba 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -323,7 +323,7 @@ int __init ftrace_dyn_arch_init(void *data)
 }
 #endif
 
-#ifdef CONFIG_FUNCTION_RET_TRACER
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
 #ifndef CONFIG_DYNAMIC_FTRACE
 
@@ -389,11 +389,11 @@ static void pop_return_trace(unsigned long *ret, unsigned long long *time,
  */
 unsigned long ftrace_return_to_handler(void)
 {
-	struct ftrace_retfunc trace;
+	struct ftrace_graph_ret trace;
 	pop_return_trace(&trace.ret, &trace.calltime, &trace.func,
 			&trace.overrun);
 	trace.rettime = cpu_clock(raw_smp_processor_id());
-	ftrace_function_return(&trace);
+	ftrace_graph_function(&trace);
 
 	return trace.ret;
 }
@@ -440,12 +440,12 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 	);
 
 	if (WARN_ON(faulted)) {
-		unregister_ftrace_return();
+		unregister_ftrace_graph();
 		return;
 	}
 
 	if (WARN_ON(!__kernel_text_address(old))) {
-		unregister_ftrace_return();
+		unregister_ftrace_graph();
 		*parent = old;
 		return;
 	}
@@ -456,4 +456,4 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 		*parent = old;
 }
 
-#endif /* CONFIG_FUNCTION_RET_TRACER */
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 7854d87b97b2..b4ac734ad8d6 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -115,8 +115,8 @@ extern int ftrace_update_ftrace_func(ftrace_func_t func);
 extern void ftrace_caller(void);
 extern void ftrace_call(void);
 extern void mcount_call(void);
-#ifdef CONFIG_FUNCTION_RET_TRACER
-extern void ftrace_return_caller(void);
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+extern void ftrace_graph_caller(void);
 #endif
 
 /**
@@ -315,7 +315,7 @@ ftrace_init_module(struct module *mod,
 /*
  * Structure that defines a return function trace.
  */
-struct ftrace_retfunc {
+struct ftrace_graph_ret {
 	unsigned long ret; /* Return address */
 	unsigned long func; /* Current function */
 	unsigned long long calltime;
@@ -324,22 +324,22 @@ struct ftrace_retfunc {
 	unsigned long overrun;
 };
 
-#ifdef CONFIG_FUNCTION_RET_TRACER
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
 #define FTRACE_RETFUNC_DEPTH 50
 #define FTRACE_RETSTACK_ALLOC_SIZE 32
 /* Type of a callback handler of tracing return function */
-typedef void (*trace_function_return_t)(struct ftrace_retfunc *);
+typedef void (*trace_function_graph_t)(struct ftrace_graph_ret *);
 
-extern int register_ftrace_return(trace_function_return_t func);
+extern int register_ftrace_graph(trace_function_graph_t func);
 /* The current handler in use */
-extern trace_function_return_t ftrace_function_return;
-extern void unregister_ftrace_return(void);
+extern trace_function_graph_t ftrace_graph_function;
+extern void unregister_ftrace_graph(void);
 
-extern void ftrace_retfunc_init_task(struct task_struct *t);
-extern void ftrace_retfunc_exit_task(struct task_struct *t);
+extern void ftrace_graph_init_task(struct task_struct *t);
+extern void ftrace_graph_exit_task(struct task_struct *t);
 #else
-static inline void ftrace_retfunc_init_task(struct task_struct *t) { }
-static inline void ftrace_retfunc_exit_task(struct task_struct *t) { }
+static inline void ftrace_graph_init_task(struct task_struct *t) { }
+static inline void ftrace_graph_exit_task(struct task_struct *t) { }
 #endif
 
 #endif /* _LINUX_FTRACE_H */
diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h
index 0b4df55d7a74..366a054d0b05 100644
--- a/include/linux/ftrace_irq.h
+++ b/include/linux/ftrace_irq.h
@@ -2,7 +2,7 @@
 #define _LINUX_FTRACE_IRQ_H
 
 
-#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_RET_TRACER)
+#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_GRAPH_TRACER)
 extern void ftrace_nmi_enter(void);
 extern void ftrace_nmi_exit(void);
 #else
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d02a0ca70ee9..7ad48f2a2758 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1365,7 +1365,7 @@ struct task_struct {
 	unsigned long default_timer_slack_ns;
 
 	struct list_head	*scm_work_list;
-#ifdef CONFIG_FUNCTION_RET_TRACER
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	/* Index of current stored adress in ret_stack */
 	int curr_ret_stack;
 	/* Stack of return addresses for return function tracing */
diff --git a/kernel/Makefile b/kernel/Makefile
index 03a45e7e87b7..703cf3b7389c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -21,7 +21,7 @@ CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_sched_clock.o = -pg
 CFLAGS_REMOVE_sched.o = -pg
 endif
-ifdef CONFIG_FUNCTION_RET_TRACER
+ifdef CONFIG_FUNCTION_GRAPH_TRACER
 CFLAGS_REMOVE_extable.o = -pg # For __kernel_text_address()
 CFLAGS_REMOVE_module.o = -pg # For __module_text_address()
 endif
diff --git a/kernel/fork.c b/kernel/fork.c
index d6e1a3205f62..5f82a999c032 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -140,7 +140,7 @@ void free_task(struct task_struct *tsk)
 	prop_local_destroy_single(&tsk->dirties);
 	free_thread_info(tsk->stack);
 	rt_mutex_debug_task_free(tsk);
-	ftrace_retfunc_exit_task(tsk);
+	ftrace_graph_exit_task(tsk);
 	free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
@@ -1271,7 +1271,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	total_forks++;
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
-	ftrace_retfunc_init_task(p);
+	ftrace_graph_init_task(p);
 	proc_fork_connector(p);
 	cgroup_post_fork(p);
 	return p;
diff --git a/kernel/sched.c b/kernel/sched.c
index 388d9db044ab..52490bf6b884 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5901,7 +5901,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 	 * The idle tasks have their own, simple scheduling class:
 	 */
 	idle->sched_class = &idle_sched_class;
-	ftrace_retfunc_init_task(idle);
+	ftrace_graph_init_task(idle);
 }
 
 /*
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 620feadff67a..eb9b901e0777 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -12,7 +12,7 @@ config NOP_TRACER
 config HAVE_FUNCTION_TRACER
 	bool
 
-config HAVE_FUNCTION_RET_TRACER
+config HAVE_FUNCTION_GRAPH_TRACER
 	bool
 
 config HAVE_FUNCTION_TRACE_MCOUNT_TEST
@@ -63,15 +63,18 @@ config FUNCTION_TRACER
 	  (the bootup default), then the overhead of the instructions is very
 	  small and not measurable even in micro-benchmarks.
 
-config FUNCTION_RET_TRACER
-	bool "Kernel Function return Tracer"
-	depends on HAVE_FUNCTION_RET_TRACER
+config FUNCTION_GRAPH_TRACER
+	bool "Kernel Function Graph Tracer"
+	depends on HAVE_FUNCTION_GRAPH_TRACER
 	depends on FUNCTION_TRACER
 	help
-	  Enable the kernel to trace a function at its return.
-	  It's first purpose is to trace the duration of functions.
-	  This is done by setting the current return address on the thread
-	  info structure of the current task.
+	  Enable the kernel to trace a function at both its return
+	  and its entry.
+	  It's first purpose is to trace the duration of functions and
+	  draw a call graph for each thread with some informations like
+	  the return value.
+	  This is done by setting the current return address on the current
+	  task structure into a stack of calls.
 
 config IRQSOFF_TRACER
 	bool "Interrupts-off Latency Tracer"
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index cef4bcb4e822..08c5fe6ddc09 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -29,7 +29,7 @@ obj-$(CONFIG_NOP_TRACER) += trace_nop.o
 obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
-obj-$(CONFIG_FUNCTION_RET_TRACER) += trace_functions_return.o
+obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
 obj-$(CONFIG_BTS_TRACER) += trace_bts.o
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 53042f118f23..9e19976af727 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -395,11 +395,11 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 	unsigned long ip, fl;
 	unsigned long ftrace_addr;
 
-#ifdef CONFIG_FUNCTION_RET_TRACER
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	if (ftrace_tracing_type == FTRACE_TYPE_ENTER)
 		ftrace_addr = (unsigned long)ftrace_caller;
 	else
-		ftrace_addr = (unsigned long)ftrace_return_caller;
+		ftrace_addr = (unsigned long)ftrace_graph_caller;
 #else
 	ftrace_addr = (unsigned long)ftrace_caller;
 #endif
@@ -1496,13 +1496,13 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 	return ret;
 }
 
-#ifdef CONFIG_FUNCTION_RET_TRACER
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
 static atomic_t ftrace_retfunc_active;
 
 /* The callback that hooks the return of a function */
-trace_function_return_t ftrace_function_return =
-			(trace_function_return_t)ftrace_stub;
+trace_function_graph_t ftrace_graph_function =
+			(trace_function_graph_t)ftrace_stub;
 
 
 /* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
@@ -1549,7 +1549,7 @@ free:
 }
 
 /* Allocate a return stack for each task */
-static int start_return_tracing(void)
+static int start_graph_tracing(void)
 {
 	struct ftrace_ret_stack **ret_stack_list;
 	int ret;
@@ -1569,7 +1569,7 @@ static int start_return_tracing(void)
 	return ret;
 }
 
-int register_ftrace_return(trace_function_return_t func)
+int register_ftrace_graph(trace_function_graph_t func)
 {
 	int ret = 0;
 
@@ -1584,13 +1584,13 @@ int register_ftrace_return(trace_function_return_t func)
 		goto out;
 	}
 	atomic_inc(&ftrace_retfunc_active);
-	ret = start_return_tracing();
+	ret = start_graph_tracing();
 	if (ret) {
 		atomic_dec(&ftrace_retfunc_active);
 		goto out;
 	}
 	ftrace_tracing_type = FTRACE_TYPE_RETURN;
-	ftrace_function_return = func;
+	ftrace_graph_function = func;
 	ftrace_startup();
 
 out:
@@ -1598,12 +1598,12 @@ out:
 	return ret;
 }
 
-void unregister_ftrace_return(void)
+void unregister_ftrace_graph(void)
 {
 	mutex_lock(&ftrace_sysctl_lock);
 
 	atomic_dec(&ftrace_retfunc_active);
-	ftrace_function_return = (trace_function_return_t)ftrace_stub;
+	ftrace_graph_function = (trace_function_graph_t)ftrace_stub;
 	ftrace_shutdown();
 	/* Restore normal tracing type */
 	ftrace_tracing_type = FTRACE_TYPE_ENTER;
@@ -1612,7 +1612,7 @@ void unregister_ftrace_return(void)
 }
 
 /* Allocate a return stack for newly created task */
-void ftrace_retfunc_init_task(struct task_struct *t)
+void ftrace_graph_init_task(struct task_struct *t)
 {
 	if (atomic_read(&ftrace_retfunc_active)) {
 		t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
@@ -1626,7 +1626,7 @@ void ftrace_retfunc_init_task(struct task_struct *t)
 		t->ret_stack = NULL;
 }
 
-void ftrace_retfunc_exit_task(struct task_struct *t)
+void ftrace_graph_exit_task(struct task_struct *t)
 {
 	struct ftrace_ret_stack	*ret_stack = t->ret_stack;
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8df8fdd69c95..f21ab2c68fd4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -878,15 +878,15 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data,
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
 }
 
-#ifdef CONFIG_FUNCTION_RET_TRACER
-static void __trace_function_return(struct trace_array *tr,
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static void __trace_function_graph(struct trace_array *tr,
 				struct trace_array_cpu *data,
-				struct ftrace_retfunc *trace,
+				struct ftrace_graph_ret *trace,
 				unsigned long flags,
 				int pc)
 {
 	struct ring_buffer_event *event;
-	struct ftrace_ret_entry *entry;
+	struct ftrace_graph_entry *entry;
 	unsigned long irq_flags;
 
 	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
@@ -1177,8 +1177,8 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
 	local_irq_restore(flags);
 }
 
-#ifdef CONFIG_FUNCTION_RET_TRACER
-void trace_function_return(struct ftrace_retfunc *trace)
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+void trace_function_graph(struct ftrace_graph_ret *trace)
 {
 	struct trace_array *tr = &global_trace;
 	struct trace_array_cpu *data;
@@ -1193,12 +1193,12 @@ void trace_function_return(struct ftrace_retfunc *trace)
 	disabled = atomic_inc_return(&data->disabled);
 	if (likely(disabled == 1)) {
 		pc = preempt_count();
-		__trace_function_return(tr, data, trace, flags, pc);
+		__trace_function_graph(tr, data, trace, flags, pc);
 	}
 	atomic_dec(&data->disabled);
 	raw_local_irq_restore(flags);
 }
-#endif /* CONFIG_FUNCTION_RET_TRACER */
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
 static struct ftrace_ops trace_ops __read_mostly =
 {
@@ -2001,7 +2001,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 		break;
 	}
 	case TRACE_FN_RET: {
-		return print_return_function(iter);
+		return print_graph_function(iter);
 		break;
 	}
 	case TRACE_BRANCH: {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3abd645e8af2..72b5ef868765 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -57,7 +57,7 @@ struct ftrace_entry {
 };
 
 /* Function return entry */
-struct ftrace_ret_entry {
+struct ftrace_graph_entry {
 	struct trace_entry	ent;
 	unsigned long		ip;
 	unsigned long		parent_ip;
@@ -264,7 +264,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\
 		IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
 		IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
-		IF_ASSIGN(var, ent, struct ftrace_ret_entry, TRACE_FN_RET);\
+		IF_ASSIGN(var, ent, struct ftrace_graph_entry, TRACE_FN_RET);\
 		IF_ASSIGN(var, ent, struct bts_entry, TRACE_BTS);\
 		__ftrace_bad_type();					\
 	} while (0)
@@ -398,7 +398,7 @@ void trace_function(struct trace_array *tr,
 		    unsigned long parent_ip,
 		    unsigned long flags, int pc);
 void
-trace_function_return(struct ftrace_retfunc *trace);
+trace_function_graph(struct ftrace_graph_ret *trace);
 
 void trace_bts(struct trace_array *tr,
 	       unsigned long from,
@@ -489,11 +489,11 @@ extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args);
 extern unsigned long trace_flags;
 
 /* Standard output formatting function used for function return traces */
-#ifdef CONFIG_FUNCTION_RET_TRACER
-extern enum print_line_t print_return_function(struct trace_iterator *iter);
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+extern enum print_line_t print_graph_function(struct trace_iterator *iter);
 #else
 static inline enum print_line_t
-print_return_function(struct trace_iterator *iter)
+print_graph_function(struct trace_iterator *iter)
 {
 	return TRACE_TYPE_UNHANDLED;
 }
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
new file mode 100644
index 000000000000..f5bad4624d2b
--- /dev/null
+++ b/kernel/trace/trace_functions_graph.c
@@ -0,0 +1,98 @@
+/*
+ *
+ * Function graph tracer.
+ * Copyright (c) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ * Mostly borrowed from function tracer which
+ * is Copyright (c) Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/fs.h>
+
+#include "trace.h"
+
+
+#define TRACE_GRAPH_PRINT_OVERRUN	0x1
+static struct tracer_opt trace_opts[] = {
+	/* Display overruns or not */
+	{ TRACER_OPT(overrun, TRACE_GRAPH_PRINT_OVERRUN) },
+	{ } /* Empty entry */
+};
+
+static struct tracer_flags tracer_flags = {
+	.val = 0, /* Don't display overruns by default */
+	.opts = trace_opts
+};
+
+
+static int graph_trace_init(struct trace_array *tr)
+{
+	int cpu;
+	for_each_online_cpu(cpu)
+		tracing_reset(tr, cpu);
+
+	return register_ftrace_graph(&trace_function_graph);
+}
+
+static void graph_trace_reset(struct trace_array *tr)
+{
+		unregister_ftrace_graph();
+}
+
+
+enum print_line_t
+print_graph_function(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry = iter->ent;
+	struct ftrace_graph_entry *field;
+	int ret;
+
+	if (entry->type == TRACE_FN_RET) {
+		trace_assign_type(field, entry);
+		ret = trace_seq_printf(s, "%pF -> ", (void *)field->parent_ip);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+
+		ret = seq_print_ip_sym(s, field->ip,
+					trace_flags & TRACE_ITER_SYM_MASK);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+
+		ret = trace_seq_printf(s, " (%llu ns)",
+					field->rettime - field->calltime);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+
+		if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
+			ret = trace_seq_printf(s, " (Overruns: %lu)",
+						field->overrun);
+			if (!ret)
+				return TRACE_TYPE_PARTIAL_LINE;
+		}
+
+		ret = trace_seq_printf(s, "\n");
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+
+		return TRACE_TYPE_HANDLED;
+	}
+	return TRACE_TYPE_UNHANDLED;
+}
+
+static struct tracer graph_trace __read_mostly = {
+	.name	     = "function-graph",
+	.init	     = graph_trace_init,
+	.reset	     = graph_trace_reset,
+	.print_line = print_graph_function,
+	.flags		= &tracer_flags,
+};
+
+static __init int init_graph_trace(void)
+{
+	return register_tracer(&graph_trace);
+}
+
+device_initcall(init_graph_trace);
-- 
cgit v1.2.3


From 287b6e68ca7209caec40b2f44f837c580a413bae Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 26 Nov 2008 00:57:25 +0100
Subject: tracing/function-return-tracer: set a more human readable output

Impact: feature

This patch sets a C-like output for the function graph tracing.
For this aim, we now call two handler for each function: one on the entry
and one other on return. This way we can draw a well-ordered call stack.

The pid of the previous trace is loosely stored to be compared against
the one of the current trace to see if there were a context switch.

Without this little feature, the call tree would seem broken at
some locations.
We could use the sched_tracer to capture these sched_events but this
way of processing is much more simpler.

2 spaces have been chosen for indentation to fit the screen while deep
calls. The time of execution in nanosecs is printed just after closed
braces, it seems more easy this way to find the corresponding function.
If the time was printed as a first column, it would be not so easy to
find the corresponding function if it is called on a deep depth.

I plan to output the return value but on 32 bits CPU, the return value
can be 32 or 64, and its difficult to guess on which case we are.
I don't know what would be the better solution on X86-32: only print
eax (low-part) or even edx (high-part).

Actually it's thee same problem when a function return a 8 bits value, the
high part of eax could contain junk values...

Here is an example of trace:

sys_read() {
  fget_light() {
  } 526
  vfs_read() {
    rw_verify_area() {
      security_file_permission() {
        cap_file_permission() {
        } 519
      } 1564
    } 2640
    do_sync_read() {
      pipe_read() {
        __might_sleep() {
        } 511
        pipe_wait() {
          prepare_to_wait() {
          } 760
          deactivate_task() {
            dequeue_task() {
              dequeue_task_fair() {
                dequeue_entity() {
                  update_curr() {
                    update_min_vruntime() {
                    } 504
                  } 1587
                  clear_buddies() {
                  } 512
                  add_cfs_task_weight() {
                  } 519
                  update_min_vruntime() {
                  } 511
                } 5602
                dequeue_entity() {
                  update_curr() {
                    update_min_vruntime() {
                    } 496
                  } 1631
                  clear_buddies() {
                  } 496
                  update_min_vruntime() {
                  } 527
                } 4580
                hrtick_update() {
                  hrtick_start_fair() {
                  } 488
                } 1489
              } 13700
            } 14949
          } 16016
          msecs_to_jiffies() {
          } 496
          put_prev_task_fair() {
          } 504
          pick_next_task_fair() {
          } 489
          pick_next_task_rt() {
          } 496
          pick_next_task_fair() {
          } 489
          pick_next_task_idle() {
          } 489

------------8<---------- thread 4 ------------8<----------

finish_task_switch() {
} 1203
do_softirq() {
  __do_softirq() {
    __local_bh_disable() {
    } 669
    rcu_process_callbacks() {
      __rcu_process_callbacks() {
        cpu_quiet() {
          rcu_start_batch() {
          } 503
        } 1647
      } 3128
      __rcu_process_callbacks() {
      } 542
    } 5362
    _local_bh_enable() {
    } 587
  } 8880
} 9986
kthread_should_stop() {
} 669
deactivate_task() {
  dequeue_task() {
    dequeue_task_fair() {
      dequeue_entity() {
        update_curr() {
          calc_delta_mine() {
          } 511
          update_min_vruntime() {
          } 511
        } 2813

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ftrace.c              |  32 +++++++----
 include/linux/ftrace.h                |  25 ++++++--
 kernel/trace/ftrace.c                 |  30 +++++-----
 kernel/trace/trace.c                  |  67 ++++++++++++++++++----
 kernel/trace/trace.h                  |  28 +++++----
 kernel/trace/trace_functions_graph.c  | 104 ++++++++++++++++++++++++++--------
 kernel/trace/trace_functions_return.c |  98 --------------------------------
 7 files changed, 208 insertions(+), 176 deletions(-)
 delete mode 100644 kernel/trace/trace_functions_return.c

(limited to 'kernel')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 3595a4c14aba..26b2d92d48b3 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -347,7 +347,7 @@ void ftrace_nmi_exit(void)
 
 /* Add a function return address to the trace stack on thread info.*/
 static int push_return_trace(unsigned long ret, unsigned long long time,
-				unsigned long func)
+				unsigned long func, int *depth)
 {
 	int index;
 
@@ -365,21 +365,22 @@ static int push_return_trace(unsigned long ret, unsigned long long time,
 	current->ret_stack[index].ret = ret;
 	current->ret_stack[index].func = func;
 	current->ret_stack[index].calltime = time;
+	*depth = index;
 
 	return 0;
 }
 
 /* Retrieve a function return address to the trace stack on thread info.*/
-static void pop_return_trace(unsigned long *ret, unsigned long long *time,
-				unsigned long *func, unsigned long *overrun)
+static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
 {
 	int index;
 
 	index = current->curr_ret_stack;
 	*ret = current->ret_stack[index].ret;
-	*func = current->ret_stack[index].func;
-	*time = current->ret_stack[index].calltime;
-	*overrun = atomic_read(&current->trace_overrun);
+	trace->func = current->ret_stack[index].func;
+	trace->calltime = current->ret_stack[index].calltime;
+	trace->overrun = atomic_read(&current->trace_overrun);
+	trace->depth = index;
 	current->curr_ret_stack--;
 }
 
@@ -390,12 +391,13 @@ static void pop_return_trace(unsigned long *ret, unsigned long long *time,
 unsigned long ftrace_return_to_handler(void)
 {
 	struct ftrace_graph_ret trace;
-	pop_return_trace(&trace.ret, &trace.calltime, &trace.func,
-			&trace.overrun);
+	unsigned long ret;
+
+	pop_return_trace(&trace, &ret);
 	trace.rettime = cpu_clock(raw_smp_processor_id());
-	ftrace_graph_function(&trace);
+	ftrace_graph_return(&trace);
 
-	return trace.ret;
+	return ret;
 }
 
 /*
@@ -407,6 +409,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 	unsigned long old;
 	unsigned long long calltime;
 	int faulted;
+	struct ftrace_graph_ent trace;
 	unsigned long return_hooker = (unsigned long)
 				&return_to_handler;
 
@@ -452,8 +455,15 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 
 	calltime = cpu_clock(raw_smp_processor_id());
 
-	if (push_return_trace(old, calltime, self_addr) == -EBUSY)
+	if (push_return_trace(old, calltime,
+				self_addr, &trace.depth) == -EBUSY) {
 		*parent = old;
+		return;
+	}
+
+	trace.func = self_addr;
+	ftrace_graph_entry(&trace);
+
 }
 
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index b4ac734ad8d6..fc2d54987198 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -312,27 +312,40 @@ ftrace_init_module(struct module *mod,
 #endif
 
 
+/*
+ * Structure that defines an entry function trace.
+ */
+struct ftrace_graph_ent {
+	unsigned long func; /* Current function */
+	int depth;
+};
+
 /*
  * Structure that defines a return function trace.
  */
 struct ftrace_graph_ret {
-	unsigned long ret; /* Return address */
 	unsigned long func; /* Current function */
 	unsigned long long calltime;
 	unsigned long long rettime;
 	/* Number of functions that overran the depth limit for current task */
 	unsigned long overrun;
+	int depth;
 };
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 #define FTRACE_RETFUNC_DEPTH 50
 #define FTRACE_RETSTACK_ALLOC_SIZE 32
-/* Type of a callback handler of tracing return function */
-typedef void (*trace_function_graph_t)(struct ftrace_graph_ret *);
+/* Type of the callback handlers for tracing function graph*/
+typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *); /* return */
+typedef void (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */
+
+extern int register_ftrace_graph(trace_func_graph_ret_t retfunc,
+				trace_func_graph_ent_t entryfunc);
+
+/* The current handlers in use */
+extern trace_func_graph_ret_t ftrace_graph_return;
+extern trace_func_graph_ent_t ftrace_graph_entry;
 
-extern int register_ftrace_graph(trace_function_graph_t func);
-/* The current handler in use */
-extern trace_function_graph_t ftrace_graph_function;
 extern void unregister_ftrace_graph(void);
 
 extern void ftrace_graph_init_task(struct task_struct *t);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9e19976af727..7e2d3b91692d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1498,12 +1498,13 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
-static atomic_t ftrace_retfunc_active;
-
-/* The callback that hooks the return of a function */
-trace_function_graph_t ftrace_graph_function =
-			(trace_function_graph_t)ftrace_stub;
+static atomic_t ftrace_graph_active;
 
+/* The callbacks that hook a function */
+trace_func_graph_ret_t ftrace_graph_return =
+			(trace_func_graph_ret_t)ftrace_stub;
+trace_func_graph_ent_t ftrace_graph_entry =
+			(trace_func_graph_ent_t)ftrace_stub;
 
 /* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
 static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
@@ -1569,7 +1570,8 @@ static int start_graph_tracing(void)
 	return ret;
 }
 
-int register_ftrace_graph(trace_function_graph_t func)
+int register_ftrace_graph(trace_func_graph_ret_t retfunc,
+			trace_func_graph_ent_t entryfunc)
 {
 	int ret = 0;
 
@@ -1583,14 +1585,15 @@ int register_ftrace_graph(trace_function_graph_t func)
 		ret = -EBUSY;
 		goto out;
 	}
-	atomic_inc(&ftrace_retfunc_active);
+	atomic_inc(&ftrace_graph_active);
 	ret = start_graph_tracing();
 	if (ret) {
-		atomic_dec(&ftrace_retfunc_active);
+		atomic_dec(&ftrace_graph_active);
 		goto out;
 	}
 	ftrace_tracing_type = FTRACE_TYPE_RETURN;
-	ftrace_graph_function = func;
+	ftrace_graph_return = retfunc;
+	ftrace_graph_entry = entryfunc;
 	ftrace_startup();
 
 out:
@@ -1602,8 +1605,9 @@ void unregister_ftrace_graph(void)
 {
 	mutex_lock(&ftrace_sysctl_lock);
 
-	atomic_dec(&ftrace_retfunc_active);
-	ftrace_graph_function = (trace_function_graph_t)ftrace_stub;
+	atomic_dec(&ftrace_graph_active);
+	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
+	ftrace_graph_entry = (trace_func_graph_ent_t)ftrace_stub;
 	ftrace_shutdown();
 	/* Restore normal tracing type */
 	ftrace_tracing_type = FTRACE_TYPE_ENTER;
@@ -1614,7 +1618,7 @@ void unregister_ftrace_graph(void)
 /* Allocate a return stack for newly created task */
 void ftrace_graph_init_task(struct task_struct *t)
 {
-	if (atomic_read(&ftrace_retfunc_active)) {
+	if (atomic_read(&ftrace_graph_active)) {
 		t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
 				* sizeof(struct ftrace_ret_stack),
 				GFP_KERNEL);
@@ -1638,5 +1642,3 @@ void ftrace_graph_exit_task(struct task_struct *t)
 }
 #endif
 
-
-
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f21ab2c68fd4..9d5f7c94f251 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -879,14 +879,38 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data,
 }
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static void __trace_function_graph(struct trace_array *tr,
+static void __trace_graph_entry(struct trace_array *tr,
+				struct trace_array_cpu *data,
+				struct ftrace_graph_ent *trace,
+				unsigned long flags,
+				int pc)
+{
+	struct ring_buffer_event *event;
+	struct ftrace_graph_ent_entry *entry;
+	unsigned long irq_flags;
+
+	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+		return;
+
+	event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, flags, pc);
+	entry->ent.type			= TRACE_GRAPH_ENT;
+	entry->graph_ent			= *trace;
+	ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
+}
+
+static void __trace_graph_return(struct trace_array *tr,
 				struct trace_array_cpu *data,
 				struct ftrace_graph_ret *trace,
 				unsigned long flags,
 				int pc)
 {
 	struct ring_buffer_event *event;
-	struct ftrace_graph_entry *entry;
+	struct ftrace_graph_ret_entry *entry;
 	unsigned long irq_flags;
 
 	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
@@ -898,12 +922,8 @@ static void __trace_function_graph(struct trace_array *tr,
 		return;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, flags, pc);
-	entry->ent.type			= TRACE_FN_RET;
-	entry->ip			= trace->func;
-	entry->parent_ip	= trace->ret;
-	entry->rettime		= trace->rettime;
-	entry->calltime		= trace->calltime;
-	entry->overrun		= trace->overrun;
+	entry->ent.type			= TRACE_GRAPH_RET;
+	entry->ret				= *trace;
 	ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
 }
 #endif
@@ -1178,7 +1198,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
 }
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-void trace_function_graph(struct ftrace_graph_ret *trace)
+void trace_graph_entry(struct ftrace_graph_ent *trace)
 {
 	struct trace_array *tr = &global_trace;
 	struct trace_array_cpu *data;
@@ -1193,7 +1213,28 @@ void trace_function_graph(struct ftrace_graph_ret *trace)
 	disabled = atomic_inc_return(&data->disabled);
 	if (likely(disabled == 1)) {
 		pc = preempt_count();
-		__trace_function_graph(tr, data, trace, flags, pc);
+		__trace_graph_entry(tr, data, trace, flags, pc);
+	}
+	atomic_dec(&data->disabled);
+	raw_local_irq_restore(flags);
+}
+
+void trace_graph_return(struct ftrace_graph_ret *trace)
+{
+	struct trace_array *tr = &global_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+	int pc;
+
+	raw_local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+	if (likely(disabled == 1)) {
+		pc = preempt_count();
+		__trace_graph_return(tr, data, trace, flags, pc);
 	}
 	atomic_dec(&data->disabled);
 	raw_local_irq_restore(flags);
@@ -2000,9 +2041,11 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 			trace_seq_print_cont(s, iter);
 		break;
 	}
-	case TRACE_FN_RET: {
+	case TRACE_GRAPH_RET: {
+		return print_graph_function(iter);
+	}
+	case TRACE_GRAPH_ENT: {
 		return print_graph_function(iter);
-		break;
 	}
 	case TRACE_BRANCH: {
 		struct trace_branch *field;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 72b5ef868765..ffe1bb1eb620 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -25,7 +25,8 @@ enum trace_type {
 	TRACE_BRANCH,
 	TRACE_BOOT_CALL,
 	TRACE_BOOT_RET,
-	TRACE_FN_RET,
+	TRACE_GRAPH_RET,
+	TRACE_GRAPH_ENT,
 	TRACE_USER_STACK,
 	TRACE_BTS,
 
@@ -56,14 +57,16 @@ struct ftrace_entry {
 	unsigned long		parent_ip;
 };
 
+/* Function call entry */
+struct ftrace_graph_ent_entry {
+	struct trace_entry			ent;
+	struct ftrace_graph_ent		graph_ent;
+};
+
 /* Function return entry */
-struct ftrace_graph_entry {
-	struct trace_entry	ent;
-	unsigned long		ip;
-	unsigned long		parent_ip;
-	unsigned long long	calltime;
-	unsigned long long	rettime;
-	unsigned long		overrun;
+struct ftrace_graph_ret_entry {
+	struct trace_entry			ent;
+	struct ftrace_graph_ret		ret;
 };
 extern struct tracer boot_tracer;
 
@@ -264,7 +267,10 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\
 		IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
 		IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
-		IF_ASSIGN(var, ent, struct ftrace_graph_entry, TRACE_FN_RET);\
+		IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry,	\
+			  TRACE_GRAPH_ENT);		\
+		IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry,	\
+			  TRACE_GRAPH_RET);		\
 		IF_ASSIGN(var, ent, struct bts_entry, TRACE_BTS);\
 		__ftrace_bad_type();					\
 	} while (0)
@@ -397,9 +403,9 @@ void trace_function(struct trace_array *tr,
 		    unsigned long ip,
 		    unsigned long parent_ip,
 		    unsigned long flags, int pc);
-void
-trace_function_graph(struct ftrace_graph_ret *trace);
 
+void trace_graph_return(struct ftrace_graph_ret *trace);
+void trace_graph_entry(struct ftrace_graph_ent *trace);
 void trace_bts(struct trace_array *tr,
 	       unsigned long from,
 	       unsigned long to);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index f5bad4624d2b..b6f0cc2a00cb 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -13,6 +13,7 @@
 
 #include "trace.h"
 
+#define TRACE_GRAPH_INDENT	2
 
 #define TRACE_GRAPH_PRINT_OVERRUN	0x1
 static struct tracer_opt trace_opts[] = {
@@ -26,6 +27,8 @@ static struct tracer_flags tracer_flags = {
 	.opts = trace_opts
 };
 
+/* pid on the last trace processed */
+static pid_t last_pid = -1;
 
 static int graph_trace_init(struct trace_array *tr)
 {
@@ -33,7 +36,8 @@ static int graph_trace_init(struct trace_array *tr)
 	for_each_online_cpu(cpu)
 		tracing_reset(tr, cpu);
 
-	return register_ftrace_graph(&trace_function_graph);
+	return register_ftrace_graph(&trace_graph_return,
+					&trace_graph_entry);
 }
 
 static void graph_trace_reset(struct trace_array *tr)
@@ -41,45 +45,97 @@ static void graph_trace_reset(struct trace_array *tr)
 		unregister_ftrace_graph();
 }
 
+/* If the pid changed since the last trace, output this event */
+static int verif_pid(struct trace_seq *s, pid_t pid)
+{
+	if (last_pid != -1 && last_pid == pid)
+		return 1;
 
-enum print_line_t
-print_graph_function(struct trace_iterator *iter)
+	last_pid = pid;
+	return trace_seq_printf(s, "\n------------8<---------- thread %d"
+				    " ------------8<----------\n\n",
+				  pid);
+}
+
+static enum print_line_t
+print_graph_entry(struct ftrace_graph_ent *call, struct trace_seq *s,
+		struct trace_entry *ent)
 {
-	struct trace_seq *s = &iter->seq;
-	struct trace_entry *entry = iter->ent;
-	struct ftrace_graph_entry *field;
+	int i;
 	int ret;
 
-	if (entry->type == TRACE_FN_RET) {
-		trace_assign_type(field, entry);
-		ret = trace_seq_printf(s, "%pF -> ", (void *)field->parent_ip);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
+	if (!verif_pid(s, ent->pid))
+		return TRACE_TYPE_PARTIAL_LINE;
 
-		ret = seq_print_ip_sym(s, field->ip,
-					trace_flags & TRACE_ITER_SYM_MASK);
+	for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
+		ret = trace_seq_printf(s, " ");
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	ret = seq_print_ip_sym(s, call->func, 0);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	ret = trace_seq_printf(s, "() {\n");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+	return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
+		   struct trace_entry *ent)
+{
+	int i;
+	int ret;
+
+	if (!verif_pid(s, ent->pid))
+		return TRACE_TYPE_PARTIAL_LINE;
 
-		ret = trace_seq_printf(s, " (%llu ns)",
-					field->rettime - field->calltime);
+	for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
+		ret = trace_seq_printf(s, " ");
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	ret = trace_seq_printf(s, "} ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
 
-		if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
-			ret = trace_seq_printf(s, " (Overruns: %lu)",
-						field->overrun);
-			if (!ret)
-				return TRACE_TYPE_PARTIAL_LINE;
-		}
+	ret = trace_seq_printf(s, "%llu\n", trace->rettime - trace->calltime);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
 
-		ret = trace_seq_printf(s, "\n");
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
+		ret = trace_seq_printf(s, " (Overruns: %lu)\n",
+					trace->overrun);
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
+	}
+	return TRACE_TYPE_HANDLED;
+}
+
+enum print_line_t
+print_graph_function(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry = iter->ent;
 
-		return TRACE_TYPE_HANDLED;
+	switch (entry->type) {
+	case TRACE_GRAPH_ENT: {
+		struct ftrace_graph_ent_entry *field;
+		trace_assign_type(field, entry);
+		return print_graph_entry(&field->graph_ent, s, entry);
+	}
+	case TRACE_GRAPH_RET: {
+		struct ftrace_graph_ret_entry *field;
+		trace_assign_type(field, entry);
+		return print_graph_return(&field->ret, s, entry);
+	}
+	default:
+		return TRACE_TYPE_UNHANDLED;
 	}
-	return TRACE_TYPE_UNHANDLED;
 }
 
 static struct tracer graph_trace __read_mostly = {
diff --git a/kernel/trace/trace_functions_return.c b/kernel/trace/trace_functions_return.c
deleted file mode 100644
index e00d64509c9c..000000000000
--- a/kernel/trace/trace_functions_return.c
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- *
- * Function return tracer.
- * Copyright (c) 2008 Frederic Weisbecker <fweisbec@gmail.com>
- * Mostly borrowed from function tracer which
- * is Copyright (c) Steven Rostedt <srostedt@redhat.com>
- *
- */
-#include <linux/debugfs.h>
-#include <linux/uaccess.h>
-#include <linux/ftrace.h>
-#include <linux/fs.h>
-
-#include "trace.h"
-
-
-#define TRACE_RETURN_PRINT_OVERRUN	0x1
-static struct tracer_opt trace_opts[] = {
-	/* Display overruns or not */
-	{ TRACER_OPT(overrun, TRACE_RETURN_PRINT_OVERRUN) },
-	{ } /* Empty entry */
-};
-
-static struct tracer_flags tracer_flags = {
-	.val = 0, /* Don't display overruns by default */
-	.opts = trace_opts
-};
-
-
-static int return_trace_init(struct trace_array *tr)
-{
-	int cpu;
-	for_each_online_cpu(cpu)
-		tracing_reset(tr, cpu);
-
-	return register_ftrace_return(&trace_function_return);
-}
-
-static void return_trace_reset(struct trace_array *tr)
-{
-		unregister_ftrace_return();
-}
-
-
-enum print_line_t
-print_return_function(struct trace_iterator *iter)
-{
-	struct trace_seq *s = &iter->seq;
-	struct trace_entry *entry = iter->ent;
-	struct ftrace_ret_entry *field;
-	int ret;
-
-	if (entry->type == TRACE_FN_RET) {
-		trace_assign_type(field, entry);
-		ret = trace_seq_printf(s, "%pF -> ", (void *)field->parent_ip);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-
-		ret = seq_print_ip_sym(s, field->ip,
-					trace_flags & TRACE_ITER_SYM_MASK);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-
-		ret = trace_seq_printf(s, " (%llu ns)",
-					field->rettime - field->calltime);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-
-		if (tracer_flags.val & TRACE_RETURN_PRINT_OVERRUN) {
-			ret = trace_seq_printf(s, " (Overruns: %lu)",
-						field->overrun);
-			if (!ret)
-				return TRACE_TYPE_PARTIAL_LINE;
-		}
-
-		ret = trace_seq_printf(s, "\n");
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-
-		return TRACE_TYPE_HANDLED;
-	}
-	return TRACE_TYPE_UNHANDLED;
-}
-
-static struct tracer return_trace __read_mostly = {
-	.name	     = "return",
-	.init	     = return_trace_init,
-	.reset	     = return_trace_reset,
-	.print_line = print_return_function,
-	.flags		= &tracer_flags,
-};
-
-static __init int init_return_trace(void)
-{
-	return register_tracer(&return_trace);
-}
-
-device_initcall(init_return_trace);
-- 
cgit v1.2.3


From df4fc31558dd2a3a30292ddb3a64c2a5befcec73 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 26 Nov 2008 00:16:23 -0500
Subject: ftrace: add function tracing to single thread

Impact: feature to function trace a single thread

This patch adds the ability to function trace a single thread.
The file:

  /debugfs/tracing/set_ftrace_pid

contains the pid to trace. Valid pids are any positive integer.
Writing any negative number to this file will disable the pid
tracing and the function tracer will go back to tracing all of
threads.

This feature works with both static and dynamic function tracing.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/ftrace.txt |  79 ++++++++++++++++++
 kernel/trace/ftrace.c    | 209 +++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 262 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt
index 35a78bc6651d..de05042f11b9 100644
--- a/Documentation/ftrace.txt
+++ b/Documentation/ftrace.txt
@@ -127,6 +127,8 @@ of ftrace. Here is a list of some of the key files:
 		be traced. If a function exists in both set_ftrace_filter
 		and set_ftrace_notrace,	the function will _not_ be traced.
 
+  set_ftrace_pid: Have the function tracer only trace a single thread.
+
   available_filter_functions: This lists the functions that ftrace
 		has processed and can trace. These are the function
 		names that you can pass to "set_ftrace_filter" or
@@ -1073,6 +1075,83 @@ For simple one time traces, the above is sufficent. For anything else,
 a search through /proc/mounts may be needed to find where the debugfs
 file-system is mounted.
 
+
+Single thread tracing
+---------------------
+
+By writing into /debug/tracing/set_ftrace_pid you can trace a
+single thread. For example:
+
+# cat /debug/tracing/set_ftrace_pid
+no pid
+# echo 3111 > /debug/tracing/set_ftrace_pid
+# cat /debug/tracing/set_ftrace_pid
+3111
+# echo function > /debug/tracing/current_tracer
+# cat /debug/tracing/trace | head
+ # tracer: function
+ #
+ #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
+ #              | |       |          |         |
+     yum-updatesd-3111  [003]  1637.254676: finish_task_switch <-thread_return
+     yum-updatesd-3111  [003]  1637.254681: hrtimer_cancel <-schedule_hrtimeout_range
+     yum-updatesd-3111  [003]  1637.254682: hrtimer_try_to_cancel <-hrtimer_cancel
+     yum-updatesd-3111  [003]  1637.254683: lock_hrtimer_base <-hrtimer_try_to_cancel
+     yum-updatesd-3111  [003]  1637.254685: fget_light <-do_sys_poll
+     yum-updatesd-3111  [003]  1637.254686: pipe_poll <-do_sys_poll
+# echo -1 > /debug/tracing/set_ftrace_pid
+# cat /debug/tracing/trace |head
+ # tracer: function
+ #
+ #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
+ #              | |       |          |         |
+ ##### CPU 3 buffer started ####
+     yum-updatesd-3111  [003]  1701.957688: free_poll_entry <-poll_freewait
+     yum-updatesd-3111  [003]  1701.957689: remove_wait_queue <-free_poll_entry
+     yum-updatesd-3111  [003]  1701.957691: fput <-free_poll_entry
+     yum-updatesd-3111  [003]  1701.957692: audit_syscall_exit <-sysret_audit
+     yum-updatesd-3111  [003]  1701.957693: path_put <-audit_syscall_exit
+
+If you want to trace a function when executing, you could use
+something like this simple program:
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+int main (int argc, char **argv)
+{
+        if (argc < 1)
+                exit(-1);
+
+        if (fork() > 0) {
+                int fd, ffd;
+                char line[64];
+                int s;
+
+                ffd = open("/debug/tracing/current_tracer", O_WRONLY);
+                if (ffd < 0)
+                        exit(-1);
+                write(ffd, "nop", 3);
+
+                fd = open("/debug/tracing/set_ftrace_pid", O_WRONLY);
+                s = sprintf(line, "%d\n", getpid());
+                write(fd, line, s);
+
+                write(ffd, "function", 8);
+
+                close(fd);
+                close(ffd);
+
+                execvp(argv[1], argv+1);
+        }
+
+        return 0;
+}
+
 dynamic ftrace
 --------------
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7e2d3b91692d..00d98c65fad0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -47,6 +47,9 @@
 int ftrace_enabled __read_mostly;
 static int last_ftrace_enabled;
 
+/* ftrace_pid_trace >= 0 will only trace threads with this pid */
+static int ftrace_pid_trace = -1;
+
 /* Quick disabling of function tracer. */
 int function_trace_stop;
 
@@ -61,6 +64,7 @@ static int ftrace_disabled __read_mostly;
 
 static DEFINE_SPINLOCK(ftrace_lock);
 static DEFINE_MUTEX(ftrace_sysctl_lock);
+static DEFINE_MUTEX(ftrace_start_lock);
 
 static struct ftrace_ops ftrace_list_end __read_mostly =
 {
@@ -70,6 +74,7 @@ static struct ftrace_ops ftrace_list_end __read_mostly =
 static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
 ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
 ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
+ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
 
 static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
 {
@@ -86,6 +91,21 @@ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
 	};
 }
 
+static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip)
+{
+	if (current->pid != ftrace_pid_trace)
+		return;
+
+	ftrace_pid_function(ip, parent_ip);
+}
+
+static void set_ftrace_pid_function(ftrace_func_t func)
+{
+	/* do not set ftrace_pid_function to itself! */
+	if (func != ftrace_pid_func)
+		ftrace_pid_function = func;
+}
+
 /**
  * clear_ftrace_function - reset the ftrace function
  *
@@ -96,6 +116,7 @@ void clear_ftrace_function(void)
 {
 	ftrace_trace_function = ftrace_stub;
 	__ftrace_trace_function = ftrace_stub;
+	ftrace_pid_function = ftrace_stub;
 }
 
 #ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
@@ -128,20 +149,26 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
 	ftrace_list = ops;
 
 	if (ftrace_enabled) {
+		ftrace_func_t func;
+
+		if (ops->next == &ftrace_list_end)
+			func = ops->func;
+		else
+			func = ftrace_list_func;
+
+		if (ftrace_pid_trace >= 0) {
+			set_ftrace_pid_function(func);
+			func = ftrace_pid_func;
+		}
+
 		/*
 		 * For one func, simply call it directly.
 		 * For more than one func, call the chain.
 		 */
 #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
-		if (ops->next == &ftrace_list_end)
-			ftrace_trace_function = ops->func;
-		else
-			ftrace_trace_function = ftrace_list_func;
+		ftrace_trace_function = func;
 #else
-		if (ops->next == &ftrace_list_end)
-			__ftrace_trace_function = ops->func;
-		else
-			__ftrace_trace_function = ftrace_list_func;
+		__ftrace_trace_function = func;
 		ftrace_trace_function = ftrace_test_stop_func;
 #endif
 	}
@@ -182,8 +209,19 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 
 	if (ftrace_enabled) {
 		/* If we only have one func left, then call that directly */
-		if (ftrace_list->next == &ftrace_list_end)
-			ftrace_trace_function = ftrace_list->func;
+		if (ftrace_list->next == &ftrace_list_end) {
+			ftrace_func_t func = ftrace_list->func;
+
+			if (ftrace_pid_trace >= 0) {
+				set_ftrace_pid_function(func);
+				func = ftrace_pid_func;
+			}
+#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
+			ftrace_trace_function = func;
+#else
+			__ftrace_trace_function = func;
+#endif
+		}
 	}
 
  out:
@@ -192,6 +230,38 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 	return ret;
 }
 
+static void ftrace_update_pid_func(void)
+{
+	ftrace_func_t func;
+
+	/* should not be called from interrupt context */
+	spin_lock(&ftrace_lock);
+
+	if (ftrace_trace_function == ftrace_stub)
+		goto out;
+
+	func = ftrace_trace_function;
+
+	if (ftrace_pid_trace >= 0) {
+		set_ftrace_pid_function(func);
+		func = ftrace_pid_func;
+	} else {
+		if (func != ftrace_pid_func)
+			goto out;
+
+		set_ftrace_pid_function(func);
+	}
+
+#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
+	ftrace_trace_function = func;
+#else
+	__ftrace_trace_function = func;
+#endif
+
+ out:
+	spin_unlock(&ftrace_lock);
+}
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 #ifndef CONFIG_FTRACE_MCOUNT_RECORD
 # error Dynamic ftrace depends on MCOUNT_RECORD
@@ -545,7 +615,19 @@ static void ftrace_run_update_code(int command)
 
 static ftrace_func_t saved_ftrace_func;
 static int ftrace_start_up;
-static DEFINE_MUTEX(ftrace_start_lock);
+
+static void ftrace_startup_enable(int command)
+{
+	if (saved_ftrace_func != ftrace_trace_function) {
+		saved_ftrace_func = ftrace_trace_function;
+		command |= FTRACE_UPDATE_TRACE_FUNC;
+	}
+
+	if (!command || !ftrace_enabled)
+		return;
+
+	ftrace_run_update_code(command);
+}
 
 static void ftrace_startup(void)
 {
@@ -558,16 +640,8 @@ static void ftrace_startup(void)
 	ftrace_start_up++;
 	command |= FTRACE_ENABLE_CALLS;
 
-	if (saved_ftrace_func != ftrace_trace_function) {
-		saved_ftrace_func = ftrace_trace_function;
-		command |= FTRACE_UPDATE_TRACE_FUNC;
-	}
-
-	if (!command || !ftrace_enabled)
-		goto out;
+	ftrace_startup_enable(command);
 
-	ftrace_run_update_code(command);
- out:
 	mutex_unlock(&ftrace_start_lock);
 }
 
@@ -1262,13 +1336,10 @@ static struct file_operations ftrace_notrace_fops = {
 	.release = ftrace_notrace_release,
 };
 
-static __init int ftrace_init_debugfs(void)
+static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
 {
-	struct dentry *d_tracer;
 	struct dentry *entry;
 
-	d_tracer = tracing_init_dentry();
-
 	entry = debugfs_create_file("available_filter_functions", 0444,
 				    d_tracer, NULL, &ftrace_avail_fops);
 	if (!entry)
@@ -1295,8 +1366,6 @@ static __init int ftrace_init_debugfs(void)
 	return 0;
 }
 
-fs_initcall(ftrace_init_debugfs);
-
 static int ftrace_convert_nops(struct module *mod,
 			       unsigned long *start,
 			       unsigned long *end)
@@ -1382,12 +1451,100 @@ static int __init ftrace_nodyn_init(void)
 }
 device_initcall(ftrace_nodyn_init);
 
+static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
+static inline void ftrace_startup_enable(int command) { }
 # define ftrace_startup()		do { } while (0)
 # define ftrace_shutdown()		do { } while (0)
 # define ftrace_startup_sysctl()	do { } while (0)
 # define ftrace_shutdown_sysctl()	do { } while (0)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
+static ssize_t
+ftrace_pid_read(struct file *file, char __user *ubuf,
+		       size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	int r;
+
+	if (ftrace_pid_trace >= 0)
+		r = sprintf(buf, "%u\n", ftrace_pid_trace);
+	else
+		r = sprintf(buf, "no pid\n");
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+ftrace_pid_write(struct file *filp, const char __user *ubuf,
+		   size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	long val;
+	int ret;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	ret = strict_strtol(buf, 10, &val);
+	if (ret < 0)
+		return ret;
+
+	mutex_lock(&ftrace_start_lock);
+	if (ret < 0) {
+		/* disable pid tracing */
+		if (ftrace_pid_trace < 0)
+			goto out;
+		ftrace_pid_trace = -1;
+
+	} else {
+
+		if (ftrace_pid_trace == val)
+			goto out;
+
+		ftrace_pid_trace = val;
+	}
+
+	/* update the function call */
+	ftrace_update_pid_func();
+	ftrace_startup_enable(0);
+
+ out:
+	mutex_unlock(&ftrace_start_lock);
+
+	return cnt;
+}
+
+static struct file_operations ftrace_pid_fops = {
+	.read = ftrace_pid_read,
+	.write = ftrace_pid_write,
+};
+
+static __init int ftrace_init_debugfs(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+
+	d_tracer = tracing_init_dentry();
+	if (!d_tracer)
+		return 0;
+
+	ftrace_init_dyn_debugfs(d_tracer);
+
+	entry = debugfs_create_file("set_ftrace_pid", 0644, d_tracer,
+				    NULL, &ftrace_pid_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'set_ftrace_pid' entry\n");
+	return 0;
+}
+
+fs_initcall(ftrace_init_debugfs);
+
 /**
  * ftrace_kill - kill ftrace
  *
-- 
cgit v1.2.3


From 5a45cfe1c64862e8cd3b0d79d7c4ba71c3118915 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 26 Nov 2008 00:16:24 -0500
Subject: ftrace: use code patching for ftrace graph tracer

Impact: more efficient code for ftrace graph tracer

This patch uses the dynamic patching, when available, to patch
the function graph code into the kernel.

This patch will ease the way for letting both function tracing
and function graph tracing run together.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_32.S |  5 +++++
 arch/x86/kernel/ftrace.c   | 48 ++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/ftrace.h     |  5 +++++
 kernel/trace/ftrace.c      | 35 ++++++++++++++++-----------------
 4 files changed, 72 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 7def9fd5c1e6..958af86186c4 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1174,6 +1174,11 @@ ftrace_call:
 	popl %edx
 	popl %ecx
 	popl %eax
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+	jmp ftrace_stub
+#endif
 
 .globl ftrace_stub
 ftrace_stub:
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 26b2d92d48b3..7ef914e6a2f6 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -111,7 +111,6 @@ static void ftrace_mod_code(void)
 	 */
 	mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
 					     MCOUNT_INSN_SIZE);
-
 }
 
 void ftrace_nmi_enter(void)
@@ -325,7 +324,51 @@ int __init ftrace_dyn_arch_init(void *data)
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
-#ifndef CONFIG_DYNAMIC_FTRACE
+#ifdef CONFIG_DYNAMIC_FTRACE
+extern void ftrace_graph_call(void);
+
+static int ftrace_mod_jmp(unsigned long ip,
+			  int old_offset, int new_offset)
+{
+	unsigned char code[MCOUNT_INSN_SIZE];
+
+	if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE))
+		return -EFAULT;
+
+	if (code[0] != 0xe9 || old_offset != *(int *)(&code[1]))
+		return -EINVAL;
+
+	*(int *)(&code[1]) = new_offset;
+
+	if (do_ftrace_mod_code(ip, &code))
+		return -EPERM;
+
+	return 0;
+}
+
+int ftrace_enable_ftrace_graph_caller(void)
+{
+	unsigned long ip = (unsigned long)(&ftrace_graph_call);
+	int old_offset, new_offset;
+
+	old_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
+	new_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
+
+	return ftrace_mod_jmp(ip, old_offset, new_offset);
+}
+
+int ftrace_disable_ftrace_graph_caller(void)
+{
+	unsigned long ip = (unsigned long)(&ftrace_graph_call);
+	int old_offset, new_offset;
+
+	old_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
+	new_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
+
+	return ftrace_mod_jmp(ip, old_offset, new_offset);
+}
+
+#else /* CONFIG_DYNAMIC_FTRACE */
 
 /*
  * These functions are picked from those used on
@@ -343,6 +386,7 @@ void ftrace_nmi_exit(void)
 {
 	atomic_dec(&in_nmi);
 }
+
 #endif /* !CONFIG_DYNAMIC_FTRACE */
 
 /* Add a function return address to the trace stack on thread info.*/
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index fc2d54987198..f9792c0d73f6 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -117,6 +117,11 @@ extern void ftrace_call(void);
 extern void mcount_call(void);
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 extern void ftrace_graph_caller(void);
+extern int ftrace_enable_ftrace_graph_caller(void);
+extern int ftrace_disable_ftrace_graph_caller(void);
+#else
+static inline int ftrace_enable_ftrace_graph_caller(void) { return 0; }
+static inline int ftrace_disable_ftrace_graph_caller(void) { return 0; }
 #endif
 
 /**
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 00d98c65fad0..5f7c8642d58b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -281,6 +281,8 @@ enum {
 	FTRACE_UPDATE_TRACE_FUNC	= (1 << 2),
 	FTRACE_ENABLE_MCOUNT		= (1 << 3),
 	FTRACE_DISABLE_MCOUNT		= (1 << 4),
+	FTRACE_START_FUNC_RET		= (1 << 5),
+	FTRACE_STOP_FUNC_RET		= (1 << 6),
 };
 
 static int ftrace_filtered;
@@ -465,14 +467,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 	unsigned long ip, fl;
 	unsigned long ftrace_addr;
 
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	if (ftrace_tracing_type == FTRACE_TYPE_ENTER)
-		ftrace_addr = (unsigned long)ftrace_caller;
-	else
-		ftrace_addr = (unsigned long)ftrace_graph_caller;
-#else
 	ftrace_addr = (unsigned long)ftrace_caller;
-#endif
 
 	ip = rec->ip;
 
@@ -605,6 +600,11 @@ static int __ftrace_modify_code(void *data)
 	if (*command & FTRACE_UPDATE_TRACE_FUNC)
 		ftrace_update_ftrace_func(ftrace_trace_function);
 
+	if (*command & FTRACE_START_FUNC_RET)
+		ftrace_enable_ftrace_graph_caller();
+	else if (*command & FTRACE_STOP_FUNC_RET)
+		ftrace_disable_ftrace_graph_caller();
+
 	return 0;
 }
 
@@ -629,10 +629,8 @@ static void ftrace_startup_enable(int command)
 	ftrace_run_update_code(command);
 }
 
-static void ftrace_startup(void)
+static void ftrace_startup(int command)
 {
-	int command = 0;
-
 	if (unlikely(ftrace_disabled))
 		return;
 
@@ -645,10 +643,8 @@ static void ftrace_startup(void)
 	mutex_unlock(&ftrace_start_lock);
 }
 
-static void ftrace_shutdown(void)
+static void ftrace_shutdown(int command)
 {
-	int command = 0;
-
 	if (unlikely(ftrace_disabled))
 		return;
 
@@ -1453,8 +1449,9 @@ device_initcall(ftrace_nodyn_init);
 
 static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
 static inline void ftrace_startup_enable(int command) { }
-# define ftrace_startup()		do { } while (0)
-# define ftrace_shutdown()		do { } while (0)
+/* Keep as macros so we do not need to define the commands */
+# define ftrace_startup(command)	do { } while (0)
+# define ftrace_shutdown(command)	do { } while (0)
 # define ftrace_startup_sysctl()	do { } while (0)
 # define ftrace_shutdown_sysctl()	do { } while (0)
 #endif /* CONFIG_DYNAMIC_FTRACE */
@@ -1585,7 +1582,7 @@ int register_ftrace_function(struct ftrace_ops *ops)
 	}
 
 	ret = __register_ftrace_function(ops);
-	ftrace_startup();
+	ftrace_startup(0);
 
 out:
 	mutex_unlock(&ftrace_sysctl_lock);
@@ -1604,7 +1601,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
 
 	mutex_lock(&ftrace_sysctl_lock);
 	ret = __unregister_ftrace_function(ops);
-	ftrace_shutdown();
+	ftrace_shutdown(0);
 	mutex_unlock(&ftrace_sysctl_lock);
 
 	return ret;
@@ -1751,7 +1748,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 	ftrace_tracing_type = FTRACE_TYPE_RETURN;
 	ftrace_graph_return = retfunc;
 	ftrace_graph_entry = entryfunc;
-	ftrace_startup();
+	ftrace_startup(FTRACE_START_FUNC_RET);
 
 out:
 	mutex_unlock(&ftrace_sysctl_lock);
@@ -1765,7 +1762,7 @@ void unregister_ftrace_graph(void)
 	atomic_dec(&ftrace_graph_active);
 	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
 	ftrace_graph_entry = (trace_func_graph_ent_t)ftrace_stub;
-	ftrace_shutdown();
+	ftrace_shutdown(FTRACE_STOP_FUNC_RET);
 	/* Restore normal tracing type */
 	ftrace_tracing_type = FTRACE_TYPE_ENTER;
 
-- 
cgit v1.2.3


From e53a6319cca69111c1643dc9f18f4465d7f1cbf0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 26 Nov 2008 00:16:25 -0500
Subject: ftrace: let function tracing and function return run together

Impact: feature

This patch enables function tracing and function return to run together.
I've tested this by enabling the stack tracer and return tracer, where
both the function entry and function return are used together with
dynamic ftrace.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 22 ++--------------------
 1 file changed, 2 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 5f7c8642d58b..cbf8b09f63a5 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -53,9 +53,6 @@ static int ftrace_pid_trace = -1;
 /* Quick disabling of function tracer. */
 int function_trace_stop;
 
-/* By default, current tracing type is normal tracing. */
-enum ftrace_tracing_type_t ftrace_tracing_type = FTRACE_TYPE_ENTER;
-
 /*
  * ftrace_disabled is set when an anomaly is discovered.
  * ftrace_disabled is much stronger than ftrace_enabled.
@@ -1576,15 +1573,9 @@ int register_ftrace_function(struct ftrace_ops *ops)
 
 	mutex_lock(&ftrace_sysctl_lock);
 
-	if (ftrace_tracing_type == FTRACE_TYPE_RETURN) {
-		ret = -EBUSY;
-		goto out;
-	}
-
 	ret = __register_ftrace_function(ops);
 	ftrace_startup(0);
 
-out:
 	mutex_unlock(&ftrace_sysctl_lock);
 	return ret;
 }
@@ -1731,23 +1722,16 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 
 	mutex_lock(&ftrace_sysctl_lock);
 
-	/*
-	 * Don't launch return tracing if normal function
-	 * tracing is already running.
-	 */
-	if (ftrace_trace_function != ftrace_stub) {
-		ret = -EBUSY;
-		goto out;
-	}
 	atomic_inc(&ftrace_graph_active);
 	ret = start_graph_tracing();
 	if (ret) {
 		atomic_dec(&ftrace_graph_active);
 		goto out;
 	}
-	ftrace_tracing_type = FTRACE_TYPE_RETURN;
+
 	ftrace_graph_return = retfunc;
 	ftrace_graph_entry = entryfunc;
+
 	ftrace_startup(FTRACE_START_FUNC_RET);
 
 out:
@@ -1763,8 +1747,6 @@ void unregister_ftrace_graph(void)
 	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
 	ftrace_graph_entry = (trace_func_graph_ent_t)ftrace_stub;
 	ftrace_shutdown(FTRACE_STOP_FUNC_RET);
-	/* Restore normal tracing type */
-	ftrace_tracing_type = FTRACE_TYPE_ENTER;
 
 	mutex_unlock(&ftrace_sysctl_lock);
 }
-- 
cgit v1.2.3


From 660c7f9be96321fc80026d76411bd15e6f418a72 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 26 Nov 2008 00:16:26 -0500
Subject: ftrace: add thread comm to function graph tracer

Impact: enhancement to function graph tracer

Export the trace_find_cmdline so the function graph tracer can
use it to print the comms of the threads.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c                 |  2 +-
 kernel/trace/trace.h                 |  1 +
 kernel/trace/trace_functions_graph.c | 21 ++++++++++++++++-----
 3 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9d5f7c94f251..5811e0a5f732 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -804,7 +804,7 @@ static void trace_save_cmdline(struct task_struct *tsk)
 	spin_unlock(&trace_cmdline_lock);
 }
 
-static char *trace_find_cmdline(int pid)
+char *trace_find_cmdline(int pid)
 {
 	char *cmdline = "<...>";
 	unsigned map;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index ffe1bb1eb620..7adacf349ef7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -450,6 +450,7 @@ struct tracer_switch_ops {
 	struct tracer_switch_ops	*next;
 };
 
+char *trace_find_cmdline(int pid);
 #endif /* CONFIG_CONTEXT_SWITCH_TRACER */
 
 #ifdef CONFIG_DYNAMIC_FTRACE
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index b6f0cc2a00cb..bbb81e7b6c40 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -32,29 +32,40 @@ static pid_t last_pid = -1;
 
 static int graph_trace_init(struct trace_array *tr)
 {
-	int cpu;
+	int cpu, ret;
+
 	for_each_online_cpu(cpu)
 		tracing_reset(tr, cpu);
 
-	return register_ftrace_graph(&trace_graph_return,
+	ret = register_ftrace_graph(&trace_graph_return,
 					&trace_graph_entry);
+	if (ret)
+		return ret;
+	tracing_start_cmdline_record();
+
+	return 0;
 }
 
 static void graph_trace_reset(struct trace_array *tr)
 {
-		unregister_ftrace_graph();
+	tracing_stop_cmdline_record();
+	unregister_ftrace_graph();
 }
 
 /* If the pid changed since the last trace, output this event */
 static int verif_pid(struct trace_seq *s, pid_t pid)
 {
+	char *comm;
+
 	if (last_pid != -1 && last_pid == pid)
 		return 1;
 
 	last_pid = pid;
-	return trace_seq_printf(s, "\n------------8<---------- thread %d"
+	comm = trace_find_cmdline(pid);
+
+	return trace_seq_printf(s, "\n------------8<---------- thread %s-%d"
 				    " ------------8<----------\n\n",
-				  pid);
+				    comm, pid);
 }
 
 static enum print_line_t
-- 
cgit v1.2.3


From 437f24fb897d409a9978eb71ecfaf279dcd94acd Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 26 Nov 2008 00:16:27 -0500
Subject: ftrace: add cpu annotation for function graph tracer

Impact: enhancement for function graph tracer

When run on a SMP box, the function graph tracer is confusing because
it shows the different CPUS as changes in the trace.

This patch adds the annotation of 'CPU[###]' where ### is a three digit
number. The output will look similar to this:

CPU[001]     dput() {
CPU[000] } 726
CPU[001]     } 487
CPU[000] do_softirq() {
CPU[001]   } 2221
CPU[000]   __do_softirq() {
CPU[000]     __local_bh_disable() {
CPU[001]   unroll_tree_refs() {
CPU[000]     } 569
CPU[001]   } 501
CPU[000]     rcu_process_callbacks() {
CPU[001]   kfree() {

What makes this nice is that now you can grep the file and produce
readable format for a particular CPU.

 # cat /debug/tracing/trace > /tmp/trace
 # grep '^CPU\[000\]' /tmp/trace > /tmp/trace0
 # grep '^CPU\[001\]' /tmp/trace > /tmp/trace1

Will give you:

 # head /tmp/trace0
CPU[000] ------------8<---------- thread sshd-3899 ------------8<----------
CPU[000]     inotify_dentry_parent_queue_event() {
CPU[000]     } 2531
CPU[000]     inotify_inode_queue_event() {
CPU[000]     } 505
CPU[000]   } 69626
CPU[000] } 73089
CPU[000] audit_syscall_exit() {
CPU[000]   path_put() {
CPU[000]     dput() {

 # head /tmp/trace1
CPU[001] ------------8<---------- thread pcscd-3446 ------------8<----------
CPU[001]               } 4186
CPU[001]               dput() {
CPU[001]               } 543
CPU[001]               vfs_permission() {
CPU[001]                 inode_permission() {
CPU[001]                   shmem_permission() {
CPU[001]                     generic_permission() {
CPU[001]                     } 501
CPU[001]                   } 2205

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_functions_graph.c | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index bbb81e7b6c40..d31d695174aa 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -28,7 +28,7 @@ static struct tracer_flags tracer_flags = {
 };
 
 /* pid on the last trace processed */
-static pid_t last_pid = -1;
+static pid_t last_pid[NR_CPUS] = { [0 ... NR_CPUS-1] = -1 };
 
 static int graph_trace_init(struct trace_array *tr)
 {
@@ -53,29 +53,34 @@ static void graph_trace_reset(struct trace_array *tr)
 }
 
 /* If the pid changed since the last trace, output this event */
-static int verif_pid(struct trace_seq *s, pid_t pid)
+static int verif_pid(struct trace_seq *s, pid_t pid, int cpu)
 {
 	char *comm;
 
-	if (last_pid != -1 && last_pid == pid)
+	if (last_pid[cpu] != -1 && last_pid[cpu] == pid)
 		return 1;
 
-	last_pid = pid;
+	last_pid[cpu] = pid;
 	comm = trace_find_cmdline(pid);
 
-	return trace_seq_printf(s, "\n------------8<---------- thread %s-%d"
+	return trace_seq_printf(s, "\nCPU[%03d]"
+				    " ------------8<---------- thread %s-%d"
 				    " ------------8<----------\n\n",
-				    comm, pid);
+				    cpu, comm, pid);
 }
 
 static enum print_line_t
 print_graph_entry(struct ftrace_graph_ent *call, struct trace_seq *s,
-		struct trace_entry *ent)
+		  struct trace_entry *ent, int cpu)
 {
 	int i;
 	int ret;
 
-	if (!verif_pid(s, ent->pid))
+	if (!verif_pid(s, ent->pid, cpu))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	ret = trace_seq_printf(s, "CPU[%03d] ", cpu);
+	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
@@ -96,12 +101,16 @@ print_graph_entry(struct ftrace_graph_ent *call, struct trace_seq *s,
 
 static enum print_line_t
 print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
-		   struct trace_entry *ent)
+		   struct trace_entry *ent, int cpu)
 {
 	int i;
 	int ret;
 
-	if (!verif_pid(s, ent->pid))
+	if (!verif_pid(s, ent->pid, cpu))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	ret = trace_seq_printf(s, "CPU[%03d] ", cpu);
+	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
@@ -137,12 +146,13 @@ print_graph_function(struct trace_iterator *iter)
 	case TRACE_GRAPH_ENT: {
 		struct ftrace_graph_ent_entry *field;
 		trace_assign_type(field, entry);
-		return print_graph_entry(&field->graph_ent, s, entry);
+		return print_graph_entry(&field->graph_ent, s, entry,
+					 iter->cpu);
 	}
 	case TRACE_GRAPH_RET: {
 		struct ftrace_graph_ret_entry *field;
 		trace_assign_type(field, entry);
-		return print_graph_return(&field->ret, s, entry);
+		return print_graph_return(&field->ret, s, entry, iter->cpu);
 	}
 	default:
 		return TRACE_TYPE_UNHANDLED;
-- 
cgit v1.2.3


From f3f47a6768a29448866da4422b6f6bee485c947f Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Sun, 23 Nov 2008 16:49:58 -0800
Subject: tracing: add "power-tracer": C/P state tracer to help power
 optimization

Impact: new "power-tracer" ftrace plugin

This patch adds a C/P-state ftrace plugin that will generate
detailed statistics about the C/P-states that are being used,
so that we can look at detailed decisions that the C/P-state
code is making, rather than the too high level "average"
that we have today.

An example way of using this is:

 mount -t debugfs none /sys/kernel/debug
 echo cstate > /sys/kernel/debug/tracing/current_tracer
 echo 1 > /sys/kernel/debug/tracing/tracing_enabled
 sleep 1
 echo 0 > /sys/kernel/debug/tracing/tracing_enabled
 cat /sys/kernel/debug/tracing/trace | perl scripts/trace/cstate.pl > out.svg

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c |   4 +
 arch/x86/kernel/process.c                  |  16 +++
 include/linux/ftrace.h                     |  29 +++++
 kernel/trace/Kconfig                       |  11 ++
 kernel/trace/Makefile                      |   1 +
 kernel/trace/trace.h                       |   7 ++
 kernel/trace/trace_power.c                 | 179 +++++++++++++++++++++++++++++
 scripts/trace/power.pl                     | 108 +++++++++++++++++
 8 files changed, 355 insertions(+)
 create mode 100644 kernel/trace/trace_power.c
 create mode 100644 scripts/trace/power.pl

(limited to 'kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 8e48c5d4467d..88ea02dcb622 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -33,6 +33,7 @@
 #include <linux/cpufreq.h>
 #include <linux/compiler.h>
 #include <linux/dmi.h>
+#include <linux/ftrace.h>
 
 #include <linux/acpi.h>
 #include <acpi/processor.h>
@@ -391,6 +392,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 	unsigned int next_perf_state = 0; /* Index into perf table */
 	unsigned int i;
 	int result = 0;
+	struct power_trace it;
 
 	dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
 
@@ -427,6 +429,8 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 		}
 	}
 
+	trace_power_mark(&it, POWER_PSTATE, next_perf_state);
+
 	switch (data->cpu_feature) {
 	case SYSTEM_INTEL_MSR_CAPABLE:
 		cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index c622772744d8..c27af49a4ede 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -7,6 +7,7 @@
 #include <linux/module.h>
 #include <linux/pm.h>
 #include <linux/clockchips.h>
+#include <linux/ftrace.h>
 #include <asm/system.h>
 
 unsigned long idle_halt;
@@ -100,6 +101,9 @@ static inline int hlt_use_halt(void)
 void default_idle(void)
 {
 	if (hlt_use_halt()) {
+		struct power_trace it;
+
+		trace_power_start(&it, POWER_CSTATE, 1);
 		current_thread_info()->status &= ~TS_POLLING;
 		/*
 		 * TS_POLLING-cleared state must be visible before we
@@ -112,6 +116,7 @@ void default_idle(void)
 		else
 			local_irq_enable();
 		current_thread_info()->status |= TS_POLLING;
+		trace_power_end(&it);
 	} else {
 		local_irq_enable();
 		/* loop is done by the caller */
@@ -154,24 +159,31 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
  */
 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 {
+	struct power_trace it;
+
+	trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);
 	if (!need_resched()) {
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
 		smp_mb();
 		if (!need_resched())
 			__mwait(ax, cx);
 	}
+	trace_power_end(&it);
 }
 
 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
 static void mwait_idle(void)
 {
+	struct power_trace it;
 	if (!need_resched()) {
+		trace_power_start(&it, POWER_CSTATE, 1);
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
 		smp_mb();
 		if (!need_resched())
 			__sti_mwait(0, 0);
 		else
 			local_irq_enable();
+		trace_power_end(&it);
 	} else
 		local_irq_enable();
 }
@@ -183,9 +195,13 @@ static void mwait_idle(void)
  */
 static void poll_idle(void)
 {
+	struct power_trace it;
+
+	trace_power_start(&it, POWER_CSTATE, 0);
 	local_irq_enable();
 	while (!need_resched())
 		cpu_relax();
+	trace_power_end(&it);
 }
 
 /*
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 7854d87b97b2..0df288666201 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -311,6 +311,35 @@ ftrace_init_module(struct module *mod,
 		   unsigned long *start, unsigned long *end) { }
 #endif
 
+enum {
+	POWER_NONE = 0,
+	POWER_CSTATE = 1,
+	POWER_PSTATE = 2,
+};
+
+struct power_trace {
+#ifdef CONFIG_POWER_TRACER
+	ktime_t			stamp;
+	ktime_t			end;
+	int			type;
+	int			state;
+#endif
+};
+
+#ifdef CONFIG_POWER_TRACER
+extern void trace_power_start(struct power_trace *it, unsigned int type,
+					unsigned int state);
+extern void trace_power_mark(struct power_trace *it, unsigned int type,
+					unsigned int state);
+extern void trace_power_end(struct power_trace *it);
+#else
+static inline void trace_power_start(struct power_trace *it, unsigned int type,
+					unsigned int state) { }
+static inline void trace_power_mark(struct power_trace *it, unsigned int type,
+					unsigned int state) { }
+static inline void trace_power_end(struct power_trace *it) { }
+#endif
+
 
 /*
  * Structure that defines a return function trace.
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 620feadff67a..d151aab48ed6 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -217,6 +217,17 @@ config BRANCH_TRACER
 
 	  Say N if unsure.
 
+config POWER_TRACER
+	bool "Trace power consumption behavior"
+	depends on DEBUG_KERNEL
+	depends on X86
+	select TRACING
+	help
+	  This tracer helps developers to analyze and optimize the kernels
+	  power management decisions, specifically the C-state and P-state
+	  behavior.
+
+
 config STACK_TRACER
 	bool "Trace max stack"
 	depends on HAVE_FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index cef4bcb4e822..acaa06553eca 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -32,5 +32,6 @@ obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
 obj-$(CONFIG_FUNCTION_RET_TRACER) += trace_functions_return.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
 obj-$(CONFIG_BTS_TRACER) += trace_bts.o
+obj-$(CONFIG_POWER_TRACER) += trace_power.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3abd645e8af2..4c453778a6ab 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -28,6 +28,7 @@ enum trace_type {
 	TRACE_FN_RET,
 	TRACE_USER_STACK,
 	TRACE_BTS,
+	TRACE_POWER,
 
 	__TRACE_LAST_TYPE
 };
@@ -160,6 +161,11 @@ struct bts_entry {
 	unsigned long		to;
 };
 
+struct trace_power {
+	struct trace_entry	ent;
+	struct power_trace	state_data;
+};
+
 /*
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
@@ -266,6 +272,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
 		IF_ASSIGN(var, ent, struct ftrace_ret_entry, TRACE_FN_RET);\
 		IF_ASSIGN(var, ent, struct bts_entry, TRACE_BTS);\
+ 		IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
 		__ftrace_bad_type();					\
 	} while (0)
 
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
new file mode 100644
index 000000000000..a7172a352f62
--- /dev/null
+++ b/kernel/trace/trace_power.c
@@ -0,0 +1,179 @@
+/*
+ * ring buffer based C-state tracer
+ *
+ * Arjan van de Ven <arjan@linux.intel.com>
+ * Copyright (C) 2008 Intel Corporation
+ *
+ * Much is borrowed from trace_boot.c which is
+ * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/debugfs.h>
+#include <linux/ftrace.h>
+#include <linux/kallsyms.h>
+#include <linux/module.h>
+
+#include "trace.h"
+
+static struct trace_array *power_trace;
+static int __read_mostly trace_power_enabled;
+
+
+static void start_power_trace(struct trace_array *tr)
+{
+	trace_power_enabled = 1;
+}
+
+static void stop_power_trace(struct trace_array *tr)
+{
+	trace_power_enabled = 0;
+}
+
+
+static int power_trace_init(struct trace_array *tr)
+{
+	int cpu;
+	power_trace = tr;
+
+	trace_power_enabled = 1;
+
+	for_each_cpu_mask(cpu, cpu_possible_map)
+		tracing_reset(tr, cpu);
+	return 0;
+}
+
+static enum print_line_t power_print_line(struct trace_iterator *iter)
+{
+	int ret = 0;
+	struct trace_entry *entry = iter->ent;
+	struct trace_power *field ;
+	struct power_trace *it;
+	struct trace_seq *s = &iter->seq;
+	struct timespec stamp;
+	struct timespec duration;
+
+	trace_assign_type(field, entry);
+	it = &field->state_data;
+	stamp = ktime_to_timespec(it->stamp);
+	duration = ktime_to_timespec(ktime_sub(it->end, it->stamp));
+
+	if (entry->type == TRACE_POWER) {
+		if (it->type == POWER_CSTATE)
+			ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n",
+					  stamp.tv_sec,
+					  stamp.tv_nsec,
+					  it->state, iter->cpu,
+					  duration.tv_sec,
+					  duration.tv_nsec);
+		if (it->type == POWER_PSTATE)
+			ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n",
+					  stamp.tv_sec,
+					  stamp.tv_nsec,
+					  it->state, iter->cpu);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+		return TRACE_TYPE_HANDLED;
+	}
+	return TRACE_TYPE_UNHANDLED;
+}
+
+static struct tracer power_tracer __read_mostly =
+{
+	.name		= "power",
+	.init		= power_trace_init,
+	.start		= start_power_trace,
+	.stop		= stop_power_trace,
+	.reset		= stop_power_trace,
+	.print_line	= power_print_line,
+};
+
+static int init_power_trace(void)
+{
+	return register_tracer(&power_tracer);
+}
+device_initcall(init_power_trace);
+
+void trace_power_start(struct power_trace *it, unsigned int type,
+			 unsigned int level)
+{
+	if (!trace_power_enabled)
+		return;
+
+	memset(it, 0, sizeof(struct power_trace));
+	it->state = level;
+	it->type = type;
+	it->stamp = ktime_get();
+}
+EXPORT_SYMBOL_GPL(trace_power_start);
+
+
+void trace_power_end(struct power_trace *it)
+{
+	struct ring_buffer_event *event;
+	struct trace_power *entry;
+	struct trace_array_cpu *data;
+	unsigned long irq_flags;
+	struct trace_array *tr = power_trace;
+
+	if (!trace_power_enabled)
+		return;
+
+	preempt_disable();
+	it->end = ktime_get();
+	data = tr->data[smp_processor_id()];
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		goto out;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, 0);
+	entry->ent.type = TRACE_POWER;
+	entry->state_data = *it;
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+	trace_wake_up();
+
+ out:
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(trace_power_end);
+
+void trace_power_mark(struct power_trace *it, unsigned int type,
+			 unsigned int level)
+{
+	struct ring_buffer_event *event;
+	struct trace_power *entry;
+	struct trace_array_cpu *data;
+	unsigned long irq_flags;
+	struct trace_array *tr = power_trace;
+
+	if (!trace_power_enabled)
+		return;
+
+	memset(it, 0, sizeof(struct power_trace));
+	it->state = level;
+	it->type = type;
+	it->stamp = ktime_get();
+	preempt_disable();
+	it->end = it->stamp;
+	data = tr->data[smp_processor_id()];
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		goto out;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, 0);
+	entry->ent.type = TRACE_POWER;
+	entry->state_data = *it;
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+	trace_wake_up();
+
+ out:
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(trace_power_mark);
diff --git a/scripts/trace/power.pl b/scripts/trace/power.pl
new file mode 100644
index 000000000000..4f729b3501e0
--- /dev/null
+++ b/scripts/trace/power.pl
@@ -0,0 +1,108 @@
+#!/usr/bin/perl
+
+# Copyright 2008, Intel Corporation
+#
+# This file is part of the Linux kernel
+#
+# This program file is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program in a file named COPYING; if not, write to the
+# Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor,
+# Boston, MA 02110-1301 USA
+#
+# Authors:
+# 	Arjan van de Ven <arjan@linux.intel.com>
+
+
+#
+# This script turns a cstate ftrace output into a SVG graphic that shows
+# historic C-state information
+#
+#
+# 	cat /sys/kernel/debug/tracing/trace | perl power.pl > out.svg
+#
+
+my @styles;
+my $base = 0;
+
+my @pstate_last;
+my @pstate_level;
+
+$styles[0] = "fill:rgb(0,0,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+$styles[1] = "fill:rgb(0,255,0);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+$styles[2] = "fill:rgb(255,0,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+$styles[3] = "fill:rgb(255,255,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+$styles[4] = "fill:rgb(255,0,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+$styles[5] = "fill:rgb(0,255,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+$styles[6] = "fill:rgb(0,128,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+$styles[7] = "fill:rgb(0,255,128);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+$styles[8] = "fill:rgb(0,25,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+
+
+print "<?xml version=\"1.0\" standalone=\"no\"?> \n";
+print "<svg width=\"10000\" height=\"100%\" version=\"1.1\" xmlns=\"http://www.w3.org/2000/svg\">\n";
+
+my $scale = 30000.0;
+while (<>) {
+	my $line = $_;
+	if ($line =~ /([0-9\.]+)\] CSTATE: Going to C([0-9]) on cpu ([0-9]+) for ([0-9\.]+)/) {
+		if ($base == 0) {
+			$base = $1;
+		}
+		my $time = $1 - $base;
+		$time = $time * $scale;
+		my $C = $2;
+		my $cpu = $3;
+		my $y = 400 * $cpu;
+		my $duration = $4 * $scale;
+		my $msec = int($4 * 100000)/100.0;
+		my $height = $C * 20;
+		$style = $styles[$C];
+
+		$y = $y + 140 - $height;
+
+		$x2 = $time + 4;
+		$y2 = $y + 4;
+
+
+		print "<rect x=\"$time\" width=\"$duration\" y=\"$y\" height=\"$height\" style=\"$style\"/>\n";
+		print "<text transform=\"translate($x2,$y2) rotate(90)\">C$C $msec</text>\n";
+	}
+	if ($line =~ /([0-9\.]+)\] PSTATE: Going to P([0-9]) on cpu ([0-9]+)/) {
+		my $time = $1 - $base;
+		my $state = $2;
+		my $cpu = $3;
+
+		if (defined($pstate_last[$cpu])) {
+			my $from = $pstate_last[$cpu];
+			my $oldstate = $pstate_state[$cpu];
+			my $duration = ($time-$from) * $scale;
+
+			$from = $from * $scale;
+			my $to = $from + $duration;
+			my $height = 140 - ($oldstate * (140/8));
+
+			my $y = 400 * $cpu + 200 + $height;
+			my $y2 = $y+4;
+			my $style = $styles[8];
+
+			print "<rect x=\"$from\" y=\"$y\" width=\"$duration\" height=\"5\" style=\"$style\"/>\n";
+			print "<text transform=\"translate($from,$y2)\">P$oldstate (cpu $cpu)</text>\n";
+		};
+
+		$pstate_last[$cpu] = $time;
+		$pstate_state[$cpu] = $state;
+	}
+}
+
+
+print "</svg>\n";
-- 
cgit v1.2.3


From 83a8df618eb04bd2819a758f3b409b1449862434 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 27 Nov 2008 01:46:33 +0100
Subject: tracing/function-graph-tracer: enhancements for the trace output

Impact: enhance the output of the graph-tracer

This patch applies some ideas of Ingo Molnar and Steven Rostedt.

* Output leaf functions in one line with parenthesis, semicolon and duration
  output.

* Add a second column (after cpu) for an overhead sign.
  if duration > 100 us, "!"
  if duration > 10 us, "+"
  else " "

* Print output in us with remaining nanosec: u.n

* Print duration on the right end, following the indentation of the functions.
  Use also visual clues: "-" on entry call (no duration to output) and "+" on
  return (duration output).

The name of the tracer has been fixed as well: function-branch becomes
function_branch.

Here is an example of the new output:

CPU[000]           dequeue_entity() {                    -
CPU[000]             update_curr() {                    -
CPU[000]               update_min_vruntime();                    + 0.512 us
CPU[000]             }                                + 1.504 us
CPU[000]             clear_buddies();                    + 0.481 us
CPU[000]             update_min_vruntime();                    + 0.504 us
CPU[000]           }                                + 4.557 us
CPU[000]           hrtick_update() {                    -
CPU[000]             hrtick_start_fair();                    + 0.489 us
CPU[000]           }                                + 1.443 us
CPU[000] +       }                                + 14.655 us
CPU[000] +     }                                + 15.678 us
CPU[000] +   }                                + 16.686 us
CPU[000]     msecs_to_jiffies();                    + 0.481 us
CPU[000]     put_prev_task_fair();                    + 0.504 us
CPU[000]     pick_next_task_fair();                    + 0.482 us
CPU[000]     pick_next_task_rt();                    + 0.504 us
CPU[000]     pick_next_task_fair();                    + 0.481 us
CPU[000]     pick_next_task_idle();                    + 0.489 us
CPU[000]     _spin_trylock();                    + 0.655 us
CPU[000]     _spin_unlock();                    + 0.609 us

CPU[000]  ------------8<---------- thread bash-2794 ------------8<----------

CPU[000]               finish_task_switch() {                    -
CPU[000]                 _spin_unlock_irq();                    + 0.722 us
CPU[000]               }                                + 2.369 us
CPU[000] !           }                                + 501972.605 us
CPU[000] !         }                                + 501973.763 us
CPU[000]           copy_from_read_buf() {                    -
CPU[000]             _spin_lock_irqsave();                    + 0.670 us
CPU[000]             _spin_unlock_irqrestore();                    + 0.699 us
CPU[000]             copy_to_user() {                    -
CPU[000]               might_fault() {                    -
CPU[000]                 __might_sleep();                    + 0.503 us
CPU[000]               }                                + 1.632 us
CPU[000]               __copy_to_user_ll();                    + 0.542 us
CPU[000]             }                                + 3.858 us
CPU[000]             tty_audit_add_data() {                    -
CPU[000]               _spin_lock_irq();                    + 0.609 us
CPU[000]               _spin_unlock_irq();                    + 0.624 us
CPU[000]             }                                + 3.196 us
CPU[000]             _spin_lock_irqsave();                    + 0.624 us
CPU[000]             _spin_unlock_irqrestore();                    + 0.625 us
CPU[000] +         }                                + 13.611 us
CPU[000]           copy_from_read_buf() {                    -
CPU[000]             _spin_lock_irqsave();                    + 0.624 us
CPU[000]             _spin_unlock_irqrestore();                    + 0.616 us
CPU[000]           }                                + 2.820 us
CPU[000]

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_functions_graph.c | 168 +++++++++++++++++++++++++++++++++--
 1 file changed, 159 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index d31d695174aa..b958d60377b3 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -14,6 +14,10 @@
 #include "trace.h"
 
 #define TRACE_GRAPH_INDENT	2
+/* Spaces between function call and time duration */
+#define TRACE_GRAPH_TIMESPACE_ENTRY	"                    "
+/* Spaces between function call and closing braces */
+#define TRACE_GRAPH_TIMESPACE_RET	"                               "
 
 #define TRACE_GRAPH_PRINT_OVERRUN	0x1
 static struct tracer_opt trace_opts[] = {
@@ -63,26 +67,130 @@ static int verif_pid(struct trace_seq *s, pid_t pid, int cpu)
 	last_pid[cpu] = pid;
 	comm = trace_find_cmdline(pid);
 
-	return trace_seq_printf(s, "\nCPU[%03d]"
+	return trace_seq_printf(s, "\nCPU[%03d] "
 				    " ------------8<---------- thread %s-%d"
 				    " ------------8<----------\n\n",
 				    cpu, comm, pid);
 }
 
+static bool
+trace_branch_is_leaf(struct trace_iterator *iter,
+		struct ftrace_graph_ent_entry *curr)
+{
+	struct ring_buffer_iter *ring_iter;
+	struct ring_buffer_event *event;
+	struct ftrace_graph_ret_entry *next;
+
+	ring_iter = iter->buffer_iter[iter->cpu];
+
+	if (!ring_iter)
+		return false;
+
+	event = ring_buffer_iter_peek(iter->buffer_iter[iter->cpu], NULL);
+
+	if (!event)
+		return false;
+
+	next = ring_buffer_event_data(event);
+
+	if (next->ent.type != TRACE_GRAPH_RET)
+		return false;
+
+	if (curr->ent.pid != next->ent.pid ||
+			curr->graph_ent.func != next->ret.func)
+		return false;
+
+	return true;
+}
+
+
+static inline int
+print_graph_duration(unsigned long long duration, struct trace_seq *s)
+{
+	unsigned long nsecs_rem = do_div(duration, 1000);
+	return trace_seq_printf(s, "+ %llu.%lu us\n", duration, nsecs_rem);
+}
+
+/* Signal a overhead of time execution to the output */
+static int
+print_graph_overhead(unsigned long long duration, struct trace_seq *s)
+{
+	/* Duration exceeded 100 msecs */
+	if (duration > 100000ULL)
+		return trace_seq_printf(s, "! ");
+
+	/* Duration exceeded 10 msecs */
+	if (duration > 10000ULL)
+		return trace_seq_printf(s, "+ ");
+
+	return trace_seq_printf(s, "  ");
+}
+
+/* Case of a leaf function on its call entry */
 static enum print_line_t
-print_graph_entry(struct ftrace_graph_ent *call, struct trace_seq *s,
-		  struct trace_entry *ent, int cpu)
+print_graph_entry_leaf(struct trace_iterator *iter,
+		struct ftrace_graph_ent_entry *entry, struct trace_seq *s)
 {
+	struct ftrace_graph_ret_entry *ret_entry;
+	struct ftrace_graph_ret *graph_ret;
+	struct ring_buffer_event *event;
+	struct ftrace_graph_ent *call;
+	unsigned long long duration;
 	int i;
 	int ret;
 
-	if (!verif_pid(s, ent->pid, cpu))
+	event = ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
+	ret_entry = ring_buffer_event_data(event);
+	graph_ret = &ret_entry->ret;
+	call = &entry->graph_ent;
+	duration = graph_ret->rettime - graph_ret->calltime;
+
+	/* Overhead */
+	ret = print_graph_overhead(duration, s);
+	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	ret = trace_seq_printf(s, "CPU[%03d] ", cpu);
+	/* Function */
+	for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
+		ret = trace_seq_printf(s, " ");
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	ret = seq_print_ip_sym(s, call->func, 0);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	ret = trace_seq_printf(s, "();");
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
+	/* Duration */
+	ret = trace_seq_printf(s, TRACE_GRAPH_TIMESPACE_ENTRY);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	ret = print_graph_duration(duration, s);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
+			struct trace_seq *s)
+{
+	int i;
+	int ret;
+	struct ftrace_graph_ent *call = &entry->graph_ent;
+
+	/* No overhead */
+	ret = trace_seq_printf(s, "  ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Function */
 	for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
 		ret = trace_seq_printf(s, " ");
 		if (!ret)
@@ -93,26 +201,62 @@ print_graph_entry(struct ftrace_graph_ent *call, struct trace_seq *s,
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	ret = trace_seq_printf(s, "() {\n");
+	ret = trace_seq_printf(s, "() {");
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* No duration to print at this state */
+	ret = trace_seq_printf(s, TRACE_GRAPH_TIMESPACE_ENTRY "-\n");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
 	return TRACE_TYPE_HANDLED;
 }
 
+static enum print_line_t
+print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
+			struct trace_iterator *iter, int cpu)
+{
+	int ret;
+	struct trace_entry *ent = iter->ent;
+
+	if (!verif_pid(s, ent->pid, cpu))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	ret = trace_seq_printf(s, "CPU[%03d] ", cpu);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	if (trace_branch_is_leaf(iter, field))
+		return print_graph_entry_leaf(iter, field, s);
+	else
+		return print_graph_entry_nested(field, s);
+
+}
+
 static enum print_line_t
 print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 		   struct trace_entry *ent, int cpu)
 {
 	int i;
 	int ret;
+	unsigned long long duration = trace->rettime - trace->calltime;
 
+	/* Pid */
 	if (!verif_pid(s, ent->pid, cpu))
 		return TRACE_TYPE_PARTIAL_LINE;
 
+	/* Cpu */
 	ret = trace_seq_printf(s, "CPU[%03d] ", cpu);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
+	/* Overhead */
+	ret = print_graph_overhead(duration, s);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Closing brace */
 	for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
 		ret = trace_seq_printf(s, " ");
 		if (!ret)
@@ -123,10 +267,16 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	ret = trace_seq_printf(s, "%llu\n", trace->rettime - trace->calltime);
+	/* Duration */
+	ret = trace_seq_printf(s, TRACE_GRAPH_TIMESPACE_RET);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	ret = print_graph_duration(duration, s);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
+	/* Overrun */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
 		ret = trace_seq_printf(s, " (Overruns: %lu)\n",
 					trace->overrun);
@@ -146,7 +296,7 @@ print_graph_function(struct trace_iterator *iter)
 	case TRACE_GRAPH_ENT: {
 		struct ftrace_graph_ent_entry *field;
 		trace_assign_type(field, entry);
-		return print_graph_entry(&field->graph_ent, s, entry,
+		return print_graph_entry(field, s, iter,
 					 iter->cpu);
 	}
 	case TRACE_GRAPH_RET: {
@@ -160,7 +310,7 @@ print_graph_function(struct trace_iterator *iter)
 }
 
 static struct tracer graph_trace __read_mostly = {
-	.name	     = "function-graph",
+	.name	     = "function_graph",
 	.init	     = graph_trace_init,
 	.reset	     = graph_trace_reset,
 	.print_line = print_graph_function,
-- 
cgit v1.2.3


From 1a056155edd458eb93ef383fa8e5741d7e7c6360 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 28 Nov 2008 00:42:46 +0100
Subject: tracing/function-graph-tracer: adjustments of the trace informations

Impact: increase the visual qualities of the call-graph-tracer output

This patch applies various trace output formatting changes:

 - CPU is now a decimal number, followed by a parenthesis.

 - Overhead is now on the second column (gives a good visibility)

 - Cost is now on the third column, can't exceed 9999.99 us. It is
   followed by a virtual line based on a "|" character.

 - Functions calls are now the last column on the right. This way, we
   haven't dynamic column (which flow is harder to follow) on its right.

 - CPU and Overhead have their own option flag. They are default-on but you
   can disable them easily:

      echo nofuncgraph-cpu > trace_options
      echo nofuncgraph-overhead > trace_options

TODO:

_ Refactoring of the thread switch output.
_ Give a default-off option to output the thread and its pid on each row.
_ Provide headers
_ ....

Here is an example of the new trace style:

0)           |             mutex_unlock() {
0)      0.639 us |           __mutex_unlock_slowpath();
0)      1.607 us |         }
0)           |             remove_wait_queue() {
0)      0.616 us |           _spin_lock_irqsave();
0)      0.616 us |           _spin_unlock_irqrestore();
0)      2.779 us |         }
0)      0.495 us |         n_tty_set_room();
0) ! 9999.999 us |       }
0)           |           tty_ldisc_deref() {
0)      0.615 us |         _spin_lock_irqsave();
0)      0.616 us |         _spin_unlock_irqrestore();
0)      2.793 us |       }
0)           |           current_fs_time() {
0)      0.488 us |         current_kernel_time();
0)      0.495 us |         timespec_trunc();
0)      2.486 us |       }
0) ! 9999.999 us |     }
0) ! 9999.999 us |   }
0) ! 9999.999 us | }
0)           |     sys_read() {
0)      0.796 us |   fget_light();
0)           |       vfs_read() {
0)           |         rw_verify_area() {
0)           |           security_file_permission() {
0)      0.488 us |         cap_file_permission();
0)      1.720 us |       }
0)      3.  4 us |     }
0)           |         tty_read() {
0)      0.488 us |       tty_paranoia_check();
0)           |           tty_ldisc_ref_wait() {
0)           |             tty_ldisc_try() {
0)      0.615 us |           _spin_lock_irqsave();
0)      0.615 us |           _spin_unlock_irqrestore();
0)      5.436 us |         }
0)      6.427 us |       }

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_functions_graph.c | 142 +++++++++++++++++++++++------------
 1 file changed, 93 insertions(+), 49 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index b958d60377b3..596a3ee43053 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -14,20 +14,25 @@
 #include "trace.h"
 
 #define TRACE_GRAPH_INDENT	2
-/* Spaces between function call and time duration */
-#define TRACE_GRAPH_TIMESPACE_ENTRY	"                    "
-/* Spaces between function call and closing braces */
-#define TRACE_GRAPH_TIMESPACE_RET	"                               "
 
+/* Flag options */
 #define TRACE_GRAPH_PRINT_OVERRUN	0x1
+#define TRACE_GRAPH_PRINT_CPU		0x2
+#define TRACE_GRAPH_PRINT_OVERHEAD	0x4
+
 static struct tracer_opt trace_opts[] = {
-	/* Display overruns or not */
-	{ TRACER_OPT(overrun, TRACE_GRAPH_PRINT_OVERRUN) },
+	/* Display overruns ? */
+	{ TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) },
+	/* Display CPU ? */
+	{ TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) },
+	/* Display Overhead ? */
+	{ TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) },
 	{ } /* Empty entry */
 };
 
 static struct tracer_flags tracer_flags = {
-	.val = 0, /* Don't display overruns by default */
+	/* Don't display overruns by default */
+	.val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD,
 	.opts = trace_opts
 };
 
@@ -56,6 +61,36 @@ static void graph_trace_reset(struct trace_array *tr)
 	unregister_ftrace_graph();
 }
 
+static inline int log10_cpu(int nb)
+{
+	if (nb / 100)
+		return 3;
+	if (nb / 10)
+		return 2;
+	return 1;
+}
+
+static enum print_line_t
+print_graph_cpu(struct trace_seq *s, int cpu)
+{
+	int i;
+	int ret;
+	int log10_this = log10_cpu(cpu);
+	int log10_all = log10_cpu(cpus_weight_nr(cpu_online_map));
+
+
+	for (i = 0; i < log10_all - log10_this; i++) {
+		ret = trace_seq_printf(s, " ");
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+	ret = trace_seq_printf(s, "%d) ", cpu);
+	if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	return TRACE_TYPE_HANDLED;
+}
+
+
 /* If the pid changed since the last trace, output this event */
 static int verif_pid(struct trace_seq *s, pid_t pid, int cpu)
 {
@@ -67,8 +102,7 @@ static int verif_pid(struct trace_seq *s, pid_t pid, int cpu)
 	last_pid[cpu] = pid;
 	comm = trace_find_cmdline(pid);
 
-	return trace_seq_printf(s, "\nCPU[%03d] "
-				    " ------------8<---------- thread %s-%d"
+	return trace_seq_printf(s, "\n------------8<---------- thread %s-%d"
 				    " ------------8<----------\n\n",
 				    cpu, comm, pid);
 }
@@ -86,7 +120,7 @@ trace_branch_is_leaf(struct trace_iterator *iter,
 	if (!ring_iter)
 		return false;
 
-	event = ring_buffer_iter_peek(iter->buffer_iter[iter->cpu], NULL);
+	event = ring_buffer_iter_peek(ring_iter, NULL);
 
 	if (!event)
 		return false;
@@ -108,7 +142,7 @@ static inline int
 print_graph_duration(unsigned long long duration, struct trace_seq *s)
 {
 	unsigned long nsecs_rem = do_div(duration, 1000);
-	return trace_seq_printf(s, "+ %llu.%lu us\n", duration, nsecs_rem);
+	return trace_seq_printf(s, "%4llu.%3lu us | ", duration, nsecs_rem);
 }
 
 /* Signal a overhead of time execution to the output */
@@ -136,8 +170,8 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 	struct ring_buffer_event *event;
 	struct ftrace_graph_ent *call;
 	unsigned long long duration;
-	int i;
 	int ret;
+	int i;
 
 	event = ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
 	ret_entry = ring_buffer_event_data(event);
@@ -145,8 +179,19 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 	call = &entry->graph_ent;
 	duration = graph_ret->rettime - graph_ret->calltime;
 
+	/* Must not exceed 8 characters: 9999.999 us */
+	if (duration > 10000000ULL)
+		duration = 9999999ULL;
+
 	/* Overhead */
-	ret = print_graph_overhead(duration, s);
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
+		ret = print_graph_overhead(duration, s);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	/* Duration */
+	ret = print_graph_duration(duration, s);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -161,16 +206,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	ret = trace_seq_printf(s, "();");
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Duration */
-	ret = trace_seq_printf(s, TRACE_GRAPH_TIMESPACE_ENTRY);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	ret = print_graph_duration(duration, s);
+	ret = trace_seq_printf(s, "();\n");
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -186,9 +222,14 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
 	struct ftrace_graph_ent *call = &entry->graph_ent;
 
 	/* No overhead */
-	ret = trace_seq_printf(s, "  ");
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
+		ret = trace_seq_printf(s, "  ");
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	/* No time */
+	ret = trace_seq_printf(s, "        |     ");
 
 	/* Function */
 	for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
@@ -201,12 +242,7 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	ret = trace_seq_printf(s, "() {");
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* No duration to print at this state */
-	ret = trace_seq_printf(s, TRACE_GRAPH_TIMESPACE_ENTRY "-\n");
+	ret = trace_seq_printf(s, "() {\n");
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -220,12 +256,16 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 	int ret;
 	struct trace_entry *ent = iter->ent;
 
+	/* Pid */
 	if (!verif_pid(s, ent->pid, cpu))
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	ret = trace_seq_printf(s, "CPU[%03d] ", cpu);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
+	/* Cpu */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
+		ret = print_graph_cpu(s, cpu);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
 
 	if (trace_branch_is_leaf(iter, field))
 		return print_graph_entry_leaf(iter, field, s);
@@ -242,17 +282,30 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 	int ret;
 	unsigned long long duration = trace->rettime - trace->calltime;
 
+	/* Must not exceed 8 characters: xxxx.yyy us */
+	if (duration > 10000000ULL)
+		duration = 9999999ULL;
+
 	/* Pid */
 	if (!verif_pid(s, ent->pid, cpu))
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Cpu */
-	ret = trace_seq_printf(s, "CPU[%03d] ", cpu);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
+		ret = print_graph_cpu(s, cpu);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
 
 	/* Overhead */
-	ret = print_graph_overhead(duration, s);
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
+		ret = print_graph_overhead(duration, s);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	/* Duration */
+	ret = print_graph_duration(duration, s);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -263,16 +316,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
-	ret = trace_seq_printf(s, "} ");
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Duration */
-	ret = trace_seq_printf(s, TRACE_GRAPH_TIMESPACE_RET);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	ret = print_graph_duration(duration, s);
+	ret = trace_seq_printf(s, "}\n");
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-- 
cgit v1.2.3


From d51090b34602a20984ab0312ef04e47069c0aec6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 28 Nov 2008 09:55:16 +0100
Subject: tracing/function-graph-tracer: more output tweaks

Impact: prettify the output some more

Before:

0)           |     sys_read() {
0)      0.796 us |   fget_light();
0)           |       vfs_read() {
0)           |         rw_verify_area() {
0)           |           security_file_permission() {
------------8<---------- thread sshd-1755 ------------8<----------

After:

 0)               |  sys_read() {
 0)      0.796 us |    fget_light();
 0)               |    vfs_read() {
 0)               |      rw_verify_area() {
 0)               |        security_file_permission() {
 ------------------------------------------
 | 1)  migration/0--1  =>  sshd-1755
 ------------------------------------------

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_functions_graph.c | 45 ++++++++++++++++++++++++++++++------
 1 file changed, 38 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 596a3ee43053..894b50bca313 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -79,6 +79,19 @@ print_graph_cpu(struct trace_seq *s, int cpu)
 	int log10_all = log10_cpu(cpus_weight_nr(cpu_online_map));
 
 
+	/*
+	 * Start with a space character - to make it stand out
+	 * to the right a bit when trace output is pasted into
+	 * email:
+	 */
+	ret = trace_seq_printf(s, " ");
+
+	/*
+	 * Tricky - we space the CPU field according to the max
+	 * number of online CPUs. On a 2-cpu system it would take
+	 * a maximum of 1 digit - on a 128 cpu system it would
+	 * take up to 3 digits:
+	 */
 	for (i = 0; i < log10_all - log10_this; i++) {
 		ret = trace_seq_printf(s, " ");
 		if (!ret)
@@ -86,7 +99,8 @@ print_graph_cpu(struct trace_seq *s, int cpu)
 	}
 	ret = trace_seq_printf(s, "%d) ", cpu);
 	if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
+		return TRACE_TYPE_PARTIAL_LINE;
+
 	return TRACE_TYPE_HANDLED;
 }
 
@@ -94,17 +108,34 @@ print_graph_cpu(struct trace_seq *s, int cpu)
 /* If the pid changed since the last trace, output this event */
 static int verif_pid(struct trace_seq *s, pid_t pid, int cpu)
 {
-	char *comm;
+	char *comm, *prev_comm;
+	pid_t prev_pid;
+	int ret;
 
 	if (last_pid[cpu] != -1 && last_pid[cpu] == pid)
 		return 1;
 
+	prev_pid = last_pid[cpu];
 	last_pid[cpu] = pid;
+
 	comm = trace_find_cmdline(pid);
+	prev_comm = trace_find_cmdline(prev_pid);
 
-	return trace_seq_printf(s, "\n------------8<---------- thread %s-%d"
-				    " ------------8<----------\n\n",
-				    cpu, comm, pid);
+/*
+ * Context-switch trace line:
+
+ ------------------------------------------
+ | 1)  migration/0--1  =>  sshd-1755
+ ------------------------------------------
+
+ */
+	ret = trace_seq_printf(s,
+		" ------------------------------------------\n");
+	ret += trace_seq_printf(s, " | %d)  %s-%d  =>  %s-%d\n",
+				  cpu, prev_comm, prev_pid, comm, pid);
+	ret += trace_seq_printf(s,
+		" ------------------------------------------\n\n");
+	return ret;
 }
 
 static bool
@@ -142,7 +173,7 @@ static inline int
 print_graph_duration(unsigned long long duration, struct trace_seq *s)
 {
 	unsigned long nsecs_rem = do_div(duration, 1000);
-	return trace_seq_printf(s, "%4llu.%3lu us | ", duration, nsecs_rem);
+	return trace_seq_printf(s, "%4llu.%3lu us |  ", duration, nsecs_rem);
 }
 
 /* Signal a overhead of time execution to the output */
@@ -229,7 +260,7 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
 	}
 
 	/* No time */
-	ret = trace_seq_printf(s, "        |     ");
+	ret = trace_seq_printf(s, "            |  ");
 
 	/* Function */
 	for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
-- 
cgit v1.2.3


From c7425acb42fff1e723b05fbf4ea11e9a455d95dc Mon Sep 17 00:00:00 2001
From: Török Edwin <edwintorok@gmail.com>
Date: Fri, 28 Nov 2008 11:17:56 +0200
Subject: tracing, alpha: fix build: add missing #ifdef CONFIG_STACKTRACE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are architectures that still have no stacktrace support.

Signed-off-by: Török Edwin <edwintorok@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5811e0a5f732..91887a280ab9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -983,6 +983,7 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 		   struct trace_array_cpu *data,
 		   unsigned long flags, int pc)
 {
+#ifdef CONFIG_STACKTRACE
 	struct ring_buffer_event *event;
 	struct userstack_entry *entry;
 	struct stack_trace trace;
@@ -1008,6 +1009,7 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 
 	save_stack_trace_user(&trace);
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+#endif
 }
 
 void __trace_userstack(struct trace_array *tr,
-- 
cgit v1.2.3


From 50cdaf08a8ec1d7f43987705da7aff7cf949708f Mon Sep 17 00:00:00 2001
From: Liming Wang <liming.wang@windriver.com>
Date: Fri, 28 Nov 2008 12:13:21 +0800
Subject: ftrace: improve seq_operation of ftrace

Impact: make ftrace position computing more sane

First remove useless ->pos field. Then we needn't check seq_printf
in .show like other place.

Signed-off-by: Liming Wang <liming.wang@windriver.com>
Reviewed-by: Bruce Ashfield <bruce.ashfield@windriver.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 27 ++++++++++-----------------
 1 file changed, 10 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index cbf8b09f63a5..08b536a2614e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -786,7 +786,6 @@ enum {
 #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
 
 struct ftrace_iterator {
-	loff_t			pos;
 	struct ftrace_page	*pg;
 	unsigned		idx;
 	unsigned		flags;
@@ -811,6 +810,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 			iter->pg = iter->pg->next;
 			iter->idx = 0;
 			goto retry;
+		} else {
+			iter->idx = -1;
 		}
 	} else {
 		rec = &iter->pg->records[iter->idx++];
@@ -833,8 +834,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 	}
 	spin_unlock(&ftrace_lock);
 
-	iter->pos = *pos;
-
 	return rec;
 }
 
@@ -842,13 +841,15 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 {
 	struct ftrace_iterator *iter = m->private;
 	void *p = NULL;
-	loff_t l = -1;
 
-	if (*pos > iter->pos)
-		*pos = iter->pos;
+	if (*pos > 0) {
+		if (iter->idx < 0)
+			return p;
+		(*pos)--;
+		iter->idx--;
+	}
 
-	l = *pos;
-	p = t_next(m, p, &l);
+	p = t_next(m, p, pos);
 
 	return p;
 }
@@ -859,21 +860,15 @@ static void t_stop(struct seq_file *m, void *p)
 
 static int t_show(struct seq_file *m, void *v)
 {
-	struct ftrace_iterator *iter = m->private;
 	struct dyn_ftrace *rec = v;
 	char str[KSYM_SYMBOL_LEN];
-	int ret = 0;
 
 	if (!rec)
 		return 0;
 
 	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
 
-	ret = seq_printf(m, "%s\n", str);
-	if (ret < 0) {
-		iter->pos--;
-		iter->idx--;
-	}
+	seq_printf(m, "%s\n", str);
 
 	return 0;
 }
@@ -899,7 +894,6 @@ ftrace_avail_open(struct inode *inode, struct file *file)
 		return -ENOMEM;
 
 	iter->pg = ftrace_pages_start;
-	iter->pos = 0;
 
 	ret = seq_open(file, &show_ftrace_seq_ops);
 	if (!ret) {
@@ -986,7 +980,6 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
 
 	if (file->f_mode & FMODE_READ) {
 		iter->pg = ftrace_pages_start;
-		iter->pos = 0;
 		iter->flags = enable ? FTRACE_ITER_FILTER :
 			FTRACE_ITER_NOTRACE;
 
-- 
cgit v1.2.3


From 70574a996fc7a70c5586eb56bd92a544eccf18b6 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 28 Nov 2008 22:08:00 +0300
Subject: sched: move double_unlock_balance() higher

Move double_lock_balance()/double_unlock_balance() higher to fix the following
with gcc-3.4.6:

   CC      kernel/sched.o
 In file included from kernel/sched.c:1605:
 kernel/sched_rt.c: In function `find_lock_lowest_rq':
 kernel/sched_rt.c:914: sorry, unimplemented: inlining failed in call to 'double_unlock_balance': function body not available
 kernel/sched_rt.c:1077: sorry, unimplemented: called from here
 make[2]: *** [kernel/sched.o] Error 1

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    | 67 +++++++++++++++++++++++++++----------------------------
 kernel/sched_rt.c |  4 ----
 2 files changed, 33 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 3d1ee429219b..6a99703e0eb0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1581,6 +1581,39 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 
 #endif
 
+/*
+ * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ */
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+	__releases(this_rq->lock)
+	__acquires(busiest->lock)
+	__acquires(this_rq->lock)
+{
+	int ret = 0;
+
+	if (unlikely(!irqs_disabled())) {
+		/* printk() doesn't work good under rq->lock */
+		spin_unlock(&this_rq->lock);
+		BUG_ON(1);
+	}
+	if (unlikely(!spin_trylock(&busiest->lock))) {
+		if (busiest < this_rq) {
+			spin_unlock(&this_rq->lock);
+			spin_lock(&busiest->lock);
+			spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
+			ret = 1;
+		} else
+			spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
+	}
+	return ret;
+}
+
+static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
+	__releases(busiest->lock)
+{
+	spin_unlock(&busiest->lock);
+	lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+}
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -2780,40 +2813,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 		__release(rq2->lock);
 }
 
-/*
- * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
- */
-static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
-	__releases(this_rq->lock)
-	__acquires(busiest->lock)
-	__acquires(this_rq->lock)
-{
-	int ret = 0;
-
-	if (unlikely(!irqs_disabled())) {
-		/* printk() doesn't work good under rq->lock */
-		spin_unlock(&this_rq->lock);
-		BUG_ON(1);
-	}
-	if (unlikely(!spin_trylock(&busiest->lock))) {
-		if (busiest < this_rq) {
-			spin_unlock(&this_rq->lock);
-			spin_lock(&busiest->lock);
-			spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
-			ret = 1;
-		} else
-			spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
-	}
-	return ret;
-}
-
-static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
-	__releases(busiest->lock)
-{
-	spin_unlock(&busiest->lock);
-	lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
-}
-
 /*
  * If dest_cpu is allowed for this process, migrate the task to it.
  * This is accomplished by forcing the cpu_allowed mask to only
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 2bdd44423599..587a16e2a8f5 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -909,10 +909,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 /* Only try algorithms three times */
 #define RT_MAX_TRIES 3
 
-static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
-static inline void double_unlock_balance(struct rq *this_rq,
-						struct rq *busiest);
-
 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
 
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
-- 
cgit v1.2.3


From 65c6dc6adbe7ee0acf207445243400a68c77af15 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 29 Nov 2008 04:12:46 +0100
Subject: tracing/branch-tracer: include missing irqflags.h

Impact: fix build error on branch tracer

This should fix a build error reported on alpha in linux-next:

 CC      kernel/trace/trace_branch.o
  kernel/trace/trace_branch.c: In function 'probe_likely_condition':
  kernel/trace/trace_branch.c:44: error: implicit declaration of function 'raw_local_irq_save'
  kernel/trace/trace_branch.c:76: error: implicit declaration of function 'raw_local_irq_restore'

Unfortunately, I can't test it since I don't have any Alpha build environment.

Reported-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_branch.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 877ee88e6a74..bc972753568d 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -6,6 +6,7 @@
 #include <linux/kallsyms.h>
 #include <linux/seq_file.h>
 #include <linux/spinlock.h>
+#include <linux/irqflags.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
-- 
cgit v1.2.3


From 6c415b9234a8c71f290e5d4fddc467f103f32719 Mon Sep 17 00:00:00 2001
From: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
Date: Mon, 1 Dec 2008 20:49:05 +0530
Subject: sched: add uid information to sched_debug for CONFIG_USER_SCHED

Impact: extend information in /proc/sched_debug

This patch adds uid information in sched_debug for CONFIG_USER_SCHED

Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  1 +
 kernel/sched.c        | 10 ++++++++++
 kernel/sched_debug.c  |  6 +++++-
 kernel/user.c         |  2 ++
 4 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7a69c4d224ee..d8733f07d80b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2218,6 +2218,7 @@ extern void normalize_rt_tasks(void);
 extern struct task_group init_task_group;
 #ifdef CONFIG_USER_SCHED
 extern struct task_group root_task_group;
+extern void set_tg_uid(struct user_struct *user);
 #endif
 
 extern struct task_group *sched_create_group(struct task_group *parent);
diff --git a/kernel/sched.c b/kernel/sched.c
index 6a99703e0eb0..4c7388ef5be7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -261,6 +261,10 @@ struct task_group {
 	struct cgroup_subsys_state css;
 #endif
 
+#ifdef CONFIG_USER_SCHED
+	uid_t uid;
+#endif
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* schedulable entities of this group on each cpu */
 	struct sched_entity **se;
@@ -286,6 +290,12 @@ struct task_group {
 
 #ifdef CONFIG_USER_SCHED
 
+/* Helper function to pass uid information to create_sched_user() */
+void set_tg_uid(struct user_struct *user)
+{
+	user->tg->uid = user->uid;
+}
+
 /*
  * Root task group.
  * 	Every UID task group (including init_task_group aka UID-0) will
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index baf2f17af462..4293cfa9681d 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -160,10 +160,14 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	cgroup_path(tg->css.cgroup, path, sizeof(path));
 
 	SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
+#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
+	{
+		uid_t uid = cfs_rq->tg->uid;
+		SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid);
+	}
 #else
 	SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
 #endif
-
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
 			SPLIT_NS(cfs_rq->exec_clock));
 
diff --git a/kernel/user.c b/kernel/user.c
index 39d6159fae43..cec2224bc9f5 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -101,6 +101,8 @@ static int sched_create_user(struct user_struct *up)
 	if (IS_ERR(up->tg))
 		rc = -ENOMEM;
 
+	set_tg_uid(up);
+
 	return rc;
 }
 
-- 
cgit v1.2.3


From 66eafebc1086014709dc38f52ddcb3d67d9b346c Mon Sep 17 00:00:00 2001
From: Liming Wang <liming.wang@windriver.com>
Date: Tue, 2 Dec 2008 10:33:08 +0800
Subject: function trace: fix a bug of single thread function trace

Impact: fix "no output from tracer" bug caused by ftrace_update_pid_func()

When disabling single thread function trace using
"echo -1 > set_ftrace_pid", the normal function trace
has to restore to original function, otherwise the normal
function trace will not work well.

Without this commit, something like below:

	$ ps |grep 850
	  850 root      2556 S    -/bin/sh
	$ echo 850 > /debug/tracing/set_ftrace_pid
	$ echo function > /debug/tracing/current_tracer
	$ echo 1 > /debug/tracing/tracing_enabled
	$ sleep 1
	$ echo 0 > /debug/tracing/tracing_enabled
	$ cat /debug/tracing/trace_pipe |wc -l
	59704
	$ echo -1 > /debug/tracing/set_ftrace_pid
	$ echo 1 > /debug/tracing/tracing_enabled
	$ sleep 1
	$ echo 0 > /debug/tracing/tracing_enabled
	$ more /debug/tracing/trace_pipe
		<====== nothing output now!
			it should output trace record.

Signed-off-by: Liming Wang <liming.wang@windriver.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 08b536a2614e..6d89ab46c6e3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -243,10 +243,8 @@ static void ftrace_update_pid_func(void)
 		set_ftrace_pid_function(func);
 		func = ftrace_pid_func;
 	} else {
-		if (func != ftrace_pid_func)
-			goto out;
-
-		set_ftrace_pid_function(func);
+		if (func == ftrace_pid_func)
+			func = ftrace_pid_function;
 	}
 
 #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
-- 
cgit v1.2.3


From 48d68b20d00865035b8b65e69af343d0f53fac9d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 2 Dec 2008 00:20:39 +0100
Subject: tracing/function-graph-tracer: support for x86-64

Impact: extend and enable the function graph tracer to 64-bit x86

This patch implements the support for function graph tracer under x86-64.
Both static and dynamic tracing are supported.

This causes some small CPP conditional asm on arch/x86/kernel/ftrace.c I
wanted to use probe_kernel_read/write to make the return address
saving/patching code more generic but it causes tracing recursion.

That would be perhaps useful to implement a notrace version of these
function for other archs ports.

Note that arch/x86/process_64.c is not traced, as in X86-32. I first
thought __switch_to() was responsible of crashes during tracing because I
believed current task were changed inside but that's actually not the
case (actually yes, but not the "current" pointer).

So I will have to investigate to find the functions that harm here, to
enable tracing of the other functions inside (but there is no issue at
this time, while process_64.c stays out of -pg flags).

A little possible race condition is fixed inside this patch too. When the
tracer allocate a return stack dynamically, the current depth is not
initialized before but after. An interrupt could occur at this time and,
after seeing that the return stack is allocated, the tracer could try to
trace it with a random uninitialized depth. It's a prevention, even if I
hadn't problems with it.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim Bird <tim.bird@am.sony.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig           |  2 +-
 arch/x86/kernel/Makefile   |  1 +
 arch/x86/kernel/entry_64.S | 74 ++++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/ftrace.c   | 11 ++++++-
 kernel/trace/ftrace.c      |  4 ++-
 5 files changed, 89 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0842b1127684..45c86fb94132 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -29,7 +29,7 @@ config X86
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FUNCTION_TRACER
-	select HAVE_FUNCTION_GRAPH_TRACER if X86_32
+	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
 	select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
 	select HAVE_ARCH_KGDB if !X86_VOYAGER
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 64939a0c3986..d274425fb076 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -17,6 +17,7 @@ endif
 ifdef CONFIG_FUNCTION_GRAPH_TRACER
 # Don't trace __switch_to() but let it for function tracer
 CFLAGS_REMOVE_process_32.o = -pg
+CFLAGS_REMOVE_process_64.o = -pg
 endif
 
 #
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 08aa6b10933c..2aa0526ac30e 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -98,6 +98,12 @@ ftrace_call:
 	movq (%rsp), %rax
 	addq $0x38, %rsp
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+	jmp ftrace_stub
+#endif
+
 .globl ftrace_stub
 ftrace_stub:
 	retq
@@ -110,6 +116,12 @@ ENTRY(mcount)
 
 	cmpq $ftrace_stub, ftrace_trace_function
 	jnz trace
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	cmpq $ftrace_stub, ftrace_graph_return
+	jnz ftrace_graph_caller
+#endif
+
 .globl ftrace_stub
 ftrace_stub:
 	retq
@@ -145,6 +157,68 @@ END(mcount)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 #endif /* CONFIG_FUNCTION_TRACER */
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+	cmpl $0, function_trace_stop
+	jne ftrace_stub
+
+	subq $0x38, %rsp
+	movq %rax, (%rsp)
+	movq %rcx, 8(%rsp)
+	movq %rdx, 16(%rsp)
+	movq %rsi, 24(%rsp)
+	movq %rdi, 32(%rsp)
+	movq %r8, 40(%rsp)
+	movq %r9, 48(%rsp)
+
+	leaq 8(%rbp), %rdi
+	movq 0x38(%rsp), %rsi
+
+	call	prepare_ftrace_return
+
+	movq 48(%rsp), %r9
+	movq 40(%rsp), %r8
+	movq 32(%rsp), %rdi
+	movq 24(%rsp), %rsi
+	movq 16(%rsp), %rdx
+	movq 8(%rsp), %rcx
+	movq (%rsp), %rax
+	addq $0x38, %rsp
+	retq
+END(ftrace_graph_caller)
+
+
+.globl return_to_handler
+return_to_handler:
+	subq  $80, %rsp
+
+	movq %rax, (%rsp)
+	movq %rcx, 8(%rsp)
+	movq %rdx, 16(%rsp)
+	movq %rsi, 24(%rsp)
+	movq %rdi, 32(%rsp)
+	movq %r8, 40(%rsp)
+	movq %r9, 48(%rsp)
+	movq %r10, 56(%rsp)
+	movq %r11, 64(%rsp)
+
+	call ftrace_return_to_handler
+
+	movq %rax, 72(%rsp)
+	movq 64(%rsp), %r11
+	movq 56(%rsp), %r10
+	movq 48(%rsp), %r9
+	movq 40(%rsp), %r8
+	movq 32(%rsp), %rdi
+	movq 24(%rsp), %rsi
+	movq 16(%rsp), %rdx
+	movq 8(%rsp), %rcx
+	movq (%rsp), %rax
+	addq $72, %rsp
+	retq
+#endif
+
+
 #ifndef CONFIG_PREEMPT
 #define retint_kernel retint_restore_args
 #endif	
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 7ef914e6a2f6..58832478b94e 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -467,8 +467,13 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 	 * ignore such a protection.
 	 */
 	asm volatile(
+#ifdef CONFIG_X86_64
+		"1: movq (%[parent_old]), %[old]\n"
+		"2: movq %[return_hooker], (%[parent_replaced])\n"
+#else
 		"1: movl (%[parent_old]), %[old]\n"
 		"2: movl %[return_hooker], (%[parent_replaced])\n"
+#endif
 		"   movl $0, %[faulted]\n"
 
 		".section .fixup, \"ax\"\n"
@@ -476,8 +481,13 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 		".previous\n"
 
 		".section __ex_table, \"a\"\n"
+#ifdef CONFIG_X86_64
+		"   .quad 1b, 3b\n"
+		"   .quad 2b, 3b\n"
+#else
 		"   .long 1b, 3b\n"
 		"   .long 2b, 3b\n"
+#endif
 		".previous\n"
 
 		: [parent_replaced] "=r" (parent), [old] "=r" (old),
@@ -509,5 +519,4 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 	ftrace_graph_entry(&trace);
 
 }
-
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 08b536a2614e..f72499627525 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1673,8 +1673,10 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
 		}
 
 		if (t->ret_stack == NULL) {
-			t->ret_stack = ret_stack_list[start++];
 			t->curr_ret_stack = -1;
+			/* Make sure IRQs see the -1 first: */
+			barrier();
+			t->ret_stack = ret_stack_list[start++];
 			atomic_set(&t->trace_overrun, 0);
 		}
 	} while_each_thread(g, t);
-- 
cgit v1.2.3


From a5e25883a445dce94a087ca479b21a5959cd5c18 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 2 Dec 2008 15:34:05 -0500
Subject: ftrace: replace raw_local_irq_save with local_irq_save

Impact: fix for lockdep and ftrace

The raw_local_irq_save/restore confuses lockdep. This patch
converts them to the local_irq_save/restore variants.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c            |  1 +
 kernel/trace/trace.c        | 12 ++++++------
 kernel/trace/trace_branch.c |  4 ++--
 kernel/trace/trace_stack.c  |  8 ++++----
 4 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 46a404173db2..74b1878b8bb8 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -25,6 +25,7 @@
  * Thanks to Arjan van de Ven for coming up with the initial idea of
  * mapping lock dependencies runtime.
  */
+#define DISABLE_BRANCH_PROFILING
 #include <linux/mutex.h>
 #include <linux/sched.h>
 #include <linux/delay.h>
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 91887a280ab9..380de630ebce 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1209,7 +1209,7 @@ void trace_graph_entry(struct ftrace_graph_ent *trace)
 	int cpu;
 	int pc;
 
-	raw_local_irq_save(flags);
+	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 	disabled = atomic_inc_return(&data->disabled);
@@ -1218,7 +1218,7 @@ void trace_graph_entry(struct ftrace_graph_ent *trace)
 		__trace_graph_entry(tr, data, trace, flags, pc);
 	}
 	atomic_dec(&data->disabled);
-	raw_local_irq_restore(flags);
+	local_irq_restore(flags);
 }
 
 void trace_graph_return(struct ftrace_graph_ret *trace)
@@ -1230,7 +1230,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
 	int cpu;
 	int pc;
 
-	raw_local_irq_save(flags);
+	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 	disabled = atomic_inc_return(&data->disabled);
@@ -1239,7 +1239,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
 		__trace_graph_return(tr, data, trace, flags, pc);
 	}
 	atomic_dec(&data->disabled);
-	raw_local_irq_restore(flags);
+	local_irq_restore(flags);
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
@@ -2645,7 +2645,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
 	if (err)
 		goto err_unlock;
 
-	raw_local_irq_disable();
+	local_irq_disable();
 	__raw_spin_lock(&ftrace_max_lock);
 	for_each_tracing_cpu(cpu) {
 		/*
@@ -2662,7 +2662,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
 		}
 	}
 	__raw_spin_unlock(&ftrace_max_lock);
-	raw_local_irq_enable();
+	local_irq_enable();
 
 	tracing_cpumask = tracing_cpumask_new;
 
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index bc972753568d..6c00feb3bac7 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -42,7 +42,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	if (unlikely(!tr))
 		return;
 
-	raw_local_irq_save(flags);
+	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
 		goto out;
@@ -74,7 +74,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 
  out:
 	atomic_dec(&tr->data[cpu]->disabled);
-	raw_local_irq_restore(flags);
+	local_irq_restore(flags);
 }
 
 static inline
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index fde3be15c642..06a16115be0f 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -48,7 +48,7 @@ static inline void check_stack(void)
 	if (!object_is_on_stack(&this_size))
 		return;
 
-	raw_local_irq_save(flags);
+	local_irq_save(flags);
 	__raw_spin_lock(&max_stack_lock);
 
 	/* a race could have already updated it */
@@ -96,7 +96,7 @@ static inline void check_stack(void)
 
  out:
 	__raw_spin_unlock(&max_stack_lock);
-	raw_local_irq_restore(flags);
+	local_irq_restore(flags);
 }
 
 static void
@@ -162,11 +162,11 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
 	if (ret < 0)
 		return ret;
 
-	raw_local_irq_save(flags);
+	local_irq_save(flags);
 	__raw_spin_lock(&max_stack_lock);
 	*ptr = val;
 	__raw_spin_unlock(&max_stack_lock);
-	raw_local_irq_restore(flags);
+	local_irq_restore(flags);
 
 	return count;
 }
-- 
cgit v1.2.3


From abc9b56d66fbd4d93302ef4bf6fa726e1b8255f9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 2 Dec 2008 15:34:06 -0500
Subject: ring-buffer: move some metadata into buffer page

Impact: get ready for splice changes

This patch moves the commit and timestamp into the beginning of each
data page of the buffer. This change will allow the page to be moved
to another location (disk, network, etc) and still have information
in the page to be able to read it.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 63 ++++++++++++++++++++++++++--------------------
 1 file changed, 36 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index e206951603c1..8619c5345889 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -195,20 +195,24 @@ void *ring_buffer_event_data(struct ring_buffer_event *event)
 #define TS_MASK		((1ULL << TS_SHIFT) - 1)
 #define TS_DELTA_TEST	(~TS_MASK)
 
-/*
- * This hack stolen from mm/slob.c.
- * We can store per page timing information in the page frame of the page.
- * Thanks to Peter Zijlstra for suggesting this idea.
- */
-struct buffer_page {
+struct buffer_data_page {
 	u64		 time_stamp;	/* page time stamp */
-	local_t		 write;		/* index for next write */
 	local_t		 commit;	/* write commited index */
+	unsigned char	 data[];	/* data of buffer page */
+};
+
+struct buffer_page {
+	local_t		 write;		/* index for next write */
 	unsigned	 read;		/* index for next read */
 	struct list_head list;		/* list of free pages */
-	void *page;			/* Actual data page */
+	struct buffer_data_page *page;	/* Actual data page */
 };
 
+static void rb_init_page(struct buffer_data_page *page)
+{
+	local_set(&page->commit, 0);
+}
+
 /*
  * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
  * this issue out.
@@ -230,7 +234,7 @@ static inline int test_time_stamp(u64 delta)
 	return 0;
 }
 
-#define BUF_PAGE_SIZE PAGE_SIZE
+#define BUF_PAGE_SIZE (PAGE_SIZE - sizeof(struct buffer_data_page))
 
 /*
  * head_page == tail_page && head == tail then buffer is empty.
@@ -333,6 +337,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
 		if (!addr)
 			goto free_pages;
 		page->page = (void *)addr;
+		rb_init_page(page->page);
 	}
 
 	list_splice(&pages, head);
@@ -378,6 +383,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
 	if (!addr)
 		goto fail_free_reader;
 	page->page = (void *)addr;
+	rb_init_page(page->page);
 
 	INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
 
@@ -647,6 +653,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 			if (!addr)
 				goto free_pages;
 			page->page = (void *)addr;
+			rb_init_page(page->page);
 		}
 	}
 
@@ -682,7 +689,7 @@ static inline int rb_null_event(struct ring_buffer_event *event)
 
 static inline void *__rb_page_index(struct buffer_page *page, unsigned index)
 {
-	return page->page + index;
+	return page->page->data + index;
 }
 
 static inline struct ring_buffer_event *
@@ -712,7 +719,7 @@ static inline unsigned rb_page_write(struct buffer_page *bpage)
 
 static inline unsigned rb_page_commit(struct buffer_page *bpage)
 {
-	return local_read(&bpage->commit);
+	return local_read(&bpage->page->commit);
 }
 
 /* Size is determined by what has been commited */
@@ -804,14 +811,15 @@ rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
 		if (RB_WARN_ON(cpu_buffer,
 			  cpu_buffer->commit_page == cpu_buffer->tail_page))
 			return;
-		cpu_buffer->commit_page->commit =
+		cpu_buffer->commit_page->page->commit =
 			cpu_buffer->commit_page->write;
 		rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
-		cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp;
+		cpu_buffer->write_stamp =
+			cpu_buffer->commit_page->page->time_stamp;
 	}
 
 	/* Now set the commit to the event's index */
-	local_set(&cpu_buffer->commit_page->commit, index);
+	local_set(&cpu_buffer->commit_page->page->commit, index);
 }
 
 static inline void
@@ -826,16 +834,17 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
 	 * assign the commit to the tail.
 	 */
 	while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
-		cpu_buffer->commit_page->commit =
+		cpu_buffer->commit_page->page->commit =
 			cpu_buffer->commit_page->write;
 		rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
-		cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp;
+		cpu_buffer->write_stamp =
+			cpu_buffer->commit_page->page->time_stamp;
 		/* add barrier to keep gcc from optimizing too much */
 		barrier();
 	}
 	while (rb_commit_index(cpu_buffer) !=
 	       rb_page_write(cpu_buffer->commit_page)) {
-		cpu_buffer->commit_page->commit =
+		cpu_buffer->commit_page->page->commit =
 			cpu_buffer->commit_page->write;
 		barrier();
 	}
@@ -843,7 +852,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
 
 static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 {
-	cpu_buffer->read_stamp = cpu_buffer->reader_page->time_stamp;
+	cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp;
 	cpu_buffer->reader_page->read = 0;
 }
 
@@ -862,7 +871,7 @@ static inline void rb_inc_iter(struct ring_buffer_iter *iter)
 	else
 		rb_inc_page(cpu_buffer, &iter->head_page);
 
-	iter->read_stamp = iter->head_page->time_stamp;
+	iter->read_stamp = iter->head_page->page->time_stamp;
 	iter->head = 0;
 }
 
@@ -998,12 +1007,12 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		 */
 		if (tail_page == cpu_buffer->tail_page) {
 			local_set(&next_page->write, 0);
-			local_set(&next_page->commit, 0);
+			local_set(&next_page->page->commit, 0);
 			cpu_buffer->tail_page = next_page;
 
 			/* reread the time stamp */
 			*ts = ring_buffer_time_stamp(cpu_buffer->cpu);
-			cpu_buffer->tail_page->time_stamp = *ts;
+			cpu_buffer->tail_page->page->time_stamp = *ts;
 		}
 
 		/*
@@ -1048,7 +1057,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	 * this page's time stamp.
 	 */
 	if (!tail && rb_is_commit(cpu_buffer, event))
-		cpu_buffer->commit_page->time_stamp = *ts;
+		cpu_buffer->commit_page->page->time_stamp = *ts;
 
 	return event;
 
@@ -1099,7 +1108,7 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 			event->time_delta = *delta & TS_MASK;
 			event->array[0] = *delta >> TS_SHIFT;
 		} else {
-			cpu_buffer->commit_page->time_stamp = *ts;
+			cpu_buffer->commit_page->page->time_stamp = *ts;
 			event->time_delta = 0;
 			event->array[0] = 0;
 		}
@@ -1552,7 +1561,7 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
 	if (iter->head)
 		iter->read_stamp = cpu_buffer->read_stamp;
 	else
-		iter->read_stamp = iter->head_page->time_stamp;
+		iter->read_stamp = iter->head_page->page->time_stamp;
 }
 
 /**
@@ -1696,7 +1705,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	cpu_buffer->reader_page->list.prev = reader->list.prev;
 
 	local_set(&cpu_buffer->reader_page->write, 0);
-	local_set(&cpu_buffer->reader_page->commit, 0);
+	local_set(&cpu_buffer->reader_page->page->commit, 0);
 
 	/* Make the reader page now replace the head */
 	reader->list.prev->next = &cpu_buffer->reader_page->list;
@@ -2088,7 +2097,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 	cpu_buffer->head_page
 		= list_entry(cpu_buffer->pages.next, struct buffer_page, list);
 	local_set(&cpu_buffer->head_page->write, 0);
-	local_set(&cpu_buffer->head_page->commit, 0);
+	local_set(&cpu_buffer->head_page->page->commit, 0);
 
 	cpu_buffer->head_page->read = 0;
 
@@ -2097,7 +2106,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 
 	INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
 	local_set(&cpu_buffer->reader_page->write, 0);
-	local_set(&cpu_buffer->reader_page->commit, 0);
+	local_set(&cpu_buffer->reader_page->page->commit, 0);
 	cpu_buffer->reader_page->read = 0;
 
 	cpu_buffer->overrun = 0;
-- 
cgit v1.2.3


From 8789a9e7df6bf9b93739c4c7d4e380725bc9e936 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 2 Dec 2008 15:34:07 -0500
Subject: ring-buffer: read page interface

Impact: new API to ring buffer

This patch adds a new interface into the ring buffer that allows a
page to be read from the ring buffer on a given CPU. For every page
read, one must also be given to allow for a "swap" of the pages.

 rpage = ring_buffer_alloc_read_page(buffer);
 if (!rpage)
	goto err;
 ret = ring_buffer_read_page(buffer, &rpage, cpu, full);
 if (!ret)
	goto empty;
 process_page(rpage);
 ring_buffer_free_read_page(rpage);

The caller of these functions must handle any waits that are
needed to wait for new data. The ring_buffer_read_page will simply
return 0 if there is no data, or if "full" is set and the writer
is still on the current page.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ring_buffer.h |   5 ++
 kernel/trace/ring_buffer.c  | 166 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 171 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 3bb87a753fa3..1a350a847edd 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -124,6 +124,11 @@ void tracing_on(void);
 void tracing_off(void);
 void tracing_off_permanent(void);
 
+void *ring_buffer_alloc_read_page(struct ring_buffer *buffer);
+void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data);
+int ring_buffer_read_page(struct ring_buffer *buffer,
+			  void **data_page, int cpu, int full);
+
 enum ring_buffer_flags {
 	RB_FL_OVERWRITE		= 1 << 0,
 };
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8619c5345889..50b74d3a5c32 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -687,6 +687,12 @@ static inline int rb_null_event(struct ring_buffer_event *event)
 	return event->type == RINGBUF_TYPE_PADDING;
 }
 
+static inline void *
+__rb_data_page_index(struct buffer_data_page *page, unsigned index)
+{
+	return page->data + index;
+}
+
 static inline void *__rb_page_index(struct buffer_page *page, unsigned index)
 {
 	return page->page->data + index;
@@ -2232,6 +2238,166 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
 	return 0;
 }
 
+static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
+			      struct buffer_data_page *page)
+{
+	struct ring_buffer_event *event;
+	unsigned long head;
+
+	__raw_spin_lock(&cpu_buffer->lock);
+	for (head = 0; head < local_read(&page->commit);
+	     head += rb_event_length(event)) {
+
+		event = __rb_data_page_index(page, head);
+		if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
+			return;
+		/* Only count data entries */
+		if (event->type != RINGBUF_TYPE_DATA)
+			continue;
+		cpu_buffer->entries--;
+	}
+	__raw_spin_unlock(&cpu_buffer->lock);
+}
+
+/**
+ * ring_buffer_alloc_read_page - allocate a page to read from buffer
+ * @buffer: the buffer to allocate for.
+ *
+ * This function is used in conjunction with ring_buffer_read_page.
+ * When reading a full page from the ring buffer, these functions
+ * can be used to speed up the process. The calling function should
+ * allocate a few pages first with this function. Then when it
+ * needs to get pages from the ring buffer, it passes the result
+ * of this function into ring_buffer_read_page, which will swap
+ * the page that was allocated, with the read page of the buffer.
+ *
+ * Returns:
+ *  The page allocated, or NULL on error.
+ */
+void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
+{
+	unsigned long addr;
+	struct buffer_data_page *page;
+
+	addr = __get_free_page(GFP_KERNEL);
+	if (!addr)
+		return NULL;
+
+	page = (void *)addr;
+
+	return page;
+}
+
+/**
+ * ring_buffer_free_read_page - free an allocated read page
+ * @buffer: the buffer the page was allocate for
+ * @data: the page to free
+ *
+ * Free a page allocated from ring_buffer_alloc_read_page.
+ */
+void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
+{
+	free_page((unsigned long)data);
+}
+
+/**
+ * ring_buffer_read_page - extract a page from the ring buffer
+ * @buffer: buffer to extract from
+ * @data_page: the page to use allocated from ring_buffer_alloc_read_page
+ * @cpu: the cpu of the buffer to extract
+ * @full: should the extraction only happen when the page is full.
+ *
+ * This function will pull out a page from the ring buffer and consume it.
+ * @data_page must be the address of the variable that was returned
+ * from ring_buffer_alloc_read_page. This is because the page might be used
+ * to swap with a page in the ring buffer.
+ *
+ * for example:
+ *	rpage = ring_buffer_alloc_page(buffer);
+ *	if (!rpage)
+ *		return error;
+ *	ret = ring_buffer_read_page(buffer, &rpage, cpu, 0);
+ *	if (ret)
+ *		process_page(rpage);
+ *
+ * When @full is set, the function will not return true unless
+ * the writer is off the reader page.
+ *
+ * Note: it is up to the calling functions to handle sleeps and wakeups.
+ *  The ring buffer can be used anywhere in the kernel and can not
+ *  blindly call wake_up. The layer that uses the ring buffer must be
+ *  responsible for that.
+ *
+ * Returns:
+ *  1 if data has been transferred
+ *  0 if no data has been transferred.
+ */
+int ring_buffer_read_page(struct ring_buffer *buffer,
+			    void **data_page, int cpu, int full)
+{
+	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+	struct ring_buffer_event *event;
+	struct buffer_data_page *page;
+	unsigned long flags;
+	int ret = 0;
+
+	if (!data_page)
+		return 0;
+
+	page = *data_page;
+	if (!page)
+		return 0;
+
+	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+	/*
+	 * rb_buffer_peek will get the next ring buffer if
+	 * the current reader page is empty.
+	 */
+	event = rb_buffer_peek(buffer, cpu, NULL);
+	if (!event)
+		goto out;
+
+	/* check for data */
+	if (!local_read(&cpu_buffer->reader_page->page->commit))
+		goto out;
+	/*
+	 * If the writer is already off of the read page, then simply
+	 * switch the read page with the given page. Otherwise
+	 * we need to copy the data from the reader to the writer.
+	 */
+	if (cpu_buffer->reader_page == cpu_buffer->commit_page) {
+		unsigned int read = cpu_buffer->reader_page->read;
+
+		if (full)
+			goto out;
+		/* The writer is still on the reader page, we must copy */
+		page = cpu_buffer->reader_page->page;
+		memcpy(page->data,
+		       cpu_buffer->reader_page->page->data + read,
+		       local_read(&page->commit) - read);
+
+		/* consume what was read */
+		cpu_buffer->reader_page += read;
+
+	} else {
+		/* swap the pages */
+		rb_init_page(page);
+		page = cpu_buffer->reader_page->page;
+		cpu_buffer->reader_page->page = *data_page;
+		cpu_buffer->reader_page->read = 0;
+		*data_page = page;
+	}
+	ret = 1;
+
+	/* update the entry counter */
+	rb_remove_entries(cpu_buffer, page);
+ out:
+	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+	return ret;
+}
+
 static ssize_t
 rb_simple_read(struct file *filp, char __user *ubuf,
 	       size_t cnt, loff_t *ppos)
-- 
cgit v1.2.3


From 14a866c567e040ccf6240d68b083dd1dbbde63e6 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 2 Dec 2008 23:50:02 -0500
Subject: ftrace: add ftrace_graph_stop()

Impact: new ftrace_graph_stop function

While developing more features of function graph, I hit a bug that
caused the WARN_ON to trigger in the prepare_ftrace_return function.
Well, it was hard for me to find out that was happening because the
bug would not print, it would just cause a hard lockup or reboot.
The reason is that it is not safe to call printk from this function.

Looking further, I also found that it calls unregister_ftrace_graph,
which grabs a mutex and calls kstop machine. This would definitely
lock the box up if it were to trigger.

This patch adds a fast and safe ftrace_graph_stop() which will
stop the function tracer. Then it is safe to call the WARN ON.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ftrace.c | 10 ++++++----
 include/linux/ftrace.h   |  2 ++
 kernel/trace/ftrace.c    |  5 +++++
 3 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 1a5b8f8cb3cc..adba8e9a427c 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -484,14 +484,16 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 		: "memory"
 	);
 
-	if (WARN_ON(faulted)) {
-		unregister_ftrace_graph();
+	if (unlikely(faulted)) {
+		ftrace_graph_stop();
+		WARN_ON(1);
 		return;
 	}
 
-	if (WARN_ON(!__kernel_text_address(old))) {
-		unregister_ftrace_graph();
+	if (unlikely(!__kernel_text_address(old))) {
+		ftrace_graph_stop();
 		*parent = old;
+		WARN_ON(1);
 		return;
 	}
 
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index afba918c623c..58ca1c3a3f4d 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -376,6 +376,8 @@ typedef void (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */
 extern int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 				trace_func_graph_ent_t entryfunc);
 
+extern void ftrace_graph_stop(void);
+
 /* The current handlers in use */
 extern trace_func_graph_ret_t ftrace_graph_return;
 extern trace_func_graph_ent_t ftrace_graph_entry;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2e78628443e8..a44af05ae2d0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1769,5 +1769,10 @@ void ftrace_graph_exit_task(struct task_struct *t)
 
 	kfree(ret_stack);
 }
+
+void ftrace_graph_stop(void)
+{
+	ftrace_stop();
+}
 #endif
 
-- 
cgit v1.2.3


From 044fa782ebb9472cf5253e95d9a625fd4c0bdd99 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 2 Dec 2008 23:50:03 -0500
Subject: ring-buffer: change "page" variable names to "bpage"

Impact: clean up

Andrew Morton pointed out that the kernel convention of a variable
named page should be of type page struct. The ring buffer uses
a variable named "page" for a pointer to something else.

This patch converts those to be called "bpage" (as in "buffer page").

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 130 ++++++++++++++++++++++-----------------------
 1 file changed, 65 insertions(+), 65 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 50b74d3a5c32..7f69cfeaadf7 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -208,9 +208,9 @@ struct buffer_page {
 	struct buffer_data_page *page;	/* Actual data page */
 };
 
-static void rb_init_page(struct buffer_data_page *page)
+static void rb_init_page(struct buffer_data_page *bpage)
 {
-	local_set(&page->commit, 0);
+	local_set(&bpage->commit, 0);
 }
 
 /*
@@ -298,19 +298,19 @@ struct ring_buffer_iter {
 static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	struct list_head *head = &cpu_buffer->pages;
-	struct buffer_page *page, *tmp;
+	struct buffer_page *bpage, *tmp;
 
 	if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
 		return -1;
 	if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
 		return -1;
 
-	list_for_each_entry_safe(page, tmp, head, list) {
+	list_for_each_entry_safe(bpage, tmp, head, list) {
 		if (RB_WARN_ON(cpu_buffer,
-			       page->list.next->prev != &page->list))
+			       bpage->list.next->prev != &bpage->list))
 			return -1;
 		if (RB_WARN_ON(cpu_buffer,
-			       page->list.prev->next != &page->list))
+			       bpage->list.prev->next != &bpage->list))
 			return -1;
 	}
 
@@ -321,23 +321,23 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
 			     unsigned nr_pages)
 {
 	struct list_head *head = &cpu_buffer->pages;
-	struct buffer_page *page, *tmp;
+	struct buffer_page *bpage, *tmp;
 	unsigned long addr;
 	LIST_HEAD(pages);
 	unsigned i;
 
 	for (i = 0; i < nr_pages; i++) {
-		page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
+		bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
 				    GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));
-		if (!page)
+		if (!bpage)
 			goto free_pages;
-		list_add(&page->list, &pages);
+		list_add(&bpage->list, &pages);
 
 		addr = __get_free_page(GFP_KERNEL);
 		if (!addr)
 			goto free_pages;
-		page->page = (void *)addr;
-		rb_init_page(page->page);
+		bpage->page = (void *)addr;
+		rb_init_page(bpage->page);
 	}
 
 	list_splice(&pages, head);
@@ -347,9 +347,9 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
 	return 0;
 
  free_pages:
-	list_for_each_entry_safe(page, tmp, &pages, list) {
-		list_del_init(&page->list);
-		free_buffer_page(page);
+	list_for_each_entry_safe(bpage, tmp, &pages, list) {
+		list_del_init(&bpage->list);
+		free_buffer_page(bpage);
 	}
 	return -ENOMEM;
 }
@@ -358,7 +358,7 @@ static struct ring_buffer_per_cpu *
 rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
-	struct buffer_page *page;
+	struct buffer_page *bpage;
 	unsigned long addr;
 	int ret;
 
@@ -373,17 +373,17 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
 	cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 	INIT_LIST_HEAD(&cpu_buffer->pages);
 
-	page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
+	bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
 			    GFP_KERNEL, cpu_to_node(cpu));
-	if (!page)
+	if (!bpage)
 		goto fail_free_buffer;
 
-	cpu_buffer->reader_page = page;
+	cpu_buffer->reader_page = bpage;
 	addr = __get_free_page(GFP_KERNEL);
 	if (!addr)
 		goto fail_free_reader;
-	page->page = (void *)addr;
-	rb_init_page(page->page);
+	bpage->page = (void *)addr;
+	rb_init_page(bpage->page);
 
 	INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
 
@@ -408,14 +408,14 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
 static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	struct list_head *head = &cpu_buffer->pages;
-	struct buffer_page *page, *tmp;
+	struct buffer_page *bpage, *tmp;
 
 	list_del_init(&cpu_buffer->reader_page->list);
 	free_buffer_page(cpu_buffer->reader_page);
 
-	list_for_each_entry_safe(page, tmp, head, list) {
-		list_del_init(&page->list);
-		free_buffer_page(page);
+	list_for_each_entry_safe(bpage, tmp, head, list) {
+		list_del_init(&bpage->list);
+		free_buffer_page(bpage);
 	}
 	kfree(cpu_buffer);
 }
@@ -512,7 +512,7 @@ static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
 static void
 rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
 {
-	struct buffer_page *page;
+	struct buffer_page *bpage;
 	struct list_head *p;
 	unsigned i;
 
@@ -523,9 +523,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
 		if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages)))
 			return;
 		p = cpu_buffer->pages.next;
-		page = list_entry(p, struct buffer_page, list);
-		list_del_init(&page->list);
-		free_buffer_page(page);
+		bpage = list_entry(p, struct buffer_page, list);
+		list_del_init(&bpage->list);
+		free_buffer_page(bpage);
 	}
 	if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages)))
 		return;
@@ -542,7 +542,7 @@ static void
 rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
 		struct list_head *pages, unsigned nr_pages)
 {
-	struct buffer_page *page;
+	struct buffer_page *bpage;
 	struct list_head *p;
 	unsigned i;
 
@@ -553,9 +553,9 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
 		if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
 			return;
 		p = pages->next;
-		page = list_entry(p, struct buffer_page, list);
-		list_del_init(&page->list);
-		list_add_tail(&page->list, &cpu_buffer->pages);
+		bpage = list_entry(p, struct buffer_page, list);
+		list_del_init(&bpage->list);
+		list_add_tail(&bpage->list, &cpu_buffer->pages);
 	}
 	rb_reset_cpu(cpu_buffer);
 
@@ -582,7 +582,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	unsigned nr_pages, rm_pages, new_pages;
-	struct buffer_page *page, *tmp;
+	struct buffer_page *bpage, *tmp;
 	unsigned long buffer_size;
 	unsigned long addr;
 	LIST_HEAD(pages);
@@ -643,17 +643,17 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 
 	for_each_buffer_cpu(buffer, cpu) {
 		for (i = 0; i < new_pages; i++) {
-			page = kzalloc_node(ALIGN(sizeof(*page),
+			bpage = kzalloc_node(ALIGN(sizeof(*bpage),
 						  cache_line_size()),
 					    GFP_KERNEL, cpu_to_node(cpu));
-			if (!page)
+			if (!bpage)
 				goto free_pages;
-			list_add(&page->list, &pages);
+			list_add(&bpage->list, &pages);
 			addr = __get_free_page(GFP_KERNEL);
 			if (!addr)
 				goto free_pages;
-			page->page = (void *)addr;
-			rb_init_page(page->page);
+			bpage->page = (void *)addr;
+			rb_init_page(bpage->page);
 		}
 	}
 
@@ -674,9 +674,9 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 	return size;
 
  free_pages:
-	list_for_each_entry_safe(page, tmp, &pages, list) {
-		list_del_init(&page->list);
-		free_buffer_page(page);
+	list_for_each_entry_safe(bpage, tmp, &pages, list) {
+		list_del_init(&bpage->list);
+		free_buffer_page(bpage);
 	}
 	mutex_unlock(&buffer->mutex);
 	return -ENOMEM;
@@ -688,14 +688,14 @@ static inline int rb_null_event(struct ring_buffer_event *event)
 }
 
 static inline void *
-__rb_data_page_index(struct buffer_data_page *page, unsigned index)
+__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
 {
-	return page->data + index;
+	return bpage->data + index;
 }
 
-static inline void *__rb_page_index(struct buffer_page *page, unsigned index)
+static inline void *__rb_page_index(struct buffer_page *bpage, unsigned index)
 {
-	return page->page->data + index;
+	return bpage->page->data + index;
 }
 
 static inline struct ring_buffer_event *
@@ -771,14 +771,14 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
 }
 
 static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
-			       struct buffer_page **page)
+			       struct buffer_page **bpage)
 {
-	struct list_head *p = (*page)->list.next;
+	struct list_head *p = (*bpage)->list.next;
 
 	if (p == &cpu_buffer->pages)
 		p = p->next;
 
-	*page = list_entry(p, struct buffer_page, list);
+	*bpage = list_entry(p, struct buffer_page, list);
 }
 
 static inline unsigned
@@ -2239,16 +2239,16 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
 }
 
 static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
-			      struct buffer_data_page *page)
+			      struct buffer_data_page *bpage)
 {
 	struct ring_buffer_event *event;
 	unsigned long head;
 
 	__raw_spin_lock(&cpu_buffer->lock);
-	for (head = 0; head < local_read(&page->commit);
+	for (head = 0; head < local_read(&bpage->commit);
 	     head += rb_event_length(event)) {
 
-		event = __rb_data_page_index(page, head);
+		event = __rb_data_page_index(bpage, head);
 		if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
 			return;
 		/* Only count data entries */
@@ -2277,15 +2277,15 @@ static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
 void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
 {
 	unsigned long addr;
-	struct buffer_data_page *page;
+	struct buffer_data_page *bpage;
 
 	addr = __get_free_page(GFP_KERNEL);
 	if (!addr)
 		return NULL;
 
-	page = (void *)addr;
+	bpage = (void *)addr;
 
-	return page;
+	return bpage;
 }
 
 /**
@@ -2337,15 +2337,15 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 {
 	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
 	struct ring_buffer_event *event;
-	struct buffer_data_page *page;
+	struct buffer_data_page *bpage;
 	unsigned long flags;
 	int ret = 0;
 
 	if (!data_page)
 		return 0;
 
-	page = *data_page;
-	if (!page)
+	bpage = *data_page;
+	if (!bpage)
 		return 0;
 
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
@@ -2372,26 +2372,26 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		if (full)
 			goto out;
 		/* The writer is still on the reader page, we must copy */
-		page = cpu_buffer->reader_page->page;
-		memcpy(page->data,
+		bpage = cpu_buffer->reader_page->page;
+		memcpy(bpage->data,
 		       cpu_buffer->reader_page->page->data + read,
-		       local_read(&page->commit) - read);
+		       local_read(&bpage->commit) - read);
 
 		/* consume what was read */
 		cpu_buffer->reader_page += read;
 
 	} else {
 		/* swap the pages */
-		rb_init_page(page);
-		page = cpu_buffer->reader_page->page;
+		rb_init_page(bpage);
+		bpage = cpu_buffer->reader_page->page;
 		cpu_buffer->reader_page->page = *data_page;
 		cpu_buffer->reader_page->read = 0;
-		*data_page = page;
+		*data_page = bpage;
 	}
 	ret = 1;
 
 	/* update the entry counter */
-	rb_remove_entries(cpu_buffer, page);
+	rb_remove_entries(cpu_buffer, bpage);
  out:
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
-- 
cgit v1.2.3


From e49dc19c6a19ea112fcb94b7c62ec62cdd5c08aa Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 2 Dec 2008 23:50:05 -0500
Subject: ftrace: function graph return for function entry

Impact: feature, let entry function decide to trace or not

This patch lets the graph tracer entry function decide if the tracing
should be done at the end as well. This requires all function graph
entry functions return 1 if it should trace, or 0 if the return should
not be traced.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_32.S |  3 +++
 arch/x86/kernel/entry_64.S |  3 +++
 arch/x86/kernel/ftrace.c   |  7 ++++++-
 include/linux/ftrace.h     |  2 +-
 kernel/trace/ftrace.c      | 10 +++++++---
 kernel/trace/trace.c       |  4 +++-
 kernel/trace/trace.h       |  2 +-
 7 files changed, 24 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 826682abed1d..43ceb3f454bf 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1196,6 +1196,9 @@ ENTRY(mcount)
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	cmpl $ftrace_stub, ftrace_graph_return
 	jnz ftrace_graph_caller
+
+	cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
+	jnz ftrace_graph_caller
 #endif
 .globl ftrace_stub
 ftrace_stub:
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 9060ba6497e2..54e0bbdccb99 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -120,6 +120,9 @@ ENTRY(mcount)
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	cmpq $ftrace_stub, ftrace_graph_return
 	jnz ftrace_graph_caller
+
+	cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
+	jnz ftrace_graph_caller
 #endif
 
 .globl ftrace_stub
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index adba8e9a427c..d278ad2ebda2 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -425,6 +425,7 @@ static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
 	trace->calltime = current->ret_stack[index].calltime;
 	trace->overrun = atomic_read(&current->trace_overrun);
 	trace->depth = index;
+	barrier();
 	current->curr_ret_stack--;
 }
 
@@ -506,7 +507,11 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 	}
 
 	trace.func = self_addr;
-	ftrace_graph_entry(&trace);
 
+	/* Only trace if the calling function expects to */
+	if (!ftrace_graph_entry(&trace)) {
+		current->curr_ret_stack--;
+		*parent = old;
+	}
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 58ca1c3a3f4d..469ceb3e85ba 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -371,7 +371,7 @@ struct ftrace_graph_ret {
 #define FTRACE_RETSTACK_ALLOC_SIZE 32
 /* Type of the callback handlers for tracing function graph*/
 typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *); /* return */
-typedef void (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */
+typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */
 
 extern int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 				trace_func_graph_ent_t entryfunc);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a44af05ae2d0..65b9e863056b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1636,11 +1636,15 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 
 static atomic_t ftrace_graph_active;
 
+int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
+{
+	return 0;
+}
+
 /* The callbacks that hook a function */
 trace_func_graph_ret_t ftrace_graph_return =
 			(trace_func_graph_ret_t)ftrace_stub;
-trace_func_graph_ent_t ftrace_graph_entry =
-			(trace_func_graph_ent_t)ftrace_stub;
+trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
 
 /* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
 static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
@@ -1738,7 +1742,7 @@ void unregister_ftrace_graph(void)
 
 	atomic_dec(&ftrace_graph_active);
 	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
-	ftrace_graph_entry = (trace_func_graph_ent_t)ftrace_stub;
+	ftrace_graph_entry = ftrace_graph_entry_stub;
 	ftrace_shutdown(FTRACE_STOP_FUNC_RET);
 
 	mutex_unlock(&ftrace_sysctl_lock);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 380de630ebce..8b6409a62b54 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1200,7 +1200,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
 }
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-void trace_graph_entry(struct ftrace_graph_ent *trace)
+int trace_graph_entry(struct ftrace_graph_ent *trace)
 {
 	struct trace_array *tr = &global_trace;
 	struct trace_array_cpu *data;
@@ -1219,6 +1219,8 @@ void trace_graph_entry(struct ftrace_graph_ent *trace)
 	}
 	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
+
+	return 1;
 }
 
 void trace_graph_return(struct ftrace_graph_ret *trace)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f96f4e787ff3..0565ae9a2210 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -412,7 +412,7 @@ void trace_function(struct trace_array *tr,
 		    unsigned long flags, int pc);
 
 void trace_graph_return(struct ftrace_graph_ret *trace);
-void trace_graph_entry(struct ftrace_graph_ent *trace);
+int trace_graph_entry(struct ftrace_graph_ent *trace);
 void trace_bts(struct trace_array *tr,
 	       unsigned long from,
 	       unsigned long to);
-- 
cgit v1.2.3


From 11e84acc400921743cc8d488e4a265cd98a655c7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 3 Dec 2008 02:30:37 +0100
Subject: tracing/function-graph-tracer: display unified style cmdline and pid

Impact: extend function-graph output: let one know which thread called a function

This patch implements a helper function to print the couple cmdline/pid.
Its output is provided during task switching and on each row if the new
"funcgraph-proc" defualt-off option is set through trace_options file.

The output is center aligned and never exceeds 14 characters. The cmdline
is truncated over 7 chars.
But note that if the pid exceeds 6 characters, the column will overflow (but
the situation is abnormal).

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_functions_graph.c | 113 ++++++++++++++++++++++++++++++-----
 1 file changed, 99 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 894b50bca313..23e19d2dbd06 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -19,6 +19,7 @@
 #define TRACE_GRAPH_PRINT_OVERRUN	0x1
 #define TRACE_GRAPH_PRINT_CPU		0x2
 #define TRACE_GRAPH_PRINT_OVERHEAD	0x4
+#define TRACE_GRAPH_PRINT_PROC		0x8
 
 static struct tracer_opt trace_opts[] = {
 	/* Display overruns ? */
@@ -27,11 +28,13 @@ static struct tracer_opt trace_opts[] = {
 	{ TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) },
 	/* Display Overhead ? */
 	{ TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) },
+	/* Display proc name/pid */
+	{ TRACER_OPT(funcgraph-proc, TRACE_GRAPH_PRINT_PROC) },
 	{ } /* Empty entry */
 };
 
 static struct tracer_flags tracer_flags = {
-	/* Don't display overruns by default */
+	/* Don't display overruns and proc by default */
 	.val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD,
 	.opts = trace_opts
 };
@@ -104,23 +107,63 @@ print_graph_cpu(struct trace_seq *s, int cpu)
 	return TRACE_TYPE_HANDLED;
 }
 
+#define TRACE_GRAPH_PROCINFO_LENGTH	14
+
+static enum print_line_t
+print_graph_proc(struct trace_seq *s, pid_t pid)
+{
+	int i;
+	int ret;
+	int len;
+	char comm[8];
+	int spaces = 0;
+	/* sign + log10(MAX_INT) + '\0' */
+	char pid_str[11];
+
+	strncpy(comm, trace_find_cmdline(pid), 7);
+	comm[7] = '\0';
+	sprintf(pid_str, "%d", pid);
+
+	/* 1 stands for the "-" character */
+	len = strlen(comm) + strlen(pid_str) + 1;
+
+	if (len < TRACE_GRAPH_PROCINFO_LENGTH)
+		spaces = TRACE_GRAPH_PROCINFO_LENGTH - len;
+
+	/* First spaces to align center */
+	for (i = 0; i < spaces / 2; i++) {
+		ret = trace_seq_printf(s, " ");
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	ret = trace_seq_printf(s, "%s-%s", comm, pid_str);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Last spaces to align center */
+	for (i = 0; i < spaces - (spaces / 2); i++) {
+		ret = trace_seq_printf(s, " ");
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+	return TRACE_TYPE_HANDLED;
+}
+
 
 /* If the pid changed since the last trace, output this event */
-static int verif_pid(struct trace_seq *s, pid_t pid, int cpu)
+static enum print_line_t
+verif_pid(struct trace_seq *s, pid_t pid, int cpu)
 {
-	char *comm, *prev_comm;
 	pid_t prev_pid;
 	int ret;
 
 	if (last_pid[cpu] != -1 && last_pid[cpu] == pid)
-		return 1;
+		return TRACE_TYPE_HANDLED;
 
 	prev_pid = last_pid[cpu];
 	last_pid[cpu] = pid;
 
-	comm = trace_find_cmdline(pid);
-	prev_comm = trace_find_cmdline(prev_pid);
-
 /*
  * Context-switch trace line:
 
@@ -130,11 +173,31 @@ static int verif_pid(struct trace_seq *s, pid_t pid, int cpu)
 
  */
 	ret = trace_seq_printf(s,
-		" ------------------------------------------\n");
-	ret += trace_seq_printf(s, " | %d)  %s-%d  =>  %s-%d\n",
-				  cpu, prev_comm, prev_pid, comm, pid);
-	ret += trace_seq_printf(s,
-		" ------------------------------------------\n\n");
+		"\n ------------------------------------------\n |");
+	if (!ret)
+		TRACE_TYPE_PARTIAL_LINE;
+
+	ret = print_graph_cpu(s, cpu);
+	if (ret == TRACE_TYPE_PARTIAL_LINE)
+		TRACE_TYPE_PARTIAL_LINE;
+
+	ret = print_graph_proc(s, prev_pid);
+	if (ret == TRACE_TYPE_PARTIAL_LINE)
+		TRACE_TYPE_PARTIAL_LINE;
+
+	ret = trace_seq_printf(s, " => ");
+	if (!ret)
+		TRACE_TYPE_PARTIAL_LINE;
+
+	ret = print_graph_proc(s, pid);
+	if (ret == TRACE_TYPE_PARTIAL_LINE)
+		TRACE_TYPE_PARTIAL_LINE;
+
+	ret = trace_seq_printf(s,
+		"\n ------------------------------------------\n\n");
+	if (!ret)
+		TRACE_TYPE_PARTIAL_LINE;
+
 	return ret;
 }
 
@@ -288,12 +351,23 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 	struct trace_entry *ent = iter->ent;
 
 	/* Pid */
-	if (!verif_pid(s, ent->pid, cpu))
+	if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Cpu */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
 		ret = print_graph_cpu(s, cpu);
+		if (ret == TRACE_TYPE_PARTIAL_LINE)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	/* Proc */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
+		ret = print_graph_proc(s, ent->pid);
+		if (ret == TRACE_TYPE_PARTIAL_LINE)
+			return TRACE_TYPE_PARTIAL_LINE;
+
+		ret = trace_seq_printf(s, " | ");
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
@@ -318,12 +392,23 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 		duration = 9999999ULL;
 
 	/* Pid */
-	if (!verif_pid(s, ent->pid, cpu))
+	if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Cpu */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
 		ret = print_graph_cpu(s, cpu);
+		if (ret == TRACE_TYPE_PARTIAL_LINE)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	/* Proc */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
+		ret = print_graph_proc(s, ent->pid);
+		if (ret == TRACE_TYPE_PARTIAL_LINE)
+			return TRACE_TYPE_PARTIAL_LINE;
+
+		ret = trace_seq_printf(s, " | ");
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
-- 
cgit v1.2.3


From 166d3c7994d79ab3f78f420607283361ff5cce79 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 3 Dec 2008 02:32:12 +0100
Subject: tracing/function-graph-tracer: improve duration output

Impact: better trace output of duration for long calls

The old duration output didn't exceeded 9999.999 us to fit the column
and the nanosecs were always 3 numbers. As Ingo suggested, it's better
to have the whole microseconds elapsed time and shift the nanosecs precision
if needed to fit the maximum 7 numbers. And usec need more number, the case
should be rare and important enough to break a bit the column alignment to
show it.

So, depending of the duration value, we now have these patterns:

    u.nnn us
   uu.nnn us
  uuu.nnn us
 uuuu.nnn us
 uuuuu.nn us
 uuuuuu.n us
 uuuuuuuu..... us

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_functions_graph.c | 55 ++++++++++++++++++++++++++++--------
 1 file changed, 43 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 23e19d2dbd06..c66578f2fdc2 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -232,11 +232,50 @@ trace_branch_is_leaf(struct trace_iterator *iter,
 }
 
 
-static inline int
+static enum print_line_t
 print_graph_duration(unsigned long long duration, struct trace_seq *s)
 {
 	unsigned long nsecs_rem = do_div(duration, 1000);
-	return trace_seq_printf(s, "%4llu.%3lu us |  ", duration, nsecs_rem);
+	/* log10(ULONG_MAX) + '\0' */
+	char msecs_str[21];
+	char nsecs_str[5];
+	int ret, len;
+	int i;
+
+	sprintf(msecs_str, "%lu", (unsigned long) duration);
+
+	/* Print msecs */
+	ret = trace_seq_printf(s, msecs_str);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	len = strlen(msecs_str);
+
+	/* Print nsecs (we don't want to exceed 7 numbers) */
+	if (len < 7) {
+		snprintf(nsecs_str, 8 - len, "%03lu", nsecs_rem);
+		ret = trace_seq_printf(s, ".%s", nsecs_str);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+		len += strlen(nsecs_str);
+	}
+
+	ret = trace_seq_printf(s, " us ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Print remaining spaces to fit the row's width */
+	for (i = len; i < 7; i++) {
+		ret = trace_seq_printf(s, " ");
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	ret = trace_seq_printf(s, "|  ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+	return TRACE_TYPE_HANDLED;
+
 }
 
 /* Signal a overhead of time execution to the output */
@@ -273,10 +312,6 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 	call = &entry->graph_ent;
 	duration = graph_ret->rettime - graph_ret->calltime;
 
-	/* Must not exceed 8 characters: 9999.999 us */
-	if (duration > 10000000ULL)
-		duration = 9999999ULL;
-
 	/* Overhead */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
 		ret = print_graph_overhead(duration, s);
@@ -286,7 +321,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 
 	/* Duration */
 	ret = print_graph_duration(duration, s);
-	if (!ret)
+	if (ret == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Function */
@@ -387,10 +422,6 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 	int ret;
 	unsigned long long duration = trace->rettime - trace->calltime;
 
-	/* Must not exceed 8 characters: xxxx.yyy us */
-	if (duration > 10000000ULL)
-		duration = 9999999ULL;
-
 	/* Pid */
 	if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
@@ -422,7 +453,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 
 	/* Duration */
 	ret = print_graph_duration(duration, s);
-	if (!ret)
+	if (ret == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Closing brace */
-- 
cgit v1.2.3


From 201955463a5c1a70d3f70d1598b27e4c2c402642 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Tue, 2 Dec 2008 22:55:38 +0100
Subject: check_hung_task(): unsigned sysctl_hung_task_warnings cannot be less
 than 0

Impact: fix warnings-limit cutoff check for debug feature

unsigned sysctl_hung_task_warnings cannot be less than 0

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/softlockup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 3953e4aed733..dc0b3be6b7d5 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -188,7 +188,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now)
 	if ((long)(now - t->last_switch_timestamp) <
 					sysctl_hung_task_timeout_secs)
 		return;
-	if (sysctl_hung_task_warnings < 0)
+	if (!sysctl_hung_task_warnings)
 		return;
 	sysctl_hung_task_warnings--;
 
-- 
cgit v1.2.3


From 764f3b95131a7ce5c992e3d00caf590fcada2f7b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Dec 2008 10:33:58 +0100
Subject: tracing/function-graph-tracer: enabled by default

CONFIG_FUNCTION_GRAPH_TRACER depends on FUNCTION_TRACER already,
(turning it non-default) so it so making it default-n is pointless.

So enable it by default - it's a nice extension of the function tracer.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8b6b673b4d6c..bde6f03512d5 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -67,6 +67,7 @@ config FUNCTION_GRAPH_TRACER
 	bool "Kernel Function Graph Tracer"
 	depends on HAVE_FUNCTION_GRAPH_TRACER
 	depends on FUNCTION_TRACER
+	default y
 	help
 	  Enable the kernel to trace a function at both its return
 	  and its entry.
-- 
cgit v1.2.3


From 0a37119d963e876ca86912497346ec50dea2541b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 3 Dec 2008 11:04:50 -0500
Subject: trace: fix output of stack trace

Impact: fix to output of stack trace

If a function is not found in the stack of the stack tracer, the
number printed is quite strange. This fixes the algorithm to handle
missing functions better.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_stack.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 06a16115be0f..0b863f2cbc8e 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -78,6 +78,7 @@ static inline void check_stack(void)
 	 * on a new max, so it is far from a fast path.
 	 */
 	while (i < max_stack_trace.nr_entries) {
+		int found = 0;
 
 		stack_dump_index[i] = this_size;
 		p = start;
@@ -86,12 +87,14 @@ static inline void check_stack(void)
 			if (*p == stack_dump_trace[i]) {
 				this_size = stack_dump_index[i++] =
 					(top - p) * sizeof(unsigned long);
+				found = 1;
 				/* Start the search from here */
 				start = p + 1;
 			}
 		}
 
-		i++;
+		if (!found)
+			i++;
 	}
 
  out:
-- 
cgit v1.2.3


From e8e1abe92fd7ea9d823a3aaf81d10e2cba593b6b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 3 Dec 2008 11:04:51 -0500
Subject: ftrace: fix race in function graph during fork

Impact: graph tracer race/crash fix

There is a nasy race in startup of a new process running the
function graph tracer. In fork.c:

	total_forks++;
	spin_unlock(&current->sighand->siglock);
	write_unlock_irq(&tasklist_lock);
	ftrace_graph_init_task(p);
	proc_fork_connector(p);
	cgroup_post_fork(p);
	return p;

The new task is free to run as soon as the tasklist_lock is released.
This is before the ftrace_graph_init_task. If the task does run
it will be using the same ret_stack and curr_ret_stack as the parent.
This will cause crashes that are difficult to debug.

This patch moves the ftrace_graph_init_task to just after the alloc_pid
code. This fixes the above race.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/fork.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 5f82a999c032..7407ab319875 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1137,6 +1137,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		}
 	}
 
+	ftrace_graph_init_task(p);
+
 	p->pid = pid_nr(pid);
 	p->tgid = p->pid;
 	if (clone_flags & CLONE_THREAD)
@@ -1145,7 +1147,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (current->nsproxy != p->nsproxy) {
 		retval = ns_cgroup_clone(p, pid);
 		if (retval)
-			goto bad_fork_free_pid;
+			goto bad_fork_free_graph;
 	}
 
 	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
@@ -1238,7 +1240,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		spin_unlock(&current->sighand->siglock);
 		write_unlock_irq(&tasklist_lock);
 		retval = -ERESTARTNOINTR;
-		goto bad_fork_free_pid;
+		goto bad_fork_free_graph;
 	}
 
 	if (clone_flags & CLONE_THREAD) {
@@ -1271,11 +1273,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	total_forks++;
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
-	ftrace_graph_init_task(p);
 	proc_fork_connector(p);
 	cgroup_post_fork(p);
 	return p;
 
+bad_fork_free_graph:
+	ftrace_graph_exit_task(p);
 bad_fork_free_pid:
 	if (pid != &init_struct_pid)
 		free_pid(pid);
-- 
cgit v1.2.3


From 6c9bacb41c10ba84ff68f238e234d96f35fb64f7 Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Mon, 1 Dec 2008 18:34:41 -0800
Subject: time: catch xtime_nsec underflows and fix them

Impact: fix time warp bug

Alex Shi, along with Yanmin Zhang have been noticing occasional time
inconsistencies recently. Through their great diagnosis, they found that
the xtime_nsec value used in update_wall_time was occasionally going
negative. After looking through the code for awhile, I realized we have
the possibility for an underflow when three conditions are met in
update_wall_time():

  1) We have accumulated a second's worth of nanoseconds, so we
     incremented xtime.tv_sec and appropriately decrement xtime_nsec.
     (This doesn't cause xtime_nsec to go negative, but it can cause it
      to be small).

  2) The remaining offset value is large, but just slightly less then
     cycle_interval.

  3) clocksource_adjust() is speeding up the clock, causing a
     corrective amount (compensating for the increase in the multiplier
     being multiplied against the unaccumulated offset value) to be
     subtracted from xtime_nsec.

This can cause xtime_nsec to underflow.

Unfortunately, since we notify the NTP subsystem via second_overflow()
whenever we accumulate a full second, and this effects the error
accumulation that has already occured, we cannot simply revert the
accumulated second from xtime nor move the second accumulation to after
the clocksource_adjust call without a change in behavior.

This leaves us with (at least) two options:

1) Simply return from clocksource_adjust() without making a change if we
   notice the adjustment would cause xtime_nsec to go negative.

This would work, but I'm concerned that if a large adjustment was needed
(due to the error being large), it may be possible to get stuck with an
ever increasing error that becomes too large to correct (since it may
always force xtime_nsec negative). This may just be paranoia on my part.

2) Catch xtime_nsec if it is negative, then add back the amount its
   negative to both xtime_nsec and the error.

This second method is consistent with how we've handled earlier rounding
issues, and also has the benefit that the error being added is always in
the oposite direction also always equal or smaller then the correction
being applied. So the risk of a corner case where things get out of
control is lessened.

This patch fixes bug 11970, as tested by Yanmin Zhang
http://bugzilla.kernel.org/show_bug.cgi?id=11970

Reported-by: alex.shi@intel.com
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Tested-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/timekeeping.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e7acfb482a68..fa05e88aa76f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -518,6 +518,28 @@ void update_wall_time(void)
 	/* correct the clock when NTP error is too big */
 	clocksource_adjust(offset);
 
+	/*
+	 * Since in the loop above, we accumulate any amount of time
+	 * in xtime_nsec over a second into xtime.tv_sec, its possible for
+	 * xtime_nsec to be fairly small after the loop. Further, if we're
+	 * slightly speeding the clocksource up in clocksource_adjust(),
+	 * its possible the required corrective factor to xtime_nsec could
+	 * cause it to underflow.
+	 *
+	 * Now, we cannot simply roll the accumulated second back, since
+	 * the NTP subsystem has been notified via second_overflow. So
+	 * instead we push xtime_nsec forward by the amount we underflowed,
+	 * and add that amount into the error.
+	 *
+	 * We'll correct this error next time through this function, when
+	 * xtime_nsec is not as small.
+	 */
+	if (unlikely((s64)clock->xtime_nsec < 0)) {
+		s64 neg = -(s64)clock->xtime_nsec;
+		clock->xtime_nsec = 0;
+		clock->error += neg << (NTP_SCALE_SHIFT - clock->shift);
+	}
+
 	/* store full nanoseconds into xtime after rounding it up and
 	 * add the remainder to the error difference.
 	 */
-- 
cgit v1.2.3


From ea4e2bc4d9f7370e57a343ccb5e7c0ad3222ec3c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 3 Dec 2008 15:36:57 -0500
Subject: ftrace: graph of a single function

This patch adds the file:

   /debugfs/tracing/set_graph_function

which can be used along with the function graph tracer.

When this file is empty, the function graph tracer will act as
usual. When the file has a function in it, the function graph
tracer will only trace that function.

For example:

 # echo blk_unplug > /debugfs/tracing/set_graph_function
 # cat /debugfs/tracing/trace
 [...]
 ------------------------------------------
 | 2)  make-19003  =>  kjournald-2219
 ------------------------------------------

 2)               |  blk_unplug() {
 2)               |    dm_unplug_all() {
 2)               |      dm_get_table() {
 2)      1.381 us |        _read_lock();
 2)      0.911 us |        dm_table_get();
 2)      1. 76 us |        _read_unlock();
 2) +   12.912 us |      }
 2)               |      dm_table_unplug_all() {
 2)               |        blk_unplug() {
 2)      0.778 us |          generic_unplug_device();
 2)      2.409 us |        }
 2)      5.992 us |      }
 2)      0.813 us |      dm_table_put();
 2) +   29. 90 us |    }
 2) +   34.532 us |  }

You can add up to 32 functions into this file. Currently we limit it
to 32, but this may change with later improvements.

To add another function, use the append '>>':

  # echo sys_read >> /debugfs/tracing/set_graph_function
  # cat /debugfs/tracing/set_graph_function
  blk_unplug
  sys_read

Using the '>' will clear out the function and write anew:

  # echo sys_write > /debug/tracing/set_graph_function
  # cat /debug/tracing/set_graph_function
  sys_write

Note, if you have function graph running while doing this, the small
time between clearing it and updating it will cause the graph to
record all functions. This should not be an issue because after
it sets the filter, only those functions will be recorded from then on.
If you need to only record a particular function then set this
file first before starting the function graph tracer. In the future
this side effect may be corrected.

The set_graph_function file is similar to the set_ftrace_filter but
it does not take wild cards nor does it allow for more than one
function to be set with a single write. There is no technical reason why
this is the case, I just do not have the time yet to implement that.

Note, dynamic ftrace must be enabled for this to appear because it
uses the dynamic ftrace records to match the name to the mcount
call sites.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h |  46 ++++++++++
 include/linux/sched.h  |   4 +
 kernel/trace/ftrace.c  | 227 +++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.c   |   8 ++
 kernel/trace/trace.h   |  30 ++++++-
 5 files changed, 314 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 469ceb3e85ba..b295d3106bfe 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -7,6 +7,7 @@
 #include <linux/init.h>
 #include <linux/types.h>
 #include <linux/kallsyms.h>
+#include <linux/bitops.h>
 
 #ifdef CONFIG_FUNCTION_TRACER
 
@@ -391,4 +392,49 @@ static inline void ftrace_graph_init_task(struct task_struct *t) { }
 static inline void ftrace_graph_exit_task(struct task_struct *t) { }
 #endif
 
+#ifdef CONFIG_TRACING
+#include <linux/sched.h>
+
+/* flags for current->trace */
+enum {
+	TSK_TRACE_FL_TRACE_BIT	= 0,
+	TSK_TRACE_FL_GRAPH_BIT	= 1,
+};
+enum {
+	TSK_TRACE_FL_TRACE	= 1 << TSK_TRACE_FL_TRACE_BIT,
+	TSK_TRACE_FL_GRAPH	= 1 << TSK_TRACE_FL_GRAPH_BIT,
+};
+
+static inline void set_tsk_trace_trace(struct task_struct *tsk)
+{
+	set_bit(TSK_TRACE_FL_TRACE_BIT, &tsk->trace);
+}
+
+static inline void clear_tsk_trace_trace(struct task_struct *tsk)
+{
+	clear_bit(TSK_TRACE_FL_TRACE_BIT, &tsk->trace);
+}
+
+static inline int test_tsk_trace_trace(struct task_struct *tsk)
+{
+	return tsk->trace & TSK_TRACE_FL_TRACE;
+}
+
+static inline void set_tsk_trace_graph(struct task_struct *tsk)
+{
+	set_bit(TSK_TRACE_FL_GRAPH_BIT, &tsk->trace);
+}
+
+static inline void clear_tsk_trace_graph(struct task_struct *tsk)
+{
+	clear_bit(TSK_TRACE_FL_GRAPH_BIT, &tsk->trace);
+}
+
+static inline int test_tsk_trace_graph(struct task_struct *tsk)
+{
+	return tsk->trace & TSK_TRACE_FL_GRAPH;
+}
+
+#endif /* CONFIG_TRACING */
+
 #endif /* _LINUX_FTRACE_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2d0a93c31228..4c152e0acc9e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1380,6 +1380,10 @@ struct task_struct {
 	 */
 	atomic_t trace_overrun;
 #endif
+#ifdef CONFIG_TRACING
+	/* state flags for use by tracers */
+	unsigned long trace;
+#endif
 };
 
 /*
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 65b9e863056b..b17a30350f06 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1320,6 +1320,224 @@ static struct file_operations ftrace_notrace_fops = {
 	.release = ftrace_notrace_release,
 };
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+static DEFINE_MUTEX(graph_lock);
+
+int ftrace_graph_count;
+unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
+
+static void *
+g_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	unsigned long *array = m->private;
+	int index = *pos;
+
+	(*pos)++;
+
+	if (index >= ftrace_graph_count)
+		return NULL;
+
+	return &array[index];
+}
+
+static void *g_start(struct seq_file *m, loff_t *pos)
+{
+	void *p = NULL;
+
+	mutex_lock(&graph_lock);
+
+	p = g_next(m, p, pos);
+
+	return p;
+}
+
+static void g_stop(struct seq_file *m, void *p)
+{
+	mutex_unlock(&graph_lock);
+}
+
+static int g_show(struct seq_file *m, void *v)
+{
+	unsigned long *ptr = v;
+	char str[KSYM_SYMBOL_LEN];
+
+	if (!ptr)
+		return 0;
+
+	kallsyms_lookup(*ptr, NULL, NULL, NULL, str);
+
+	seq_printf(m, "%s\n", str);
+
+	return 0;
+}
+
+static struct seq_operations ftrace_graph_seq_ops = {
+	.start = g_start,
+	.next = g_next,
+	.stop = g_stop,
+	.show = g_show,
+};
+
+static int
+ftrace_graph_open(struct inode *inode, struct file *file)
+{
+	int ret = 0;
+
+	if (unlikely(ftrace_disabled))
+		return -ENODEV;
+
+	mutex_lock(&graph_lock);
+	if ((file->f_mode & FMODE_WRITE) &&
+	    !(file->f_flags & O_APPEND)) {
+		ftrace_graph_count = 0;
+		memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
+	}
+
+	if (file->f_mode & FMODE_READ) {
+		ret = seq_open(file, &ftrace_graph_seq_ops);
+		if (!ret) {
+			struct seq_file *m = file->private_data;
+			m->private = ftrace_graph_funcs;
+		}
+	} else
+		file->private_data = ftrace_graph_funcs;
+	mutex_unlock(&graph_lock);
+
+	return ret;
+}
+
+static ssize_t
+ftrace_graph_read(struct file *file, char __user *ubuf,
+		       size_t cnt, loff_t *ppos)
+{
+	if (file->f_mode & FMODE_READ)
+		return seq_read(file, ubuf, cnt, ppos);
+	else
+		return -EPERM;
+}
+
+static int
+ftrace_set_func(unsigned long *array, int idx, char *buffer)
+{
+	char str[KSYM_SYMBOL_LEN];
+	struct dyn_ftrace *rec;
+	struct ftrace_page *pg;
+	int found = 0;
+	int i;
+
+	if (ftrace_disabled)
+		return -ENODEV;
+
+	/* should not be called from interrupt context */
+	spin_lock(&ftrace_lock);
+
+	for (pg = ftrace_pages_start; pg; pg = pg->next) {
+		for (i = 0; i < pg->index; i++) {
+			rec = &pg->records[i];
+
+			if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
+				continue;
+
+			kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+			if (strcmp(str, buffer) == 0) {
+				found = 1;
+				array[idx] = rec->ip;
+				break;
+			}
+		}
+	}
+	spin_unlock(&ftrace_lock);
+
+	return found ? 0 : -EINVAL;
+}
+
+static ssize_t
+ftrace_graph_write(struct file *file, const char __user *ubuf,
+		   size_t cnt, loff_t *ppos)
+{
+	unsigned char buffer[FTRACE_BUFF_MAX+1];
+	unsigned long *array;
+	size_t read = 0;
+	ssize_t ret;
+	int index = 0;
+	char ch;
+
+	if (!cnt || cnt < 0)
+		return 0;
+
+	mutex_lock(&graph_lock);
+
+	if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	if (file->f_mode & FMODE_READ) {
+		struct seq_file *m = file->private_data;
+		array = m->private;
+	} else
+		array = file->private_data;
+
+	ret = get_user(ch, ubuf++);
+	if (ret)
+		goto out;
+	read++;
+	cnt--;
+
+	/* skip white space */
+	while (cnt && isspace(ch)) {
+		ret = get_user(ch, ubuf++);
+		if (ret)
+			goto out;
+		read++;
+		cnt--;
+	}
+
+	if (isspace(ch)) {
+		*ppos += read;
+		ret = read;
+		goto out;
+	}
+
+	while (cnt && !isspace(ch)) {
+		if (index < FTRACE_BUFF_MAX)
+			buffer[index++] = ch;
+		else {
+			ret = -EINVAL;
+			goto out;
+		}
+		ret = get_user(ch, ubuf++);
+		if (ret)
+			goto out;
+		read++;
+		cnt--;
+	}
+	buffer[index] = 0;
+
+	/* we allow only one at a time */
+	ret = ftrace_set_func(array, ftrace_graph_count, buffer);
+	if (ret)
+		goto out;
+
+	ftrace_graph_count++;
+
+	file->f_pos += read;
+
+	ret = read;
+ out:
+	mutex_unlock(&graph_lock);
+
+	return ret;
+}
+
+static const struct file_operations ftrace_graph_fops = {
+	.open = ftrace_graph_open,
+	.read = ftrace_graph_read,
+	.write = ftrace_graph_write,
+};
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
 static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
 {
 	struct dentry *entry;
@@ -1347,6 +1565,15 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
 		pr_warning("Could not create debugfs "
 			   "'set_ftrace_notrace' entry\n");
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	entry = debugfs_create_file("set_graph_function", 0444, d_tracer,
+				    NULL,
+				    &ftrace_graph_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'set_graph_function' entry\n");
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
 	return 0;
 }
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8b6409a62b54..710b39acd81b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1209,6 +1209,9 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 	int cpu;
 	int pc;
 
+	if (!ftrace_graph_addr(trace->func))
+		return 0;
+
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
@@ -1217,6 +1220,9 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 		pc = preempt_count();
 		__trace_graph_entry(tr, data, trace, flags, pc);
 	}
+	/* Only do the atomic if it is not already set */
+	if (!test_tsk_trace_graph(current))
+		set_tsk_trace_graph(current);
 	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
 
@@ -1240,6 +1246,8 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
 		pc = preempt_count();
 		__trace_graph_return(tr, data, trace, flags, pc);
 	}
+	if (!trace->depth)
+		clear_tsk_trace_graph(current);
 	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
 }
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 0565ae9a2210..41f026bfc9ed 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -505,13 +505,41 @@ extern unsigned long trace_flags;
 /* Standard output formatting function used for function return traces */
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 extern enum print_line_t print_graph_function(struct trace_iterator *iter);
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+/* TODO: make this variable */
+#define FTRACE_GRAPH_MAX_FUNCS		32
+extern int ftrace_graph_count;
+extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
+
+static inline int ftrace_graph_addr(unsigned long addr)
+{
+	int i;
+
+	if (!ftrace_graph_count || test_tsk_trace_graph(current))
+		return 1;
+
+	for (i = 0; i < ftrace_graph_count; i++) {
+		if (addr == ftrace_graph_funcs[i])
+			return 1;
+	}
+
+	return 0;
+}
 #else
+static inline int ftrace_trace_addr(unsigned long addr)
+{
+	return 1
+}
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#else /* CONFIG_FUNCTION_GRAPH_TRACER */
 static inline enum print_line_t
 print_graph_function(struct trace_iterator *iter)
 {
 	return TRACE_TYPE_UNHANDLED;
 }
-#endif
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
 /*
  * trace_iterator_flags is an enumeration that defines bit
-- 
cgit v1.2.3


From 0ef8cde56ab92ab3f65221246dc1622c6b5068b3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 3 Dec 2008 15:36:58 -0500
Subject: ftrace: use task struct trace flag to filter on pid

Impact: clean up

Use the new task struct trace flags to determine if a process should be
traced or not.

Note: this moves the searching of the pid to the slow path of setting
the pid field. This needs to be converted to the pid name space.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b17a30350f06..c5049f54a275 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -47,7 +47,7 @@
 int ftrace_enabled __read_mostly;
 static int last_ftrace_enabled;
 
-/* ftrace_pid_trace >= 0 will only trace threads with this pid */
+/* set when tracing only a pid */
 static int ftrace_pid_trace = -1;
 
 /* Quick disabling of function tracer. */
@@ -90,7 +90,7 @@ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
 
 static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip)
 {
-	if (current->pid != ftrace_pid_trace)
+	if (!test_tsk_trace_trace(current))
 		return;
 
 	ftrace_pid_function(ip, parent_ip);
@@ -1714,11 +1714,33 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
 		ftrace_pid_trace = -1;
 
 	} else {
+		struct task_struct *p;
+		int found = 0;
 
 		if (ftrace_pid_trace == val)
 			goto out;
 
-		ftrace_pid_trace = val;
+		/*
+		 * Find the task that matches this pid.
+		 * TODO: use pid namespaces instead.
+		 */
+		rcu_read_lock();
+		for_each_process(p) {
+			if (p->pid == val) {
+				found = 1;
+				set_tsk_trace_trace(p);
+			} else if (test_tsk_trace_trace(p))
+				clear_tsk_trace_trace(p);
+		}
+		rcu_read_unlock();
+
+		if (found)
+			ftrace_pid_trace = val;
+		else {
+			if (ftrace_pid_trace < 0)
+				goto out;
+			ftrace_pid_trace = -1;
+		}
 	}
 
 	/* update the function call */
-- 
cgit v1.2.3


From 804a685162a7080386714166776f57255a75238e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 3 Dec 2008 15:36:59 -0500
Subject: ftrace: trace single pid for function graph tracer

Impact: New feature

This patch makes the changes to set_ftrace_pid apply to the function
graph tracer.

  # echo $$ > /debugfs/tracing/set_ftrace_pid
  # echo function_graph > /debugfs/tracing/current_tracer

Will cause only the current task to be traced. Note, the trace flags are
also inherited by child processes, so the children of the shell
will also be traced.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c |  2 +-
 kernel/trace/trace.c  |  3 +++
 kernel/trace/trace.h  | 10 ++++++++++
 3 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index c5049f54a275..57592a9dd630 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -48,7 +48,7 @@ int ftrace_enabled __read_mostly;
 static int last_ftrace_enabled;
 
 /* set when tracing only a pid */
-static int ftrace_pid_trace = -1;
+int ftrace_pid_trace = -1;
 
 /* Quick disabling of function tracer. */
 int function_trace_stop;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 710b39acd81b..1bd9574404e5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1209,6 +1209,9 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 	int cpu;
 	int pc;
 
+	if (!ftrace_trace_task(current))
+		return 0;
+
 	if (!ftrace_graph_addr(trace->func))
 		return 0;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 41f026bfc9ed..95fff37ed970 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -541,6 +541,16 @@ print_graph_function(struct trace_iterator *iter)
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
+extern int ftrace_pid_trace;
+
+static inline int ftrace_trace_task(struct task_struct *task)
+{
+	if (ftrace_pid_trace < 0)
+		return 1;
+
+	return test_tsk_trace_trace(task);
+}
+
 /*
  * trace_iterator_flags is an enumeration that defines bit
  * positions into trace_flags that controls the output.
-- 
cgit v1.2.3


From 978f3a45d9499c7a447ca7615455cefb63d44165 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 4 Dec 2008 00:26:40 -0500
Subject: ftrace: use struct pid

Impact: clean up, extend PID filtering to PID namespaces

Eric Biederman suggested using the struct pid for filtering on
pids in the kernel. This patch is based off of a demonstration
of an implementation that Eric sent me in an email.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 76 +++++++++++++++++++++++++++++----------------------
 kernel/trace/trace.h  |  4 +--
 2 files changed, 46 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 57592a9dd630..10b1d7c1b1dd 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -48,7 +48,7 @@ int ftrace_enabled __read_mostly;
 static int last_ftrace_enabled;
 
 /* set when tracing only a pid */
-int ftrace_pid_trace = -1;
+struct pid *ftrace_pid_trace;
 
 /* Quick disabling of function tracer. */
 int function_trace_stop;
@@ -153,7 +153,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
 		else
 			func = ftrace_list_func;
 
-		if (ftrace_pid_trace >= 0) {
+		if (ftrace_pid_trace) {
 			set_ftrace_pid_function(func);
 			func = ftrace_pid_func;
 		}
@@ -209,7 +209,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 		if (ftrace_list->next == &ftrace_list_end) {
 			ftrace_func_t func = ftrace_list->func;
 
-			if (ftrace_pid_trace >= 0) {
+			if (ftrace_pid_trace) {
 				set_ftrace_pid_function(func);
 				func = ftrace_pid_func;
 			}
@@ -239,7 +239,7 @@ static void ftrace_update_pid_func(void)
 
 	func = ftrace_trace_function;
 
-	if (ftrace_pid_trace >= 0) {
+	if (ftrace_pid_trace) {
 		set_ftrace_pid_function(func);
 		func = ftrace_pid_func;
 	} else {
@@ -1678,18 +1678,40 @@ ftrace_pid_read(struct file *file, char __user *ubuf,
 	char buf[64];
 	int r;
 
-	if (ftrace_pid_trace >= 0)
-		r = sprintf(buf, "%u\n", ftrace_pid_trace);
+	if (ftrace_pid_trace)
+		r = sprintf(buf, "%u\n", pid_nr(ftrace_pid_trace));
 	else
 		r = sprintf(buf, "no pid\n");
 
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
+static void clear_ftrace_pid_task(struct pid **pid)
+{
+	struct task_struct *p;
+
+	do_each_pid_task(*pid, PIDTYPE_PID, p) {
+		clear_tsk_trace_trace(p);
+	} while_each_pid_task(*pid, PIDTYPE_PID, p);
+	put_pid(*pid);
+
+	*pid = NULL;
+}
+
+static void set_ftrace_pid_task(struct pid *pid)
+{
+	struct task_struct *p;
+
+	do_each_pid_task(pid, PIDTYPE_PID, p) {
+		set_tsk_trace_trace(p);
+	} while_each_pid_task(pid, PIDTYPE_PID, p);
+}
+
 static ssize_t
 ftrace_pid_write(struct file *filp, const char __user *ubuf,
 		   size_t cnt, loff_t *ppos)
 {
+	struct pid *pid;
 	char buf[64];
 	long val;
 	int ret;
@@ -1707,40 +1729,30 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
 		return ret;
 
 	mutex_lock(&ftrace_start_lock);
-	if (ret < 0) {
+	if (val < 0) {
 		/* disable pid tracing */
-		if (ftrace_pid_trace < 0)
+		if (!ftrace_pid_trace)
 			goto out;
-		ftrace_pid_trace = -1;
+
+		clear_ftrace_pid_task(&ftrace_pid_trace);
 
 	} else {
-		struct task_struct *p;
-		int found = 0;
+		pid = find_get_pid(val);
 
-		if (ftrace_pid_trace == val)
+		if (pid == ftrace_pid_trace) {
+			put_pid(pid);
 			goto out;
-
-		/*
-		 * Find the task that matches this pid.
-		 * TODO: use pid namespaces instead.
-		 */
-		rcu_read_lock();
-		for_each_process(p) {
-			if (p->pid == val) {
-				found = 1;
-				set_tsk_trace_trace(p);
-			} else if (test_tsk_trace_trace(p))
-				clear_tsk_trace_trace(p);
 		}
-		rcu_read_unlock();
 
-		if (found)
-			ftrace_pid_trace = val;
-		else {
-			if (ftrace_pid_trace < 0)
-				goto out;
-			ftrace_pid_trace = -1;
-		}
+		if (ftrace_pid_trace)
+			clear_ftrace_pid_task(&ftrace_pid_trace);
+
+		if (!pid)
+			goto out;
+
+		ftrace_pid_trace = pid;
+
+		set_ftrace_pid_task(ftrace_pid_trace);
 	}
 
 	/* update the function call */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 95fff37ed970..8b81b4d727bd 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -541,11 +541,11 @@ print_graph_function(struct trace_iterator *iter)
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
-extern int ftrace_pid_trace;
+extern struct pid *ftrace_pid_trace;
 
 static inline int ftrace_trace_task(struct task_struct *task)
 {
-	if (ftrace_pid_trace < 0)
+	if (ftrace_pid_trace)
 		return 1;
 
 	return test_tsk_trace_trace(task);
-- 
cgit v1.2.3


From e32d89569128e76bdf84867be0928902ca9f7555 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 4 Dec 2008 00:26:41 -0500
Subject: ftrace: add ability to only trace swapper tasks

Impact: new feature

This patch lets the swapper tasks of all CPUS be filtered by the
set_ftrace_pid file.

If '0' is echoed into this file, then all the idle tasks (aka swapper)
is flagged to be traced.  This affects all CPU idle tasks.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 74 +++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 63 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 10b1d7c1b1dd..eb57dc1ea097 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -49,6 +49,7 @@ static int last_ftrace_enabled;
 
 /* set when tracing only a pid */
 struct pid *ftrace_pid_trace;
+static struct pid * const ftrace_swapper_pid = (struct pid *)1;
 
 /* Quick disabling of function tracer. */
 int function_trace_stop;
@@ -1678,7 +1679,9 @@ ftrace_pid_read(struct file *file, char __user *ubuf,
 	char buf[64];
 	int r;
 
-	if (ftrace_pid_trace)
+	if (ftrace_pid_trace == ftrace_swapper_pid)
+		r = sprintf(buf, "swapper tasks\n");
+	else if (ftrace_pid_trace)
 		r = sprintf(buf, "%u\n", pid_nr(ftrace_pid_trace));
 	else
 		r = sprintf(buf, "no pid\n");
@@ -1686,19 +1689,43 @@ ftrace_pid_read(struct file *file, char __user *ubuf,
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
-static void clear_ftrace_pid_task(struct pid **pid)
+static void clear_ftrace_swapper(void)
 {
 	struct task_struct *p;
+	int cpu;
 
-	do_each_pid_task(*pid, PIDTYPE_PID, p) {
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
+		p = idle_task(cpu);
 		clear_tsk_trace_trace(p);
-	} while_each_pid_task(*pid, PIDTYPE_PID, p);
-	put_pid(*pid);
+	}
+	put_online_cpus();
+}
 
-	*pid = NULL;
+static void set_ftrace_swapper(void)
+{
+	struct task_struct *p;
+	int cpu;
+
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
+		p = idle_task(cpu);
+		set_tsk_trace_trace(p);
+	}
+	put_online_cpus();
 }
 
-static void set_ftrace_pid_task(struct pid *pid)
+static void clear_ftrace_pid(struct pid *pid)
+{
+	struct task_struct *p;
+
+	do_each_pid_task(pid, PIDTYPE_PID, p) {
+		clear_tsk_trace_trace(p);
+	} while_each_pid_task(pid, PIDTYPE_PID, p);
+	put_pid(pid);
+}
+
+static void set_ftrace_pid(struct pid *pid)
 {
 	struct task_struct *p;
 
@@ -1707,6 +1734,24 @@ static void set_ftrace_pid_task(struct pid *pid)
 	} while_each_pid_task(pid, PIDTYPE_PID, p);
 }
 
+static void clear_ftrace_pid_task(struct pid **pid)
+{
+	if (*pid == ftrace_swapper_pid)
+		clear_ftrace_swapper();
+	else
+		clear_ftrace_pid(*pid);
+
+	*pid = NULL;
+}
+
+static void set_ftrace_pid_task(struct pid *pid)
+{
+	if (pid == ftrace_swapper_pid)
+		set_ftrace_swapper();
+	else
+		set_ftrace_pid(pid);
+}
+
 static ssize_t
 ftrace_pid_write(struct file *filp, const char __user *ubuf,
 		   size_t cnt, loff_t *ppos)
@@ -1737,11 +1782,18 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
 		clear_ftrace_pid_task(&ftrace_pid_trace);
 
 	} else {
-		pid = find_get_pid(val);
+		/* swapper task is special */
+		if (!val) {
+			pid = ftrace_swapper_pid;
+			if (pid == ftrace_pid_trace)
+				goto out;
+		} else {
+			pid = find_get_pid(val);
 
-		if (pid == ftrace_pid_trace) {
-			put_pid(pid);
-			goto out;
+			if (pid == ftrace_pid_trace) {
+				put_pid(pid);
+				goto out;
+			}
 		}
 
 		if (ftrace_pid_trace)
-- 
cgit v1.2.3


From 6b2539302bee8e88c99e3c7d80c16a04dbe5e2ad Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 4 Dec 2008 09:18:28 +0100
Subject: tracing: fix typo and missing inline function

Impact: fix build bugs

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8b81b4d727bd..b4b7b735184d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -529,7 +529,11 @@ static inline int ftrace_graph_addr(unsigned long addr)
 #else
 static inline int ftrace_trace_addr(unsigned long addr)
 {
-	return 1
+	return 1;
+}
+static inline int ftrace_graph_addr(unsigned long addr)
+{
+	return 1;
 }
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
-- 
cgit v1.2.3


From faec2ec505d397e9426754722b6e80d519c4938f Mon Sep 17 00:00:00 2001
From: Liming Wang <liming.wang@windriver.com>
Date: Thu, 4 Dec 2008 14:24:49 +0800
Subject: ftrace: avoid duplicated function when writing set_graph_function

Impact: fix a bug in function filter setting

when writing function to set_graph_function, we should check whether it
has existed in set_graph_function to avoid duplicating.

Signed-off-by: Liming Wang <liming.wang@windriver.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index eb57dc1ea097..d2b156538162 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1425,7 +1425,7 @@ ftrace_set_func(unsigned long *array, int idx, char *buffer)
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
 	int found = 0;
-	int i;
+	int i, j;
 
 	if (ftrace_disabled)
 		return -ENODEV;
@@ -1443,7 +1443,13 @@ ftrace_set_func(unsigned long *array, int idx, char *buffer)
 			kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
 			if (strcmp(str, buffer) == 0) {
 				found = 1;
-				array[idx] = rec->ip;
+				for (j = 0; j < idx; j++)
+					if (array[j] == rec->ip) {
+						found = 0;
+						break;
+					}
+				if (found)
+					array[idx] = rec->ip;
 				break;
 			}
 		}
-- 
cgit v1.2.3


From 1fd8f2a3f9a91b287a876cef830b21baafc8a799 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 3 Dec 2008 23:45:11 +0100
Subject: tracing/function-graph-tracer: handle ftrace_printk entries

Handle the TRACE_PRINT entries from the function grapg tracer
and output them as a C comment just below the function that called
it, as if it was a comment inside this function.

Example with an ftrace_printk inside might_sleep() function:

void __might_sleep(char *file, int line)
{
	static unsigned long prev_jiffy;	/* ratelimiting */

	ftrace_printk("Hi I'm a comment in might_sleep() :-)");

A chunk of a resulting trace:

 0)               |        _reiserfs_free_block() {
 0)               |          reiserfs_read_bitmap_block() {
 0)               |            __bread() {
 0)               |              __getblk() {
 0)               |                __find_get_block() {
 0)   0.698 us    |                  mark_page_accessed();
 0)   2.267 us    |                }
 0)               |                __might_sleep() {
 0)               |                  /* Hi I'm a comment in might_sleep() :-) */
 0)   1.321 us    |                }
 0)   5.872 us    |              }
 0)   7.313 us    |            }
 0)   8.718 us    |          }

And this patch brings two minor fixes:

- The newline after a switch-out task has disappeared
- The "|" sign just before the cpu number on task-switch has been deleted.

 0)   0.616 us    |                pick_next_task_rt();
 0)   1.457 us    |                _spin_trylock();
 0)   0.653 us    |                _spin_unlock();
 0)   0.728 us    |                _spin_trylock();
 0)   0.631 us    |                _spin_unlock();
 0)   0.729 us    |                native_load_sp0();
 0)   0.593 us    |                native_load_tls();
 ------------------------------------------
 0)    cat-2834    =>   migrati-3
 ------------------------------------------

 0)               |    finish_task_switch() {
 0)   0.841 us    |      _spin_unlock_irq();
 0)   0.616 us    |      post_schedule_rt();
 0)   3.882 us    |    }

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c                 | 28 +++++++++++---
 kernel/trace/trace.h                 |  4 +-
 kernel/trace/trace_functions_graph.c | 72 +++++++++++++++++++++++++++++++++++-
 kernel/trace/trace_mmiotrace.c       |  2 +-
 4 files changed, 97 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8b6409a62b54..1ca74c0cee6a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3335,7 +3335,7 @@ static int mark_printk(const char *fmt, ...)
 	int ret;
 	va_list args;
 	va_start(args, fmt);
-	ret = trace_vprintk(0, fmt, args);
+	ret = trace_vprintk(0, -1, fmt, args);
 	va_end(args);
 	return ret;
 }
@@ -3564,9 +3564,16 @@ static __init int tracer_init_debugfs(void)
 	return 0;
 }
 
-int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
+int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 {
-	static DEFINE_SPINLOCK(trace_buf_lock);
+	/*
+	 * Raw Spinlock because a normal spinlock would be traced here
+	 * and append an irrelevant couple spin_lock_irqsave/
+	 * spin_unlock_irqrestore traced by ftrace around this
+	 * TRACE_PRINTK trace.
+	 */
+	static raw_spinlock_t trace_buf_lock =
+				(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 	static char trace_buf[TRACE_BUF_SIZE];
 
 	struct ring_buffer_event *event;
@@ -3587,7 +3594,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 	if (unlikely(atomic_read(&data->disabled)))
 		goto out;
 
-	spin_lock_irqsave(&trace_buf_lock, flags);
+	local_irq_save(flags);
+	__raw_spin_lock(&trace_buf_lock);
 	len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
 
 	len = min(len, TRACE_BUF_SIZE-1);
@@ -3601,13 +3609,15 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 	tracing_generic_entry_update(&entry->ent, flags, pc);
 	entry->ent.type			= TRACE_PRINT;
 	entry->ip			= ip;
+	entry->depth			= depth;
 
 	memcpy(&entry->buf, trace_buf, len);
 	entry->buf[len] = 0;
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
 
  out_unlock:
-	spin_unlock_irqrestore(&trace_buf_lock, flags);
+	__raw_spin_unlock(&trace_buf_lock);
+	local_irq_restore(flags);
 
  out:
 	preempt_enable_notrace();
@@ -3625,7 +3635,13 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...)
 		return 0;
 
 	va_start(ap, fmt);
-	ret = trace_vprintk(ip, fmt, ap);
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	ret = trace_vprintk(ip, current->curr_ret_stack, fmt, ap);
+#else
+	ret = trace_vprintk(ip, -1, fmt, ap);
+#endif
+
 	va_end(ap);
 	return ret;
 }
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 0565ae9a2210..fce98898205a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -117,6 +117,7 @@ struct userstack_entry {
 struct print_entry {
 	struct trace_entry	ent;
 	unsigned long		ip;
+	int			depth;
 	char			buf[];
 };
 
@@ -498,7 +499,8 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
 extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
 				 size_t cnt);
 extern long ns2usecs(cycle_t nsec);
-extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args);
+extern int
+trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args);
 
 extern unsigned long trace_flags;
 
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index c66578f2fdc2..32b7fb9a19df 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -173,7 +173,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu)
 
  */
 	ret = trace_seq_printf(s,
-		"\n ------------------------------------------\n |");
+		" ------------------------------------------\n");
 	if (!ret)
 		TRACE_TYPE_PARTIAL_LINE;
 
@@ -477,6 +477,71 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 	return TRACE_TYPE_HANDLED;
 }
 
+static enum print_line_t
+print_graph_comment(struct print_entry *trace, struct trace_seq *s,
+		   struct trace_entry *ent, struct trace_iterator *iter)
+{
+	int i;
+	int ret;
+
+	/* Pid */
+	if (verif_pid(s, ent->pid, iter->cpu) == TRACE_TYPE_PARTIAL_LINE)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Cpu */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
+		ret = print_graph_cpu(s, iter->cpu);
+		if (ret == TRACE_TYPE_PARTIAL_LINE)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	/* Proc */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
+		ret = print_graph_proc(s, ent->pid);
+		if (ret == TRACE_TYPE_PARTIAL_LINE)
+			return TRACE_TYPE_PARTIAL_LINE;
+
+		ret = trace_seq_printf(s, " | ");
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	/* No overhead */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
+		ret = trace_seq_printf(s, "  ");
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	/* No time */
+	ret = trace_seq_printf(s, "            |  ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Indentation */
+	if (trace->depth > 0)
+		for (i = 0; i < (trace->depth + 1) * TRACE_GRAPH_INDENT; i++) {
+			ret = trace_seq_printf(s, " ");
+			if (!ret)
+				return TRACE_TYPE_PARTIAL_LINE;
+		}
+
+	/* The comment */
+	ret = trace_seq_printf(s, "/* %s", trace->buf);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	if (ent->flags & TRACE_FLAG_CONT)
+		trace_seq_print_cont(s, iter);
+
+	ret = trace_seq_printf(s, " */\n");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+
 enum print_line_t
 print_graph_function(struct trace_iterator *iter)
 {
@@ -495,6 +560,11 @@ print_graph_function(struct trace_iterator *iter)
 		trace_assign_type(field, entry);
 		return print_graph_return(&field->ret, s, entry, iter->cpu);
 	}
+	case TRACE_PRINT: {
+		struct print_entry *field;
+		trace_assign_type(field, entry);
+		return print_graph_comment(field, s, entry, iter);
+	}
 	default:
 		return TRACE_TYPE_UNHANDLED;
 	}
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 2a98a206acc2..2fb6da6523b3 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -366,5 +366,5 @@ void mmio_trace_mapping(struct mmiotrace_map *map)
 
 int mmio_trace_printk(const char *fmt, va_list args)
 {
-	return trace_vprintk(0, fmt, args);
+	return trace_vprintk(0, -1, fmt, args);
 }
-- 
cgit v1.2.3


From 50c396d38c1a7f0c693579ec88cb4be3c0b0645e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 30 Nov 2008 01:47:12 -0500
Subject: [PATCH] kill obsolete temporary comment in swsusp_close()

it had been put there to mark the call of blkdev_put() that
needed proper argument propagated to it; later patch in the
same series had done just that.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/power/swap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index b7713b53d07a..6da14358537c 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -633,7 +633,7 @@ void swsusp_close(fmode_t mode)
 		return;
 	}
 
-	blkdev_put(resume_bdev, mode); /* move up */
+	blkdev_put(resume_bdev, mode);
 }
 
 static int swsusp_header_init(void)
-- 
cgit v1.2.3


From ff32504fdc56407654584ef187b20022c94a3486 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 4 Dec 2008 23:47:35 +0100
Subject: tracing/ftrace: don't insert TRACE_PRINT during selftests

Impact: fix tracer selfstests false results

After setting a ftrace_printk somewhere in th kernel, I saw the
Function tracer selftest failing.

When a selftest occurs, the ring buffer is lurked to see if
some entries were inserted. But concurrent insertion such as
ftrace_printk could occured at the same time and could give
false positive or negative results.

This patch prevent prevent from TRACE_PRINT entries insertion
during selftests.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ea38652d631c..5dca6ef1fbeb 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -44,6 +44,14 @@
 unsigned long __read_mostly	tracing_max_latency = (cycle_t)ULONG_MAX;
 unsigned long __read_mostly	tracing_thresh;
 
+/* We need to change this state when a selftest is running.
+ * A selftest will lurk into the ring-buffer to count the
+ * entries inserted during the selftest although some concurrent
+ * insertions into the ring-buffer such as ftrace_printk could occurred
+ * at the same time, giving false positive or negative results.
+ */
+static atomic_t tracing_selftest_running = ATOMIC_INIT(0);
+
 /* For tracers that don't implement custom flags */
 static struct tracer_opt dummy_tracer_opt[] = {
 	{ }
@@ -589,6 +597,8 @@ int register_tracer(struct tracer *type)
 		struct tracer *saved_tracer = current_trace;
 		struct trace_array *tr = &global_trace;
 		int i;
+
+		atomic_set(&tracing_selftest_running, 1);
 		/*
 		 * Run a selftest on this tracer.
 		 * Here we reset the trace buffer, and set the current
@@ -603,6 +613,7 @@ int register_tracer(struct tracer *type)
 		/* the test is responsible for initializing and enabling */
 		pr_info("Testing tracer %s: ", type->name);
 		ret = type->selftest(type, tr);
+		atomic_set(&tracing_selftest_running, 0);
 		/* the test is responsible for resetting too */
 		current_trace = saved_tracer;
 		if (ret) {
@@ -3594,7 +3605,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	unsigned long flags, irq_flags;
 	int cpu, len = 0, size, pc;
 
-	if (tracing_disabled)
+	if (tracing_disabled || atomic_read(&tracing_selftest_running))
 		return 0;
 
 	pc = preempt_count();
-- 
cgit v1.2.3


From 77d683f3e0258d522c5506e7b5fd05c9411184d9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 4 Dec 2008 23:49:47 +0100
Subject: tracing/ftrace: fix the check of ftrace_trace_task

Impact: fix default empty traces on function-graph-tracer

The actual ftrace_trace_task() checks if ftrace_pid_trace is allocated
and return 1 if it is true.
If it is NULL, it will check the bit of pid tracing flag for the current
task (which are not set by default).
So by default, a task is not traced.
Actually all tasks should be traced by default and filter_by_pid when
ftrace_pid_trace is allocated.

The appropriate condition should be to return 1 if filter_by_pid is
set.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acke-dby: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index a71bbe0a3631..5ac697065a48 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -551,7 +551,7 @@ extern struct pid *ftrace_pid_trace;
 
 static inline int ftrace_trace_task(struct task_struct *task)
 {
-	if (ftrace_pid_trace)
+	if (!ftrace_pid_trace)
 		return 1;
 
 	return test_tsk_trace_trace(task);
-- 
cgit v1.2.3


From 21a8c466f99063eeb8567318b4e305eda9015408 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 4 Dec 2008 23:51:23 +0100
Subject: tracing/ftrace: provide the macro task_curr_ret_stack()

Impact: cleanup

As suggested by Steven Rostedt, this patch provide a new macro
task_curr_ret_stack() to move the cpp conditionnal CONFIG into
the linux/ftrace.h headers.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 11 +++++++++++
 kernel/trace/trace.c   |  8 +-------
 2 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index b295d3106bfe..b9b4d0a22d10 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -8,6 +8,7 @@
 #include <linux/types.h>
 #include <linux/kallsyms.h>
 #include <linux/bitops.h>
+#include <linux/sched.h>
 
 #ifdef CONFIG_FUNCTION_TRACER
 
@@ -387,9 +388,19 @@ extern void unregister_ftrace_graph(void);
 
 extern void ftrace_graph_init_task(struct task_struct *t);
 extern void ftrace_graph_exit_task(struct task_struct *t);
+
+static inline int task_curr_ret_stack(struct task_struct *t)
+{
+	return t->curr_ret_stack;
+}
 #else
 static inline void ftrace_graph_init_task(struct task_struct *t) { }
 static inline void ftrace_graph_exit_task(struct task_struct *t) { }
+
+static inline int task_curr_ret_stack(struct task_struct *tsk)
+{
+	return -1;
+}
 #endif
 
 #ifdef CONFIG_TRACING
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5dca6ef1fbeb..7a93c663e52a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3657,13 +3657,7 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...)
 		return 0;
 
 	va_start(ap, fmt);
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	ret = trace_vprintk(ip, current->curr_ret_stack, fmt, ap);
-#else
-	ret = trace_vprintk(ip, -1, fmt, ap);
-#endif
-
+	ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
 	va_end(ap);
 	return ret;
 }
-- 
cgit v1.2.3


From 21bbecdaaef3a6acc19905ab88c0587817318870 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 4 Dec 2008 23:30:56 -0500
Subject: ftrace: use init_struct_pid as swapper pid

Impact: clean up

Using (struct pid *)-1 as the pointer for ftrace_swapper_pid is
a little confusing for others. This patch uses the address of the
actual init pid structure instead. This change is only for
clarity. It does not affect the code itself. Hopefully soon the
swapper tasks will all have their own pid structure and then
we can clean up the code a bit more.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d2b156538162..2971fe48f55e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -49,7 +49,7 @@ static int last_ftrace_enabled;
 
 /* set when tracing only a pid */
 struct pid *ftrace_pid_trace;
-static struct pid * const ftrace_swapper_pid = (struct pid *)1;
+static struct pid * const ftrace_swapper_pid = &init_struct_pid;
 
 /* Quick disabling of function tracer. */
 int function_trace_stop;
-- 
cgit v1.2.3


From c37bbb0fdcc01610fd55604eb6927210a1d20044 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Wed, 3 Dec 2008 13:17:06 -0600
Subject: user namespaces: let user_ns be cloned with fairsched

(These two patches are in the next-unacked branch of
git://git.kernel.org/pub/scm/linux/kernel/git/sergeh/userns-2.6.
If they get some ACKs, then I hope to feed this into security-next.
After these two, I think we're ready to tackle userns+capabilities)

Fairsched creates a per-uid directory under /sys/kernel/uids/.
So when you clone(CLONE_NEWUSER), it tries to create
/sys/kernel/uids/0, which already exists, and you get back
-ENOMEM.

This was supposed to be fixed by sysfs tagging, but that
was postponed (ok, rejected until sysfs locking is fixed).
So, just as with network namespaces, we just don't create
those directories for user namespaces other than the init.

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/user.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/user.c b/kernel/user.c
index 97202cb29adc..6c924bc48c08 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -246,6 +246,8 @@ static int uids_user_create(struct user_struct *up)
 	int error;
 
 	memset(kobj, 0, sizeof(struct kobject));
+	if (up->user_ns != &init_user_ns)
+		return 0;
 	kobj->kset = uids_kset;
 	error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid);
 	if (error) {
@@ -281,6 +283,8 @@ static void remove_user_sysfs_dir(struct work_struct *w)
 	unsigned long flags;
 	int remove_user = 0;
 
+	if (up->user_ns != &init_user_ns)
+		return;
 	/* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
 	 * atomic.
 	 */
-- 
cgit v1.2.3


From 7657d90497f98426af17f0ac633a9b335bb7a8fb Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Wed, 3 Dec 2008 13:17:33 -0600
Subject: user namespaces: require cap_set{ug}id for CLONE_NEWUSER

While ideally CLONE_NEWUSER will eventually require no
privilege, the required permission checks are currently
not there.  As a result, CLONE_NEWUSER has the same effect
as a setuid(0)+setgroups(1,"0").  While we already require
CAP_SYS_ADMIN, requiring CAP_SETUID and CAP_SETGID seems
appropriate.

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/fork.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 1dd89451fae4..e3a85b33107e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1344,7 +1344,8 @@ long do_fork(unsigned long clone_flags,
 		/* hopefully this check will go away when userns support is
 		 * complete
 		 */
-		if (!capable(CAP_SYS_ADMIN))
+		if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
+				!capable(CAP_SETGID))
 			return -EPERM;
 	}
 
-- 
cgit v1.2.3


From decbec3838d10ecd7aabdb4c0e05aac0e5f5dc0c Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 8 Dec 2008 01:56:06 +0100
Subject: tracing/function-graph-tracer: implement a print_headers function

Impact: provide trace headers to explain a bit the output

This patch implements the print_headers callback for the function graph
tracer. These headers are output according to the current trace options.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_functions_graph.c | 33 +++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 32b7fb9a19df..af60eef4cbcc 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -570,11 +570,36 @@ print_graph_function(struct trace_iterator *iter)
 	}
 }
 
+static void print_graph_headers(struct seq_file *s)
+{
+	/* 1st line */
+	seq_printf(s, "# ");
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
+		seq_printf(s, "CPU ");
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
+		seq_printf(s, "TASK/PID     ");
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD)
+		seq_printf(s, "OVERHEAD/");
+	seq_printf(s, "DURATION            FUNCTION CALLS\n");
+
+	/* 2nd line */
+	seq_printf(s, "# ");
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
+		seq_printf(s, "|   ");
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
+		seq_printf(s, "|      |     ");
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
+		seq_printf(s, "|        ");
+		seq_printf(s, "|                   |   |   |   |\n");
+	} else
+		seq_printf(s, "    |               |   |   |   |\n");
+}
 static struct tracer graph_trace __read_mostly = {
-	.name	     = "function_graph",
-	.init	     = graph_trace_init,
-	.reset	     = graph_trace_reset,
-	.print_line = print_graph_function,
+	.name	     	= "function_graph",
+	.init	     	= graph_trace_init,
+	.reset	     	= graph_trace_reset,
+	.print_line	= print_graph_function,
+	.print_header	= print_graph_headers,
 	.flags		= &tracer_flags,
 };
 
-- 
cgit v1.2.3


From 5436499e6098759c2340f8b906ea52f993dc4efb Mon Sep 17 00:00:00 2001
From: Ken Chen <kenchen@google.com>
Date: Sun, 7 Dec 2008 18:47:37 -0800
Subject: sched: fix sd_parent_degenerate on non-numa smp machine

Impact: optimize the sched domains tree some more

The addition of SD_SERIALIZE flag added to SD_NODE_INIT prevented top level
dummy numa sched_domain to be properly degenerated on non-numa smp machine.
The reason is that in sd_parent_degenerate(), it found that the child and
parent does not have comon sched_domain flags due to SD_SERIALIZE.  However,
for non-numa smp box, the top level is a dummy with a single sched_group.

Filter out SD_SERIALIZE if it is on non-numa machine to properly degenerate
top level node sched_domain.  this will cut back some of the sd domain walk
in the load balancer code.

Signed-off-by: Ken Chen <kenchen@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 152828239ef0..74498c840f93 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6768,6 +6768,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 				SD_BALANCE_EXEC |
 				SD_SHARE_CPUPOWER |
 				SD_SHARE_PKG_RESOURCES);
+		if (nr_node_ids == 1)
+			pflags &= ~SD_SERIALIZE;
 	}
 	if (~cflags & pflags)
 		return 0;
-- 
cgit v1.2.3


From 8b96f0119818964e4944fd1c423bf6770027d3ac Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 6 Dec 2008 03:40:00 +0100
Subject: tracing/function-graph-tracer: introduce __notrace_funcgraph to
 filter special functions

Impact: trace more functions

When the function graph tracer is configured, three more files are not
traced to prevent only four functions to be traced. And this impacts the
normal function tracer too.

arch/x86/kernel/process_64/32.c:

I had crashes when I let this file traced. After some debugging, I saw
that the "current" task point was changed inside__swtich_to(), ie:
"write_pda(pcurrent, next_p);" inside process_64.c Since the tracer store
the original return address of the function inside current, we had
crashes. Only __switch_to() has to be excluded from tracing.

kernel/module.c and kernel/extable.c:

Because of a function used internally by the function graph tracer:
__kernel_text_address()

To let the other functions inside these files to be traced, this patch
introduces the __notrace_funcgraph function prefix which is __notrace if
function graph tracer is configured and nothing if not.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/Makefile     |  6 ------
 arch/x86/kernel/process_32.c |  4 +++-
 arch/x86/kernel/process_64.c |  4 +++-
 include/linux/ftrace.h       | 11 +++++++++++
 kernel/Makefile              |  4 ----
 kernel/extable.c             |  5 +++--
 kernel/module.c              |  2 +-
 7 files changed, 21 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index a3049da61985..1cad9318d217 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -14,12 +14,6 @@ CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
 CFLAGS_REMOVE_ftrace.o = -pg
 endif
 
-ifdef CONFIG_FUNCTION_GRAPH_TRACER
-# Don't trace __switch_to() but let it for function tracer
-CFLAGS_REMOVE_process_32.o = -pg
-CFLAGS_REMOVE_process_64.o = -pg
-endif
-
 #
 # vsyscalls (which work on the user stack) should have
 # no stack-protector checks:
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0a1302fe6d45..24c2276aa453 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -38,6 +38,7 @@
 #include <linux/percpu.h>
 #include <linux/prctl.h>
 #include <linux/dmi.h>
+#include <linux/ftrace.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -548,7 +549,8 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
  * the task-switch, and shows up in ret_from_fork in entry.S,
  * for example.
  */
-struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+__notrace_funcgraph struct task_struct *
+__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
 	struct thread_struct *prev = &prev_p->thread,
 				 *next = &next_p->thread;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c958120fb1b6..fbb321d53d34 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -39,6 +39,7 @@
 #include <linux/prctl.h>
 #include <linux/uaccess.h>
 #include <linux/io.h>
+#include <linux/ftrace.h>
 
 #include <asm/pgtable.h>
 #include <asm/system.h>
@@ -551,8 +552,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
  * - could test fs/gs bitsliced
  *
  * Kprobes not supported here. Set the probe on schedule instead.
+ * Function graph tracer not supported too.
  */
-struct task_struct *
+__notrace_funcgraph struct task_struct *
 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
 	struct thread_struct *prev = &prev_p->thread;
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index b9b4d0a22d10..449fa8e9e34f 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -369,6 +369,14 @@ struct ftrace_graph_ret {
 };
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+/*
+ * Sometimes we don't want to trace a function with the function
+ * graph tracer but we want them to keep traced by the usual function
+ * tracer if the function graph tracer is not configured.
+ */
+#define __notrace_funcgraph		notrace
+
 #define FTRACE_RETFUNC_DEPTH 50
 #define FTRACE_RETSTACK_ALLOC_SIZE 32
 /* Type of the callback handlers for tracing function graph*/
@@ -394,6 +402,9 @@ static inline int task_curr_ret_stack(struct task_struct *t)
 	return t->curr_ret_stack;
 }
 #else
+
+#define __notrace_funcgraph
+
 static inline void ftrace_graph_init_task(struct task_struct *t) { }
 static inline void ftrace_graph_exit_task(struct task_struct *t) { }
 
diff --git a/kernel/Makefile b/kernel/Makefile
index 703cf3b7389c..19fad003b19d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -21,10 +21,6 @@ CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_sched_clock.o = -pg
 CFLAGS_REMOVE_sched.o = -pg
 endif
-ifdef CONFIG_FUNCTION_GRAPH_TRACER
-CFLAGS_REMOVE_extable.o = -pg # For __kernel_text_address()
-CFLAGS_REMOVE_module.o = -pg # For __module_text_address()
-endif
 
 obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
diff --git a/kernel/extable.c b/kernel/extable.c
index a26cb2e17023..feb0317cf09a 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -17,6 +17,7 @@
 */
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/ftrace.h>
 #include <asm/uaccess.h>
 #include <asm/sections.h>
 
@@ -40,7 +41,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
 	return e;
 }
 
-int core_kernel_text(unsigned long addr)
+__notrace_funcgraph int core_kernel_text(unsigned long addr)
 {
 	if (addr >= (unsigned long)_stext &&
 	    addr <= (unsigned long)_etext)
@@ -53,7 +54,7 @@ int core_kernel_text(unsigned long addr)
 	return 0;
 }
 
-int __kernel_text_address(unsigned long addr)
+__notrace_funcgraph int __kernel_text_address(unsigned long addr)
 {
 	if (core_kernel_text(addr))
 		return 1;
diff --git a/kernel/module.c b/kernel/module.c
index 89bcf7c1327d..dd2a54155b54 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2704,7 +2704,7 @@ int is_module_address(unsigned long addr)
 
 
 /* Is this a valid kernel address? */
-struct module *__module_text_address(unsigned long addr)
+__notrace_funcgraph struct module *__module_text_address(unsigned long addr)
 {
 	struct module *mod;
 
-- 
cgit v1.2.3


From 8e1b82e0866befaa0b2920be296c6e4c3fc7f422 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 6 Dec 2008 03:41:33 +0100
Subject: tracing/function-graph-tracer: turn tracing_selftest_running into an
 int

Impact: cleanup

Apply some suggestions of Steven Rostedt:

_turn tracing_selftest_running into a simple int (no need of an atomic_t)
_set it __read_mostly
_fix a comment style

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7a93c663e52a..33549537f30f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -44,13 +44,14 @@
 unsigned long __read_mostly	tracing_max_latency = (cycle_t)ULONG_MAX;
 unsigned long __read_mostly	tracing_thresh;
 
-/* We need to change this state when a selftest is running.
+/*
+ * We need to change this state when a selftest is running.
  * A selftest will lurk into the ring-buffer to count the
  * entries inserted during the selftest although some concurrent
  * insertions into the ring-buffer such as ftrace_printk could occurred
  * at the same time, giving false positive or negative results.
  */
-static atomic_t tracing_selftest_running = ATOMIC_INIT(0);
+static bool __read_mostly tracing_selftest_running;
 
 /* For tracers that don't implement custom flags */
 static struct tracer_opt dummy_tracer_opt[] = {
@@ -574,6 +575,8 @@ int register_tracer(struct tracer *type)
 	unlock_kernel();
 	mutex_lock(&trace_types_lock);
 
+	tracing_selftest_running = true;
+
 	for (t = trace_types; t; t = t->next) {
 		if (strcmp(type->name, t->name) == 0) {
 			/* already found */
@@ -598,7 +601,6 @@ int register_tracer(struct tracer *type)
 		struct trace_array *tr = &global_trace;
 		int i;
 
-		atomic_set(&tracing_selftest_running, 1);
 		/*
 		 * Run a selftest on this tracer.
 		 * Here we reset the trace buffer, and set the current
@@ -613,7 +615,6 @@ int register_tracer(struct tracer *type)
 		/* the test is responsible for initializing and enabling */
 		pr_info("Testing tracer %s: ", type->name);
 		ret = type->selftest(type, tr);
-		atomic_set(&tracing_selftest_running, 0);
 		/* the test is responsible for resetting too */
 		current_trace = saved_tracer;
 		if (ret) {
@@ -635,6 +636,7 @@ int register_tracer(struct tracer *type)
 		max_tracer_type_len = len;
 
  out:
+	tracing_selftest_running = false;
 	mutex_unlock(&trace_types_lock);
 	lock_kernel();
 
@@ -3605,7 +3607,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	unsigned long flags, irq_flags;
 	int cpu, len = 0, size, pc;
 
-	if (tracing_disabled || atomic_read(&tracing_selftest_running))
+	if (tracing_disabled || tracing_selftest_running)
 		return 0;
 
 	pc = preempt_count();
-- 
cgit v1.2.3


From 380c4b1411ccd6885f92b2c8ceb08433a720f44e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 6 Dec 2008 03:43:41 +0100
Subject: tracing/function-graph-tracer: append the tracing_graph_flag

Impact: Provide a way to pause the function graph tracer

As suggested by Steven Rostedt, the previous patch that prevented from
spinlock function tracing shouldn't use the raw_spinlock to fix it.
It's much better to follow lockdep with normal spinlock, so this patch
adds a new flag for each task to make the function graph tracer able
to be paused. We also can send an ftrace_printk whithout worrying of
the irrelevant traced spinlock during insertion.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ftrace.c |  5 ++++-
 include/linux/ftrace.h   | 13 +++++++++++++
 include/linux/sched.h    |  2 ++
 kernel/trace/ftrace.c    |  2 ++
 kernel/trace/trace.c     | 18 +++++-------------
 5 files changed, 26 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index f98c4076a170..1b43086b097a 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -476,7 +476,10 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 				&return_to_handler;
 
 	/* Nmi's are currently unsupported */
-	if (atomic_read(&in_nmi))
+	if (unlikely(atomic_read(&in_nmi)))
+		return;
+
+	if (unlikely(atomic_read(&current->tracing_graph_pause)))
 		return;
 
 	/*
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 449fa8e9e34f..11cac81eed08 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -401,6 +401,16 @@ static inline int task_curr_ret_stack(struct task_struct *t)
 {
 	return t->curr_ret_stack;
 }
+
+static inline void pause_graph_tracing(void)
+{
+	atomic_inc(&current->tracing_graph_pause);
+}
+
+static inline void unpause_graph_tracing(void)
+{
+	atomic_dec(&current->tracing_graph_pause);
+}
 #else
 
 #define __notrace_funcgraph
@@ -412,6 +422,9 @@ static inline int task_curr_ret_stack(struct task_struct *tsk)
 {
 	return -1;
 }
+
+static inline void pause_graph_tracing(void) { }
+static inline void unpause_graph_tracing(void) { }
 #endif
 
 #ifdef CONFIG_TRACING
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4c152e0acc9e..4b81fc5f7731 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1379,6 +1379,8 @@ struct task_struct {
 	 * because of depth overrun.
 	 */
 	atomic_t trace_overrun;
+	/* Pause for the tracing */
+	atomic_t tracing_graph_pause;
 #endif
 #ifdef CONFIG_TRACING
 	/* state flags for use by tracers */
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2971fe48f55e..a12f80efceaa 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1998,6 +1998,7 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
 			/* Make sure IRQs see the -1 first: */
 			barrier();
 			t->ret_stack = ret_stack_list[start++];
+			atomic_set(&t->tracing_graph_pause, 0);
 			atomic_set(&t->trace_overrun, 0);
 		}
 	} while_each_thread(g, t);
@@ -2077,6 +2078,7 @@ void ftrace_graph_init_task(struct task_struct *t)
 		if (!t->ret_stack)
 			return;
 		t->curr_ret_stack = -1;
+		atomic_set(&t->tracing_graph_pause, 0);
 		atomic_set(&t->trace_overrun, 0);
 	} else
 		t->ret_stack = NULL;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 33549537f30f..0b8659bd5ad2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3590,14 +3590,7 @@ static __init int tracer_init_debugfs(void)
 
 int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 {
-	/*
-	 * Raw Spinlock because a normal spinlock would be traced here
-	 * and append an irrelevant couple spin_lock_irqsave/
-	 * spin_unlock_irqrestore traced by ftrace around this
-	 * TRACE_PRINTK trace.
-	 */
-	static raw_spinlock_t trace_buf_lock =
-				(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+	static DEFINE_SPINLOCK(trace_buf_lock);
 	static char trace_buf[TRACE_BUF_SIZE];
 
 	struct ring_buffer_event *event;
@@ -3618,8 +3611,8 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	if (unlikely(atomic_read(&data->disabled)))
 		goto out;
 
-	local_irq_save(flags);
-	__raw_spin_lock(&trace_buf_lock);
+	pause_graph_tracing();
+	spin_lock_irqsave(&trace_buf_lock, irq_flags);
 	len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
 
 	len = min(len, TRACE_BUF_SIZE-1);
@@ -3640,9 +3633,8 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
 
  out_unlock:
-	__raw_spin_unlock(&trace_buf_lock);
-	local_irq_restore(flags);
-
+	spin_unlock_irqrestore(&trace_buf_lock, irq_flags);
+	unpause_graph_tracing();
  out:
 	preempt_enable_notrace();
 
-- 
cgit v1.2.3


From efbe027e95dc13ac343b6130948418d7ead7ddf1 Mon Sep 17 00:00:00 2001
From: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Date: Mon, 8 Dec 2008 20:52:49 +0530
Subject: sched: idle_balance() does not call load_balance_newidle()

Impact: fix SD_BALANCE_NEWIDLEand broaden its use

load_balance_newidle() does not get called if SD_BALANCE_NEWIDLE is
set at higher level domain (3-CPU) and not in low level domain (2-MC).

pulled_task is initialised to -1 and checked for non-zero which is
always true if the lowest level sched_domain does not have
SD_BALANCE_NEWIDLE flag set.

Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 74498c840f93..bb9c6384d077 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3685,7 +3685,7 @@ out_balanced:
 static void idle_balance(int this_cpu, struct rq *this_rq)
 {
 	struct sched_domain *sd;
-	int pulled_task = -1;
+	int pulled_task = 0;
 	unsigned long next_balance = jiffies + HZ;
 	cpumask_t tmpmask;
 
-- 
cgit v1.2.3


From e726f5f91effd8944c76475a2688093a03ba0d10 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 8 Dec 2008 16:55:53 +0100
Subject: tracing/function-graph-tracer: fix 'flags' variable mismatch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

this warning:

 kernel/trace/trace.c: In function ‘trace_vprintk’:
 kernel/trace/trace.c:3626: warning: ‘flags’ may be used uninitialized in this function

shows some confusion about irq_flags / flags use here. We already have
irq_flags so remove the extra flags variable.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0b8659bd5ad2..8ebe0070c47a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3596,9 +3596,9 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	struct ring_buffer_event *event;
 	struct trace_array *tr = &global_trace;
 	struct trace_array_cpu *data;
-	struct print_entry *entry;
-	unsigned long flags, irq_flags;
 	int cpu, len = 0, size, pc;
+	struct print_entry *entry;
+	unsigned long irq_flags;
 
 	if (tracing_disabled || tracing_selftest_running)
 		return 0;
@@ -3623,7 +3623,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	if (!event)
 		goto out_unlock;
 	entry = ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, flags, pc);
+	tracing_generic_entry_update(&entry->ent, irq_flags, pc);
 	entry->ent.type			= TRACE_PRINT;
 	entry->ip			= ip;
 	entry->depth			= depth;
-- 
cgit v1.2.3


From 94d6a5f7341ebaff53d4e41cc81fab37f0d9fbed Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Mon, 8 Dec 2008 15:52:21 -0600
Subject: user namespaces: document CFS behavior

Documented the currently bogus state of support for CFS user groups with
user namespaces.  In particular, all users in a user namespace should be
children of the user which created the user namespace.  This is yet to
be implemented.

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Acked-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 Documentation/scheduler/sched-design-CFS.txt | 21 +++++++++++++++++++++
 kernel/user.c                                |  8 +++++++-
 2 files changed, 28 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt
index eb471c7a905e..8398ca4ff4ed 100644
--- a/Documentation/scheduler/sched-design-CFS.txt
+++ b/Documentation/scheduler/sched-design-CFS.txt
@@ -273,3 +273,24 @@ task groups and modify their CPU share using the "cgroups" pseudo filesystem.
 
 	# #Launch gmplayer (or your favourite movie player)
 	# echo <movie_player_pid> > multimedia/tasks
+
+8. Implementation note: user namespaces
+
+User namespaces are intended to be hierarchical.  But they are currently
+only partially implemented.  Each of those has ramifications for CFS.
+
+First, since user namespaces are hierarchical, the /sys/kernel/uids
+presentation is inadequate.  Eventually we will likely want to use sysfs
+tagging to provide private views of /sys/kernel/uids within each user
+namespace.
+
+Second, the hierarchical nature is intended to support completely
+unprivileged use of user namespaces.  So if using user groups, then
+we want the users in a user namespace to be children of the user
+who created it.
+
+That is currently unimplemented.  So instead, every user in a new
+user namespace will receive 1024 shares just like any user in the
+initial user namespace.  Note that at the moment creation of a new
+user namespace requires each of CAP_SYS_ADMIN, CAP_SETUID, and
+CAP_SETGID.
diff --git a/kernel/user.c b/kernel/user.c
index 6c924bc48c08..6608a3d8ca61 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -239,7 +239,13 @@ static struct kobj_type uids_ktype = {
 	.release = uids_release,
 };
 
-/* create /sys/kernel/uids/<uid>/cpu_share file for this user */
+/*
+ * Create /sys/kernel/uids/<uid>/cpu_share file for this user
+ * We do not create this file for users in a user namespace (until
+ * sysfs tagging is implemented).
+ *
+ * See Documentation/scheduler/sched-design-CFS.txt for ramifications.
+ */
 static int uids_user_create(struct user_struct *up)
 {
 	struct kobject *kobj = &up->kobj;
-- 
cgit v1.2.3


From a3f07114e3359fb98683069ae397220e8992a24a Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 5 Nov 2008 12:47:09 -0500
Subject: [PATCH] Audit: make audit=0 actually turn off audit

Currently audit=0 on the kernel command line does absolutely nothing.
Audit always loads and always uses its resources such as creating the
kernel netlink socket.  This patch causes audit=0 to actually disable
audit.  Audit will use no resources and starting the userspace auditd
daemon will not cause the kernel audit system to activate.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit.c | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 4414e93d8750..d8646c23b427 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -61,8 +61,11 @@
 
 #include "audit.h"
 
-/* No auditing will take place until audit_initialized != 0.
+/* No auditing will take place until audit_initialized == AUDIT_INITIALIZED.
  * (Initialization happens after skb_init is called.) */
+#define AUDIT_DISABLED		-1
+#define AUDIT_UNINITIALIZED	0
+#define AUDIT_INITIALIZED	1
 static int	audit_initialized;
 
 #define AUDIT_OFF	0
@@ -965,6 +968,9 @@ static int __init audit_init(void)
 {
 	int i;
 
+	if (audit_initialized == AUDIT_DISABLED)
+		return 0;
+
 	printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
 	       audit_default ? "enabled" : "disabled");
 	audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0,
@@ -976,7 +982,7 @@ static int __init audit_init(void)
 
 	skb_queue_head_init(&audit_skb_queue);
 	skb_queue_head_init(&audit_skb_hold_queue);
-	audit_initialized = 1;
+	audit_initialized = AUDIT_INITIALIZED;
 	audit_enabled = audit_default;
 	audit_ever_enabled |= !!audit_default;
 
@@ -999,13 +1005,21 @@ __initcall(audit_init);
 static int __init audit_enable(char *str)
 {
 	audit_default = !!simple_strtol(str, NULL, 0);
-	printk(KERN_INFO "audit: %s%s\n",
-	       audit_default ? "enabled" : "disabled",
-	       audit_initialized ? "" : " (after initialization)");
-	if (audit_initialized) {
+	if (!audit_default)
+		audit_initialized = AUDIT_DISABLED;
+
+	printk(KERN_INFO "audit: %s", audit_default ? "enabled" : "disabled");
+
+	if (audit_initialized == AUDIT_INITIALIZED) {
 		audit_enabled = audit_default;
 		audit_ever_enabled |= !!audit_default;
+	} else if (audit_initialized == AUDIT_UNINITIALIZED) {
+		printk(" (after initialization)");
+	} else {
+		printk(" (until reboot)");
 	}
+	printk("\n");
+
 	return 1;
 }
 
@@ -1146,7 +1160,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
 	int reserve;
 	unsigned long timeout_start = jiffies;
 
-	if (!audit_initialized)
+	if (audit_initialized != AUDIT_INITIALIZED)
 		return NULL;
 
 	if (unlikely(audit_filter_type(type)))
-- 
cgit v1.2.3


From a64e64944f4b8ce3288519555dbaa0232414b8ac Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 12 Nov 2008 18:37:41 -0500
Subject: [PATCH] return records for fork() both to child and parent

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h |  2 ++
 kernel/auditsc.c      | 17 +++++++++++++++++
 kernel/fork.c         |  1 +
 3 files changed, 20 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 6272a395d43c..1b2a6a5c1876 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -391,6 +391,7 @@ extern int audit_classify_arch(int arch);
 #ifdef CONFIG_AUDITSYSCALL
 /* These are defined in auditsc.c */
 				/* Public API */
+extern void audit_finish_fork(struct task_struct *child);
 extern int  audit_alloc(struct task_struct *task);
 extern void audit_free(struct task_struct *task);
 extern void audit_syscall_entry(int arch,
@@ -504,6 +505,7 @@ static inline int audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
 extern int audit_n_rules;
 extern int audit_signals;
 #else
+#define audit_finish_fork(t)
 #define audit_alloc(t) ({ 0; })
 #define audit_free(t) do { ; } while (0)
 #define audit_syscall_entry(ta,a,b,c,d,e) do { ; } while (0)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index cf5bc2f5f9c3..de8468050afa 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1548,6 +1548,23 @@ void audit_syscall_entry(int arch, int major,
 	context->ppid       = 0;
 }
 
+void audit_finish_fork(struct task_struct *child)
+{
+	struct audit_context *ctx = current->audit_context;
+	struct audit_context *p = child->audit_context;
+	if (!p || !ctx || !ctx->auditable)
+		return;
+	p->arch = ctx->arch;
+	p->major = ctx->major;
+	memcpy(p->argv, ctx->argv, sizeof(ctx->argv));
+	p->ctime = ctx->ctime;
+	p->dummy = ctx->dummy;
+	p->auditable = ctx->auditable;
+	p->in_syscall = ctx->in_syscall;
+	p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL);
+	p->ppid = current->pid;
+}
+
 /**
  * audit_syscall_exit - deallocate audit context after a system call
  * @tsk: task being audited
diff --git a/kernel/fork.c b/kernel/fork.c
index 2a372a0e206f..8d6a7dd9282b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1398,6 +1398,7 @@ long do_fork(unsigned long clone_flags,
 			init_completion(&vfork);
 		}
 
+		audit_finish_fork(p);
 		tracehook_report_clone(trace, regs, clone_flags, nr, p);
 
 		/*
-- 
cgit v1.2.3


From 7f0ed77d241b60f70136f15b8eef30a3de1fa249 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Mon, 1 Dec 2008 14:16:06 -0800
Subject: [patch 1/1] audit: remove excess kernel-doc

Delete excess kernel-doc notation in kernel/auditsc.c:

Warning(linux-2.6.27-git10//kernel/auditsc.c:1481): Excess function parameter or struct member 'tsk' description in 'audit_syscall_entry'
Warning(linux-2.6.27-git10//kernel/auditsc.c:1564): Excess function parameter or struct member 'tsk' description in 'audit_syscall_exit'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric Paris <eparis@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index de8468050afa..0a13d6895494 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1459,7 +1459,6 @@ void audit_free(struct task_struct *tsk)
 
 /**
  * audit_syscall_entry - fill in an audit record at syscall entry
- * @tsk: task being audited
  * @arch: architecture type
  * @major: major syscall type (function)
  * @a1: additional syscall register 1
@@ -1567,7 +1566,6 @@ void audit_finish_fork(struct task_struct *child)
 
 /**
  * audit_syscall_exit - deallocate audit context after a system call
- * @tsk: task being audited
  * @valid: success/failure flag
  * @return_code: syscall return value
  *
-- 
cgit v1.2.3


From 48887e63d6e057543067327da6b091297f7fe645 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 6 Dec 2008 01:05:50 -0500
Subject: [PATCH] fix broken timestamps in AVC generated by kernel threads

Timestamp in audit_context is valid only if ->in_syscall is set.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h | 4 ++--
 kernel/audit.c        | 4 +---
 kernel/auditsc.c      | 5 ++++-
 3 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 1b2a6a5c1876..8f0672d13eb1 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -435,7 +435,7 @@ static inline void audit_ptrace(struct task_struct *t)
 
 				/* Private API (for audit.c only) */
 extern unsigned int audit_serial(void);
-extern void auditsc_get_stamp(struct audit_context *ctx,
+extern int auditsc_get_stamp(struct audit_context *ctx,
 			      struct timespec *t, unsigned int *serial);
 extern int  audit_set_loginuid(struct task_struct *task, uid_t loginuid);
 #define audit_get_loginuid(t) ((t)->loginuid)
@@ -518,7 +518,7 @@ extern int audit_signals;
 #define audit_inode(n,d) do { ; } while (0)
 #define audit_inode_child(d,i,p) do { ; } while (0)
 #define audit_core_dumps(i) do { ; } while (0)
-#define auditsc_get_stamp(c,t,s) do { BUG(); } while (0)
+#define auditsc_get_stamp(c,t,s) (0)
 #define audit_get_loginuid(t) (-1)
 #define audit_get_sessionid(t) (-1)
 #define audit_log_task_context(b) do { ; } while (0)
diff --git a/kernel/audit.c b/kernel/audit.c
index d8646c23b427..ce6d8ea3131e 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1121,9 +1121,7 @@ unsigned int audit_serial(void)
 static inline void audit_get_stamp(struct audit_context *ctx,
 				   struct timespec *t, unsigned int *serial)
 {
-	if (ctx)
-		auditsc_get_stamp(ctx, t, serial);
-	else {
+	if (!ctx || !auditsc_get_stamp(ctx, t, serial)) {
 		*t = CURRENT_TIME;
 		*serial = audit_serial();
 	}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 0a13d6895494..2a3f0afc4d2a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1957,15 +1957,18 @@ EXPORT_SYMBOL_GPL(__audit_inode_child);
  *
  * Also sets the context as auditable.
  */
-void auditsc_get_stamp(struct audit_context *ctx,
+int auditsc_get_stamp(struct audit_context *ctx,
 		       struct timespec *t, unsigned int *serial)
 {
+	if (!ctx->in_syscall)
+		return 0;
 	if (!ctx->serial)
 		ctx->serial = audit_serial();
 	t->tv_sec  = ctx->ctime.tv_sec;
 	t->tv_nsec = ctx->ctime.tv_nsec;
 	*serial    = ctx->serial;
 	ctx->auditable = 1;
+	return 1;
 }
 
 /* global counter which is incremented every time something logs in */
-- 
cgit v1.2.3


From 9a2bd244e18ffbb96c8b783210fda4eded7c7e6f Mon Sep 17 00:00:00 2001
From: Brian King <brking@linux.vnet.ibm.com>
Date: Tue, 9 Dec 2008 08:47:00 -0600
Subject: sched: CPU remove deadlock fix

Impact: fix possible deadlock in CPU hot-remove path

This patch fixes a possible deadlock scenario in the CPU remove path.
migration_call grabs rq->lock, then wakes up everything on rq->migration_queue
with the lock held. Then one of the tasks on the migration queue ends up
calling tg_shares_up which then also tries to acquire the same rq->lock.

[c000000058eab2e0] c000000000502078 ._spin_lock_irqsave+0x98/0xf0
[c000000058eab370] c00000000008011c .tg_shares_up+0x10c/0x20c
[c000000058eab430] c00000000007867c .walk_tg_tree+0xc4/0xfc
[c000000058eab4d0] c0000000000840c8 .try_to_wake_up+0xb0/0x3c4
[c000000058eab590] c0000000000799a0 .__wake_up_common+0x6c/0xe0
[c000000058eab640] c00000000007ada4 .complete+0x54/0x80
[c000000058eab6e0] c000000000509fa8 .migration_call+0x5fc/0x6f8
[c000000058eab7c0] c000000000504074 .notifier_call_chain+0x68/0xe0
[c000000058eab860] c000000000506568 ._cpu_down+0x2b0/0x3f4
[c000000058eaba60] c000000000506750 .cpu_down+0xa4/0x108
[c000000058eabb10] c000000000507e54 .store_online+0x44/0xa8
[c000000058eabba0] c000000000396260 .sysdev_store+0x3c/0x50
[c000000058eabc10] c0000000001a39b8 .sysfs_write_file+0x124/0x18c
[c000000058eabcd0] c00000000013061c .vfs_write+0xd0/0x1bc
[c000000058eabd70] c0000000001308a4 .sys_write+0x68/0x114
[c000000058eabe30] c0000000000086b4 syscall_exit+0x0/0x40

Signed-off-by: Brian King <brking@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index b7480fb5c3dc..e4bb1dd7b308 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6587,7 +6587,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 			req = list_entry(rq->migration_queue.next,
 					 struct migration_req, list);
 			list_del_init(&req->list);
+			spin_unlock_irq(&rq->lock);
 			complete(&req->done);
+			spin_lock_irq(&rq->lock);
 		}
 		spin_unlock_irq(&rq->lock);
 		break;
-- 
cgit v1.2.3


From fbb5b7ae4b442f1923513dc6165a66c7a7f29073 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@comcast.net>
Date: Tue, 9 Dec 2008 13:14:10 -0800
Subject: relayfs: fix infinite loop with splice()

Running kmemtraced, which uses splice() on relayfs, causes a hard lock on
x86-64 SMP.  As described by Tom Zanussi:

  It looks like you hit the same problem as described here:

  commit 8191ecd1d14c6914c660dfa007154860a7908857

      splice: fix infinite loop in generic_file_splice_read()

  relay uses the same loop but it never got noticed or fixed.

Cc: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Tested-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/relay.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index 32b0befdcb6a..09ac2008f77b 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1317,12 +1317,9 @@ static ssize_t relay_file_splice_read(struct file *in,
 		if (ret < 0)
 			break;
 		else if (!ret) {
-			if (spliced)
-				break;
-			if (flags & SPLICE_F_NONBLOCK) {
+			if (flags & SPLICE_F_NONBLOCK)
 				ret = -EAGAIN;
-				break;
-			}
+			break;
 		}
 
 		*ppos += ret;
-- 
cgit v1.2.3


From 9c24624727f6d6c460e45762a408ca5f5b9b8ef2 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Tue, 9 Dec 2008 13:14:27 -0800
Subject: KSYM_SYMBOL_LEN fixes

Miles Lane tailing /sys files hit a BUG which Pekka Enberg has tracked
to my 966c8c12dc9e77f931e2281ba25d2f0244b06949 sprint_symbol(): use
less stack exposing a bug in slub's list_locations() -
kallsyms_lookup() writes a 0 to namebuf[KSYM_NAME_LEN-1], but that was
beyond the end of page provided.

The 100 slop which list_locations() allows at end of page looks roughly
enough for all the other stuff it might print after the symbol before
it checks again: break out KSYM_SYMBOL_LEN earlier than before.

Latencytop and ftrace and are using KSYM_NAME_LEN buffers where they
need KSYM_SYMBOL_LEN buffers, and vmallocinfo a 2*KSYM_NAME_LEN buffer
where it wants a KSYM_SYMBOL_LEN buffer: fix those before anyone copies
them.

[akpm@linux-foundation.org: ftrace.h needs module.h]
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc Miles Lane <miles.lane@gmail.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Steven Rostedt <srostedt@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c         | 2 +-
 include/linux/ftrace.h | 3 ++-
 kernel/latencytop.c    | 2 +-
 mm/slub.c              | 2 +-
 mm/vmalloc.c           | 2 +-
 5 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 486cf3fe7139..d4677603c889 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -371,7 +371,7 @@ static int lstats_show_proc(struct seq_file *m, void *v)
 				task->latency_record[i].time,
 				task->latency_record[i].max);
 			for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
-				char sym[KSYM_NAME_LEN];
+				char sym[KSYM_SYMBOL_LEN];
 				char *c;
 				if (!task->latency_record[i].backtrace[q])
 					break;
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 703eb53cfa2b..9c5bc6be2b09 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -6,6 +6,7 @@
 #include <linux/ktime.h>
 #include <linux/init.h>
 #include <linux/types.h>
+#include <linux/module.h>
 #include <linux/kallsyms.h>
 
 #ifdef CONFIG_FUNCTION_TRACER
@@ -231,7 +232,7 @@ ftrace_init_module(unsigned long *start, unsigned long *end) { }
 
 struct boot_trace {
 	pid_t			caller;
-	char			func[KSYM_NAME_LEN];
+	char			func[KSYM_SYMBOL_LEN];
 	int			result;
 	unsigned long long	duration;		/* usecs */
 	ktime_t			calltime;
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 5e7b45c56923..449db466bdbc 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -191,7 +191,7 @@ static int lstats_show(struct seq_file *m, void *v)
 				latency_record[i].time,
 				latency_record[i].max);
 			for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
-				char sym[KSYM_NAME_LEN];
+				char sym[KSYM_SYMBOL_LEN];
 				char *c;
 				if (!latency_record[i].backtrace[q])
 					break;
diff --git a/mm/slub.c b/mm/slub.c
index 749588a50a5a..a2cd47d89e0a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3597,7 +3597,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
 	for (i = 0; i < t.count; i++) {
 		struct location *l = &t.loc[i];
 
-		if (len > PAGE_SIZE - 100)
+		if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100)
 			break;
 		len += sprintf(buf + len, "%7ld ", l->count);
 
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f3f6e0758562..1ddb77ba3995 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1717,7 +1717,7 @@ static int s_show(struct seq_file *m, void *p)
 		v->addr, v->addr + v->size, v->size);
 
 	if (v->caller) {
-		char buff[2 * KSYM_NAME_LEN];
+		char buff[KSYM_SYMBOL_LEN];
 
 		seq_putc(m, ' ');
 		sprint_symbol(buff, (unsigned long)v->caller);
-- 
cgit v1.2.3


From b88ed20594db2c685555b68c52b693b75738b2f5 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Wed, 10 Dec 2008 20:48:52 +0000
Subject: fix mapping_writably_mapped()

Lee Schermerhorn noticed yesterday that I broke the mapping_writably_mapped
test in 2.6.7!  Bad bad bug, good good find.

The i_mmap_writable count must be incremented for VM_SHARED (just as
i_writecount is for VM_DENYWRITE, but while holding the i_mmap_lock)
when dup_mmap() copies the vma for fork: it has its own more optimal
version of __vma_link_file(), and I missed this out.  So the count
was later going down to 0 (dangerous) when one end unmapped, then
wrapping negative (inefficient) when the other end unmapped.

The only impact on x86 would have been that setting a mandatory lock on
a file which has at some time been opened O_RDWR and mapped MAP_SHARED
(but not necessarily PROT_WRITE) across a fork, might fail with -EAGAIN
when it should succeed, or succeed when it should fail.

But those architectures which rely on flush_dcache_page() to flush
userspace modifications back into the page before the kernel reads it,
may in some cases have skipped the flush after such a fork - though any
repetitive test will soon wrap the count negative, in which case it will
flush_dcache_page() unnecessarily.

Fix would be a two-liner, but mapping variable added, and comment moved.

Reported-by: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 8d6a7dd9282b..495da2e9a8b4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -315,17 +315,20 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		file = tmp->vm_file;
 		if (file) {
 			struct inode *inode = file->f_path.dentry->d_inode;
+			struct address_space *mapping = file->f_mapping;
+
 			get_file(file);
 			if (tmp->vm_flags & VM_DENYWRITE)
 				atomic_dec(&inode->i_writecount);
-
-			/* insert tmp into the share list, just after mpnt */
-			spin_lock(&file->f_mapping->i_mmap_lock);
+			spin_lock(&mapping->i_mmap_lock);
+			if (tmp->vm_flags & VM_SHARED)
+				mapping->i_mmap_writable++;
 			tmp->vm_truncate_count = mpnt->vm_truncate_count;
-			flush_dcache_mmap_lock(file->f_mapping);
+			flush_dcache_mmap_lock(mapping);
+			/* insert tmp into the share list, just after mpnt */
 			vma_prio_tree_add(tmp, mpnt);
-			flush_dcache_mmap_unlock(file->f_mapping);
-			spin_unlock(&file->f_mapping->i_mmap_lock);
+			flush_dcache_mmap_unlock(mapping);
+			spin_unlock(&mapping->i_mmap_lock);
 		}
 
 		/*
-- 
cgit v1.2.3


From a93751cab71d63126687551823ed3e70cd85854a Mon Sep 17 00:00:00 2001
From: Markus Metzger <markut.t.metzger@intel.com>
Date: Thu, 11 Dec 2008 13:53:26 +0100
Subject: x86, bts, ftrace: adapt the hw-branch-tracer to the ds.c interface

Impact: restructure code, cleanup

Remove BTS bits from the hw-branch-tracer (renamed from bts-tracer) and
use the ds interface.

Signed-off-by: Markus Metzger <markut.t.metzger@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig             |   4 +-
 kernel/trace/Makefile            |   2 +-
 kernel/trace/trace.c             |   2 +-
 kernel/trace/trace.h             |  14 +-
 kernel/trace/trace_bts.c         | 276 ---------------------------------------
 kernel/trace/trace_hw_branches.c | 205 +++++++++++++++++++++++++++++
 6 files changed, 215 insertions(+), 288 deletions(-)
 delete mode 100644 kernel/trace/trace_bts.c
 create mode 100644 kernel/trace/trace_hw_branches.c

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index bde6f03512d5..d8bae6f4219e 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -251,9 +251,9 @@ config STACK_TRACER
 
 	  Say N if unsure.
 
-config BTS_TRACER
+config HW_BRANCH_TRACER
 	depends on HAVE_HW_BRANCH_TRACER
-	bool "Trace branches"
+	bool "Trace hw branches"
 	select TRACING
 	help
 	  This tracer records all branches on the system in a circular
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 62dc561b6676..349d5a93653f 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -31,7 +31,7 @@ obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
-obj-$(CONFIG_BTS_TRACER) += trace_bts.o
+obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
 obj-$(CONFIG_POWER_TRACER) += trace_power.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8ebe0070c47a..639344a4d3a2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2425,7 +2425,7 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 
 	/* Notify the tracer early; before we stop tracing. */
 	if (iter->trace && iter->trace->open)
-			iter->trace->open(iter);
+		iter->trace->open(iter);
 
 	/* Annotate start of buffers if we had overruns */
 	if (ring_buffer_overruns(iter->tr->buffer))
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5ac697065a48..f07c246dd73d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -28,7 +28,7 @@ enum trace_type {
 	TRACE_GRAPH_RET,
 	TRACE_GRAPH_ENT,
 	TRACE_USER_STACK,
-	TRACE_BTS,
+	TRACE_HW_BRANCHES,
 	TRACE_POWER,
 
 	__TRACE_LAST_TYPE
@@ -159,10 +159,10 @@ struct trace_branch {
 	char			correct;
 };
 
-struct bts_entry {
+struct hw_branch_entry {
 	struct trace_entry	ent;
-	unsigned long		from;
-	unsigned long		to;
+	u64			from;
+	u64			to;
 };
 
 struct trace_power {
@@ -278,7 +278,7 @@ extern void __ftrace_bad_type(void);
 			  TRACE_GRAPH_ENT);		\
 		IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry,	\
 			  TRACE_GRAPH_RET);		\
-		IF_ASSIGN(var, ent, struct bts_entry, TRACE_BTS);\
+		IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
  		IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
 		__ftrace_bad_type();					\
 	} while (0)
@@ -414,9 +414,7 @@ void trace_function(struct trace_array *tr,
 
 void trace_graph_return(struct ftrace_graph_ret *trace);
 int trace_graph_entry(struct ftrace_graph_ent *trace);
-void trace_bts(struct trace_array *tr,
-	       unsigned long from,
-	       unsigned long to);
+void trace_hw_branch(struct trace_array *tr, u64 from, u64 to);
 
 void tracing_start_cmdline_record(void);
 void tracing_stop_cmdline_record(void);
diff --git a/kernel/trace/trace_bts.c b/kernel/trace/trace_bts.c
deleted file mode 100644
index 23b76e4690ef..000000000000
--- a/kernel/trace/trace_bts.c
+++ /dev/null
@@ -1,276 +0,0 @@
-/*
- * BTS tracer
- *
- * Copyright (C) 2008 Markus Metzger <markus.t.metzger@gmail.com>
- *
- */
-
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/debugfs.h>
-#include <linux/ftrace.h>
-#include <linux/kallsyms.h>
-
-#include <asm/ds.h>
-
-#include "trace.h"
-
-
-#define SIZEOF_BTS (1 << 13)
-
-static DEFINE_PER_CPU(struct bts_tracer *, tracer);
-static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
-
-#define this_tracer per_cpu(tracer, smp_processor_id())
-#define this_buffer per_cpu(buffer, smp_processor_id())
-
-
-/*
- * Information to interpret a BTS record.
- * This will go into an in-kernel BTS interface.
- */
-static unsigned char sizeof_field;
-static unsigned long debugctl_mask;
-
-#define sizeof_bts (3 * sizeof_field)
-
-static void bts_trace_cpuinit(struct cpuinfo_x86 *c)
-{
-	switch (c->x86) {
-	case 0x6:
-		switch (c->x86_model) {
-		case 0x0 ... 0xC:
-			break;
-		case 0xD:
-		case 0xE: /* Pentium M */
-			sizeof_field = sizeof(long);
-			debugctl_mask = (1<<6)|(1<<7);
-			break;
-		default:
-			sizeof_field = 8;
-			debugctl_mask = (1<<6)|(1<<7);
-			break;
-		}
-		break;
-	case 0xF:
-		switch (c->x86_model) {
-		case 0x0:
-		case 0x1:
-		case 0x2: /* Netburst */
-			sizeof_field = sizeof(long);
-			debugctl_mask = (1<<2)|(1<<3);
-			break;
-		default:
-			/* sorry, don't know about them */
-			break;
-		}
-		break;
-	default:
-		/* sorry, don't know about them */
-		break;
-	}
-}
-
-static inline void bts_enable(void)
-{
-	unsigned long debugctl;
-
-	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl | debugctl_mask);
-}
-
-static inline void bts_disable(void)
-{
-	unsigned long debugctl;
-
-	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl & ~debugctl_mask);
-}
-
-static void bts_trace_reset(struct trace_array *tr)
-{
-	int cpu;
-
-	tr->time_start = ftrace_now(tr->cpu);
-
-	for_each_online_cpu(cpu)
-		tracing_reset(tr, cpu);
-}
-
-static void bts_trace_start_cpu(void *arg)
-{
-	this_tracer =
-		ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS,
-			       /* ovfl = */ NULL, /* th = */ (size_t)-1);
-	if (IS_ERR(this_tracer)) {
-		this_tracer = NULL;
-		return;
-	}
-
-	bts_enable();
-}
-
-static void bts_trace_start(struct trace_array *tr)
-{
-	int cpu;
-
-	bts_trace_reset(tr);
-
-	for_each_cpu_mask(cpu, cpu_possible_map)
-		smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
-}
-
-static void bts_trace_stop_cpu(void *arg)
-{
-	if (this_tracer) {
-		bts_disable();
-
-		ds_release_bts(this_tracer);
-		this_tracer = NULL;
-	}
-}
-
-static void bts_trace_stop(struct trace_array *tr)
-{
-	int cpu;
-
-	for_each_cpu_mask(cpu, cpu_possible_map)
-		smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
-}
-
-static int bts_trace_init(struct trace_array *tr)
-{
-	bts_trace_cpuinit(&boot_cpu_data);
-	bts_trace_reset(tr);
-	bts_trace_start(tr);
-
-	return 0;
-}
-
-static void bts_trace_print_header(struct seq_file *m)
-{
-#ifdef __i386__
-	seq_puts(m, "# CPU#    FROM           TO     FUNCTION\n");
-	seq_puts(m, "#  |       |             |         |\n");
-#else
-	seq_puts(m,
-		 "# CPU#        FROM                   TO         FUNCTION\n");
-	seq_puts(m,
-		 "#  |           |                     |             |\n");
-#endif
-}
-
-static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
-{
-	struct trace_entry *entry = iter->ent;
-	struct trace_seq *seq = &iter->seq;
-	struct bts_entry *it;
-
-	trace_assign_type(it, entry);
-
-	if (entry->type == TRACE_BTS) {
-		int ret;
-#ifdef CONFIG_KALLSYMS
-		char function[KSYM_SYMBOL_LEN];
-		sprint_symbol(function, it->from);
-#else
-		char *function = "<unknown>";
-#endif
-
-		ret = trace_seq_printf(seq, "%4d  0x%lx -> 0x%lx [%s]\n",
-				       entry->cpu, it->from, it->to, function);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;;
-		return TRACE_TYPE_HANDLED;
-	}
-	return TRACE_TYPE_UNHANDLED;
-}
-
-void trace_bts(struct trace_array *tr, unsigned long from, unsigned long to)
-{
-	struct ring_buffer_event *event;
-	struct bts_entry *entry;
-	unsigned long irq;
-
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq);
-	if (!event)
-		return;
-	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, from);
-	entry->ent.type = TRACE_BTS;
-	entry->ent.cpu = smp_processor_id();
-	entry->from = from;
-	entry->to   = to;
-	ring_buffer_unlock_commit(tr->buffer, event, irq);
-}
-
-static void trace_bts_at(struct trace_array *tr, size_t index)
-{
-	const void *raw = NULL;
-	unsigned long from, to;
-	int err;
-
-	err = ds_access_bts(this_tracer, index, &raw);
-	if (err < 0)
-		return;
-
-	from = *(const unsigned long *)raw;
-	to = *(const unsigned long *)((const char *)raw + sizeof_field);
-
-	trace_bts(tr, from, to);
-}
-
-static void trace_bts_cpu(void *arg)
-{
-	struct trace_array *tr = (struct trace_array *) arg;
-	size_t index = 0, end = 0, i;
-	int err;
-
-	if (!this_tracer)
-		return;
-
-	bts_disable();
-
-	err = ds_get_bts_index(this_tracer, &index);
-	if (err < 0)
-		goto out;
-
-	err = ds_get_bts_end(this_tracer, &end);
-	if (err < 0)
-		goto out;
-
-	for (i = index; i < end; i++)
-		trace_bts_at(tr, i);
-
-	for (i = 0; i < index; i++)
-		trace_bts_at(tr, i);
-
-out:
-	bts_enable();
-}
-
-static void trace_bts_prepare(struct trace_iterator *iter)
-{
-	int cpu;
-
-	for_each_cpu_mask(cpu, cpu_possible_map)
-		smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1);
-}
-
-struct tracer bts_tracer __read_mostly =
-{
-	.name		= "bts",
-	.init		= bts_trace_init,
-	.reset		= bts_trace_stop,
-	.print_header	= bts_trace_print_header,
-	.print_line	= bts_trace_print_line,
-	.start		= bts_trace_start,
-	.stop		= bts_trace_stop,
-	.open		= trace_bts_prepare
-};
-
-__init static int init_bts_trace(void)
-{
-	return register_tracer(&bts_tracer);
-}
-device_initcall(init_bts_trace);
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
new file mode 100644
index 000000000000..ee29e012aa97
--- /dev/null
+++ b/kernel/trace/trace_hw_branches.c
@@ -0,0 +1,205 @@
+/*
+ * h/w branch tracer for x86 based on bts
+ *
+ * Copyright (C) 2008 Markus Metzger <markus.t.metzger@gmail.com>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/ftrace.h>
+#include <linux/kallsyms.h>
+
+#include <asm/ds.h>
+
+#include "trace.h"
+
+
+#define SIZEOF_BTS (1 << 13)
+
+static DEFINE_PER_CPU(struct bts_tracer *, tracer);
+static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
+
+#define this_tracer per_cpu(tracer, smp_processor_id())
+#define this_buffer per_cpu(buffer, smp_processor_id())
+
+
+static void bts_trace_reset(struct trace_array *tr)
+{
+	int cpu;
+
+	tr->time_start = ftrace_now(tr->cpu);
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr, cpu);
+}
+
+static void bts_trace_start_cpu(void *arg)
+{
+	if (this_tracer)
+		ds_release_bts(this_tracer);
+
+	this_tracer =
+		ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS,
+			       /* ovfl = */ NULL, /* th = */ (size_t)-1,
+			       BTS_KERNEL);
+	if (IS_ERR(this_tracer)) {
+		this_tracer = NULL;
+		return;
+	}
+}
+
+static void bts_trace_start(struct trace_array *tr)
+{
+	int cpu;
+
+	bts_trace_reset(tr);
+
+	for_each_cpu_mask(cpu, cpu_possible_map)
+		smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
+}
+
+static void bts_trace_stop_cpu(void *arg)
+{
+	if (this_tracer) {
+		ds_release_bts(this_tracer);
+		this_tracer = NULL;
+	}
+}
+
+static void bts_trace_stop(struct trace_array *tr)
+{
+	int cpu;
+
+	for_each_cpu_mask(cpu, cpu_possible_map)
+		smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
+}
+
+static int bts_trace_init(struct trace_array *tr)
+{
+	bts_trace_reset(tr);
+	bts_trace_start(tr);
+
+	return 0;
+}
+
+static void bts_trace_print_header(struct seq_file *m)
+{
+	seq_puts(m,
+		 "# CPU#        FROM                   TO         FUNCTION\n");
+	seq_puts(m,
+		 "#  |           |                     |             |\n");
+}
+
+static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
+{
+	struct trace_entry *entry = iter->ent;
+	struct trace_seq *seq = &iter->seq;
+	struct hw_branch_entry *it;
+
+	trace_assign_type(it, entry);
+
+	if (entry->type == TRACE_HW_BRANCHES) {
+		if (trace_seq_printf(seq, "%4d  ", entry->cpu) &&
+		    trace_seq_printf(seq, "0x%016llx -> 0x%016llx ",
+				     it->from, it->to) &&
+		    (!it->from ||
+		     seq_print_ip_sym(seq, it->from, /* sym_flags = */ 0)) &&
+		    trace_seq_printf(seq, "\n"))
+			return TRACE_TYPE_HANDLED;
+		return TRACE_TYPE_PARTIAL_LINE;;
+	}
+	return TRACE_TYPE_UNHANDLED;
+}
+
+void trace_hw_branch(struct trace_array *tr, u64 from, u64 to)
+{
+	struct ring_buffer_event *event;
+	struct hw_branch_entry *entry;
+	unsigned long irq;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq);
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, from);
+	entry->ent.type = TRACE_HW_BRANCHES;
+	entry->ent.cpu = smp_processor_id();
+	entry->from = from;
+	entry->to   = to;
+	ring_buffer_unlock_commit(tr->buffer, event, irq);
+}
+
+static void trace_bts_at(struct trace_array *tr,
+			 const struct bts_trace *trace, void *at)
+{
+	struct bts_struct bts;
+	int err = 0;
+
+	WARN_ON_ONCE(!trace->read);
+	if (!trace->read)
+		return;
+
+	err = trace->read(this_tracer, at, &bts);
+	if (err < 0)
+		return;
+
+	switch (bts.qualifier) {
+	case BTS_BRANCH:
+		trace_hw_branch(tr, bts.variant.lbr.from, bts.variant.lbr.to);
+		break;
+	}
+}
+
+static void trace_bts_cpu(void *arg)
+{
+	struct trace_array *tr = (struct trace_array *) arg;
+	const struct bts_trace *trace;
+	unsigned char *at;
+
+	if (!this_tracer)
+		return;
+
+	ds_suspend_bts(this_tracer);
+	trace = ds_read_bts(this_tracer);
+	if (!trace)
+		goto out;
+
+	for (at = trace->ds.top; (void *)at < trace->ds.end;
+	     at += trace->ds.size)
+		trace_bts_at(tr, trace, at);
+
+	for (at = trace->ds.begin; (void *)at < trace->ds.top;
+	     at += trace->ds.size)
+		trace_bts_at(tr, trace, at);
+
+out:
+	ds_resume_bts(this_tracer);
+}
+
+static void trace_bts_prepare(struct trace_iterator *iter)
+{
+	int cpu;
+
+	for_each_cpu_mask(cpu, cpu_possible_map)
+		smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1);
+}
+
+struct tracer bts_tracer __read_mostly =
+{
+	.name		= "hw-branch-tracer",
+	.init		= bts_trace_init,
+	.reset		= bts_trace_stop,
+	.print_header	= bts_trace_print_header,
+	.print_line	= bts_trace_print_line,
+	.start		= bts_trace_start,
+	.stop		= bts_trace_stop,
+	.open		= trace_bts_prepare
+};
+
+__init static int init_bts_trace(void)
+{
+	return register_tracer(&bts_tracer);
+}
+device_initcall(init_bts_trace);
-- 
cgit v1.2.3


From f8b755ac8e0cc3f330269e4c4504514f987167a2 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 9 Dec 2008 23:55:25 +0100
Subject: tracing/function-graph-tracer: Output arrows signal on hardirq
 call/return

Impact: make more obvious the hardirq calls in the output

When a hardirq is triggered inside the codeflow on output, we have
now two arrows that indicate the entry and return of the hardirq.

 0)               |          bit_waitqueue() {
 0)   0.880 us    |            __phys_addr();
 0)   2.699 us    |          }
 0)               |          __wake_up_bit() {
 0)   ==========> |          smp_apic_timer_interrupt() {
 0)   0.797 us    |            native_apic_mem_write();
 0)   0.715 us    |            exit_idle();
 0)               |            irq_enter() {
 0)   0.722 us    |              idle_cpu();
 0)   5.519 us    |            }
 0)               |            hrtimer_interrupt() {
 0)               |              ktime_get() {
 0)               |                ktime_get_ts() {
 0)   0.805 us    |                  getnstimeofday();

 [...]

 0) ! 108.528 us  |            }
 0)               |            irq_exit() {
 0)               |              do_softirq() {
 0)               |                __do_softirq() {
 0)   0.895 us    |                  __local_bh_disable();
 0)               |                  run_timer_softirq() {
 0)   0.827 us    |                    hrtimer_run_pending();
 0)   1.226 us    |                    _spin_lock_irq();
 0)               |                    _spin_unlock_irq() {
 0)   6.550 us    |                  }
 0)   0.924 us    |                  _local_bh_enable();
 0) + 12.129 us   |                }
 0) + 13.911 us   |              }
 0)   0.707 us    |              idle_cpu();
 0) + 17.009 us   |            }
 0) ! 137.419 us  |          }
 0)   <========== |
 0)   1.045 us    |          }
 0) ! 148.908 us  |        }
 0) ! 151.022 us  |      }
 0) ! 153.022 us  |    }
 0)   0.963 us    |    journal_mark_dirty();
 0)   0.925 us    |    __brelse();

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_functions_graph.c | 66 +++++++++++++++++++++++++++++++++---
 1 file changed, 62 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index af60eef4cbcc..4bf39fcae97a 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -231,6 +231,49 @@ trace_branch_is_leaf(struct trace_iterator *iter,
 	return true;
 }
 
+static enum print_line_t
+print_graph_irq(struct trace_seq *s, unsigned long addr,
+				enum trace_type type, int cpu, pid_t pid)
+{
+	int ret;
+
+	if (addr < (unsigned long)__irqentry_text_start ||
+		addr >= (unsigned long)__irqentry_text_end)
+		return TRACE_TYPE_UNHANDLED;
+
+	if (type == TRACE_GRAPH_ENT) {
+		ret = trace_seq_printf(s, "==========> |  ");
+	} else {
+		/* Cpu */
+		if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
+			ret = print_graph_cpu(s, cpu);
+			if (ret == TRACE_TYPE_PARTIAL_LINE)
+				return TRACE_TYPE_PARTIAL_LINE;
+		}
+		/* Proc */
+		if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
+			ret = print_graph_proc(s, pid);
+			if (ret == TRACE_TYPE_PARTIAL_LINE)
+				return TRACE_TYPE_PARTIAL_LINE;
+
+			ret = trace_seq_printf(s, " | ");
+			if (!ret)
+				return TRACE_TYPE_PARTIAL_LINE;
+		}
+
+		/* No overhead */
+		if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
+			ret = trace_seq_printf(s, "  ");
+			if (!ret)
+				return TRACE_TYPE_PARTIAL_LINE;
+		}
+
+		ret = trace_seq_printf(s, "<========== |\n");
+	}
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+	return TRACE_TYPE_HANDLED;
+}
 
 static enum print_line_t
 print_graph_duration(unsigned long long duration, struct trace_seq *s)
@@ -344,7 +387,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 
 static enum print_line_t
 print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
-			struct trace_seq *s)
+			struct trace_seq *s, pid_t pid, int cpu)
 {
 	int i;
 	int ret;
@@ -357,8 +400,18 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
-	/* No time */
-	ret = trace_seq_printf(s, "            |  ");
+	/* Interrupt */
+	ret = print_graph_irq(s, call->func, TRACE_GRAPH_ENT, cpu, pid);
+	if (ret == TRACE_TYPE_UNHANDLED) {
+		/* No time */
+		ret = trace_seq_printf(s, "            |  ");
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	} else {
+		if (ret == TRACE_TYPE_PARTIAL_LINE)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
 
 	/* Function */
 	for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
@@ -410,7 +463,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 	if (trace_branch_is_leaf(iter, field))
 		return print_graph_entry_leaf(iter, field, s);
 	else
-		return print_graph_entry_nested(field, s);
+		return print_graph_entry_nested(field, s, iter->ent->pid, cpu);
 
 }
 
@@ -474,6 +527,11 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
+
+	ret = print_graph_irq(s, trace->func, TRACE_GRAPH_RET, cpu, ent->pid);
+	if (ret == TRACE_TYPE_PARTIAL_LINE)
+		return TRACE_TYPE_PARTIAL_LINE;
+
 	return TRACE_TYPE_HANDLED;
 }
 
-- 
cgit v1.2.3


From cbc34ed1ac36690f75fd272e19e7b4fc29aae5a2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 10 Dec 2008 08:08:22 +0100
Subject: sched: fix tracepoints in scheduler

The trace point only caught one of many places where a task changes cpu,
put it in the right place to we get all of them.

Change the signature while we're at it.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/trace/sched.h | 4 ++--
 kernel/sched.c        | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/trace/sched.h b/include/trace/sched.h
index 9b2854abf7e2..f4549d506b16 100644
--- a/include/trace/sched.h
+++ b/include/trace/sched.h
@@ -30,8 +30,8 @@ DECLARE_TRACE(sched_switch,
 		TPARGS(rq, prev, next));
 
 DECLARE_TRACE(sched_migrate_task,
-	TPPROTO(struct rq *rq, struct task_struct *p, int dest_cpu),
-		TPARGS(rq, p, dest_cpu));
+	TPPROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
+		TPARGS(p, orig_cpu, dest_cpu));
 
 DECLARE_TRACE(sched_process_free,
 	TPPROTO(struct task_struct *p),
diff --git a/kernel/sched.c b/kernel/sched.c
index 7729c4bbc8ba..d377097572f9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1851,6 +1851,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 
 	clock_offset = old_rq->clock - new_rq->clock;
 
+	trace_sched_migrate_task(p, task_cpu(p), new_cpu);
+
 #ifdef CONFIG_SCHEDSTATS
 	if (p->se.wait_start)
 		p->se.wait_start -= clock_offset;
@@ -2868,7 +2870,6 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
 	    || unlikely(!cpu_active(dest_cpu)))
 		goto out;
 
-	trace_sched_migrate_task(rq, p, dest_cpu);
 	/* force the process onto the specified CPU */
 	if (migrate_task(p, dest_cpu, &req)) {
 		/* Need to wait for migration thread (might exit: take ref). */
-- 
cgit v1.2.3


From ee79d1bdb6a10499e53f80b1e8d14110215178ba Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 9 Dec 2008 18:49:50 +0100
Subject: sched: let arch_update_cpu_topology indicate if topology changed

Change arch_update_cpu_topology so it returns 1 if the cpu topology changed
and 0 if it didn't change. This will be useful for the next patch which adds
a call to this function in partition_sched_domains.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/s390/kernel/topology.c | 5 +++--
 include/linux/topology.h    | 2 +-
 kernel/sched.c              | 8 +++++++-
 3 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index a947899dcba1..bf96f1b5c6ec 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -212,7 +212,7 @@ static void update_cpu_core_map(void)
 		cpu_core_map[cpu] = cpu_coregroup_map(cpu);
 }
 
-void arch_update_cpu_topology(void)
+int arch_update_cpu_topology(void)
 {
 	struct tl_info *info = tl_info;
 	struct sys_device *sysdev;
@@ -221,7 +221,7 @@ void arch_update_cpu_topology(void)
 	if (!machine_has_topology) {
 		update_cpu_core_map();
 		topology_update_polarization_simple();
-		return;
+		return 0;
 	}
 	stsi(info, 15, 1, 2);
 	tl_to_cores(info);
@@ -230,6 +230,7 @@ void arch_update_cpu_topology(void)
 		sysdev = get_cpu_sysdev(cpu);
 		kobject_uevent(&sysdev->kobj, KOBJ_CHANGE);
 	}
+	return 1;
 }
 
 static void topology_work_fn(struct work_struct *work)
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 117f1b7405cf..0c5b5ac36d8e 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -49,7 +49,7 @@
 	for_each_online_node(node)			\
 		if (nr_cpus_node(node))
 
-void arch_update_cpu_topology(void);
+int arch_update_cpu_topology(void);
 
 /* Conform to ACPI 2.0 SLIT distance definitions */
 #define LOCAL_DISTANCE		10
diff --git a/kernel/sched.c b/kernel/sched.c
index ef212da928e8..fcfbbd9dbd60 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7675,8 +7675,14 @@ static struct sched_domain_attr *dattr_cur;
  */
 static cpumask_t fallback_doms;
 
-void __attribute__((weak)) arch_update_cpu_topology(void)
+/*
+ * arch_update_cpu_topology lets virtualized architectures update the
+ * cpu core maps. It is supposed to return 1 if the topology changed
+ * or 0 if it stayed the same.
+ */
+int __attribute__((weak)) arch_update_cpu_topology(void)
 {
+	return 0;
 }
 
 /*
-- 
cgit v1.2.3


From d65bd5ecb2bd166cea4952a59b7e16cc3ad6ef6c Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 9 Dec 2008 18:49:51 +0100
Subject: sched: add missing arch_update_cpu_topology() call

arch_reinit_sched_domains() used to call arch_update_cpu_topology()
via arch_init_sched_domains(). This call got lost with
e761b7725234276a802322549cee5255305a0930 ("cpu hotplug, sched: Introduce
cpu_active_map and redo sched domain managment (take 2)".

So we might end up with outdated and missing cpus in the cpu core
maps (architecture used to call arch_reinit_sched_domains if cpu
topology changed).

This adds a call to arch_update_cpu_topology in partition_sched_domains
which gets called whenever scheduling domains get updated. Which is
what is supposed to happen when cpu topology changes.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index fcfbbd9dbd60..ad7b93be5691 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7774,17 +7774,21 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
 			     struct sched_domain_attr *dattr_new)
 {
 	int i, j, n;
+	int new_topology;
 
 	mutex_lock(&sched_domains_mutex);
 
 	/* always unregister in case we don't destroy any domains */
 	unregister_sched_domain_sysctl();
 
+	/* Let architecture update cpu core mappings. */
+	new_topology = arch_update_cpu_topology();
+
 	n = doms_new ? ndoms_new : 0;
 
 	/* Destroy deleted domains */
 	for (i = 0; i < ndoms_cur; i++) {
-		for (j = 0; j < n; j++) {
+		for (j = 0; j < n && !new_topology; j++) {
 			if (cpus_equal(doms_cur[i], doms_new[j])
 			    && dattrs_equal(dattr_cur, i, dattr_new, j))
 				goto match1;
@@ -7804,7 +7808,7 @@ match1:
 
 	/* Build new domains */
 	for (i = 0; i < ndoms_new; i++) {
-		for (j = 0; j < ndoms_cur; j++) {
+		for (j = 0; j < ndoms_cur && !new_topology; j++) {
 			if (cpus_equal(doms_new[i], doms_cur[j])
 			    && dattrs_equal(dattr_new, i, dattr_cur, j))
 				goto match2;
-- 
cgit v1.2.3


From ca7e716c7833aeaeb8fedd6d004c5f5d5e14d325 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 14 Dec 2008 15:46:01 -0800
Subject: Revert "sched_clock: prevent scd->clock from moving backwards"

This reverts commit 5b7dba4ff834259a5623e03a565748704a8fe449, which
caused a regression in hibernate, reported by and bisected by Fabio
Comolli.

This revert fixes

 http://bugzilla.kernel.org/show_bug.cgi?id=12155
 http://bugzilla.kernel.org/show_bug.cgi?id=12149

Bisected-by: Fabio Comolli <fabio.comolli@gmail.com>
Requested-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched_clock.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 81787248b60f..e8ab096ddfe3 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -118,13 +118,13 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
 
 	/*
 	 * scd->clock = clamp(scd->tick_gtod + delta,
-	 *		      max(scd->tick_gtod, scd->clock),
-	 *		      max(scd->clock, scd->tick_gtod + TICK_NSEC));
+	 * 		      max(scd->tick_gtod, scd->clock),
+	 * 		      scd->tick_gtod + TICK_NSEC);
 	 */
 
 	clock = scd->tick_gtod + delta;
 	min_clock = wrap_max(scd->tick_gtod, scd->clock);
-	max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC);
+	max_clock = scd->tick_gtod + TICK_NSEC;
 
 	clock = wrap_max(clock, min_clock);
 	clock = wrap_min(clock, max_clock);
-- 
cgit v1.2.3


From 307257cf475aac25db30b669987f13d90c934e3a Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Mon, 15 Dec 2008 13:54:22 -0800
Subject: cgroups: fix a race between rmdir and remount

When a cgroup is removed, it's unlinked from its parent's children list,
but not actually freed until the last dentry on it is released (at which
point cgrp->root->number_of_cgroups is decremented).

Currently rebind_subsystems checks for the top cgroup's child list being
empty in order to rebind subsystems into or out of a hierarchy - this can
result in the set of subsystems bound to a hierarchy being
removed-but-not-freed cgroup.

The simplest fix for this is to forbid remounts that change the set of
subsystems on a hierarchy that has removed-but-not-freed cgroups.  This
bug can be reproduced via:

mkdir /mnt/cg
mount -t cgroup -o ns,freezer cgroup /mnt/cg
mkdir /mnt/cg/foo
sleep 1h < /mnt/cg/foo &
rmdir /mnt/cg/foo
mount -t cgroup -o remount,ns,devices,freezer cgroup /mnt/cg
kill $!

Though the above will cause oops in -mm only but not mainline, but the bug
can cause memory leak in mainline (and even oops)

Signed-off-by: Paul Menage <menage@google.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index fe00b3b983a8..8185a0f09594 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -702,7 +702,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 	 * any child cgroups exist. This is theoretically supportable
 	 * but involves complex error handling, so it's being left until
 	 * later */
-	if (!list_empty(&cgrp->children))
+	if (root->number_of_cgroups > 1)
 		return -EBUSY;
 
 	/* Process each subsystem */
-- 
cgit v1.2.3


From 03e89e4574a680af15f59329b061f35d9813aff4 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 16 Dec 2008 08:45:30 +0100
Subject: sched: fix wakeup preemption clock

Impact: sharpen the wakeup-granularity to always be against current scheduler time

It was possible to do the preemption check against an old time stamp.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 2 +-
 kernel/sched_fair.c | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ad7b93be5691..88215066efae 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2266,6 +2266,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 
 	smp_wmb();
 	rq = task_rq_lock(p, &flags);
+	update_rq_clock(rq);
 	old_state = p->state;
 	if (!(old_state & state))
 		goto out;
@@ -2323,7 +2324,6 @@ out_activate:
 		schedstat_inc(p, se.nr_wakeups_local);
 	else
 		schedstat_inc(p, se.nr_wakeups_remote);
-	update_rq_clock(rq);
 	activate_task(rq, p, 1);
 	success = 1;
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 98345e45b059..928cd74cff0d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1345,12 +1345,11 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
 {
 	struct task_struct *curr = rq->curr;
 	struct sched_entity *se = &curr->se, *pse = &p->se;
+	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 
-	if (unlikely(rt_prio(p->prio))) {
-		struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+	update_curr(cfs_rq);
 
-		update_rq_clock(rq);
-		update_curr(cfs_rq);
+	if (unlikely(rt_prio(p->prio))) {
 		resched_task(curr);
 		return;
 	}
-- 
cgit v1.2.3


From 34f28ecd0f4bdc733c681294d02d9fab5880591b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 16 Dec 2008 08:45:31 +0100
Subject: sched: optimize update_curr()

Impact: micro-optimization

Skip the hard work when there is none.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 928cd74cff0d..5ad4440f0fc4 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -492,6 +492,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
 	 * overflow on 32 bits):
 	 */
 	delta_exec = (unsigned long)(now - curr->exec_start);
+	if (!delta_exec)
+		return;
 
 	__update_curr(cfs_rq, curr, delta_exec);
 	curr->exec_start = now;
-- 
cgit v1.2.3


From 720f54988e17b33f3f477010b3a68ee872d20d5a Mon Sep 17 00:00:00 2001
From: Ken Chen <kenchen@google.com>
Date: Mon, 15 Dec 2008 22:02:01 -0800
Subject: sched, cpuacct: refactoring cpuusage_read / cpuusage_write

Impact: micro-optimize the code on 64-bit architectures

In the thread regarding to 'export percpu cpuacct cgroup stats'
http://lkml.org/lkml/2008/12/7/13

akpm pointed out that current cpuacct code is inefficient.  This patch
refactoring the following:

* make cpu_rq locking only on 32-bit
* change iterator to each_present_cpu instead of each_possible_cpu to
  make it hotplug friendly.

It's a bit of code churn, but I was rewarded with 160 byte code size saving
on x86-64 arch and zero code size change on i386.

Signed-off-by: Ken Chen <kenchen@google.com>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 56 +++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 39 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 88215066efae..41b7e2d524d6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9275,6 +9275,41 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 	kfree(ca);
 }
 
+static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
+{
+	u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+	u64 data;
+
+#ifndef CONFIG_64BIT
+	/*
+	 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
+	 */
+	spin_lock_irq(&cpu_rq(cpu)->lock);
+	data = *cpuusage;
+	spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+	data = *cpuusage;
+#endif
+
+	return data;
+}
+
+static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
+{
+	u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+
+#ifndef CONFIG_64BIT
+	/*
+	 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
+	 */
+	spin_lock_irq(&cpu_rq(cpu)->lock);
+	*cpuusage = val;
+	spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+	*cpuusage = val;
+#endif
+}
+
 /* return total cpu usage (in nanoseconds) of a group */
 static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
 {
@@ -9282,17 +9317,8 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
 	u64 totalcpuusage = 0;
 	int i;
 
-	for_each_possible_cpu(i) {
-		u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
-
-		/*
-		 * Take rq->lock to make 64-bit addition safe on 32-bit
-		 * platforms.
-		 */
-		spin_lock_irq(&cpu_rq(i)->lock);
-		totalcpuusage += *cpuusage;
-		spin_unlock_irq(&cpu_rq(i)->lock);
-	}
+	for_each_present_cpu(i)
+		totalcpuusage += cpuacct_cpuusage_read(ca, i);
 
 	return totalcpuusage;
 }
@@ -9309,13 +9335,9 @@ static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
 		goto out;
 	}
 
-	for_each_possible_cpu(i) {
-		u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
+	for_each_present_cpu(i)
+		cpuacct_cpuusage_write(ca, i, 0);
 
-		spin_lock_irq(&cpu_rq(i)->lock);
-		*cpuusage = 0;
-		spin_unlock_irq(&cpu_rq(i)->lock);
-	}
 out:
 	return err;
 }
-- 
cgit v1.2.3


From e9515c3c9feecd74174c2998add0db51e02abb8d Mon Sep 17 00:00:00 2001
From: Ken Chen <kenchen@google.com>
Date: Mon, 15 Dec 2008 22:04:15 -0800
Subject: sched, cpuacct: export percpu cpuacct cgroup stats

This patch export per-cpu CPU cycle usage for a given cpuacct cgroup.
There is a need for a user space monitor daemon to track group CPU
usage on per-cpu base.  It is also useful for monitoring CFS load
balancer behavior by tracking per CPU group usage.

Signed-off-by: Ken Chen <kenchen@google.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 41b7e2d524d6..f53e2b8ef521 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9342,12 +9342,32 @@ out:
 	return err;
 }
 
+static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
+				   struct seq_file *m)
+{
+	struct cpuacct *ca = cgroup_ca(cgroup);
+	u64 percpu;
+	int i;
+
+	for_each_present_cpu(i) {
+		percpu = cpuacct_cpuusage_read(ca, i);
+		seq_printf(m, "%llu ", (unsigned long long) percpu);
+	}
+	seq_printf(m, "\n");
+	return 0;
+}
+
 static struct cftype files[] = {
 	{
 		.name = "usage",
 		.read_u64 = cpuusage_read,
 		.write_u64 = cpuusage_write,
 	},
+	{
+		.name = "usage_percpu",
+		.read_seq_string = cpuacct_percpu_seq_read,
+	},
+
 };
 
 static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
-- 
cgit v1.2.3


From 80f40ee4a07530cc3acbc239a9299ec47025825b Mon Sep 17 00:00:00 2001
From: Bharata B Rao <bharata@linux.vnet.ibm.com>
Date: Mon, 15 Dec 2008 11:56:48 +0530
Subject: sched: use RCU variant of list traversal in for_each_leaf_rt_rq()

Impact: fix potential of rare crash

for_each_leaf_rt_rq() walks an RCU protected list (rq->leaf_rt_rq_list),
but doesn't use list_for_each_entry_rcu(). Fix this.

Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d9ba9d5f99d6..7bdf84c85ccd 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -77,7 +77,7 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 }
 
 #define for_each_leaf_rt_rq(rt_rq, rq) \
-	list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
+	list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
 
 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 {
-- 
cgit v1.2.3


From 2c2d7329d8afa9efa3ec24e19a53e7be9d14f242 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 16 Dec 2008 22:08:58 +0100
Subject: tracing/ftrace: use preempt_enable_no_resched_notrace in
 ring_buffer_time_stamp()

Impact: prevent a trace recursion

After some tests with function graph tracer under x86-32, I saw some recursions
caused by ring_buffer_time_stamp() that calls preempt_enable_no_notrace() which
calls preempt_schedule() which is traced itself.

This patch re-enables preemption without rescheduling.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7f69cfeaadf7..eab81f918f6a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -107,7 +107,7 @@ u64 ring_buffer_time_stamp(int cpu)
 	preempt_disable_notrace();
 	/* shift to debug/test normalization and TIME_EXTENTS */
 	time = sched_clock() << DEBUG_SHIFT;
-	preempt_enable_notrace();
+	preempt_enable_no_resched_notrace();
 
 	return time;
 }
-- 
cgit v1.2.3


From 66896a85cf2890b6bbbc4c9ccdcd296600ffbf89 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 13 Dec 2008 20:18:13 +0100
Subject: tracing/ftrace: add the printk-msg-only option

Impact: display ftrace_printk messages "as is"

By default, ftrace_printk() messages find their output with some other
informations like pid, caller, ...
Sometimes a developer just want to have the ftrace_printk left "as is", without
other information.

This is done by providing a default-off option called printk-msg-only.
To enable it, just do `echo printk-msg-only > /debugfs/tracing/trace_options`

Before the patch:

           <...>-2739  [000]   145.692153: __might_sleep: I'm an ftrace_printk msg in __might_sleep
           <...>-2739  [000]   145.692155: __might_sleep: I'm another ftrace_printk msg in __might_sleep

After the patch and the printk-msg-only option enabled:

I'm an ftrace_printk msg in __might_sleep
I'm another ftrace_printk msg in __might_sleep

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 25 +++++++++++++++++++++++++
 kernel/trace/trace.h |  3 ++-
 2 files changed, 27 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 639344a4d3a2..1a3d6b329782 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -287,6 +287,7 @@ static const char *trace_options[] = {
 	"annotate",
 	"userstacktrace",
 	"sym-userobj",
+	"printk-msg-only",
 	NULL
 };
 
@@ -2265,6 +2266,25 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 	return TRACE_TYPE_HANDLED;
 }
 
+static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry = iter->ent;
+	struct print_entry *field;
+	int ret;
+
+	trace_assign_type(field, entry);
+
+	ret = trace_seq_printf(s, field->buf);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	if (entry->flags & TRACE_FLAG_CONT)
+		trace_seq_print_cont(s, iter);
+
+	return TRACE_TYPE_HANDLED;
+}
+
 static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
@@ -2345,6 +2365,11 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
 			return ret;
 	}
 
+	if (iter->ent->type == TRACE_PRINT &&
+			trace_flags & TRACE_ITER_PRINTK &&
+			trace_flags & TRACE_ITER_PRINTK_MSGONLY)
+		return print_printk_msg_only(iter);
+
 	if (trace_flags & TRACE_ITER_BIN)
 		return print_bin_fmt(iter);
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f07c246dd73d..fc75dce7a664 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -578,7 +578,8 @@ enum trace_iterator_flags {
 	TRACE_ITER_BRANCH		= 0x1000,
 	TRACE_ITER_ANNOTATE		= 0x2000,
 	TRACE_ITER_USERSTACKTRACE       = 0x4000,
-	TRACE_ITER_SYM_USEROBJ          = 0x8000
+	TRACE_ITER_SYM_USEROBJ          = 0x8000,
+	TRACE_ITER_PRINTK_MSGONLY	= 0x10000
 };
 
 /*
-- 
cgit v1.2.3


From f38f1d2aa5a3520cf05da7cd6bd12fe2b0c509b7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 16 Dec 2008 23:06:40 -0500
Subject: trace: add a way to enable or disable the stack tracer

Impact: enhancement to stack tracer

The stack tracer currently is either on when configured in or
off when it is not. It can not be disabled when it is configured on.
(besides disabling the function tracer that it uses)

This patch adds a way to enable or disable the stack tracer at
run time. It defaults off on bootup, but a kernel parameter 'stacktrace'
has been added to enable it on bootup.

A new sysctl has been added "kernel.stack_tracer_enabled" to let
the user enable or disable the stack tracer at run time.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/kernel-parameters.txt |  4 +++
 include/linux/ftrace.h              |  8 ++++++
 kernel/sysctl.c                     | 10 +++++++
 kernel/trace/Kconfig                | 13 +++++++---
 kernel/trace/trace_stack.c          | 52 ++++++++++++++++++++++++++++++++++---
 5 files changed, 79 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 2919a2e91938..edab81c13182 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -89,6 +89,7 @@ parameter is applicable:
 	SPARC	Sparc architecture is enabled.
 	SWSUSP	Software suspend (hibernation) is enabled.
 	SUSPEND	System suspend states are enabled.
+	FTRACE	Function tracing enabled.
 	TS	Appropriate touchscreen support is enabled.
 	USB	USB support is enabled.
 	USBHID	USB Human Interface Device support is enabled.
@@ -2173,6 +2174,9 @@ and is between 256 and 4096 characters. It is defined in the file
 	st=		[HW,SCSI] SCSI tape parameters (buffers, etc.)
 			See Documentation/scsi/st.txt.
 
+	stacktrace	[FTRACE]
+			Enabled the stack tracer on boot up.
+
 	sti=		[PARISC,HW]
 			Format: <num>
 			Set the STI (builtin display/keyboard on the HP-PARISC
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 44020f31bd81..6b0db53caa7d 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -86,6 +86,14 @@ static inline void ftrace_stop(void) { }
 static inline void ftrace_start(void) { }
 #endif /* CONFIG_FUNCTION_TRACER */
 
+#ifdef CONFIG_STACK_TRACER
+extern int stack_tracer_enabled;
+int
+stack_trace_sysctl(struct ctl_table *table, int write,
+		   struct file *file, void __user *buffer, size_t *lenp,
+		   loff_t *ppos);
+#endif
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 /* asm/ftrace.h must be defined for archs supporting dynamic ftrace */
 #include <asm/ftrace.h>
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c83f566e940a..6ac501a2dcc6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -487,6 +487,16 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &ftrace_enable_sysctl,
 	},
 #endif
+#ifdef CONFIG_STACK_TRACER
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "stack_tracer_enabled",
+		.data		= &stack_tracer_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &stack_trace_sysctl,
+	},
+#endif
 #ifdef CONFIG_TRACING
 	{
 		.ctl_name	= CTL_UNNUMBERED,
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d8bae6f4219e..e2a4ff6fc3a6 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -244,10 +244,15 @@ config STACK_TRACER
 
 	  This tracer works by hooking into every function call that the
 	  kernel executes, and keeping a maximum stack depth value and
-	  stack-trace saved. Because this logic has to execute in every
-	  kernel function, all the time, this option can slow down the
-	  kernel measurably and is generally intended for kernel
-	  developers only.
+	  stack-trace saved.  If this is configured with DYNAMIC_FTRACE
+	  then it will not have any overhead while the stack tracer
+	  is disabled.
+
+	  To enable the stack tracer on bootup, pass in 'stacktrace'
+	  on the kernel command line.
+
+	  The stack tracer can also be enabled or disabled via the
+	  sysctl kernel.stack_tracer_enabled
 
 	  Say N if unsure.
 
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 0b863f2cbc8e..4842c969c785 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -10,6 +10,7 @@
 #include <linux/debugfs.h>
 #include <linux/ftrace.h>
 #include <linux/module.h>
+#include <linux/sysctl.h>
 #include <linux/init.h>
 #include <linux/fs.h>
 #include "trace.h"
@@ -31,6 +32,10 @@ static raw_spinlock_t max_stack_lock =
 
 static int stack_trace_disabled __read_mostly;
 static DEFINE_PER_CPU(int, trace_active);
+static DEFINE_MUTEX(stack_sysctl_mutex);
+
+int stack_tracer_enabled;
+static int last_stack_tracer_enabled;
 
 static inline void check_stack(void)
 {
@@ -174,7 +179,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
 	return count;
 }
 
-static struct file_operations stack_max_size_fops = {
+static const struct file_operations stack_max_size_fops = {
 	.open		= tracing_open_generic,
 	.read		= stack_max_size_read,
 	.write		= stack_max_size_write,
@@ -272,7 +277,7 @@ static int t_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static struct seq_operations stack_trace_seq_ops = {
+static const struct seq_operations stack_trace_seq_ops = {
 	.start		= t_start,
 	.next		= t_next,
 	.stop		= t_stop,
@@ -288,12 +293,48 @@ static int stack_trace_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
-static struct file_operations stack_trace_fops = {
+static const struct file_operations stack_trace_fops = {
 	.open		= stack_trace_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 };
 
+int
+stack_trace_sysctl(struct ctl_table *table, int write,
+		   struct file *file, void __user *buffer, size_t *lenp,
+		   loff_t *ppos)
+{
+	int ret;
+
+	mutex_lock(&stack_sysctl_mutex);
+
+	ret  = proc_dointvec(table, write, file, buffer, lenp, ppos);
+
+	if (ret || !write ||
+	    (last_stack_tracer_enabled == stack_tracer_enabled))
+		goto out;
+
+	last_stack_tracer_enabled = stack_tracer_enabled;
+
+	if (stack_tracer_enabled)
+		register_ftrace_function(&trace_ops);
+	else
+		unregister_ftrace_function(&trace_ops);
+
+ out:
+	mutex_unlock(&stack_sysctl_mutex);
+	return ret;
+}
+
+static int start_stack_trace __initdata;
+
+static __init int enable_stacktrace(char *str)
+{
+	start_stack_trace = 1;
+	return 1;
+}
+__setup("stacktrace", enable_stacktrace);
+
 static __init int stack_trace_init(void)
 {
 	struct dentry *d_tracer;
@@ -311,7 +352,10 @@ static __init int stack_trace_init(void)
 	if (!entry)
 		pr_warning("Could not create debugfs 'stack_trace' entry\n");
 
-	register_ftrace_function(&trace_ops);
+	if (start_stack_trace) {
+		register_ftrace_function(&trace_ops);
+		stack_tracer_enabled = 1;
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3


From e05a43b744fb9518cbf8539a7ef33164ac60a70f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 17 Dec 2008 09:43:00 -0500
Subject: trace: better use of stack_trace_enabled for boot up code

Impact: clean up

Andrew Morton suggested to use the stack_tracer_enabled variable
to decide whether or not to start stack tracing on bootup.
This lets us remove the start_stack_trace variable.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_stack.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 4842c969c785..d0871bc0aca5 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -308,7 +308,7 @@ stack_trace_sysctl(struct ctl_table *table, int write,
 
 	mutex_lock(&stack_sysctl_mutex);
 
-	ret  = proc_dointvec(table, write, file, buffer, lenp, ppos);
+	ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
 
 	if (ret || !write ||
 	    (last_stack_tracer_enabled == stack_tracer_enabled))
@@ -326,11 +326,10 @@ stack_trace_sysctl(struct ctl_table *table, int write,
 	return ret;
 }
 
-static int start_stack_trace __initdata;
-
 static __init int enable_stacktrace(char *str)
 {
-	start_stack_trace = 1;
+	stack_tracer_enabled = 1;
+	last_stack_tracer_enabled = 1;
 	return 1;
 }
 __setup("stacktrace", enable_stacktrace);
@@ -352,10 +351,8 @@ static __init int stack_trace_init(void)
 	if (!entry)
 		pr_warning("Could not create debugfs 'stack_trace' entry\n");
 
-	if (start_stack_trace) {
+	if (stack_tracer_enabled)
 		register_ftrace_function(&trace_ops);
-		stack_tracer_enabled = 1;
-	}
 
 	return 0;
 }
-- 
cgit v1.2.3


From ea3a6d6d60b2504c573fe3415f6617e8310c0236 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 17 Dec 2008 15:05:36 -0500
Subject: ftrace: add not to regex on filtering functions

Impact: enhancement

Ingo Molnar has asked about a way to remove items from the filter
lists. Currently, you can only add or replace items. The way
items are added to the list is through opening one of the list
files (set_ftrace_filter or set_ftrace_notrace) via append.
If the file is opened for truncate, the list is cleared.

  echo spin_lock > /debug/tracing/set_ftrace_filter

The above will replace the list with only spin_lock

  echo spin_lock >> /debug/tracing/set_ftrace_filter

The above will add spin_lock to the list.

Now this patch adds:

  echo '!spin_lock' >> /debug/tracing/set_ftrace_filter

This will remove spin_lock from the list.

The limited glob features of these lists also can be notted.

  echo '!spin_*' >> /debug/tracing/set_ftrace_filter

This will remove all functions that start with 'spin_'

Note:

  echo '!spin_*' > /debug/tracing/set_ftrace_filter

will simply clear out the list (notice the '>' instead of '>>')

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a12f80efceaa..2f32969c09df 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1047,6 +1047,13 @@ ftrace_match(unsigned char *buff, int len, int enable)
 	int type = MATCH_FULL;
 	unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
 	unsigned i, match = 0, search_len = 0;
+	int not = 0;
+
+	if (buff[0] == '!') {
+		not = 1;
+		buff++;
+		len--;
+	}
 
 	for (i = 0; i < len; i++) {
 		if (buff[i] == '*') {
@@ -1100,8 +1107,12 @@ ftrace_match(unsigned char *buff, int len, int enable)
 					matched = 1;
 				break;
 			}
-			if (matched)
-				rec->flags |= flag;
+			if (matched) {
+				if (not)
+					rec->flags &= ~flag;
+				else
+					rec->flags |= flag;
+			}
 		}
 		pg = pg->next;
 	}
-- 
cgit v1.2.3


From 3d9101e92529e1ff6014f95a69afc82f37b9b13a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 17 Dec 2008 22:34:13 +0100
Subject: trace: fix task state printout

Impact: fix occasionally incorrect trace output

The tracing code has interesting varieties of printing out task state.

Unfortunalely only one of the instances is correct as it copies the
code from sched.c:sched_show_task(). The others are plain wrong as
they treatthe bitfield as an integer offset into the character
array. Also the size check of the character array is wrong as it
includes the trailing \0.

Use a common state decoder inline which does the Right Thing.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 39 +++++++++++++++++----------------------
 1 file changed, 17 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d86e3252f300..803100518f11 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1301,6 +1301,13 @@ lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
 
 static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
 
+static int task_state_char(unsigned long state)
+{
+	int bit = state ? __ffs(state) + 1 : 0;
+
+	return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
+}
+
 /*
  * The message is supposed to contain an ending newline.
  * If the printing stops prematurely, try to add a newline of our own.
@@ -1396,12 +1403,8 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 
 		trace_assign_type(field, entry);
 
-		T = field->next_state < sizeof(state_to_char) ?
-			state_to_char[field->next_state] : 'X';
-
-		state = field->prev_state ?
-			__ffs(field->prev_state) + 1 : 0;
-		S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X';
+		T = task_state_char(field->next_state);
+		S = task_state_char(field->prev_state);
 		comm = trace_find_cmdline(field->next_pid);
 		trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
 				 field->prev_pid,
@@ -1519,10 +1522,8 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 
 		trace_assign_type(field, entry);
 
-		S = field->prev_state < sizeof(state_to_char) ?
-			state_to_char[field->prev_state] : 'X';
-		T = field->next_state < sizeof(state_to_char) ?
-			state_to_char[field->next_state] : 'X';
+		T = task_state_char(field->next_state);
+		S = task_state_char(field->prev_state);
 		ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n",
 				       field->prev_pid,
 				       field->prev_prio,
@@ -1621,12 +1622,9 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
 
 		trace_assign_type(field, entry);
 
-		S = field->prev_state < sizeof(state_to_char) ?
-			state_to_char[field->prev_state] : 'X';
-		T = field->next_state < sizeof(state_to_char) ?
-			state_to_char[field->next_state] : 'X';
-		if (entry->type == TRACE_WAKE)
-			S = '+';
+		T = task_state_char(field->next_state);
+		S = entry->type == TRACE_WAKE ? '+' :
+			task_state_char(field->prev_state);
 		ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
 				       field->prev_pid,
 				       field->prev_prio,
@@ -1712,12 +1710,9 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 
 		trace_assign_type(field, entry);
 
-		S = field->prev_state < sizeof(state_to_char) ?
-			state_to_char[field->prev_state] : 'X';
-		T = field->next_state < sizeof(state_to_char) ?
-			state_to_char[field->next_state] : 'X';
-		if (entry->type == TRACE_WAKE)
-			S = '+';
+		T = task_state_char(field->next_state);
+		S = entry->type == TRACE_WAKE ? '+' :
+			task_state_char(field->prev_state);
 		SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
 		SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
 		SEQ_PUT_HEX_FIELD_RET(s, S);
-- 
cgit v1.2.3


From 6d102bc68f3dd2ae0e305b09170b1751aa67baeb Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 17 Dec 2008 17:48:23 +0800
Subject: tracing/ring-buffer: remove unused ring_buffer size

Impact: remove dead code

struct ring_buffer.size is not set after ring_buffer is initialized
or resized. it is always 0.

we can use "buffer->pages * PAGE_SIZE" to get ring_buffer's size

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index eab81f918f6a..bb6922a931b1 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -258,7 +258,6 @@ struct ring_buffer_per_cpu {
 };
 
 struct ring_buffer {
-	unsigned long			size;
 	unsigned			pages;
 	unsigned			flags;
 	int				cpus;
@@ -2210,8 +2209,7 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
 		return -EINVAL;
 
 	/* At least make sure the two buffers are somewhat the same */
-	if (buffer_a->size != buffer_b->size ||
-	    buffer_a->pages != buffer_b->pages)
+	if (buffer_a->pages != buffer_b->pages)
 		return -EINVAL;
 
 	cpu_buffer_a = buffer_a->buffers[cpu];
-- 
cgit v1.2.3


From 9c2c48020ec0dd6ecd27e5a1298f73b40d85a595 Mon Sep 17 00:00:00 2001
From: Ken Chen <kenchen@google.com>
Date: Tue, 16 Dec 2008 23:41:22 -0800
Subject: schedstat: consolidate per-task cpu runtime stats

Impact: simplify code

When we turn on CONFIG_SCHEDSTATS, per-task cpu runtime is accumulated
twice. Once in task->se.sum_exec_runtime and once in sched_info.cpu_time.
These two stats are exactly the same.

Given that task->se.sum_exec_runtime is always accumulated by the core
scheduler, sched_info can reuse that data instead of duplicate the accounting.

Signed-off-by: Ken Chen <kenchen@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/proc/base.c        | 2 +-
 include/linux/sched.h | 3 +--
 kernel/delayacct.c    | 2 +-
 kernel/sched.c        | 2 ++
 kernel/sched_stats.h  | 5 ++---
 5 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index d4677603c889..4d745bac768c 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -347,7 +347,7 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
 static int proc_pid_schedstat(struct task_struct *task, char *buffer)
 {
 	return sprintf(buffer, "%llu %llu %lu\n",
-			task->sched_info.cpu_time,
+			task->se.sum_exec_runtime,
 			task->sched_info.run_delay,
 			task->sched_info.pcount);
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8cccd6dc5d66..2d1e840ddd35 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -670,8 +670,7 @@ struct reclaim_state;
 struct sched_info {
 	/* cumulative counters */
 	unsigned long pcount;	      /* # of times run on this cpu */
-	unsigned long long cpu_time,  /* time spent on the cpu */
-			   run_delay; /* time spent waiting on a runqueue */
+	unsigned long long run_delay; /* time spent waiting on a runqueue */
 
 	/* timestamps */
 	unsigned long long last_arrival,/* when we last ran on a cpu */
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index b3179dad71be..abb6e17505e2 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -127,7 +127,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 	 */
 	t1 = tsk->sched_info.pcount;
 	t2 = tsk->sched_info.run_delay;
-	t3 = tsk->sched_info.cpu_time;
+	t3 = tsk->se.sum_exec_runtime;
 
 	d->cpu_count += t1;
 
diff --git a/kernel/sched.c b/kernel/sched.c
index f53e2b8ef521..fd835fc320b8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -596,6 +596,8 @@ struct rq {
 #ifdef CONFIG_SCHEDSTATS
 	/* latency stats */
 	struct sched_info rq_sched_info;
+	unsigned long long rq_cpu_time;
+	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
 
 	/* sys_sched_yield() stats */
 	unsigned int yld_exp_empty;
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 7dbf72a2b02c..3b01098164c8 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -31,7 +31,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
 		    rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
 		    rq->sched_switch, rq->sched_count, rq->sched_goidle,
 		    rq->ttwu_count, rq->ttwu_local,
-		    rq->rq_sched_info.cpu_time,
+		    rq->rq_cpu_time,
 		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
 
 		seq_printf(seq, "\n");
@@ -123,7 +123,7 @@ static inline void
 rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 {
 	if (rq)
-		rq->rq_sched_info.cpu_time += delta;
+		rq->rq_cpu_time += delta;
 }
 
 static inline void
@@ -236,7 +236,6 @@ static inline void sched_info_depart(struct task_struct *t)
 	unsigned long long delta = task_rq(t)->clock -
 					t->sched_info.last_arrival;
 
-	t->sched_info.cpu_time += delta;
 	rq_sched_info_depart(task_rq(t), delta);
 
 	if (t->state == TASK_RUNNING)
-- 
cgit v1.2.3


From 3bddb9a3246f6df5cf3b7655cb541ac10203bb71 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 19 Dec 2008 01:03:29 +0100
Subject: tracing: fix warning in kernel/trace/trace.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

this warning:

  kernel/trace/trace.c: In function ‘print_lat_fmt’:
  kernel/trace/trace.c:1826: warning: unused variable ‘state’

Triggers because 'state' has become unused - remove it.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1a3d6b329782..49fc7201295e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1816,7 +1816,6 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 	char *comm;
 	int S, T;
 	int i;
-	unsigned state;
 
 	if (entry->type == TRACE_CONT)
 		return TRACE_TYPE_HANDLED;
-- 
cgit v1.2.3


From c71dd42db2c6f1637b92502a214587431c1a6ad2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 19 Dec 2008 01:09:51 +0100
Subject: tracing: fix warnings in kernel/trace/trace_sched_switch.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

these warnings:

  kernel/trace/trace_sched_switch.c: In function ‘tracing_sched_register’:
  kernel/trace/trace_sched_switch.c:96: warning: passing argument 1 of ‘register_trace_sched_wakeup_new’ from incompatible pointer type
  kernel/trace/trace_sched_switch.c:112: warning: passing argument 1 of ‘unregister_trace_sched_wakeup_new’ from incompatible pointer type
  kernel/trace/trace_sched_switch.c: In function ‘tracing_sched_unregister’:
  kernel/trace/trace_sched_switch.c:121: warning: passing argument 1 of ‘unregister_trace_sched_wakeup_new’ from incompatible pointer type

Trigger because sched_wakeup_new tracepoints need the same trace
signature as sched_wakeup - which was changed recently.

Fix it.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/trace/sched.h             | 4 ++--
 kernel/sched.c                    | 2 +-
 kernel/trace/trace_sched_switch.c | 1 +
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/trace/sched.h b/include/trace/sched.h
index f4549d506b16..bc4c9eadc6ba 100644
--- a/include/trace/sched.h
+++ b/include/trace/sched.h
@@ -21,8 +21,8 @@ DECLARE_TRACE(sched_wakeup,
 		TPARGS(rq, p));
 
 DECLARE_TRACE(sched_wakeup_new,
-	TPPROTO(struct rq *rq, struct task_struct *p),
-		TPARGS(rq, p));
+	TPPROTO(struct rq *rq, struct task_struct *p, int success),
+		TPARGS(rq, p, success));
 
 DECLARE_TRACE(sched_switch,
 	TPPROTO(struct rq *rq, struct task_struct *prev,
diff --git a/kernel/sched.c b/kernel/sched.c
index d377097572f9..ac5a70a87d1e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2457,7 +2457,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		p->sched_class->task_new(rq, p);
 		inc_nr_running(rq);
 	}
-	trace_sched_wakeup_new(rq, p);
+	trace_sched_wakeup_new(rq, p, 1);
 	check_preempt_curr(rq, p, 0);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 863390557b44..781d72ef873c 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -247,3 +247,4 @@ __init static int init_sched_switch_trace(void)
 	return register_tracer(&sched_switch_trace);
 }
 device_initcall(init_sched_switch_trace);
+
-- 
cgit v1.2.3


From 213cc060797378059a28ebc5c539f3e9a80160bd Mon Sep 17 00:00:00 2001
From: Pekka J Enberg <penberg@cs.helsinki.fi>
Date: Fri, 19 Dec 2008 12:08:39 +0200
Subject: ftrace: introduce tracing_reset_online_cpus() helper

Impact: cleanup

This patch factors out common code from multiple tracers into a
tracing_reset_online_cpus() function and converts the tracers to use it.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c              | 10 ++++++++++
 kernel/trace/trace.h              |  1 +
 kernel/trace/trace_boot.c         | 12 +-----------
 kernel/trace/trace_functions.c    | 14 ++------------
 kernel/trace/trace_hw_branches.c  | 14 ++------------
 kernel/trace/trace_mmiotrace.c    |  6 +-----
 kernel/trace/trace_sched_switch.c | 14 ++------------
 kernel/trace/trace_sysprof.c      | 12 +-----------
 8 files changed, 20 insertions(+), 63 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0eb6d48347f7..79db26e8216e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -679,6 +679,16 @@ void tracing_reset(struct trace_array *tr, int cpu)
 	ftrace_enable_cpu();
 }
 
+void tracing_reset_online_cpus(struct trace_array *tr)
+{
+	int cpu;
+
+	tr->time_start = ftrace_now(tr->cpu);
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr, cpu);
+}
+
 #define SAVED_CMDLINES 128
 static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
 static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index fc75dce7a664..cc7a4f864036 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -374,6 +374,7 @@ struct trace_iterator {
 int tracing_is_enabled(void);
 void trace_wake_up(void);
 void tracing_reset(struct trace_array *tr, int cpu);
+void tracing_reset_online_cpus(struct trace_array *tr);
 int tracing_open_generic(struct inode *inode, struct file *filp);
 struct dentry *tracing_init_dentry(void);
 void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index a4fa2c57e34e..3ccebde28482 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -37,16 +37,6 @@ void disable_boot_trace(void)
 		tracing_stop_sched_switch_record();
 }
 
-static void reset_boot_trace(struct trace_array *tr)
-{
-	int cpu;
-
-	tr->time_start = ftrace_now(tr->cpu);
-
-	for_each_online_cpu(cpu)
-		tracing_reset(tr, cpu);
-}
-
 static int boot_trace_init(struct trace_array *tr)
 {
 	int cpu;
@@ -130,7 +120,7 @@ struct tracer boot_tracer __read_mostly =
 {
 	.name		= "initcall",
 	.init		= boot_trace_init,
-	.reset		= reset_boot_trace,
+	.reset		= tracing_reset_online_cpus,
 	.print_line	= initcall_print_line,
 };
 
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index e74f6d0a3216..9236d7e25a16 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -16,20 +16,10 @@
 
 #include "trace.h"
 
-static void function_reset(struct trace_array *tr)
-{
-	int cpu;
-
-	tr->time_start = ftrace_now(tr->cpu);
-
-	for_each_online_cpu(cpu)
-		tracing_reset(tr, cpu);
-}
-
 static void start_function_trace(struct trace_array *tr)
 {
 	tr->cpu = get_cpu();
-	function_reset(tr);
+	tracing_reset_online_cpus(tr);
 	put_cpu();
 
 	tracing_start_cmdline_record();
@@ -55,7 +45,7 @@ static void function_trace_reset(struct trace_array *tr)
 
 static void function_trace_start(struct trace_array *tr)
 {
-	function_reset(tr);
+	tracing_reset_online_cpus(tr);
 }
 
 static struct tracer function_trace __read_mostly =
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index ee29e012aa97..b6a3e20a49a9 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -25,16 +25,6 @@ static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
 #define this_buffer per_cpu(buffer, smp_processor_id())
 
 
-static void bts_trace_reset(struct trace_array *tr)
-{
-	int cpu;
-
-	tr->time_start = ftrace_now(tr->cpu);
-
-	for_each_online_cpu(cpu)
-		tracing_reset(tr, cpu);
-}
-
 static void bts_trace_start_cpu(void *arg)
 {
 	if (this_tracer)
@@ -54,7 +44,7 @@ static void bts_trace_start(struct trace_array *tr)
 {
 	int cpu;
 
-	bts_trace_reset(tr);
+	tracing_reset_online_cpus(tr);
 
 	for_each_cpu_mask(cpu, cpu_possible_map)
 		smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
@@ -78,7 +68,7 @@ static void bts_trace_stop(struct trace_array *tr)
 
 static int bts_trace_init(struct trace_array *tr)
 {
-	bts_trace_reset(tr);
+	tracing_reset_online_cpus(tr);
 	bts_trace_start(tr);
 
 	return 0;
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 2fb6da6523b3..fffcb069f1dc 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -22,14 +22,10 @@ static unsigned long prev_overruns;
 
 static void mmio_reset_data(struct trace_array *tr)
 {
-	int cpu;
-
 	overrun_detected = false;
 	prev_overruns = 0;
-	tr->time_start = ftrace_now(tr->cpu);
 
-	for_each_online_cpu(cpu)
-		tracing_reset(tr, cpu);
+	tracing_reset_online_cpus(tr);
 }
 
 static int mmio_trace_init(struct trace_array *tr)
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 781d72ef873c..add2c1fdae92 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -72,16 +72,6 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee)
 	local_irq_restore(flags);
 }
 
-static void sched_switch_reset(struct trace_array *tr)
-{
-	int cpu;
-
-	tr->time_start = ftrace_now(tr->cpu);
-
-	for_each_online_cpu(cpu)
-		tracing_reset(tr, cpu);
-}
-
 static int tracing_sched_register(void)
 {
 	int ret;
@@ -197,7 +187,7 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr)
 
 static void start_sched_trace(struct trace_array *tr)
 {
-	sched_switch_reset(tr);
+	tracing_reset_online_cpus(tr);
 	tracing_start_sched_switch_record();
 }
 
@@ -221,7 +211,7 @@ static void sched_switch_trace_reset(struct trace_array *tr)
 
 static void sched_switch_trace_start(struct trace_array *tr)
 {
-	sched_switch_reset(tr);
+	tracing_reset_online_cpus(tr);
 	tracing_start_sched_switch();
 }
 
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 54960edb96d0..01becf1f19ff 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -234,20 +234,10 @@ static void stop_stack_timers(void)
 		stop_stack_timer(cpu);
 }
 
-static void stack_reset(struct trace_array *tr)
-{
-	int cpu;
-
-	tr->time_start = ftrace_now(tr->cpu);
-
-	for_each_online_cpu(cpu)
-		tracing_reset(tr, cpu);
-}
-
 static void start_stack_trace(struct trace_array *tr)
 {
 	mutex_lock(&sample_timer_lock);
-	stack_reset(tr);
+	tracing_reset_online_cpus(tr);
 	start_stack_timers();
 	tracer_enabled = 1;
 	mutex_unlock(&sample_timer_lock);
-- 
cgit v1.2.3


From 9bb482476c6c9d1ae033306440c51ceac93ea80c Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Tue, 16 Dec 2008 11:30:08 +0000
Subject: allow stripping of generated symbols under CONFIG_KALLSYMS_ALL

Building upon parts of the module stripping patch, this patch
introduces similar stripping for vmlinux when CONFIG_KALLSYMS_ALL=y.
Using CONFIG_KALLSYMS_STRIP_GENERATED reduces the overhead of
CONFIG_KALLSYMS_ALL from 245k/310k to 65k/80k for the (i386/x86-64)
kernels I tested with.

The patch also does away with the need to special case the kallsyms-
internal symbols by making them available even in the first linking
stage.

While it is a generated file, the patch includes the changes to
scripts/genksyms/keywords.c_shipped, as I'm unsure what the procedure
here is.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
---
 Makefile                            |  47 ++++++---
 arch/x86/scripts/strip-symbols      |   1 +
 init/Kconfig                        |   7 ++
 kernel/kallsyms.c                   |  16 ++-
 scripts/genksyms/keywords.c_shipped | 189 ++++++++++++++++++------------------
 scripts/genksyms/keywords.gperf     |   2 +
 scripts/kallsyms.c                  |  21 ++--
 7 files changed, 155 insertions(+), 128 deletions(-)
 create mode 100644 arch/x86/scripts/strip-symbols

(limited to 'kernel')

diff --git a/Makefile b/Makefile
index 5dd0ed3b12c6..b3d1c8f1f4ce 100644
--- a/Makefile
+++ b/Makefile
@@ -604,6 +604,9 @@ export	INSTALL_PATH ?= /boot
 MODLIB	= $(INSTALL_MOD_PATH)/lib/modules/$(KERNELRELEASE)
 export MODLIB
 
+strip-symbols := $(srctree)/scripts/strip-symbols \
+		 $(wildcard $(srctree)/arch/$(ARCH)/scripts/strip-symbols)
+
 #
 # INSTALL_MOD_STRIP, if defined, will cause modules to be stripped while
 # they get installed.  If INSTALL_MOD_STRIP is '1', then the default
@@ -611,8 +614,10 @@ export MODLIB
 # be used as the option(s) to the objcopy command.
 ifdef INSTALL_MOD_STRIP
 ifeq ($(INSTALL_MOD_STRIP),1)
-mod_strip_cmd = $(OBJCOPY) --strip-debug --strip-symbols \
-		$(srctree)/scripts/strip-symbols --wildcard
+mod_strip_cmd = $(OBJCOPY) --strip-debug
+ifeq ($(CONFIG_KALLSYMS_ALL),$(CONFIG_KALLSYMS_STRIP_GENERATED))
+mod_strip_cmd += --wildcard $(addprefix --strip-symbols ,$(strip-symbols))
+endif
 else
 mod_strip_cmd = $(OBJCOPY) $(INSTALL_MOD_STRIP)
 endif # INSTALL_MOD_STRIP=1
@@ -747,6 +752,7 @@ last_kallsyms := 2
 endif
 
 kallsyms.o := .tmp_kallsyms$(last_kallsyms).o
+kallsyms.h := $(wildcard include/config/kallsyms/*.h) $(wildcard include/config/kallsyms/*/*.h)
 
 define verify_kallsyms
 	$(Q)$(if $($(quiet)cmd_sysmap),                                      \
@@ -771,24 +777,41 @@ endef
 
 # Generate .S file with all kernel symbols
 quiet_cmd_kallsyms = KSYM    $@
-      cmd_kallsyms = $(NM) -n $< | $(KALLSYMS) \
-                     $(if $(CONFIG_KALLSYMS_ALL),--all-symbols) > $@
+      cmd_kallsyms = { test $* -eq 0 || $(NM) -n $<; } \
+		     | $(KALLSYMS) $(if $(CONFIG_KALLSYMS_ALL),--all-symbols) >$@
 
-.tmp_kallsyms1.o .tmp_kallsyms2.o .tmp_kallsyms3.o: %.o: %.S scripts FORCE
+quiet_cmd_kstrip = STRIP   $@
+      cmd_kstrip = $(OBJCOPY) --wildcard $(addprefix --strip$(if $(CONFIG_RELOCATABLE),-unneeded)-symbols ,$(filter %/scripts/strip-symbols,$^)) $< $@
+
+$(foreach n,0 1 2 3,.tmp_kallsyms$(n).o): KBUILD_AFLAGS += -Wa,--strip-local-absolute
+$(foreach n,0 1 2 3,.tmp_kallsyms$(n).o): %.o: %.S scripts FORCE
 	$(call if_changed_dep,as_o_S)
 
-.tmp_kallsyms%.S: .tmp_vmlinux% $(KALLSYMS)
+ifeq ($(CONFIG_KALLSYMS_STRIP_GENERATED),y)
+strip-ext := .stripped
+endif
+
+.tmp_kallsyms%.S: .tmp_vmlinux%$(strip-ext) $(KALLSYMS) $(kallsyms.h)
 	$(call cmd,kallsyms)
 
+# make -jN seems to have problems with intermediate files, see bug #3330.
+.SECONDARY: $(foreach n,1 2 3,.tmp_vmlinux$(n).stripped)
+.tmp_vmlinux%.stripped: .tmp_vmlinux% $(strip-symbols) $(kallsyms.h)
+	$(call cmd,kstrip)
+
+ifneq ($(CONFIG_DEBUG_INFO),y)
+.tmp_vmlinux%: LDFLAGS_vmlinux += -S
+endif
 # .tmp_vmlinux1 must be complete except kallsyms, so update vmlinux version
-.tmp_vmlinux1: $(vmlinux-lds) $(vmlinux-all) FORCE
-	$(call if_changed_rule,ksym_ld)
+.tmp_vmlinux%: $(vmlinux-lds) $(vmlinux-all) FORCE
+	$(if $(filter 1,$*),$(call if_changed_rule,ksym_ld),$(call if_changed,vmlinux__))
 
-.tmp_vmlinux2: $(vmlinux-lds) $(vmlinux-all) .tmp_kallsyms1.o FORCE
-	$(call if_changed,vmlinux__)
+.tmp_vmlinux0$(strip-ext):
+	$(Q)echo "placeholder" >$@
 
-.tmp_vmlinux3: $(vmlinux-lds) $(vmlinux-all) .tmp_kallsyms2.o FORCE
-	$(call if_changed,vmlinux__)
+.tmp_vmlinux1: .tmp_kallsyms0.o
+.tmp_vmlinux2: .tmp_kallsyms1.o
+.tmp_vmlinux3: .tmp_kallsyms2.o
 
 # Needs to visit scripts/ before $(KALLSYMS) can be used.
 $(KALLSYMS): scripts ;
diff --git a/arch/x86/scripts/strip-symbols b/arch/x86/scripts/strip-symbols
new file mode 100644
index 000000000000..a2f1ccb827c7
--- /dev/null
+++ b/arch/x86/scripts/strip-symbols
@@ -0,0 +1 @@
+__cpu_vendor_dev_X86_VENDOR_*
diff --git a/init/Kconfig b/init/Kconfig
index f763762d544a..0f5af409fef1 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -588,6 +588,13 @@ config KALLSYMS_ALL
 
 	   Say N.
 
+config KALLSYMS_STRIP_GENERATED
+	bool "Strip machine generated symbols from kallsyms"
+	depends on KALLSYMS_ALL
+	default y
+	help
+	  Say N if you want kallsyms to retain even machine generated symbols.
+
 config KALLSYMS_EXTRA_PASS
 	bool "Do an extra kallsyms pass"
 	depends on KALLSYMS
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 7b8b0f21a5b1..e694afa0eb8c 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -30,20 +30,19 @@
 #define all_var 0
 #endif
 
-/* These will be re-linked against their real values during the second link stage */
-extern const unsigned long kallsyms_addresses[] __attribute__((weak));
-extern const u8 kallsyms_names[] __attribute__((weak));
+extern const unsigned long kallsyms_addresses[];
+extern const u8 kallsyms_names[];
 
 /* tell the compiler that the count isn't in the small data section if the arch
  * has one (eg: FRV)
  */
 extern const unsigned long kallsyms_num_syms
-__attribute__((weak, section(".rodata")));
+	__attribute__((__section__(".rodata")));
 
-extern const u8 kallsyms_token_table[] __attribute__((weak));
-extern const u16 kallsyms_token_index[] __attribute__((weak));
+extern const u8 kallsyms_token_table[];
+extern const u16 kallsyms_token_index[];
 
-extern const unsigned long kallsyms_markers[] __attribute__((weak));
+extern const unsigned long kallsyms_markers[];
 
 static inline int is_kernel_inittext(unsigned long addr)
 {
@@ -168,9 +167,6 @@ static unsigned long get_symbol_pos(unsigned long addr,
 	unsigned long symbol_start = 0, symbol_end = 0;
 	unsigned long i, low, high, mid;
 
-	/* This kernel should never had been booted. */
-	BUG_ON(!kallsyms_addresses);
-
 	/* do a binary search on the sorted kallsyms_addresses array */
 	low = 0;
 	high = kallsyms_num_syms;
diff --git a/scripts/genksyms/keywords.c_shipped b/scripts/genksyms/keywords.c_shipped
index 971e0113ae7a..83484fe93ede 100644
--- a/scripts/genksyms/keywords.c_shipped
+++ b/scripts/genksyms/keywords.c_shipped
@@ -1,4 +1,4 @@
-/* ANSI-C code produced by gperf version 3.0.2 */
+/* ANSI-C code produced by gperf version 3.0.1 */
 /* Command-line: gperf -L ANSI-C -a -C -E -g -H is_reserved_hash -k '1,3,$' -N is_reserved_word -p -t scripts/genksyms/keywords.gperf  */
 
 #if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \
@@ -32,7 +32,7 @@
 
 #line 3 "scripts/genksyms/keywords.gperf"
 struct resword { const char *name; int token; };
-/* maximum key range = 62, duplicates = 0 */
+/* maximum key range = 64, duplicates = 0 */
 
 #ifdef __GNUC__
 __inline
@@ -46,32 +46,32 @@ is_reserved_hash (register const char *str, register unsigned int len)
 {
   static const unsigned char asso_values[] =
     {
-      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
-      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
-      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
-      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
-      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
-      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
-      65, 65, 65, 65, 65, 65, 65, 65, 65,  5,
-      65, 65, 65, 65, 65, 65, 35, 65, 65, 65,
-       0, 65, 65, 65, 65, 65, 65, 65, 65, 65,
-      65, 65, 65, 65, 65,  0, 65,  0, 65,  5,
-      20, 15, 10, 30, 65, 15, 65, 65, 20,  0,
-      10, 35, 20, 65, 10,  5,  0, 10,  5, 65,
-      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
-      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
-      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
-      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
-      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
-      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
-      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
-      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
-      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
-      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
-      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
-      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
-      65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
-      65, 65, 65, 65, 65, 65
+      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
+      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
+      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
+      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
+      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
+      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
+      67, 67, 67, 67, 67, 67, 67, 67, 67,  0,
+      67, 67, 67, 67, 67, 67, 15, 67, 67, 67,
+       0, 67, 67, 67, 67, 67, 67, 67, 67, 67,
+      67, 67, 67, 67, 67,  0, 67,  0, 67,  5,
+      25, 20, 15, 30, 67, 15, 67, 67, 10,  0,
+      10, 40, 20, 67, 10,  5,  0, 10, 15, 67,
+      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
+      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
+      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
+      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
+      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
+      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
+      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
+      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
+      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
+      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
+      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
+      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
+      67, 67, 67, 67, 67, 67, 67, 67, 67, 67,
+      67, 67, 67, 67, 67, 67
     };
   return len + asso_values[(unsigned char)str[2]] + asso_values[(unsigned char)str[0]] + asso_values[(unsigned char)str[len - 1]];
 }
@@ -84,116 +84,119 @@ is_reserved_word (register const char *str, register unsigned int len)
 {
   enum
     {
-      TOTAL_KEYWORDS = 43,
+      TOTAL_KEYWORDS = 45,
       MIN_WORD_LENGTH = 3,
       MAX_WORD_LENGTH = 24,
       MIN_HASH_VALUE = 3,
-      MAX_HASH_VALUE = 64
+      MAX_HASH_VALUE = 66
     };
 
   static const struct resword wordlist[] =
     {
       {""}, {""}, {""},
-#line 26 "scripts/genksyms/keywords.gperf"
+#line 28 "scripts/genksyms/keywords.gperf"
       {"asm", ASM_KEYW},
       {""},
-#line 8 "scripts/genksyms/keywords.gperf"
+#line 10 "scripts/genksyms/keywords.gperf"
       {"__asm", ASM_KEYW},
       {""},
-#line 9 "scripts/genksyms/keywords.gperf"
+#line 11 "scripts/genksyms/keywords.gperf"
       {"__asm__", ASM_KEYW},
       {""}, {""},
-#line 52 "scripts/genksyms/keywords.gperf"
+#line 54 "scripts/genksyms/keywords.gperf"
       {"__typeof__", TYPEOF_KEYW},
       {""},
-#line 12 "scripts/genksyms/keywords.gperf"
+#line 14 "scripts/genksyms/keywords.gperf"
       {"__const", CONST_KEYW},
-#line 11 "scripts/genksyms/keywords.gperf"
-      {"__attribute__", ATTRIBUTE_KEYW},
 #line 13 "scripts/genksyms/keywords.gperf"
+      {"__attribute__", ATTRIBUTE_KEYW},
+#line 15 "scripts/genksyms/keywords.gperf"
       {"__const__", CONST_KEYW},
-#line 18 "scripts/genksyms/keywords.gperf"
+#line 20 "scripts/genksyms/keywords.gperf"
       {"__signed__", SIGNED_KEYW},
-#line 44 "scripts/genksyms/keywords.gperf"
+#line 46 "scripts/genksyms/keywords.gperf"
       {"static", STATIC_KEYW},
-#line 20 "scripts/genksyms/keywords.gperf"
-      {"__volatile__", VOLATILE_KEYW},
-#line 39 "scripts/genksyms/keywords.gperf"
+      {""},
+#line 41 "scripts/genksyms/keywords.gperf"
       {"int", INT_KEYW},
-#line 32 "scripts/genksyms/keywords.gperf"
+#line 34 "scripts/genksyms/keywords.gperf"
       {"char", CHAR_KEYW},
-#line 33 "scripts/genksyms/keywords.gperf"
+#line 35 "scripts/genksyms/keywords.gperf"
       {"const", CONST_KEYW},
-#line 45 "scripts/genksyms/keywords.gperf"
+#line 47 "scripts/genksyms/keywords.gperf"
       {"struct", STRUCT_KEYW},
-#line 24 "scripts/genksyms/keywords.gperf"
+#line 26 "scripts/genksyms/keywords.gperf"
       {"__restrict__", RESTRICT_KEYW},
-#line 25 "scripts/genksyms/keywords.gperf"
+#line 27 "scripts/genksyms/keywords.gperf"
       {"restrict", RESTRICT_KEYW},
-#line 23 "scripts/genksyms/keywords.gperf"
-      {"_restrict", RESTRICT_KEYW},
-#line 16 "scripts/genksyms/keywords.gperf"
+#line 7 "scripts/genksyms/keywords.gperf"
+      {"EXPORT_SYMBOL_GPL_FUTURE", EXPORT_SYMBOL_KEYW},
+#line 18 "scripts/genksyms/keywords.gperf"
       {"__inline__", INLINE_KEYW},
-#line 10 "scripts/genksyms/keywords.gperf"
-      {"__attribute", ATTRIBUTE_KEYW},
       {""},
-#line 14 "scripts/genksyms/keywords.gperf"
+#line 22 "scripts/genksyms/keywords.gperf"
+      {"__volatile__", VOLATILE_KEYW},
+#line 5 "scripts/genksyms/keywords.gperf"
+      {"EXPORT_SYMBOL", EXPORT_SYMBOL_KEYW},
+#line 25 "scripts/genksyms/keywords.gperf"
+      {"_restrict", RESTRICT_KEYW},
+      {""},
+#line 12 "scripts/genksyms/keywords.gperf"
+      {"__attribute", ATTRIBUTE_KEYW},
+#line 6 "scripts/genksyms/keywords.gperf"
+      {"EXPORT_SYMBOL_GPL", EXPORT_SYMBOL_KEYW},
+#line 16 "scripts/genksyms/keywords.gperf"
       {"__extension__", EXTENSION_KEYW},
-#line 35 "scripts/genksyms/keywords.gperf"
+#line 37 "scripts/genksyms/keywords.gperf"
       {"enum", ENUM_KEYW},
-#line 19 "scripts/genksyms/keywords.gperf"
-      {"__volatile", VOLATILE_KEYW},
-#line 36 "scripts/genksyms/keywords.gperf"
+#line 8 "scripts/genksyms/keywords.gperf"
+      {"EXPORT_UNUSED_SYMBOL", EXPORT_SYMBOL_KEYW},
+#line 38 "scripts/genksyms/keywords.gperf"
       {"extern", EXTERN_KEYW},
       {""},
-#line 17 "scripts/genksyms/keywords.gperf"
+#line 19 "scripts/genksyms/keywords.gperf"
       {"__signed", SIGNED_KEYW},
-#line 7 "scripts/genksyms/keywords.gperf"
-      {"EXPORT_SYMBOL_GPL_FUTURE", EXPORT_SYMBOL_KEYW},
-      {""},
-#line 51 "scripts/genksyms/keywords.gperf"
+#line 9 "scripts/genksyms/keywords.gperf"
+      {"EXPORT_UNUSED_SYMBOL_GPL", EXPORT_SYMBOL_KEYW},
+#line 49 "scripts/genksyms/keywords.gperf"
+      {"union", UNION_KEYW},
+#line 53 "scripts/genksyms/keywords.gperf"
       {"typeof", TYPEOF_KEYW},
-#line 46 "scripts/genksyms/keywords.gperf"
+#line 48 "scripts/genksyms/keywords.gperf"
       {"typedef", TYPEDEF_KEYW},
-#line 15 "scripts/genksyms/keywords.gperf"
+#line 17 "scripts/genksyms/keywords.gperf"
       {"__inline", INLINE_KEYW},
-#line 31 "scripts/genksyms/keywords.gperf"
+#line 33 "scripts/genksyms/keywords.gperf"
       {"auto", AUTO_KEYW},
-#line 47 "scripts/genksyms/keywords.gperf"
-      {"union", UNION_KEYW},
-      {""}, {""},
-#line 48 "scripts/genksyms/keywords.gperf"
-      {"unsigned", UNSIGNED_KEYW},
-#line 49 "scripts/genksyms/keywords.gperf"
-      {"void", VOID_KEYW},
-#line 42 "scripts/genksyms/keywords.gperf"
-      {"short", SHORT_KEYW},
+#line 21 "scripts/genksyms/keywords.gperf"
+      {"__volatile", VOLATILE_KEYW},
       {""}, {""},
 #line 50 "scripts/genksyms/keywords.gperf"
-      {"volatile", VOLATILE_KEYW},
-      {""},
-#line 37 "scripts/genksyms/keywords.gperf"
-      {"float", FLOAT_KEYW},
-#line 34 "scripts/genksyms/keywords.gperf"
-      {"double", DOUBLE_KEYW},
+      {"unsigned", UNSIGNED_KEYW},
       {""},
-#line 5 "scripts/genksyms/keywords.gperf"
-      {"EXPORT_SYMBOL", EXPORT_SYMBOL_KEYW},
-      {""}, {""},
-#line 38 "scripts/genksyms/keywords.gperf"
+#line 44 "scripts/genksyms/keywords.gperf"
+      {"short", SHORT_KEYW},
+#line 40 "scripts/genksyms/keywords.gperf"
       {"inline", INLINE_KEYW},
-#line 6 "scripts/genksyms/keywords.gperf"
-      {"EXPORT_SYMBOL_GPL", EXPORT_SYMBOL_KEYW},
-#line 41 "scripts/genksyms/keywords.gperf"
-      {"register", REGISTER_KEYW},
       {""},
-#line 22 "scripts/genksyms/keywords.gperf"
+#line 52 "scripts/genksyms/keywords.gperf"
+      {"volatile", VOLATILE_KEYW},
+#line 42 "scripts/genksyms/keywords.gperf"
+      {"long", LONG_KEYW},
+#line 24 "scripts/genksyms/keywords.gperf"
       {"_Bool", BOOL_KEYW},
-#line 43 "scripts/genksyms/keywords.gperf"
-      {"signed", SIGNED_KEYW},
       {""}, {""},
-#line 40 "scripts/genksyms/keywords.gperf"
-      {"long", LONG_KEYW}
+#line 43 "scripts/genksyms/keywords.gperf"
+      {"register", REGISTER_KEYW},
+#line 51 "scripts/genksyms/keywords.gperf"
+      {"void", VOID_KEYW},
+#line 39 "scripts/genksyms/keywords.gperf"
+      {"float", FLOAT_KEYW},
+#line 36 "scripts/genksyms/keywords.gperf"
+      {"double", DOUBLE_KEYW},
+      {""}, {""}, {""}, {""},
+#line 45 "scripts/genksyms/keywords.gperf"
+      {"signed", SIGNED_KEYW}
     };
 
   if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH)
diff --git a/scripts/genksyms/keywords.gperf b/scripts/genksyms/keywords.gperf
index 5ef3733225fb..8abe7ab8d88f 100644
--- a/scripts/genksyms/keywords.gperf
+++ b/scripts/genksyms/keywords.gperf
@@ -5,6 +5,8 @@ struct resword { const char *name; int token; }
 EXPORT_SYMBOL, EXPORT_SYMBOL_KEYW
 EXPORT_SYMBOL_GPL, EXPORT_SYMBOL_KEYW
 EXPORT_SYMBOL_GPL_FUTURE, EXPORT_SYMBOL_KEYW
+EXPORT_UNUSED_SYMBOL, EXPORT_SYMBOL_KEYW
+EXPORT_UNUSED_SYMBOL_GPL, EXPORT_SYMBOL_KEYW
 __asm, ASM_KEYW
 __asm__, ASM_KEYW
 __attribute, ATTRIBUTE_KEYW
diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c
index ad2434b26970..92758120a767 100644
--- a/scripts/kallsyms.c
+++ b/scripts/kallsyms.c
@@ -130,18 +130,9 @@ static int read_symbol(FILE *in, struct sym_entry *s)
 static int symbol_valid(struct sym_entry *s)
 {
 	/* Symbols which vary between passes.  Passes 1 and 2 must have
-	 * identical symbol lists.  The kallsyms_* symbols below are only added
-	 * after pass 1, they would be included in pass 2 when --all-symbols is
-	 * specified so exclude them to get a stable symbol list.
+	 * identical symbol lists.
 	 */
 	static char *special_symbols[] = {
-		"kallsyms_addresses",
-		"kallsyms_num_syms",
-		"kallsyms_names",
-		"kallsyms_markers",
-		"kallsyms_token_table",
-		"kallsyms_token_index",
-
 	/* Exclude linker generated symbols which vary between passes */
 		"_SDA_BASE_",		/* ppc */
 		"_SDA2_BASE_",		/* ppc */
@@ -173,7 +164,9 @@ static int symbol_valid(struct sym_entry *s)
 	}
 
 	/* Exclude symbols which vary between passes. */
-	if (strstr((char *)s->sym + offset, "_compiled."))
+	if (strstr((char *)s->sym + offset, "_compiled.") ||
+	    strncmp((char*)s->sym + offset, "__compound_literal.", 19) == 0 ||
+	    strncmp((char*)s->sym + offset, "__compound_literal$", 19) == 0)
 		return 0;
 
 	for (i = 0; special_symbols[i]; i++)
@@ -550,8 +543,10 @@ int main(int argc, char **argv)
 		usage();
 
 	read_map(stdin);
-	sort_symbols();
-	optimize_token_table();
+	if (table_cnt) {
+		sort_symbols();
+		optimize_token_table();
+	}
 	write_src();
 
 	return 0;
-- 
cgit v1.2.3


From bf53de907dfdaac178c92d774aae7370d7b97d20 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 19 Dec 2008 15:10:24 +0100
Subject: x86, bts: add fork and exit handling

Impact: introduce new ptrace facility

Add arch_ptrace_untrace() function that is called when the tracer
detaches (either voluntarily or when the tracing task dies);
ptrace_disable() is only called on a voluntary detach.

Add ptrace_fork() and arch_ptrace_fork(). They are called when a
traced task is forked.

Clear DS and BTS related fields on fork.

Release DS resources and reclaim memory in ptrace_untrace(). This
releases resources already when the tracing task dies. We used to do
that when the traced task dies.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/ds.h     |  9 ++++++++
 arch/x86/include/asm/ptrace.h |  7 ++++++
 arch/x86/kernel/ds.c          | 11 ++++++++++
 arch/x86/kernel/process_32.c  | 20 ++++++++---------
 arch/x86/kernel/process_64.c  | 20 ++++++++---------
 arch/x86/kernel/ptrace.c      | 50 ++++++++++++++++++++++++++++++++++---------
 include/linux/ptrace.h        | 22 +++++++++++++++++++
 kernel/fork.c                 |  2 ++
 kernel/ptrace.c               | 12 +++++++++++
 9 files changed, 121 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
index ee0ea3a96c11..a8f672ba100c 100644
--- a/arch/x86/include/asm/ds.h
+++ b/arch/x86/include/asm/ds.h
@@ -252,12 +252,21 @@ extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
  */
 extern void ds_switch_to(struct task_struct *prev, struct task_struct *next);
 
+/*
+ * Task clone/init and cleanup work
+ */
+extern void ds_copy_thread(struct task_struct *tsk, struct task_struct *father);
+extern void ds_exit_thread(struct task_struct *tsk);
+
 #else /* CONFIG_X86_DS */
 
 struct cpuinfo_x86;
 static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {}
 static inline void ds_switch_to(struct task_struct *prev,
 				struct task_struct *next) {}
+static inline void ds_copy_thread(struct task_struct *tsk,
+				  struct task_struct *father) {}
+static inline void ds_exit_thread(struct task_struct *tsk) {}
 
 #endif /* CONFIG_X86_DS */
 #endif /* _ASM_X86_DS_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index fbf744215911..6d34d954c228 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -235,6 +235,13 @@ extern int do_get_thread_area(struct task_struct *p, int idx,
 extern int do_set_thread_area(struct task_struct *p, int idx,
 			      struct user_desc __user *info, int can_allocate);
 
+extern void x86_ptrace_untrace(struct task_struct *);
+extern void x86_ptrace_fork(struct task_struct *child,
+			    unsigned long clone_flags);
+
+#define arch_ptrace_untrace(tsk) x86_ptrace_untrace(tsk)
+#define arch_ptrace_fork(child, flags) x86_ptrace_fork(child, flags)
+
 #endif /* __KERNEL__ */
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 98d271e60e08..da91701a2348 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -1017,3 +1017,14 @@ void ds_switch_to(struct task_struct *prev, struct task_struct *next)
 
 	update_debugctlmsr(next->thread.debugctlmsr);
 }
+
+void ds_copy_thread(struct task_struct *tsk, struct task_struct *father)
+{
+	clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR);
+	tsk->thread.ds_ctx = NULL;
+}
+
+void ds_exit_thread(struct task_struct *tsk)
+{
+	WARN_ON(tsk->thread.ds_ctx);
+}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 605eff9a8ac0..3ba155d24884 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -60,6 +60,7 @@
 #include <asm/idle.h>
 #include <asm/syscalls.h>
 #include <asm/smp.h>
+#include <asm/ds.h>
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
@@ -251,17 +252,8 @@ void exit_thread(void)
 		tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
 		put_cpu();
 	}
-#ifdef CONFIG_X86_DS
-	/* Free any BTS tracers that have not been properly released. */
-	if (unlikely(current->bts)) {
-		ds_release_bts(current->bts);
-		current->bts = NULL;
-
-		kfree(current->bts_buffer);
-		current->bts_buffer = NULL;
-		current->bts_size = 0;
-	}
-#endif /* CONFIG_X86_DS */
+
+	ds_exit_thread(current);
 }
 
 void flush_thread(void)
@@ -343,6 +335,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
 		kfree(p->thread.io_bitmap_ptr);
 		p->thread.io_bitmap_max = 0;
 	}
+
+	ds_copy_thread(p, current);
+
+	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
+	p->thread.debugctlmsr = 0;
+
 	return err;
 }
 
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 1cfd2a4bf853..416fb9282f4f 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -53,6 +53,7 @@
 #include <asm/ia32.h>
 #include <asm/idle.h>
 #include <asm/syscalls.h>
+#include <asm/ds.h>
 
 asmlinkage extern void ret_from_fork(void);
 
@@ -236,17 +237,8 @@ void exit_thread(void)
 		t->io_bitmap_max = 0;
 		put_cpu();
 	}
-#ifdef CONFIG_X86_DS
-	/* Free any BTS tracers that have not been properly released. */
-	if (unlikely(current->bts)) {
-		ds_release_bts(current->bts);
-		current->bts = NULL;
-
-		kfree(current->bts_buffer);
-		current->bts_buffer = NULL;
-		current->bts_size = 0;
-	}
-#endif /* CONFIG_X86_DS */
+
+	ds_exit_thread(current);
 }
 
 void flush_thread(void)
@@ -376,6 +368,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
 		if (err)
 			goto out;
 	}
+
+	ds_copy_thread(p, me);
+
+	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
+	p->thread.debugctlmsr = 0;
+
 	err = 0;
 out:
 	if (err && p->thread.io_bitmap_ptr) {
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 45e9855da2d2..6ad2bb607650 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -769,8 +769,47 @@ static int ptrace_bts_size(struct task_struct *child)
 
 	return (trace->ds.top - trace->ds.begin) / trace->ds.size;
 }
+
+static void ptrace_bts_fork(struct task_struct *tsk)
+{
+	tsk->bts = NULL;
+	tsk->bts_buffer = NULL;
+	tsk->bts_size = 0;
+	tsk->thread.bts_ovfl_signal = 0;
+}
+
+static void ptrace_bts_untrace(struct task_struct *child)
+{
+	if (unlikely(child->bts)) {
+		ds_release_bts(child->bts);
+		child->bts = NULL;
+
+		kfree(child->bts_buffer);
+		child->bts_buffer = NULL;
+		child->bts_size = 0;
+	}
+}
+
+static void ptrace_bts_detach(struct task_struct *child)
+{
+	ptrace_bts_untrace(child);
+}
+#else
+static inline void ptrace_bts_fork(struct task_struct *tsk) {}
+static inline void ptrace_bts_detach(struct task_struct *child) {}
+static inline void ptrace_bts_untrace(struct task_struct *child) {}
 #endif /* CONFIG_X86_PTRACE_BTS */
 
+void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags)
+{
+	ptrace_bts_fork(child);
+}
+
+void x86_ptrace_untrace(struct task_struct *child)
+{
+	ptrace_bts_untrace(child);
+}
+
 /*
  * Called by kernel/ptrace.c when detaching..
  *
@@ -782,16 +821,7 @@ void ptrace_disable(struct task_struct *child)
 #ifdef TIF_SYSCALL_EMU
 	clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 #endif
-#ifdef CONFIG_X86_PTRACE_BTS
-	if (child->bts) {
-		ds_release_bts(child->bts);
-		child->bts = NULL;
-
-		kfree(child->bts_buffer);
-		child->bts_buffer = NULL;
-		child->bts_size = 0;
-	}
-#endif /* CONFIG_X86_PTRACE_BTS */
+	ptrace_bts_detach(child);
 }
 
 #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 22641d5d45df..98b93ca4db06 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -94,6 +94,7 @@ extern void ptrace_notify(int exit_code);
 extern void __ptrace_link(struct task_struct *child,
 			  struct task_struct *new_parent);
 extern void __ptrace_unlink(struct task_struct *child);
+extern void ptrace_fork(struct task_struct *task, unsigned long clone_flags);
 #define PTRACE_MODE_READ   1
 #define PTRACE_MODE_ATTACH 2
 /* Returns 0 on success, -errno on denial. */
@@ -313,6 +314,27 @@ static inline void user_enable_block_step(struct task_struct *task)
 #define arch_ptrace_stop(code, info)		do { } while (0)
 #endif
 
+#ifndef arch_ptrace_untrace
+/*
+ * Do machine-specific work before untracing child.
+ *
+ * This is called for a normal detach as well as from ptrace_exit()
+ * when the tracing task dies.
+ *
+ * Called with write_lock(&tasklist_lock) held.
+ */
+#define arch_ptrace_untrace(task)		do { } while (0)
+#endif
+
+#ifndef arch_ptrace_fork
+/*
+ * Do machine-specific work to initialize a new task.
+ *
+ * This is called from copy_process().
+ */
+#define arch_ptrace_fork(child, clone_flags)	do { } while (0)
+#endif
+
 extern int task_current_syscall(struct task_struct *target, long *callno,
 				unsigned long args[6], unsigned int maxargs,
 				unsigned long *sp, unsigned long *pc);
diff --git a/kernel/fork.c b/kernel/fork.c
index 7b93da72d4a2..65ce60adc8e8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1096,6 +1096,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_DEBUG_MUTEXES
 	p->blocked_on = NULL; /* not blocked yet */
 #endif
+	if (unlikely(ptrace_reparented(current)))
+		ptrace_fork(p, clone_flags);
 
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	sched_fork(p, clone_flags);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 4c8bcd7dd8e0..100a71cfdaba 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -25,6 +25,17 @@
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
 
+
+/*
+ * Initialize a new task whose father had been ptraced.
+ *
+ * Called from copy_process().
+ */
+void ptrace_fork(struct task_struct *child, unsigned long clone_flags)
+{
+	arch_ptrace_fork(child, clone_flags);
+}
+
 /*
  * ptrace a task: make the debugger its new parent and
  * move it to the ptrace list.
@@ -72,6 +83,7 @@ void __ptrace_unlink(struct task_struct *child)
 	child->parent = child->real_parent;
 	list_del_init(&child->ptrace_entry);
 
+	arch_ptrace_untrace(child);
 	if (task_is_traced(child))
 		ptrace_untrace(child);
 }
-- 
cgit v1.2.3


From 3d44cc3e01ee1b40317f79ed54324e25c4f848df Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 20 Dec 2008 21:27:34 +0100
Subject: Null pointer deref with hrtimer_try_to_cancel()

Impact: Prevent kernel crash with posix timer clockid CLOCK_MONOTONIC_RAW

commit 2d42244ae71d6c7b0884b5664cf2eda30fb2ae68 (clocksource:
introduce CLOCK_MONOTONIC_RAW) introduced a new clockid, which is only
available to read out the raw not NTP adjusted system time.

The above commit did not prevent that a posix timer can be created
with that clockid. The timer_create() syscall succeeds and initializes
the timer to a non existing hrtimer base. When the timer is deleted
either by timer_delete() or by the exit() cleanup the kernel crashes.

Prevent the creation of timers for CLOCK_MONOTONIC_RAW by setting the
posix clock function to no_timer_create which returns an error code.

Reported-and-tested-by: Eric Sesterhenn <snakebyte@gmx.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/posix-timers.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 5e79c662294b..a140e44eebba 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -197,6 +197,11 @@ static int common_timer_create(struct k_itimer *new_timer)
 	return 0;
 }
 
+static int no_timer_create(struct k_itimer *new_timer)
+{
+	return -EOPNOTSUPP;
+}
+
 /*
  * Return nonzero if we know a priori this clockid_t value is bogus.
  */
@@ -248,6 +253,7 @@ static __init int init_posix_timers(void)
 		.clock_getres = hrtimer_get_res,
 		.clock_get = posix_get_monotonic_raw,
 		.clock_set = do_posix_clock_nosettime,
+		.timer_create = no_timer_create,
 	};
 
 	register_posix_clock(CLOCK_REALTIME, &clock_realtime);
-- 
cgit v1.2.3


From a8ccf1d6f60e3e6ae63122e02378cd4d40dd4aac Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 23 Dec 2008 11:32:24 -0500
Subject: ring-buffer: fix dangling commit race

Impact: fix stuck trace-buffers

If an interrupt comes in during the rb_set_commit_to_write and
pushes the tail page forward just at the right time, the commit
updates will miss the adding of the interrupt data. This will
cause the commit pointer to cease from moving forward.

Thanks to Jiaying Zhang for finding this race.

Reported-by: Jiaying Zhang <jiayingz@google.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: <stable@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bb6922a931b1..d03f4f44a823 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -838,6 +838,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
 	 * back to us). This allows us to do a simple loop to
 	 * assign the commit to the tail.
 	 */
+ again:
 	while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
 		cpu_buffer->commit_page->page->commit =
 			cpu_buffer->commit_page->write;
@@ -853,6 +854,17 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
 			cpu_buffer->commit_page->write;
 		barrier();
 	}
+
+	/* again, keep gcc from optimizing */
+	barrier();
+
+	/*
+	 * If an interrupt came in just after the first while loop
+	 * and pushed the tail page forward, we will be left with
+	 * a dangling commit that will never go forward.
+	 */
+	if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page))
+		goto again;
 }
 
 static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
-- 
cgit v1.2.3


From 98db8df777438e16ad0f44a0fba05ebbdb73db8d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 23 Dec 2008 11:32:25 -0500
Subject: ring-buffer: prevent false positive warning

Impact: eliminate false WARN_ON message

If an interrupt goes off after the setting of the local variable
tail_page and before incrementing the write index of that page,
the interrupt could push the commit forward to the next page.

Later a check is made to see if interrupts pushed the buffer around
the entire ring buffer by comparing the next page to the last commited
page. This can produce a false positive if the interrupt had pushed
the commit page forward as stated above.

Thanks to Jiaying Zhang for finding this race.

Reported-by: Jiaying Zhang <jiayingz@google.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: <stable@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index d03f4f44a823..76f34c0ef29c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -962,12 +962,15 @@ static struct ring_buffer_event *
 __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		  unsigned type, unsigned long length, u64 *ts)
 {
-	struct buffer_page *tail_page, *head_page, *reader_page;
+	struct buffer_page *tail_page, *head_page, *reader_page, *commit_page;
 	unsigned long tail, write;
 	struct ring_buffer *buffer = cpu_buffer->buffer;
 	struct ring_buffer_event *event;
 	unsigned long flags;
 
+	commit_page = cpu_buffer->commit_page;
+	/* we just need to protect against interrupts */
+	barrier();
 	tail_page = cpu_buffer->tail_page;
 	write = local_add_return(length, &tail_page->write);
 	tail = write - length;
@@ -993,7 +996,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		 * it all the way around the buffer, bail, and warn
 		 * about it.
 		 */
-		if (unlikely(next_page == cpu_buffer->commit_page)) {
+		if (unlikely(next_page == commit_page)) {
 			WARN_ON_ONCE(1);
 			goto out_unlock;
 		}
-- 
cgit v1.2.3


From e368d3a836797ddf193b1ec18c97407a791d2451 Mon Sep 17 00:00:00 2001
From: Sharyathi Nagesh <sharyath@in.ibm.com>
Date: Tue, 23 Dec 2008 13:57:12 -0800
Subject: cgroups: suppress bogus warning messages

Remove spurious warning messages that are thrown onto the console during
cgroup operations.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Sharyathi Nagesh <sharyathi@in.ibm.com>
Acked-by: Serge E. Hallyn <serge@hallyn.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8185a0f09594..a3415507bd0a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2934,9 +2934,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
  again:
 	root = subsys->root;
 	if (root == &rootnode) {
-		printk(KERN_INFO
-		       "Not cloning cgroup for unused subsystem %s\n",
-		       subsys->name);
 		mutex_unlock(&cgroup_mutex);
 		return 0;
 	}
-- 
cgit v1.2.3


From 20ca9b3f4c6dfa0af8dd5b18a64df17eb994b54d Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 23 Dec 2008 13:57:14 -0800
Subject: cgroups: avoid accessing uninitialized data in failure path

If cgroup_get_rootdir() failed, free_cg_links() will be called in the
failure path, but tmp_cg_links hasn't been initialized at that time.

I introduced this bug in the 2.6.27 merge window.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a3415507bd0a..2606d0fb4e54 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1024,7 +1024,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 		if (ret == -EBUSY) {
 			mutex_unlock(&cgroup_mutex);
 			mutex_unlock(&inode->i_mutex);
-			goto drop_new_super;
+			goto free_cg_links;
 		}
 
 		/* EBUSY should be the only error here */
@@ -1073,10 +1073,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 
 	return simple_set_mnt(mnt, sb);
 
+ free_cg_links:
+	free_cg_links(&tmp_cg_links);
  drop_new_super:
 	up_write(&sb->s_umount);
 	deactivate_super(sb);
-	free_cg_links(&tmp_cg_links);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 468a15bb4cc61694495cc5ed7ffca29e87c79b69 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 16 Dec 2008 08:07:03 +0100
Subject: sched, trace: update trace_sched_wakeup()

Impact: extend the wakeup tracepoint with the info whether the wakeup was real

Add the information needed to distinguish 'real' wakeups from 'false'
wakeups.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/trace/sched.h             | 4 ++--
 kernel/sched.c                    | 2 +-
 kernel/trace/trace_sched_switch.c | 2 +-
 kernel/trace/trace_sched_wakeup.c | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/trace/sched.h b/include/trace/sched.h
index bc4c9eadc6ba..0d81098ee9fc 100644
--- a/include/trace/sched.h
+++ b/include/trace/sched.h
@@ -17,8 +17,8 @@ DECLARE_TRACE(sched_wait_task,
 		TPARGS(rq, p));
 
 DECLARE_TRACE(sched_wakeup,
-	TPPROTO(struct rq *rq, struct task_struct *p),
-		TPARGS(rq, p));
+	TPPROTO(struct rq *rq, struct task_struct *p, int success),
+		TPARGS(rq, p, success));
 
 DECLARE_TRACE(sched_wakeup_new,
 	TPPROTO(struct rq *rq, struct task_struct *p, int success),
diff --git a/kernel/sched.c b/kernel/sched.c
index ceda5799466e..dcb39bc88f6c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2324,7 +2324,7 @@ out_activate:
 	success = 1;
 
 out_running:
-	trace_sched_wakeup(rq, p);
+	trace_sched_wakeup(rq, p, success);
 	check_preempt_curr(rq, p, sync);
 
 	p->state = TASK_RUNNING;
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index add2c1fdae92..df175cb4564f 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -49,7 +49,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
 }
 
 static void
-probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee)
+probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
 {
 	struct trace_array_cpu *data;
 	unsigned long flags;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0067b49746c1..43586b689e31 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -211,7 +211,7 @@ static void wakeup_reset(struct trace_array *tr)
 }
 
 static void
-probe_wakeup(struct rq *rq, struct task_struct *p)
+probe_wakeup(struct rq *rq, struct task_struct *p, int success)
 {
 	int cpu = smp_processor_id();
 	unsigned long flags;
-- 
cgit v1.2.3