From 45ac1403f564f411c6a383a2448688ba8dd705a4 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Tue, 21 Jul 2015 12:44:02 +0300
Subject: perf: Add PERF_RECORD_SWITCH to indicate context switches

There are already two events for context switches, namely the tracepoint
sched:sched_switch and the software event context_switches.
Unfortunately neither are suitable for use by non-privileged users for
the purpose of synchronizing hardware trace data (e.g. Intel PT) to the
context switch.

Tracepoints are no good at all for non-privileged users because they
need either CAP_SYS_ADMIN or /proc/sys/kernel/perf_event_paranoid <= -1.

On the other hand, kernel software events need either CAP_SYS_ADMIN or
/proc/sys/kernel/perf_event_paranoid <= 1.

Now many distributions do default perf_event_paranoid to 1 making
context_switches a contender, except it has another problem (which is
also shared with sched:sched_switch) which is that it happens before
perf schedules events out instead of after perf schedules events in.
Whereas a privileged user can see all the events anyway, a
non-privileged user only sees events for their own processes, in other
words they see when their process was scheduled out not when it was
scheduled in. That presents two problems to use the event:

1. the information comes too late, so tools have to look ahead in the
   event stream to find out what the current state is

2. if they are unlucky tracing might have stopped before the
   context-switches event is recorded.

This new PERF_RECORD_SWITCH event does not have those problems
and it also has a couple of other small advantages.

It is easier to use because it is an auxiliary event (like mmap, comm
and task events) which can be enabled by setting a single bit. It is
smaller than sched:sched_switch and easier to parse.

To make the event useful for privileged users also, if the
context is cpu-wide then the event record will be
PERF_RECORD_SWITCH_CPU_WIDE which is the same as
PERF_RECORD_SWITCH except it also provides the next or
previous pid/tid.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Jiri Olsa <jolsa@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Pawel Moll <pawel.moll@arm.com>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1437471846-26995-2-git-send-email-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 kernel/events/core.c | 103 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 103 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index d3dae3419b99..ce21143c0d9e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -163,6 +163,7 @@ static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
 static atomic_t nr_task_events __read_mostly;
 static atomic_t nr_freq_events __read_mostly;
+static atomic_t nr_switch_events __read_mostly;
 
 static LIST_HEAD(pmus);
 static DEFINE_MUTEX(pmus_lock);
@@ -2619,6 +2620,9 @@ static void perf_pmu_sched_task(struct task_struct *prev,
 	local_irq_restore(flags);
 }
 
+static void perf_event_switch(struct task_struct *task,
+			      struct task_struct *next_prev, bool sched_in);
+
 #define for_each_task_context_nr(ctxn)					\
 	for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
 
@@ -2641,6 +2645,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
 	if (__this_cpu_read(perf_sched_cb_usages))
 		perf_pmu_sched_task(task, next, false);
 
+	if (atomic_read(&nr_switch_events))
+		perf_event_switch(task, next, false);
+
 	for_each_task_context_nr(ctxn)
 		perf_event_context_sched_out(task, ctxn, next);
 
@@ -2831,6 +2838,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
 	if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
 		perf_cgroup_sched_in(prev, task);
 
+	if (atomic_read(&nr_switch_events))
+		perf_event_switch(task, prev, true);
+
 	if (__this_cpu_read(perf_sched_cb_usages))
 		perf_pmu_sched_task(prev, task, true);
 }
@@ -3454,6 +3464,10 @@ static void unaccount_event(struct perf_event *event)
 		atomic_dec(&nr_task_events);
 	if (event->attr.freq)
 		atomic_dec(&nr_freq_events);
+	if (event->attr.context_switch) {
+		static_key_slow_dec_deferred(&perf_sched_events);
+		atomic_dec(&nr_switch_events);
+	}
 	if (is_cgroup_event(event))
 		static_key_slow_dec_deferred(&perf_sched_events);
 	if (has_branch_stack(event))
@@ -5981,6 +5995,91 @@ void perf_log_lost_samples(struct perf_event *event, u64 lost)
 	perf_output_end(&handle);
 }
 
+/*
+ * context_switch tracking
+ */
+
+struct perf_switch_event {
+	struct task_struct	*task;
+	struct task_struct	*next_prev;
+
+	struct {
+		struct perf_event_header	header;
+		u32				next_prev_pid;
+		u32				next_prev_tid;
+	} event_id;
+};
+
+static int perf_event_switch_match(struct perf_event *event)
+{
+	return event->attr.context_switch;
+}
+
+static void perf_event_switch_output(struct perf_event *event, void *data)
+{
+	struct perf_switch_event *se = data;
+	struct perf_output_handle handle;
+	struct perf_sample_data sample;
+	int ret;
+
+	if (!perf_event_switch_match(event))
+		return;
+
+	/* Only CPU-wide events are allowed to see next/prev pid/tid */
+	if (event->ctx->task) {
+		se->event_id.header.type = PERF_RECORD_SWITCH;
+		se->event_id.header.size = sizeof(se->event_id.header);
+	} else {
+		se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
+		se->event_id.header.size = sizeof(se->event_id);
+		se->event_id.next_prev_pid =
+					perf_event_pid(event, se->next_prev);
+		se->event_id.next_prev_tid =
+					perf_event_tid(event, se->next_prev);
+	}
+
+	perf_event_header__init_id(&se->event_id.header, &sample, event);
+
+	ret = perf_output_begin(&handle, event, se->event_id.header.size);
+	if (ret)
+		return;
+
+	if (event->ctx->task)
+		perf_output_put(&handle, se->event_id.header);
+	else
+		perf_output_put(&handle, se->event_id);
+
+	perf_event__output_id_sample(event, &handle, &sample);
+
+	perf_output_end(&handle);
+}
+
+static void perf_event_switch(struct task_struct *task,
+			      struct task_struct *next_prev, bool sched_in)
+{
+	struct perf_switch_event switch_event;
+
+	/* N.B. caller checks nr_switch_events != 0 */
+
+	switch_event = (struct perf_switch_event){
+		.task		= task,
+		.next_prev	= next_prev,
+		.event_id	= {
+			.header = {
+				/* .type */
+				.misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
+				/* .size */
+			},
+			/* .next_prev_pid */
+			/* .next_prev_tid */
+		},
+	};
+
+	perf_event_aux(perf_event_switch_output,
+		       &switch_event,
+		       NULL);
+}
+
 /*
  * IRQ throttle logging
  */
@@ -7479,6 +7578,10 @@ static void account_event(struct perf_event *event)
 		if (atomic_inc_return(&nr_freq_events) == 1)
 			tick_nohz_full_kick_all();
 	}
+	if (event->attr.context_switch) {
+		atomic_inc(&nr_switch_events);
+		static_key_slow_inc(&perf_sched_events.key);
+	}
 	if (has_branch_stack(event))
 		static_key_slow_inc(&perf_sched_events.key);
 	if (is_cgroup_event(event))
-- 
cgit v1.2.3


From f231722a2b27ee99cbcd0c6bcf4c866612b78137 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 21 Jul 2015 15:40:03 +0200
Subject: uprobes: Introduce get_uprobe()

Cosmetic. Add the new trivial helper, get_uprobe(). It matches
put_uprobe() we already have and we can simplify a couple of its
users.

Tested-by: Pratyush Anand <panand@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Anton Arapov <arapov@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150721134003.GA4736@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 39 ++++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index cb346f26a22d..a9847b4ec1e7 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -366,6 +366,18 @@ set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long v
 	return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn);
 }
 
+static struct uprobe *get_uprobe(struct uprobe *uprobe)
+{
+	atomic_inc(&uprobe->ref);
+	return uprobe;
+}
+
+static void put_uprobe(struct uprobe *uprobe)
+{
+	if (atomic_dec_and_test(&uprobe->ref))
+		kfree(uprobe);
+}
+
 static int match_uprobe(struct uprobe *l, struct uprobe *r)
 {
 	if (l->inode < r->inode)
@@ -393,10 +405,8 @@ static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
 	while (n) {
 		uprobe = rb_entry(n, struct uprobe, rb_node);
 		match = match_uprobe(&u, uprobe);
-		if (!match) {
-			atomic_inc(&uprobe->ref);
-			return uprobe;
-		}
+		if (!match)
+			return get_uprobe(uprobe);
 
 		if (match < 0)
 			n = n->rb_left;
@@ -432,10 +442,8 @@ static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
 		parent = *p;
 		u = rb_entry(parent, struct uprobe, rb_node);
 		match = match_uprobe(uprobe, u);
-		if (!match) {
-			atomic_inc(&u->ref);
-			return u;
-		}
+		if (!match)
+			return get_uprobe(u);
 
 		if (match < 0)
 			p = &parent->rb_left;
@@ -472,12 +480,6 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
 	return u;
 }
 
-static void put_uprobe(struct uprobe *uprobe)
-{
-	if (atomic_dec_and_test(&uprobe->ref))
-		kfree(uprobe);
-}
-
 static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
 {
 	struct uprobe *uprobe, *cur_uprobe;
@@ -1039,14 +1041,14 @@ static void build_probe_list(struct inode *inode,
 			if (u->inode != inode || u->offset < min)
 				break;
 			list_add(&u->pending_list, head);
-			atomic_inc(&u->ref);
+			get_uprobe(u);
 		}
 		for (t = n; (t = rb_next(t)); ) {
 			u = rb_entry(t, struct uprobe, rb_node);
 			if (u->inode != inode || u->offset > max)
 				break;
 			list_add(&u->pending_list, head);
-			atomic_inc(&u->ref);
+			get_uprobe(u);
 		}
 	}
 	spin_unlock(&uprobes_treelock);
@@ -1437,7 +1439,7 @@ static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
 			return -ENOMEM;
 
 		*n = *o;
-		atomic_inc(&n->uprobe->ref);
+		get_uprobe(n->uprobe);
 		n->next = NULL;
 
 		*p = n;
@@ -1565,8 +1567,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
 		orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
 	}
 
-	atomic_inc(&uprobe->ref);
-	ri->uprobe = uprobe;
+	ri->uprobe = get_uprobe(uprobe);
 	ri->func = instruction_pointer(regs);
 	ri->orig_ret_vaddr = orig_ret_vaddr;
 	ri->chained = chained;
-- 
cgit v1.2.3


From 2bb5e840e873f8778a41801141771f54f547fa65 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 21 Jul 2015 15:40:06 +0200
Subject: uprobes: Introduce free_ret_instance()

We can simplify uprobe_free_utask() and handle_uretprobe_chain()
if we add a simple helper which does put_uprobe/kfree and
returns the ->next return_instance.

Tested-by: Pratyush Anand <panand@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Anton Arapov <arapov@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150721134006.GA4740@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index a9847b4ec1e7..d8c702fc836f 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1378,6 +1378,14 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
 	return instruction_pointer(regs);
 }
 
+static struct return_instance *free_ret_instance(struct return_instance *ri)
+{
+	struct return_instance *next = ri->next;
+	put_uprobe(ri->uprobe);
+	kfree(ri);
+	return next;
+}
+
 /*
  * Called with no locks held.
  * Called in context of a exiting or a exec-ing thread.
@@ -1385,7 +1393,7 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
 void uprobe_free_utask(struct task_struct *t)
 {
 	struct uprobe_task *utask = t->utask;
-	struct return_instance *ri, *tmp;
+	struct return_instance *ri;
 
 	if (!utask)
 		return;
@@ -1394,13 +1402,8 @@ void uprobe_free_utask(struct task_struct *t)
 		put_uprobe(utask->active_uprobe);
 
 	ri = utask->return_instances;
-	while (ri) {
-		tmp = ri;
-		ri = ri->next;
-
-		put_uprobe(tmp->uprobe);
-		kfree(tmp);
-	}
+	while (ri)
+		ri = free_ret_instance(ri);
 
 	xol_free_insn_slot(t);
 	kfree(utask);
@@ -1770,7 +1773,7 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
 static bool handle_trampoline(struct pt_regs *regs)
 {
 	struct uprobe_task *utask;
-	struct return_instance *ri, *tmp;
+	struct return_instance *ri;
 	bool chained;
 
 	utask = current->utask;
@@ -1792,11 +1795,7 @@ static bool handle_trampoline(struct pt_regs *regs)
 		handle_uretprobe_chain(ri, regs);
 
 		chained = ri->chained;
-		put_uprobe(ri->uprobe);
-
-		tmp = ri;
-		ri = ri->next;
-		kfree(tmp);
+		ri = free_ret_instance(ri);
 		utask->depth--;
 
 		if (!chained)
-- 
cgit v1.2.3


From 0b5256c7f173258b19d98364adb57f707dda22f3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 21 Jul 2015 15:40:08 +0200
Subject: uprobes: Send SIGILL if handle_trampoline() fails

1. It doesn't make sense to continue if handle_trampoline()
   fails, change handle_swbp() to always return after this call.

2. Turn pr_warn() into uprobe_warn(), and change
   handle_trampoline() to send SIGILL on failure. It is pointless to
   return to user mode with the corrupted instruction_pointer() which
   we can't restore.

Tested-by: Pratyush Anand <panand@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Anton Arapov <arapov@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150721134008.GA4745@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index d8c702fc836f..eabdc21366ee 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1770,7 +1770,7 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
 	up_read(&uprobe->register_rwsem);
 }
 
-static bool handle_trampoline(struct pt_regs *regs)
+static void handle_trampoline(struct pt_regs *regs)
 {
 	struct uprobe_task *utask;
 	struct return_instance *ri;
@@ -1778,11 +1778,11 @@ static bool handle_trampoline(struct pt_regs *regs)
 
 	utask = current->utask;
 	if (!utask)
-		return false;
+		goto sigill;
 
 	ri = utask->return_instances;
 	if (!ri)
-		return false;
+		goto sigill;
 
 	/*
 	 * TODO: we should throw out return_instance's invalidated by
@@ -1804,8 +1804,12 @@ static bool handle_trampoline(struct pt_regs *regs)
 	}
 
 	utask->return_instances = ri;
+	return;
+
+ sigill:
+	uprobe_warn(current, "handle uretprobe, sending SIGILL.");
+	force_sig_info(SIGILL, SEND_SIG_FORCED, current);
 
-	return true;
 }
 
 bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
@@ -1824,13 +1828,8 @@ static void handle_swbp(struct pt_regs *regs)
 	int uninitialized_var(is_swbp);
 
 	bp_vaddr = uprobe_get_swbp_addr(regs);
-	if (bp_vaddr == get_trampoline_vaddr()) {
-		if (handle_trampoline(regs))
-			return;
-
-		pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n",
-						current->pid, current->tgid);
-	}
+	if (bp_vaddr == get_trampoline_vaddr())
+		return handle_trampoline(regs);
 
 	uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
 	if (!uprobe) {
-- 
cgit v1.2.3


From 6c58d0e4cc26ea8882928e64c0de9afed4fc37cb Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 21 Jul 2015 15:40:10 +0200
Subject: uprobes: Change prepare_uretprobe() to use uprobe_warn()

Turn the last pr_warn() in uprobes.c into uprobe_warn().

While at it:

   - s/kzalloc/kmalloc, we initialize every member of 'ri'

   - remove the pointless comment above the obvious code

Tested-by: Pratyush Anand <panand@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Anton Arapov <arapov@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150721134010.GA4752@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index eabdc21366ee..4c941feae3a2 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1541,9 +1541,9 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
 		return;
 	}
 
-	ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL);
+	ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
 	if (!ri)
-		goto fail;
+		return;
 
 	trampoline_vaddr = get_trampoline_vaddr();
 	orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
@@ -1561,8 +1561,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
 			 * This situation is not possible. Likely we have an
 			 * attack from user-space.
 			 */
-			pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n",
-						current->pid, current->tgid);
+			uprobe_warn(current, "handle tail call");
 			goto fail;
 		}
 
@@ -1576,13 +1575,10 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
 	ri->chained = chained;
 
 	utask->depth++;
-
-	/* add instance to the stack */
 	ri->next = utask->return_instances;
 	utask->return_instances = ri;
 
 	return;
-
  fail:
 	kfree(ri);
 }
-- 
cgit v1.2.3


From a83cfeb92132c279b20bbc8ed3cef833b0fe417e Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 21 Jul 2015 15:40:13 +0200
Subject: uprobes: Change handle_trampoline() to find the next chain beforehand

No functional changes, preparation.

Add the new helper, find_next_ret_chain(), which finds the first
!chained entry and returns its ->next. Yes, it is suboptimal. We
probably want to turn ->chained into ->start_of_this_chain
pointer and avoid another loop. But this needs the boring
changes in dup_utask(), so lets do this later.

Change the main loop in handle_trampoline() to unwind the stack
until ri is equal to the pointer returned by this new helper.

Tested-by: Pratyush Anand <panand@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Anton Arapov <arapov@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150721134013.GA4755@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 4c941feae3a2..98e4d97b8c31 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1766,11 +1766,22 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
 	up_read(&uprobe->register_rwsem);
 }
 
+static struct return_instance *find_next_ret_chain(struct return_instance *ri)
+{
+	bool chained;
+
+	do {
+		chained = ri->chained;
+		ri = ri->next;	/* can't be NULL if chained */
+	} while (chained);
+
+	return ri;
+}
+
 static void handle_trampoline(struct pt_regs *regs)
 {
 	struct uprobe_task *utask;
-	struct return_instance *ri;
-	bool chained;
+	struct return_instance *ri, *next;
 
 	utask = current->utask;
 	if (!utask)
@@ -1780,24 +1791,18 @@ static void handle_trampoline(struct pt_regs *regs)
 	if (!ri)
 		goto sigill;
 
+	next = find_next_ret_chain(ri);
 	/*
 	 * TODO: we should throw out return_instance's invalidated by
 	 * longjmp(), currently we assume that the probed function always
 	 * returns.
 	 */
 	instruction_pointer_set(regs, ri->orig_ret_vaddr);
-
-	for (;;) {
+	do {
 		handle_uretprobe_chain(ri, regs);
-
-		chained = ri->chained;
 		ri = free_ret_instance(ri);
 		utask->depth--;
-
-		if (!chained)
-			break;
-		BUG_ON(!ri);
-	}
+	} while (ri != next);
 
 	utask->return_instances = ri;
 	return;
-- 
cgit v1.2.3


From 97da89767d398c1dfa1f34e5f312eb8ebb382f7f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 21 Jul 2015 15:40:16 +0200
Subject: uprobes: Export 'struct return_instance', introduce
 arch_uretprobe_is_alive()

Add the new "weak" helper, arch_uretprobe_is_alive(), used by
the next patches. It should return true if this return_instance
is still valid. The arch agnostic version just always returns
true.

The patch exports "struct return_instance" for the architectures
which want to override this hook. We can also cleanup
prepare_uretprobe() if we pass the new return_instance to
arch_uretprobe_hijack_return_addr().

Tested-by: Pratyush Anand <panand@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Anton Arapov <arapov@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150721134016.GA4762@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/uprobes.h | 10 ++++++++++
 kernel/events/uprobes.c | 14 +++++---------
 2 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 60beb5dc7977..50d2764d66a8 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -92,6 +92,15 @@ struct uprobe_task {
 	unsigned int			depth;
 };
 
+struct return_instance {
+	struct uprobe		*uprobe;
+	unsigned long		func;
+	unsigned long		orig_ret_vaddr; /* original return address */
+	bool			chained;	/* true, if instance is nested */
+
+	struct return_instance	*next;		/* keep as stack */
+};
+
 struct xol_area;
 
 struct uprobes_state {
@@ -128,6 +137,7 @@ extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk);
 extern int  arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data);
 extern void arch_uprobe_abort_xol(struct arch_uprobe *aup, struct pt_regs *regs);
 extern unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs);
+extern bool arch_uretprobe_is_alive(struct return_instance *ret, struct pt_regs *regs);
 extern bool arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs);
 extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
 					 void *src, unsigned long len);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 98e4d97b8c31..1c71b6242a7e 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -86,15 +86,6 @@ struct uprobe {
 	struct arch_uprobe	arch;
 };
 
-struct return_instance {
-	struct uprobe		*uprobe;
-	unsigned long		func;
-	unsigned long		orig_ret_vaddr; /* original return address */
-	bool			chained;	/* true, if instance is nested */
-
-	struct return_instance	*next;		/* keep as stack */
-};
-
 /*
  * Execute out of line area: anonymous executable mapping installed
  * by the probed task to execute the copy of the original instruction
@@ -1818,6 +1809,11 @@ bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
 	return false;
 }
 
+bool __weak arch_uretprobe_is_alive(struct return_instance *ret, struct pt_regs *regs)
+{
+	return true;
+}
+
 /*
  * Run handler and ask thread to singlestep.
  * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
-- 
cgit v1.2.3


From 7b868e4802a86d867aad1be0471b5767d9c20e10 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 21 Jul 2015 15:40:18 +0200
Subject: uprobes/x86: Reimplement arch_uretprobe_is_alive()

Add the x86 specific version of arch_uretprobe_is_alive()
helper. It returns true if the stack frame mangled by
prepare_uretprobe() is still on stack. So if it returns false,
we know that the probed function has already returned.

We add the new return_instance->stack member and change the
generic code to initialize it in prepare_uretprobe, but it
should be equally useful for other architectures.

TODO: this assumes that the probed application can't use
      multiple stacks (say sigaltstack). We will try to improve
      this logic later.

Tested-by: Pratyush Anand <panand@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Anton Arapov <arapov@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150721134018.GA4766@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/uprobes.c | 5 +++++
 include/linux/uprobes.h   | 1 +
 kernel/events/uprobes.c   | 1 +
 3 files changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 66476244731e..58e9b842633f 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -985,3 +985,8 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs
 
 	return -1;
 }
+
+bool arch_uretprobe_is_alive(struct return_instance *ret, struct pt_regs *regs)
+{
+	return regs->sp <= ret->stack;
+}
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 50d2764d66a8..7ab6d2c8be49 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -95,6 +95,7 @@ struct uprobe_task {
 struct return_instance {
 	struct uprobe		*uprobe;
 	unsigned long		func;
+	unsigned long		stack;		/* stack pointer */
 	unsigned long		orig_ret_vaddr; /* original return address */
 	bool			chained;	/* true, if instance is nested */
 
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 1c71b6242a7e..c5f316e06dc0 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1562,6 +1562,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
 
 	ri->uprobe = get_uprobe(uprobe);
 	ri->func = instruction_pointer(regs);
+	ri->stack = user_stack_pointer(regs);
 	ri->orig_ret_vaddr = orig_ret_vaddr;
 	ri->chained = chained;
 
-- 
cgit v1.2.3


From 5eeb50de42fd3251845d03c556db012267c72b3f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 21 Jul 2015 15:40:21 +0200
Subject: uprobes: Change handle_trampoline() to flush the frames invalidated
 by longjmp()

Test-case:

	#include <stdio.h>
	#include <setjmp.h>

	jmp_buf jmp;

	void func_2(void)
	{
		longjmp(jmp, 1);
	}

	void func_1(void)
	{
		if (setjmp(jmp))
			return;
		func_2();
		printf("ERR!! I am running on the caller's stack\n");
	}

	int main(void)
	{
		func_1();
		return 0;
	}

fails if you probe func_1() and func_2() because
handle_trampoline() assumes that the probed function should must
return and hit the bp installed be prepare_uretprobe(). But in
this case func_2() does not return, so when func_1() returns the
kernel uses the no longer valid return_instance of func_2().

Change handle_trampoline() to unwind ->return_instances until we
know that the next chain is alive or NULL, this ensures that the
current chain is the last we need to report and free.

Alternatively, every return_instance could use unique
trampoline_vaddr, in this case we could use it as a key. And
this could solve the problem with sigaltstack() automatically.

But this approach needs more changes, and it puts the "hard"
limit on MAX_URETPROBE_DEPTH. Plus it can not solve another
problem partially fixed by the next patch.

Note: this change has no effect on !x86, the arch-agnostic
version of arch_uretprobe_is_alive() just returns "true".

TODO: as documented by the previous change, arch_uretprobe_is_alive()
      can be fooled by sigaltstack/etc.

Tested-by: Pratyush Anand <panand@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Anton Arapov <arapov@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150721134021.GA4773@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index c5f316e06dc0..93d939c80cd9 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1774,6 +1774,7 @@ static void handle_trampoline(struct pt_regs *regs)
 {
 	struct uprobe_task *utask;
 	struct return_instance *ri, *next;
+	bool valid;
 
 	utask = current->utask;
 	if (!utask)
@@ -1783,18 +1784,24 @@ static void handle_trampoline(struct pt_regs *regs)
 	if (!ri)
 		goto sigill;
 
-	next = find_next_ret_chain(ri);
-	/*
-	 * TODO: we should throw out return_instance's invalidated by
-	 * longjmp(), currently we assume that the probed function always
-	 * returns.
-	 */
-	instruction_pointer_set(regs, ri->orig_ret_vaddr);
 	do {
-		handle_uretprobe_chain(ri, regs);
-		ri = free_ret_instance(ri);
-		utask->depth--;
-	} while (ri != next);
+		/*
+		 * We should throw out the frames invalidated by longjmp().
+		 * If this chain is valid, then the next one should be alive
+		 * or NULL; the latter case means that nobody but ri->func
+		 * could hit this trampoline on return. TODO: sigaltstack().
+		 */
+		next = find_next_ret_chain(ri);
+		valid = !next || arch_uretprobe_is_alive(next, regs);
+
+		instruction_pointer_set(regs, ri->orig_ret_vaddr);
+		do {
+			if (valid)
+				handle_uretprobe_chain(ri, regs);
+			ri = free_ret_instance(ri);
+			utask->depth--;
+		} while (ri != next);
+	} while (!valid);
 
 	utask->return_instances = ri;
 	return;
-- 
cgit v1.2.3


From a5b7e1a89b820f2b9b23634ca4c59b555e8d9a0d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 21 Jul 2015 15:40:23 +0200
Subject: uprobes: Change prepare_uretprobe() to (try to) flush the dead frames

Change prepare_uretprobe() to flush the !arch_uretprobe_is_alive()
return_instance's. This is not needed correctness-wise, but can help
to avoid the failure caused by MAX_URETPROBE_DEPTH.

Note: in this case arch_uretprobe_is_alive() can be false
positive, the stack can grow after longjmp(). Unfortunately, the
kernel can't 100% solve this problem, but see the next patch.

Tested-by: Pratyush Anand <panand@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Anton Arapov <arapov@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150721134023.GA4776@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 93d939c80cd9..7e61c8ca27e0 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1511,6 +1511,16 @@ static unsigned long get_trampoline_vaddr(void)
 	return trampoline_vaddr;
 }
 
+static void cleanup_return_instances(struct uprobe_task *utask, struct pt_regs *regs)
+{
+	struct return_instance *ri = utask->return_instances;
+	while (ri && !arch_uretprobe_is_alive(ri, regs)) {
+		ri = free_ret_instance(ri);
+		utask->depth--;
+	}
+	utask->return_instances = ri;
+}
+
 static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
 {
 	struct return_instance *ri;
@@ -1541,6 +1551,9 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
 	if (orig_ret_vaddr == -1)
 		goto fail;
 
+	/* drop the entries invalidated by longjmp() */
+	cleanup_return_instances(utask, regs);
+
 	/*
 	 * We don't want to keep trampoline address in stack, rather keep the
 	 * original return address of first caller thru all the consequent
-- 
cgit v1.2.3


From 86dcb702e74b8ab7d3b2d36984ef00671cea73b9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 21 Jul 2015 15:40:26 +0200
Subject: uprobes: Add the "enum rp_check ctx" arg to arch_uretprobe_is_alive()

arch/x86 doesn't care (so far), but as Pratyush Anand pointed
out other architectures might want why arch_uretprobe_is_alive()
was called and use different checks depending on the context.
Add the new argument to distinguish 2 callers.

Tested-by: Pratyush Anand <panand@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Anton Arapov <arapov@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150721134026.GA4779@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/uprobes.c | 3 ++-
 include/linux/uprobes.h   | 7 ++++++-
 kernel/events/uprobes.c   | 9 ++++++---
 3 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 58e9b842633f..acf8b9010bbf 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -986,7 +986,8 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs
 	return -1;
 }
 
-bool arch_uretprobe_is_alive(struct return_instance *ret, struct pt_regs *regs)
+bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
+				struct pt_regs *regs)
 {
 	return regs->sp <= ret->stack;
 }
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 7ab6d2c8be49..c0a540239ab6 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -102,6 +102,11 @@ struct return_instance {
 	struct return_instance	*next;		/* keep as stack */
 };
 
+enum rp_check {
+	RP_CHECK_CALL,
+	RP_CHECK_RET,
+};
+
 struct xol_area;
 
 struct uprobes_state {
@@ -138,7 +143,7 @@ extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk);
 extern int  arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data);
 extern void arch_uprobe_abort_xol(struct arch_uprobe *aup, struct pt_regs *regs);
 extern unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs);
-extern bool arch_uretprobe_is_alive(struct return_instance *ret, struct pt_regs *regs);
+extern bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, struct pt_regs *regs);
 extern bool arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs);
 extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
 					 void *src, unsigned long len);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 7e61c8ca27e0..df5661a44e35 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1514,7 +1514,9 @@ static unsigned long get_trampoline_vaddr(void)
 static void cleanup_return_instances(struct uprobe_task *utask, struct pt_regs *regs)
 {
 	struct return_instance *ri = utask->return_instances;
-	while (ri && !arch_uretprobe_is_alive(ri, regs)) {
+	enum rp_check ctx = RP_CHECK_CALL;
+
+	while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) {
 		ri = free_ret_instance(ri);
 		utask->depth--;
 	}
@@ -1805,7 +1807,7 @@ static void handle_trampoline(struct pt_regs *regs)
 		 * could hit this trampoline on return. TODO: sigaltstack().
 		 */
 		next = find_next_ret_chain(ri);
-		valid = !next || arch_uretprobe_is_alive(next, regs);
+		valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs);
 
 		instruction_pointer_set(regs, ri->orig_ret_vaddr);
 		do {
@@ -1830,7 +1832,8 @@ bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
 	return false;
 }
 
-bool __weak arch_uretprobe_is_alive(struct return_instance *ret, struct pt_regs *regs)
+bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
+					struct pt_regs *regs)
 {
 	return true;
 }
-- 
cgit v1.2.3


From db087ef69a2b155ae001665bf0b3806abde7ee34 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 21 Jul 2015 15:40:28 +0200
Subject: uprobes/x86: Make arch_uretprobe_is_alive(RP_CHECK_CALL) more clever

The previous change documents that cleanup_return_instances()
can't always detect the dead frames, the stack can grow. But
there is one special case which imho worth fixing:
arch_uretprobe_is_alive() can return true when the stack didn't
actually grow, but the next "call" insn uses the already
invalidated frame.

Test-case:

	#include <stdio.h>
	#include <setjmp.h>

	jmp_buf jmp;
	int nr = 1024;

	void func_2(void)
	{
		if (--nr == 0)
			return;
		longjmp(jmp, 1);
	}

	void func_1(void)
	{
		setjmp(jmp);
		func_2();
	}

	int main(void)
	{
		func_1();
		return 0;
	}

If you ret-probe func_1() and func_2() prepare_uretprobe() hits
the MAX_URETPROBE_DEPTH limit and "return" from func_2() is not
reported.

When we know that the new call is not chained, we can do the
more strict check. In this case "sp" points to the new ret-addr,
so every frame which uses the same "sp" must be dead. The only
complication is that arch_uretprobe_is_alive() needs to know was
it chained or not, so we add the new RP_CHECK_CHAIN_CALL enum
and change prepare_uretprobe() to pass RP_CHECK_CALL only if
!chained.

Note: arch_uretprobe_is_alive() could also re-read *sp and check
if this word is still trampoline_vaddr. This could obviously
improve the logic, but I would like to avoid another
copy_from_user() especially in the case when we can't avoid the
false "alive == T" positives.

Tested-by: Pratyush Anand <panand@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Anton Arapov <arapov@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150721134028.GA4786@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/uprobes.c |  5 ++++-
 include/linux/uprobes.h   |  1 +
 kernel/events/uprobes.c   | 14 +++++++-------
 3 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index acf8b9010bbf..bf4db6eaec8f 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -989,5 +989,8 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs
 bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
 				struct pt_regs *regs)
 {
-	return regs->sp <= ret->stack;
+	if (ctx == RP_CHECK_CALL) /* sp was just decremented by "call" insn */
+		return regs->sp < ret->stack;
+	else
+		return regs->sp <= ret->stack;
 }
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index c0a540239ab6..0bdc72f36905 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -104,6 +104,7 @@ struct return_instance {
 
 enum rp_check {
 	RP_CHECK_CALL,
+	RP_CHECK_CHAIN_CALL,
 	RP_CHECK_RET,
 };
 
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index df5661a44e35..0f370ef57a02 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1511,10 +1511,11 @@ static unsigned long get_trampoline_vaddr(void)
 	return trampoline_vaddr;
 }
 
-static void cleanup_return_instances(struct uprobe_task *utask, struct pt_regs *regs)
+static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
+					struct pt_regs *regs)
 {
 	struct return_instance *ri = utask->return_instances;
-	enum rp_check ctx = RP_CHECK_CALL;
+	enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL;
 
 	while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) {
 		ri = free_ret_instance(ri);
@@ -1528,7 +1529,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
 	struct return_instance *ri;
 	struct uprobe_task *utask;
 	unsigned long orig_ret_vaddr, trampoline_vaddr;
-	bool chained = false;
+	bool chained;
 
 	if (!get_xol_area())
 		return;
@@ -1554,14 +1555,15 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
 		goto fail;
 
 	/* drop the entries invalidated by longjmp() */
-	cleanup_return_instances(utask, regs);
+	chained = (orig_ret_vaddr == trampoline_vaddr);
+	cleanup_return_instances(utask, chained, regs);
 
 	/*
 	 * We don't want to keep trampoline address in stack, rather keep the
 	 * original return address of first caller thru all the consequent
 	 * instances. This also makes breakpoint unwrapping easier.
 	 */
-	if (orig_ret_vaddr == trampoline_vaddr) {
+	if (chained) {
 		if (!utask->return_instances) {
 			/*
 			 * This situation is not possible. Likely we have an
@@ -1570,8 +1572,6 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
 			uprobe_warn(current, "handle tail call");
 			goto fail;
 		}
-
-		chained = true;
 		orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
 	}
 
-- 
cgit v1.2.3


From f58bea2fec63db72f8050ade709358257e9102ab Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 21 Jul 2015 15:40:31 +0200
Subject: uprobes: Fix the usage of install_special_mapping()

install_special_mapping(pages) expects that "pages" is the zero-
terminated array while xol_add_vma() passes &area->page, this
means that special_mapping_fault() can wrongly use the next
member in xol_area (vaddr) as "struct page *".

Fortunately, this area is not expandable so pgoff != 0 isn't
possible (modulo bugs in special_mapping_vmops), but still this
does not look good.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Pratyush Anand <panand@redhat.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150721134031.GA4789@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 0f370ef57a02..4b8ac5f13320 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -99,7 +99,7 @@ struct xol_area {
 	wait_queue_head_t 	wq;		/* if all slots are busy */
 	atomic_t 		slot_count;	/* number of in-use slots */
 	unsigned long 		*bitmap;	/* 0 = free slot */
-	struct page 		*page;
+	struct page 		*pages[2];
 
 	/*
 	 * We keep the vma's vm_start rather than a pointer to the vma
@@ -1142,7 +1142,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
 	}
 
 	ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE,
-				VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, &area->page);
+				VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, area->pages);
 	if (ret)
 		goto fail;
 
@@ -1168,21 +1168,22 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
 	if (!area->bitmap)
 		goto free_area;
 
-	area->page = alloc_page(GFP_HIGHUSER);
-	if (!area->page)
+	area->pages[0] = alloc_page(GFP_HIGHUSER);
+	if (!area->pages[0])
 		goto free_bitmap;
+	area->pages[1] = NULL;
 
 	area->vaddr = vaddr;
 	init_waitqueue_head(&area->wq);
 	/* Reserve the 1st slot for get_trampoline_vaddr() */
 	set_bit(0, area->bitmap);
 	atomic_set(&area->slot_count, 1);
-	copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
+	copy_to_page(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE);
 
 	if (!xol_add_vma(mm, area))
 		return area;
 
-	__free_page(area->page);
+	__free_page(area->pages[0]);
  free_bitmap:
 	kfree(area->bitmap);
  free_area:
@@ -1220,7 +1221,7 @@ void uprobe_clear_state(struct mm_struct *mm)
 	if (!area)
 		return;
 
-	put_page(area->page);
+	put_page(area->pages[0]);
 	kfree(area->bitmap);
 	kfree(area);
 }
@@ -1289,7 +1290,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
 	if (unlikely(!xol_vaddr))
 		return 0;
 
-	arch_uprobe_copy_ixol(area->page, xol_vaddr,
+	arch_uprobe_copy_ixol(area->pages[0], xol_vaddr,
 			      &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
 
 	return xol_vaddr;
-- 
cgit v1.2.3


From 704bde3cc26a4cb34386c164107b59e09745a022 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 21 Jul 2015 15:40:33 +0200
Subject: uprobes: Use vm_special_mapping to name the XOL vma

Change xol_add_vma() to use _install_special_mapping(), this way
we can name the vma installed by uprobes. Currently it looks
like private anonymous mapping, this is confusing and
complicates the debugging. With this change /proc/$pid/maps
reports "[uprobes]".

As a side effect this will cause core dumps to include the XOL vma
and I think this is good; this can help to debug the problem if
the app crashed because it was probed.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Pratyush Anand <panand@redhat.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150721134033.GA4796@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 4b8ac5f13320..2d5b7bd337a7 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -96,17 +96,18 @@ struct uprobe {
  * allocated.
  */
 struct xol_area {
-	wait_queue_head_t 	wq;		/* if all slots are busy */
-	atomic_t 		slot_count;	/* number of in-use slots */
-	unsigned long 		*bitmap;	/* 0 = free slot */
-	struct page 		*pages[2];
+	wait_queue_head_t 		wq;		/* if all slots are busy */
+	atomic_t 			slot_count;	/* number of in-use slots */
+	unsigned long 			*bitmap;	/* 0 = free slot */
 
+	struct vm_special_mapping	xol_mapping;
+	struct page 			*pages[2];
 	/*
 	 * We keep the vma's vm_start rather than a pointer to the vma
 	 * itself.  The probed process or a naughty kernel module could make
 	 * the vma go away, and we must handle that reasonably gracefully.
 	 */
-	unsigned long 		vaddr;		/* Page(s) of instruction slots */
+	unsigned long 			vaddr;		/* Page(s) of instruction slots */
 };
 
 /*
@@ -1125,11 +1126,14 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
 /* Slot allocation for XOL */
 static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
 {
-	int ret = -EALREADY;
+	struct vm_area_struct *vma;
+	int ret;
 
 	down_write(&mm->mmap_sem);
-	if (mm->uprobes_state.xol_area)
+	if (mm->uprobes_state.xol_area) {
+		ret = -EALREADY;
 		goto fail;
+	}
 
 	if (!area->vaddr) {
 		/* Try to map as high as possible, this is only a hint. */
@@ -1141,11 +1145,15 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
 		}
 	}
 
-	ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE,
-				VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, area->pages);
-	if (ret)
+	vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE,
+				VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO,
+				&area->xol_mapping);
+	if (IS_ERR(vma)) {
+		ret = PTR_ERR(vma);
 		goto fail;
+	}
 
+	ret = 0;
 	smp_wmb();	/* pairs with get_xol_area() */
 	mm->uprobes_state.xol_area = area;
  fail:
@@ -1168,6 +1176,8 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
 	if (!area->bitmap)
 		goto free_area;
 
+	area->xol_mapping.name = "[uprobes]";
+	area->xol_mapping.pages = area->pages;
 	area->pages[0] = alloc_page(GFP_HIGHUSER);
 	if (!area->pages[0])
 		goto free_bitmap;
-- 
cgit v1.2.3


From 2a742cedcf13572999436676cbe36c3a9b733b0f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 21 Jul 2015 15:40:36 +0200
Subject: uprobes: Fix the waitqueue_active() check in xol_free_insn_slot()

The xol_free_insn_slot()->waitqueue_active() check is buggy. We
need mb() after we set the conditon for wait_event(), or
xol_take_insn_slot() can miss the wakeup.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Pratyush Anand <panand@redhat.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150721134036.GA4799@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 2d5b7bd337a7..4e5e9798aa0c 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1337,6 +1337,7 @@ static void xol_free_insn_slot(struct task_struct *tsk)
 
 		clear_bit(slot_nr, area->bitmap);
 		atomic_dec(&area->slot_count);
+		smp_mb__after_atomic(); /* pairs with prepare_to_wait() */
 		if (waitqueue_active(&area->wq))
 			wake_up(&area->wq);
 
-- 
cgit v1.2.3


From e5779e8e12299f77c2421a707855d8d124171d85 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 30 Jul 2015 20:32:40 -0700
Subject: perf/x86/hw_breakpoints: Disallow kernel breakpoints unless
 kprobe-safe

Code on the kprobe blacklist doesn't want unexpected int3
exceptions. It probably doesn't want unexpected debug exceptions
either. Be safe: disallow breakpoints in nokprobes code.

On non-CONFIG_KPROBES kernels, there is no kprobe blacklist.  In
that case, disallow kernel breakpoints entirely.

It will be particularly important to keep hw breakpoints out of the
entry and NMI code once we move debug exceptions off the IST stack.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/e14b152af99640448d895e3c2a8c2d5ee19a1325.1438312874.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/hw_breakpoint.c | 15 +++++++++++++++
 include/linux/kprobes.h         |  2 ++
 kernel/kprobes.c                |  2 +-
 3 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 7114ba220fd4..78f3e90c5659 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -32,6 +32,7 @@
 #include <linux/irqflags.h>
 #include <linux/notifier.h>
 #include <linux/kallsyms.h>
+#include <linux/kprobes.h>
 #include <linux/percpu.h>
 #include <linux/kdebug.h>
 #include <linux/kernel.h>
@@ -243,6 +244,20 @@ static int arch_build_bp_info(struct perf_event *bp)
 		info->type = X86_BREAKPOINT_RW;
 		break;
 	case HW_BREAKPOINT_X:
+		/*
+		 * We don't allow kernel breakpoints in places that are not
+		 * acceptable for kprobes.  On non-kprobes kernels, we don't
+		 * allow kernel breakpoints at all.
+		 */
+		if (bp->attr.bp_addr >= TASK_SIZE_MAX) {
+#ifdef CONFIG_KPROBES
+			if (within_kprobe_blacklist(bp->attr.bp_addr))
+				return -EINVAL;
+#else
+			return -EINVAL;
+#endif
+		}
+
 		info->type = X86_BREAKPOINT_EXECUTE;
 		/*
 		 * x86 inst breakpoints need to have a specific undefined len.
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 1ab54754a86d..8f6849084248 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -267,6 +267,8 @@ extern void show_registers(struct pt_regs *regs);
 extern void kprobes_inc_nmissed_count(struct kprobe *p);
 extern bool arch_within_kprobe_blacklist(unsigned long addr);
 
+extern bool within_kprobe_blacklist(unsigned long addr);
+
 struct kprobe_insn_cache {
 	struct mutex mutex;
 	void *(*alloc)(void);	/* allocate insn page */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index c90e417bb963..d10ab6b9b5e0 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1332,7 +1332,7 @@ bool __weak arch_within_kprobe_blacklist(unsigned long addr)
 	       addr < (unsigned long)__kprobes_text_end;
 }
 
-static bool within_kprobe_blacklist(unsigned long addr)
+bool within_kprobe_blacklist(unsigned long addr)
 {
 	struct kprobe_blacklist_entry *ent;
 
-- 
cgit v1.2.3


From 9a6694cfa2390181dec936a17c0d9d21ef7b08d9 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Thu, 30 Jul 2015 16:48:24 +0300
Subject: perf/x86/intel/pt: Do not force sync packets on every schedule-in

Currently, the PT driver zeroes out the status register every time before
starting the event. However, all the writable bits are already taken care
of in pt_handle_status() function, except the new PacketByteCnt field,
which in new versions of PT contains the number of packet bytes written
since the last sync (PSB) packet. Zeroing it out before enabling PT forces
a sync packet to be written. This means that, with the existing code, a
sync packet (PSB and PSBEND, 18 bytes in total) will be generated every
time a PT event is scheduled in.

To avoid these unnecessary syncs and save a WRMSR in the fast path, this
patch changes the default behavior to not clear PacketByteCnt field, so
that the sync packets will be generated with the period specified as
"psb_period" attribute config field. This has little impact on the trace
data as the other packets that are normally sent within PSB+ (between PSB
and PSBEND) have their own generation scenarios which do not depend on the
sync packets.

One exception where we do need to force PSB like this when tracing starts,
so that the decoder has a clear sync point in the trace. For this purpose
we aready have hw::itrace_started flag, which we are currently using to
output PERF_RECORD_ITRACE_START. This patch moves setting itrace_started
from perf core to the pmu::start, where it should still be 0 on the very
first run.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: hpa@zytor.com
Link: http://lkml.kernel.org/r/1438264104-16189-1-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_pt.c | 7 +++++--
 kernel/events/core.c                      | 2 --
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
index 183de719628d..cc58ef8d9ada 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_pt.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -191,6 +191,11 @@ static void pt_config(struct perf_event *event)
 {
 	u64 reg;
 
+	if (!event->hw.itrace_started) {
+		event->hw.itrace_started = 1;
+		wrmsrl(MSR_IA32_RTIT_STATUS, 0);
+	}
+
 	reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
 
 	if (!event->attr.exclude_kernel)
@@ -910,7 +915,6 @@ void intel_pt_interrupt(void)
 
 		pt_config_buffer(buf->cur->table, buf->cur_idx,
 				 buf->output_off);
-		wrmsrl(MSR_IA32_RTIT_STATUS, 0);
 		pt_config(event);
 	}
 }
@@ -934,7 +938,6 @@ static void pt_event_start(struct perf_event *event, int mode)
 
 	pt_config_buffer(buf->cur->table, buf->cur_idx,
 			 buf->output_off);
-	wrmsrl(MSR_IA32_RTIT_STATUS, 0);
 	pt_config(event);
 }
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a9796c8ff7e0..bdea12924b11 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6139,8 +6139,6 @@ static void perf_log_itrace_start(struct perf_event *event)
 	    event->hw.itrace_started)
 		return;
 
-	event->hw.itrace_started = 1;
-
 	rec.header.type	= PERF_RECORD_ITRACE_START;
 	rec.header.misc	= 0;
 	rec.header.size	= sizeof(rec);
-- 
cgit v1.2.3


From 04a22fae4cbc1f7d3f7471e9b36359f98bd3f043 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Wed, 1 Jul 2015 02:13:50 +0000
Subject: tracing, perf: Implement BPF programs attached to uprobes

By copying BPF related operation to uprobe processing path, this patch
allow users attach BPF programs to uprobes like what they are already
doing on kprobes.

After this patch, users are allowed to use PERF_EVENT_IOC_SET_BPF on a
uprobe perf event. Which make it possible to profile user space programs
and kernel events together using BPF.

Because of this patch, CONFIG_BPF_EVENTS should be selected by
CONFIG_UPROBE_EVENT to ensure trace_call_bpf() is compiled even if
KPROBE_EVENT is not set.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David Ahern <dsahern@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kaixu Xia <xiakaixu@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1435716878-189507-3-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 include/linux/trace_events.h | 5 +++++
 kernel/events/core.c         | 4 ++--
 kernel/trace/Kconfig         | 2 +-
 kernel/trace/trace_uprobe.c  | 5 +++++
 4 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 180dbf8720f9..ed27917cabc9 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -243,6 +243,7 @@ enum {
 	TRACE_EVENT_FL_USE_CALL_FILTER_BIT,
 	TRACE_EVENT_FL_TRACEPOINT_BIT,
 	TRACE_EVENT_FL_KPROBE_BIT,
+	TRACE_EVENT_FL_UPROBE_BIT,
 };
 
 /*
@@ -257,6 +258,7 @@ enum {
  *  USE_CALL_FILTER - For trace internal events, don't use file filter
  *  TRACEPOINT    - Event is a tracepoint
  *  KPROBE        - Event is a kprobe
+ *  UPROBE        - Event is a uprobe
  */
 enum {
 	TRACE_EVENT_FL_FILTERED		= (1 << TRACE_EVENT_FL_FILTERED_BIT),
@@ -267,8 +269,11 @@ enum {
 	TRACE_EVENT_FL_USE_CALL_FILTER	= (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT),
 	TRACE_EVENT_FL_TRACEPOINT	= (1 << TRACE_EVENT_FL_TRACEPOINT_BIT),
 	TRACE_EVENT_FL_KPROBE		= (1 << TRACE_EVENT_FL_KPROBE_BIT),
+	TRACE_EVENT_FL_UPROBE		= (1 << TRACE_EVENT_FL_UPROBE_BIT),
 };
 
+#define TRACE_EVENT_FL_UKPROBE (TRACE_EVENT_FL_KPROBE | TRACE_EVENT_FL_UPROBE)
+
 struct trace_event_call {
 	struct list_head	list;
 	struct trace_event_class *class;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index bdea12924b11..77f9e5d0e2d1 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6846,8 +6846,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 	if (event->tp_event->prog)
 		return -EEXIST;
 
-	if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
-		/* bpf programs can only be attached to kprobes */
+	if (!(event->tp_event->flags & TRACE_EVENT_FL_UKPROBE))
+		/* bpf programs can only be attached to u/kprobes */
 		return -EINVAL;
 
 	prog = bpf_prog_get(prog_fd);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 3b9a48ae153a..1153c43428f3 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -434,7 +434,7 @@ config UPROBE_EVENT
 
 config BPF_EVENTS
 	depends on BPF_SYSCALL
-	depends on KPROBE_EVENT
+	depends on KPROBE_EVENT || UPROBE_EVENT
 	bool
 	default y
 	help
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index aa1ea7b36fa8..f97479f1ce35 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1095,11 +1095,15 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
 {
 	struct trace_event_call *call = &tu->tp.call;
 	struct uprobe_trace_entry_head *entry;
+	struct bpf_prog *prog = call->prog;
 	struct hlist_head *head;
 	void *data;
 	int size, esize;
 	int rctx;
 
+	if (prog && !trace_call_bpf(prog, regs))
+		return;
+
 	esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
 
 	size = esize + tu->tp.size + dsize;
@@ -1289,6 +1293,7 @@ static int register_uprobe_event(struct trace_uprobe *tu)
 		return -ENODEV;
 	}
 
+	call->flags = TRACE_EVENT_FL_UPROBE;
 	call->class->reg = trace_uprobe_register;
 	call->data = tu;
 	ret = trace_add_event_call(call);
-- 
cgit v1.2.3


From c2ad6b51efc5f27d70ce952decd2a15679b83600 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Tue, 28 Jul 2015 09:00:04 +0300
Subject: perf/ring-buffer: Clarify the use of page::private for high-order AUX
 allocations

A question [1] was raised about the use of page::private in AUX buffer
allocations, so let's add a clarification about its intended use.

The private field and flag are used by perf's rb_alloc_aux() path to
tell the pmu driver the size of each high-order allocation, so that the
driver can program those appropriately into its hardware. This only
matters for PMUs that don't support hardware scatter tables. Otherwise,
every page in the buffer is just a page.

This patch adds a comment about the private field to the AUX buffer
allocation path.

  [1] http://marc.info/?l=linux-kernel&m=143803696607968

Reported-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1438063204-665-1-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/ring_buffer.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index c8aa3f75bc4d..182bc30899d5 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -437,7 +437,10 @@ static struct page *rb_alloc_aux_page(int node, int order)
 
 	if (page && order) {
 		/*
-		 * Communicate the allocation size to the driver
+		 * Communicate the allocation size to the driver:
+		 * if we managed to secure a high-order allocation,
+		 * set its first page's private to this order;
+		 * !PagePrivate(page) means it's just a normal page.
 		 */
 		split_page(page, order);
 		SetPagePrivate(page);
-- 
cgit v1.2.3


From a2fb3382edbea83c6f2bf6ac15e3673b2e254aad Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Wed, 26 Aug 2015 10:57:46 +0000
Subject: tracing/uprobes: Do not print '0x (null)' when offset is 0

When manually added uprobe point with zero address, 'uprobe_events'
output '(null)' instead of 0x00000000:

  # echo p:probe_libc/abs_0 /path/to/lib.bin:0x0 arg1=%ax > \
            /sys/kernel/debug/tracing/uprobe_events

  # cat /sys/kernel/debug/tracing/uprobe_events
    p:probe_libc/abs_0 /path/to/lib.bin:0x          (null) arg1=%ax

 This patch fixes this behavior:

  # cat /sys/kernel/debug/tracing/uprobe_events
  p:probe_libc/abs_0 /path/to/lib.bin:0x0000000000000000

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1440586666-235233-8-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 kernel/trace/trace_uprobe.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index f97479f1ce35..d2f6d0be3503 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -601,7 +601,22 @@ static int probes_seq_show(struct seq_file *m, void *v)
 
 	seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system,
 			trace_event_name(&tu->tp.call));
-	seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
+	seq_printf(m, " %s:", tu->filename);
+
+	/* Don't print "0x  (null)" when offset is 0 */
+	if (tu->offset) {
+		seq_printf(m, "0x%p", (void *)tu->offset);
+	} else {
+		switch (sizeof(void *)) {
+		case 4:
+			seq_printf(m, "0x00000000");
+			break;
+		case 8:
+		default:
+			seq_printf(m, "0x0000000000000000");
+			break;
+		}
+	}
 
 	for (i = 0; i < tu->tp.nr_args; i++)
 		seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm);
-- 
cgit v1.2.3