From 45ac1403f564f411c6a383a2448688ba8dd705a4 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 21 Jul 2015 12:44:02 +0300 Subject: perf: Add PERF_RECORD_SWITCH to indicate context switches There are already two events for context switches, namely the tracepoint sched:sched_switch and the software event context_switches. Unfortunately neither are suitable for use by non-privileged users for the purpose of synchronizing hardware trace data (e.g. Intel PT) to the context switch. Tracepoints are no good at all for non-privileged users because they need either CAP_SYS_ADMIN or /proc/sys/kernel/perf_event_paranoid <= -1. On the other hand, kernel software events need either CAP_SYS_ADMIN or /proc/sys/kernel/perf_event_paranoid <= 1. Now many distributions do default perf_event_paranoid to 1 making context_switches a contender, except it has another problem (which is also shared with sched:sched_switch) which is that it happens before perf schedules events out instead of after perf schedules events in. Whereas a privileged user can see all the events anyway, a non-privileged user only sees events for their own processes, in other words they see when their process was scheduled out not when it was scheduled in. That presents two problems to use the event: 1. the information comes too late, so tools have to look ahead in the event stream to find out what the current state is 2. if they are unlucky tracing might have stopped before the context-switches event is recorded. This new PERF_RECORD_SWITCH event does not have those problems and it also has a couple of other small advantages. It is easier to use because it is an auxiliary event (like mmap, comm and task events) which can be enabled by setting a single bit. It is smaller than sched:sched_switch and easier to parse. To make the event useful for privileged users also, if the context is cpu-wide then the event record will be PERF_RECORD_SWITCH_CPU_WIDE which is the same as PERF_RECORD_SWITCH except it also provides the next or previous pid/tid. Signed-off-by: Adrian Hunter Acked-by: Peter Zijlstra (Intel) Tested-by: Jiri Olsa Cc: Andi Kleen Cc: Mathieu Poirier Cc: Pawel Moll Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1437471846-26995-2-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 103 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d3dae3419b99..ce21143c0d9e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -163,6 +163,7 @@ static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; static atomic_t nr_task_events __read_mostly; static atomic_t nr_freq_events __read_mostly; +static atomic_t nr_switch_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -2619,6 +2620,9 @@ static void perf_pmu_sched_task(struct task_struct *prev, local_irq_restore(flags); } +static void perf_event_switch(struct task_struct *task, + struct task_struct *next_prev, bool sched_in); + #define for_each_task_context_nr(ctxn) \ for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) @@ -2641,6 +2645,9 @@ void __perf_event_task_sched_out(struct task_struct *task, if (__this_cpu_read(perf_sched_cb_usages)) perf_pmu_sched_task(task, next, false); + if (atomic_read(&nr_switch_events)) + perf_event_switch(task, next, false); + for_each_task_context_nr(ctxn) perf_event_context_sched_out(task, ctxn, next); @@ -2831,6 +2838,9 @@ void __perf_event_task_sched_in(struct task_struct *prev, if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) perf_cgroup_sched_in(prev, task); + if (atomic_read(&nr_switch_events)) + perf_event_switch(task, prev, true); + if (__this_cpu_read(perf_sched_cb_usages)) perf_pmu_sched_task(prev, task, true); } @@ -3454,6 +3464,10 @@ static void unaccount_event(struct perf_event *event) atomic_dec(&nr_task_events); if (event->attr.freq) atomic_dec(&nr_freq_events); + if (event->attr.context_switch) { + static_key_slow_dec_deferred(&perf_sched_events); + atomic_dec(&nr_switch_events); + } if (is_cgroup_event(event)) static_key_slow_dec_deferred(&perf_sched_events); if (has_branch_stack(event)) @@ -5981,6 +5995,91 @@ void perf_log_lost_samples(struct perf_event *event, u64 lost) perf_output_end(&handle); } +/* + * context_switch tracking + */ + +struct perf_switch_event { + struct task_struct *task; + struct task_struct *next_prev; + + struct { + struct perf_event_header header; + u32 next_prev_pid; + u32 next_prev_tid; + } event_id; +}; + +static int perf_event_switch_match(struct perf_event *event) +{ + return event->attr.context_switch; +} + +static void perf_event_switch_output(struct perf_event *event, void *data) +{ + struct perf_switch_event *se = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret; + + if (!perf_event_switch_match(event)) + return; + + /* Only CPU-wide events are allowed to see next/prev pid/tid */ + if (event->ctx->task) { + se->event_id.header.type = PERF_RECORD_SWITCH; + se->event_id.header.size = sizeof(se->event_id.header); + } else { + se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE; + se->event_id.header.size = sizeof(se->event_id); + se->event_id.next_prev_pid = + perf_event_pid(event, se->next_prev); + se->event_id.next_prev_tid = + perf_event_tid(event, se->next_prev); + } + + perf_event_header__init_id(&se->event_id.header, &sample, event); + + ret = perf_output_begin(&handle, event, se->event_id.header.size); + if (ret) + return; + + if (event->ctx->task) + perf_output_put(&handle, se->event_id.header); + else + perf_output_put(&handle, se->event_id); + + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +static void perf_event_switch(struct task_struct *task, + struct task_struct *next_prev, bool sched_in) +{ + struct perf_switch_event switch_event; + + /* N.B. caller checks nr_switch_events != 0 */ + + switch_event = (struct perf_switch_event){ + .task = task, + .next_prev = next_prev, + .event_id = { + .header = { + /* .type */ + .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT, + /* .size */ + }, + /* .next_prev_pid */ + /* .next_prev_tid */ + }, + }; + + perf_event_aux(perf_event_switch_output, + &switch_event, + NULL); +} + /* * IRQ throttle logging */ @@ -7479,6 +7578,10 @@ static void account_event(struct perf_event *event) if (atomic_inc_return(&nr_freq_events) == 1) tick_nohz_full_kick_all(); } + if (event->attr.context_switch) { + atomic_inc(&nr_switch_events); + static_key_slow_inc(&perf_sched_events.key); + } if (has_branch_stack(event)) static_key_slow_inc(&perf_sched_events.key); if (is_cgroup_event(event)) -- cgit v1.2.3 From f231722a2b27ee99cbcd0c6bcf4c866612b78137 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:03 +0200 Subject: uprobes: Introduce get_uprobe() Cosmetic. Add the new trivial helper, get_uprobe(). It matches put_uprobe() we already have and we can simplify a couple of its users. Tested-by: Pratyush Anand Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Anton Arapov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134003.GA4736@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index cb346f26a22d..a9847b4ec1e7 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -366,6 +366,18 @@ set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long v return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn); } +static struct uprobe *get_uprobe(struct uprobe *uprobe) +{ + atomic_inc(&uprobe->ref); + return uprobe; +} + +static void put_uprobe(struct uprobe *uprobe) +{ + if (atomic_dec_and_test(&uprobe->ref)) + kfree(uprobe); +} + static int match_uprobe(struct uprobe *l, struct uprobe *r) { if (l->inode < r->inode) @@ -393,10 +405,8 @@ static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset) while (n) { uprobe = rb_entry(n, struct uprobe, rb_node); match = match_uprobe(&u, uprobe); - if (!match) { - atomic_inc(&uprobe->ref); - return uprobe; - } + if (!match) + return get_uprobe(uprobe); if (match < 0) n = n->rb_left; @@ -432,10 +442,8 @@ static struct uprobe *__insert_uprobe(struct uprobe *uprobe) parent = *p; u = rb_entry(parent, struct uprobe, rb_node); match = match_uprobe(uprobe, u); - if (!match) { - atomic_inc(&u->ref); - return u; - } + if (!match) + return get_uprobe(u); if (match < 0) p = &parent->rb_left; @@ -472,12 +480,6 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) return u; } -static void put_uprobe(struct uprobe *uprobe) -{ - if (atomic_dec_and_test(&uprobe->ref)) - kfree(uprobe); -} - static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) { struct uprobe *uprobe, *cur_uprobe; @@ -1039,14 +1041,14 @@ static void build_probe_list(struct inode *inode, if (u->inode != inode || u->offset < min) break; list_add(&u->pending_list, head); - atomic_inc(&u->ref); + get_uprobe(u); } for (t = n; (t = rb_next(t)); ) { u = rb_entry(t, struct uprobe, rb_node); if (u->inode != inode || u->offset > max) break; list_add(&u->pending_list, head); - atomic_inc(&u->ref); + get_uprobe(u); } } spin_unlock(&uprobes_treelock); @@ -1437,7 +1439,7 @@ static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) return -ENOMEM; *n = *o; - atomic_inc(&n->uprobe->ref); + get_uprobe(n->uprobe); n->next = NULL; *p = n; @@ -1565,8 +1567,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; } - atomic_inc(&uprobe->ref); - ri->uprobe = uprobe; + ri->uprobe = get_uprobe(uprobe); ri->func = instruction_pointer(regs); ri->orig_ret_vaddr = orig_ret_vaddr; ri->chained = chained; -- cgit v1.2.3 From 2bb5e840e873f8778a41801141771f54f547fa65 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:06 +0200 Subject: uprobes: Introduce free_ret_instance() We can simplify uprobe_free_utask() and handle_uretprobe_chain() if we add a simple helper which does put_uprobe/kfree and returns the ->next return_instance. Tested-by: Pratyush Anand Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Anton Arapov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134006.GA4740@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index a9847b4ec1e7..d8c702fc836f 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1378,6 +1378,14 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs) return instruction_pointer(regs); } +static struct return_instance *free_ret_instance(struct return_instance *ri) +{ + struct return_instance *next = ri->next; + put_uprobe(ri->uprobe); + kfree(ri); + return next; +} + /* * Called with no locks held. * Called in context of a exiting or a exec-ing thread. @@ -1385,7 +1393,7 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs) void uprobe_free_utask(struct task_struct *t) { struct uprobe_task *utask = t->utask; - struct return_instance *ri, *tmp; + struct return_instance *ri; if (!utask) return; @@ -1394,13 +1402,8 @@ void uprobe_free_utask(struct task_struct *t) put_uprobe(utask->active_uprobe); ri = utask->return_instances; - while (ri) { - tmp = ri; - ri = ri->next; - - put_uprobe(tmp->uprobe); - kfree(tmp); - } + while (ri) + ri = free_ret_instance(ri); xol_free_insn_slot(t); kfree(utask); @@ -1770,7 +1773,7 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs) static bool handle_trampoline(struct pt_regs *regs) { struct uprobe_task *utask; - struct return_instance *ri, *tmp; + struct return_instance *ri; bool chained; utask = current->utask; @@ -1792,11 +1795,7 @@ static bool handle_trampoline(struct pt_regs *regs) handle_uretprobe_chain(ri, regs); chained = ri->chained; - put_uprobe(ri->uprobe); - - tmp = ri; - ri = ri->next; - kfree(tmp); + ri = free_ret_instance(ri); utask->depth--; if (!chained) -- cgit v1.2.3 From 0b5256c7f173258b19d98364adb57f707dda22f3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:08 +0200 Subject: uprobes: Send SIGILL if handle_trampoline() fails 1. It doesn't make sense to continue if handle_trampoline() fails, change handle_swbp() to always return after this call. 2. Turn pr_warn() into uprobe_warn(), and change handle_trampoline() to send SIGILL on failure. It is pointless to return to user mode with the corrupted instruction_pointer() which we can't restore. Tested-by: Pratyush Anand Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Anton Arapov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134008.GA4745@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d8c702fc836f..eabdc21366ee 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1770,7 +1770,7 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs) up_read(&uprobe->register_rwsem); } -static bool handle_trampoline(struct pt_regs *regs) +static void handle_trampoline(struct pt_regs *regs) { struct uprobe_task *utask; struct return_instance *ri; @@ -1778,11 +1778,11 @@ static bool handle_trampoline(struct pt_regs *regs) utask = current->utask; if (!utask) - return false; + goto sigill; ri = utask->return_instances; if (!ri) - return false; + goto sigill; /* * TODO: we should throw out return_instance's invalidated by @@ -1804,8 +1804,12 @@ static bool handle_trampoline(struct pt_regs *regs) } utask->return_instances = ri; + return; + + sigill: + uprobe_warn(current, "handle uretprobe, sending SIGILL."); + force_sig_info(SIGILL, SEND_SIG_FORCED, current); - return true; } bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs) @@ -1824,13 +1828,8 @@ static void handle_swbp(struct pt_regs *regs) int uninitialized_var(is_swbp); bp_vaddr = uprobe_get_swbp_addr(regs); - if (bp_vaddr == get_trampoline_vaddr()) { - if (handle_trampoline(regs)) - return; - - pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n", - current->pid, current->tgid); - } + if (bp_vaddr == get_trampoline_vaddr()) + return handle_trampoline(regs); uprobe = find_active_uprobe(bp_vaddr, &is_swbp); if (!uprobe) { -- cgit v1.2.3 From 6c58d0e4cc26ea8882928e64c0de9afed4fc37cb Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:10 +0200 Subject: uprobes: Change prepare_uretprobe() to use uprobe_warn() Turn the last pr_warn() in uprobes.c into uprobe_warn(). While at it: - s/kzalloc/kmalloc, we initialize every member of 'ri' - remove the pointless comment above the obvious code Tested-by: Pratyush Anand Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Anton Arapov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134010.GA4752@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index eabdc21366ee..4c941feae3a2 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1541,9 +1541,9 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) return; } - ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL); + ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL); if (!ri) - goto fail; + return; trampoline_vaddr = get_trampoline_vaddr(); orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); @@ -1561,8 +1561,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) * This situation is not possible. Likely we have an * attack from user-space. */ - pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n", - current->pid, current->tgid); + uprobe_warn(current, "handle tail call"); goto fail; } @@ -1576,13 +1575,10 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) ri->chained = chained; utask->depth++; - - /* add instance to the stack */ ri->next = utask->return_instances; utask->return_instances = ri; return; - fail: kfree(ri); } -- cgit v1.2.3 From a83cfeb92132c279b20bbc8ed3cef833b0fe417e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:13 +0200 Subject: uprobes: Change handle_trampoline() to find the next chain beforehand No functional changes, preparation. Add the new helper, find_next_ret_chain(), which finds the first !chained entry and returns its ->next. Yes, it is suboptimal. We probably want to turn ->chained into ->start_of_this_chain pointer and avoid another loop. But this needs the boring changes in dup_utask(), so lets do this later. Change the main loop in handle_trampoline() to unwind the stack until ri is equal to the pointer returned by this new helper. Tested-by: Pratyush Anand Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Anton Arapov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134013.GA4755@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4c941feae3a2..98e4d97b8c31 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1766,11 +1766,22 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs) up_read(&uprobe->register_rwsem); } +static struct return_instance *find_next_ret_chain(struct return_instance *ri) +{ + bool chained; + + do { + chained = ri->chained; + ri = ri->next; /* can't be NULL if chained */ + } while (chained); + + return ri; +} + static void handle_trampoline(struct pt_regs *regs) { struct uprobe_task *utask; - struct return_instance *ri; - bool chained; + struct return_instance *ri, *next; utask = current->utask; if (!utask) @@ -1780,24 +1791,18 @@ static void handle_trampoline(struct pt_regs *regs) if (!ri) goto sigill; + next = find_next_ret_chain(ri); /* * TODO: we should throw out return_instance's invalidated by * longjmp(), currently we assume that the probed function always * returns. */ instruction_pointer_set(regs, ri->orig_ret_vaddr); - - for (;;) { + do { handle_uretprobe_chain(ri, regs); - - chained = ri->chained; ri = free_ret_instance(ri); utask->depth--; - - if (!chained) - break; - BUG_ON(!ri); - } + } while (ri != next); utask->return_instances = ri; return; -- cgit v1.2.3 From 97da89767d398c1dfa1f34e5f312eb8ebb382f7f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:16 +0200 Subject: uprobes: Export 'struct return_instance', introduce arch_uretprobe_is_alive() Add the new "weak" helper, arch_uretprobe_is_alive(), used by the next patches. It should return true if this return_instance is still valid. The arch agnostic version just always returns true. The patch exports "struct return_instance" for the architectures which want to override this hook. We can also cleanup prepare_uretprobe() if we pass the new return_instance to arch_uretprobe_hijack_return_addr(). Tested-by: Pratyush Anand Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Anton Arapov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134016.GA4762@redhat.com Signed-off-by: Ingo Molnar --- include/linux/uprobes.h | 10 ++++++++++ kernel/events/uprobes.c | 14 +++++--------- 2 files changed, 15 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 60beb5dc7977..50d2764d66a8 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -92,6 +92,15 @@ struct uprobe_task { unsigned int depth; }; +struct return_instance { + struct uprobe *uprobe; + unsigned long func; + unsigned long orig_ret_vaddr; /* original return address */ + bool chained; /* true, if instance is nested */ + + struct return_instance *next; /* keep as stack */ +}; + struct xol_area; struct uprobes_state { @@ -128,6 +137,7 @@ extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk); extern int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data); extern void arch_uprobe_abort_xol(struct arch_uprobe *aup, struct pt_regs *regs); extern unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs); +extern bool arch_uretprobe_is_alive(struct return_instance *ret, struct pt_regs *regs); extern bool arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs); extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *src, unsigned long len); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 98e4d97b8c31..1c71b6242a7e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -86,15 +86,6 @@ struct uprobe { struct arch_uprobe arch; }; -struct return_instance { - struct uprobe *uprobe; - unsigned long func; - unsigned long orig_ret_vaddr; /* original return address */ - bool chained; /* true, if instance is nested */ - - struct return_instance *next; /* keep as stack */ -}; - /* * Execute out of line area: anonymous executable mapping installed * by the probed task to execute the copy of the original instruction @@ -1818,6 +1809,11 @@ bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs) return false; } +bool __weak arch_uretprobe_is_alive(struct return_instance *ret, struct pt_regs *regs) +{ + return true; +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. -- cgit v1.2.3 From 7b868e4802a86d867aad1be0471b5767d9c20e10 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:18 +0200 Subject: uprobes/x86: Reimplement arch_uretprobe_is_alive() Add the x86 specific version of arch_uretprobe_is_alive() helper. It returns true if the stack frame mangled by prepare_uretprobe() is still on stack. So if it returns false, we know that the probed function has already returned. We add the new return_instance->stack member and change the generic code to initialize it in prepare_uretprobe, but it should be equally useful for other architectures. TODO: this assumes that the probed application can't use multiple stacks (say sigaltstack). We will try to improve this logic later. Tested-by: Pratyush Anand Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Anton Arapov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134018.GA4766@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/uprobes.c | 5 +++++ include/linux/uprobes.h | 1 + kernel/events/uprobes.c | 1 + 3 files changed, 7 insertions(+) (limited to 'kernel') diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 66476244731e..58e9b842633f 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -985,3 +985,8 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs return -1; } + +bool arch_uretprobe_is_alive(struct return_instance *ret, struct pt_regs *regs) +{ + return regs->sp <= ret->stack; +} diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 50d2764d66a8..7ab6d2c8be49 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -95,6 +95,7 @@ struct uprobe_task { struct return_instance { struct uprobe *uprobe; unsigned long func; + unsigned long stack; /* stack pointer */ unsigned long orig_ret_vaddr; /* original return address */ bool chained; /* true, if instance is nested */ diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 1c71b6242a7e..c5f316e06dc0 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1562,6 +1562,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) ri->uprobe = get_uprobe(uprobe); ri->func = instruction_pointer(regs); + ri->stack = user_stack_pointer(regs); ri->orig_ret_vaddr = orig_ret_vaddr; ri->chained = chained; -- cgit v1.2.3 From 5eeb50de42fd3251845d03c556db012267c72b3f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:21 +0200 Subject: uprobes: Change handle_trampoline() to flush the frames invalidated by longjmp() Test-case: #include #include jmp_buf jmp; void func_2(void) { longjmp(jmp, 1); } void func_1(void) { if (setjmp(jmp)) return; func_2(); printf("ERR!! I am running on the caller's stack\n"); } int main(void) { func_1(); return 0; } fails if you probe func_1() and func_2() because handle_trampoline() assumes that the probed function should must return and hit the bp installed be prepare_uretprobe(). But in this case func_2() does not return, so when func_1() returns the kernel uses the no longer valid return_instance of func_2(). Change handle_trampoline() to unwind ->return_instances until we know that the next chain is alive or NULL, this ensures that the current chain is the last we need to report and free. Alternatively, every return_instance could use unique trampoline_vaddr, in this case we could use it as a key. And this could solve the problem with sigaltstack() automatically. But this approach needs more changes, and it puts the "hard" limit on MAX_URETPROBE_DEPTH. Plus it can not solve another problem partially fixed by the next patch. Note: this change has no effect on !x86, the arch-agnostic version of arch_uretprobe_is_alive() just returns "true". TODO: as documented by the previous change, arch_uretprobe_is_alive() can be fooled by sigaltstack/etc. Tested-by: Pratyush Anand Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Anton Arapov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134021.GA4773@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c5f316e06dc0..93d939c80cd9 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1774,6 +1774,7 @@ static void handle_trampoline(struct pt_regs *regs) { struct uprobe_task *utask; struct return_instance *ri, *next; + bool valid; utask = current->utask; if (!utask) @@ -1783,18 +1784,24 @@ static void handle_trampoline(struct pt_regs *regs) if (!ri) goto sigill; - next = find_next_ret_chain(ri); - /* - * TODO: we should throw out return_instance's invalidated by - * longjmp(), currently we assume that the probed function always - * returns. - */ - instruction_pointer_set(regs, ri->orig_ret_vaddr); do { - handle_uretprobe_chain(ri, regs); - ri = free_ret_instance(ri); - utask->depth--; - } while (ri != next); + /* + * We should throw out the frames invalidated by longjmp(). + * If this chain is valid, then the next one should be alive + * or NULL; the latter case means that nobody but ri->func + * could hit this trampoline on return. TODO: sigaltstack(). + */ + next = find_next_ret_chain(ri); + valid = !next || arch_uretprobe_is_alive(next, regs); + + instruction_pointer_set(regs, ri->orig_ret_vaddr); + do { + if (valid) + handle_uretprobe_chain(ri, regs); + ri = free_ret_instance(ri); + utask->depth--; + } while (ri != next); + } while (!valid); utask->return_instances = ri; return; -- cgit v1.2.3 From a5b7e1a89b820f2b9b23634ca4c59b555e8d9a0d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:23 +0200 Subject: uprobes: Change prepare_uretprobe() to (try to) flush the dead frames Change prepare_uretprobe() to flush the !arch_uretprobe_is_alive() return_instance's. This is not needed correctness-wise, but can help to avoid the failure caused by MAX_URETPROBE_DEPTH. Note: in this case arch_uretprobe_is_alive() can be false positive, the stack can grow after longjmp(). Unfortunately, the kernel can't 100% solve this problem, but see the next patch. Tested-by: Pratyush Anand Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Anton Arapov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134023.GA4776@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 93d939c80cd9..7e61c8ca27e0 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1511,6 +1511,16 @@ static unsigned long get_trampoline_vaddr(void) return trampoline_vaddr; } +static void cleanup_return_instances(struct uprobe_task *utask, struct pt_regs *regs) +{ + struct return_instance *ri = utask->return_instances; + while (ri && !arch_uretprobe_is_alive(ri, regs)) { + ri = free_ret_instance(ri); + utask->depth--; + } + utask->return_instances = ri; +} + static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) { struct return_instance *ri; @@ -1541,6 +1551,9 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) if (orig_ret_vaddr == -1) goto fail; + /* drop the entries invalidated by longjmp() */ + cleanup_return_instances(utask, regs); + /* * We don't want to keep trampoline address in stack, rather keep the * original return address of first caller thru all the consequent -- cgit v1.2.3 From 86dcb702e74b8ab7d3b2d36984ef00671cea73b9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:26 +0200 Subject: uprobes: Add the "enum rp_check ctx" arg to arch_uretprobe_is_alive() arch/x86 doesn't care (so far), but as Pratyush Anand pointed out other architectures might want why arch_uretprobe_is_alive() was called and use different checks depending on the context. Add the new argument to distinguish 2 callers. Tested-by: Pratyush Anand Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Anton Arapov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134026.GA4779@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/uprobes.c | 3 ++- include/linux/uprobes.h | 7 ++++++- kernel/events/uprobes.c | 9 ++++++--- 3 files changed, 14 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 58e9b842633f..acf8b9010bbf 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -986,7 +986,8 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs return -1; } -bool arch_uretprobe_is_alive(struct return_instance *ret, struct pt_regs *regs) +bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, + struct pt_regs *regs) { return regs->sp <= ret->stack; } diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 7ab6d2c8be49..c0a540239ab6 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -102,6 +102,11 @@ struct return_instance { struct return_instance *next; /* keep as stack */ }; +enum rp_check { + RP_CHECK_CALL, + RP_CHECK_RET, +}; + struct xol_area; struct uprobes_state { @@ -138,7 +143,7 @@ extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk); extern int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data); extern void arch_uprobe_abort_xol(struct arch_uprobe *aup, struct pt_regs *regs); extern unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs); -extern bool arch_uretprobe_is_alive(struct return_instance *ret, struct pt_regs *regs); +extern bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, struct pt_regs *regs); extern bool arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs); extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *src, unsigned long len); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7e61c8ca27e0..df5661a44e35 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1514,7 +1514,9 @@ static unsigned long get_trampoline_vaddr(void) static void cleanup_return_instances(struct uprobe_task *utask, struct pt_regs *regs) { struct return_instance *ri = utask->return_instances; - while (ri && !arch_uretprobe_is_alive(ri, regs)) { + enum rp_check ctx = RP_CHECK_CALL; + + while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) { ri = free_ret_instance(ri); utask->depth--; } @@ -1805,7 +1807,7 @@ static void handle_trampoline(struct pt_regs *regs) * could hit this trampoline on return. TODO: sigaltstack(). */ next = find_next_ret_chain(ri); - valid = !next || arch_uretprobe_is_alive(next, regs); + valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs); instruction_pointer_set(regs, ri->orig_ret_vaddr); do { @@ -1830,7 +1832,8 @@ bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs) return false; } -bool __weak arch_uretprobe_is_alive(struct return_instance *ret, struct pt_regs *regs) +bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, + struct pt_regs *regs) { return true; } -- cgit v1.2.3 From db087ef69a2b155ae001665bf0b3806abde7ee34 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:28 +0200 Subject: uprobes/x86: Make arch_uretprobe_is_alive(RP_CHECK_CALL) more clever The previous change documents that cleanup_return_instances() can't always detect the dead frames, the stack can grow. But there is one special case which imho worth fixing: arch_uretprobe_is_alive() can return true when the stack didn't actually grow, but the next "call" insn uses the already invalidated frame. Test-case: #include #include jmp_buf jmp; int nr = 1024; void func_2(void) { if (--nr == 0) return; longjmp(jmp, 1); } void func_1(void) { setjmp(jmp); func_2(); } int main(void) { func_1(); return 0; } If you ret-probe func_1() and func_2() prepare_uretprobe() hits the MAX_URETPROBE_DEPTH limit and "return" from func_2() is not reported. When we know that the new call is not chained, we can do the more strict check. In this case "sp" points to the new ret-addr, so every frame which uses the same "sp" must be dead. The only complication is that arch_uretprobe_is_alive() needs to know was it chained or not, so we add the new RP_CHECK_CHAIN_CALL enum and change prepare_uretprobe() to pass RP_CHECK_CALL only if !chained. Note: arch_uretprobe_is_alive() could also re-read *sp and check if this word is still trampoline_vaddr. This could obviously improve the logic, but I would like to avoid another copy_from_user() especially in the case when we can't avoid the false "alive == T" positives. Tested-by: Pratyush Anand Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Anton Arapov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134028.GA4786@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/uprobes.c | 5 ++++- include/linux/uprobes.h | 1 + kernel/events/uprobes.c | 14 +++++++------- 3 files changed, 12 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index acf8b9010bbf..bf4db6eaec8f 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -989,5 +989,8 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, struct pt_regs *regs) { - return regs->sp <= ret->stack; + if (ctx == RP_CHECK_CALL) /* sp was just decremented by "call" insn */ + return regs->sp < ret->stack; + else + return regs->sp <= ret->stack; } diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index c0a540239ab6..0bdc72f36905 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -104,6 +104,7 @@ struct return_instance { enum rp_check { RP_CHECK_CALL, + RP_CHECK_CHAIN_CALL, RP_CHECK_RET, }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index df5661a44e35..0f370ef57a02 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1511,10 +1511,11 @@ static unsigned long get_trampoline_vaddr(void) return trampoline_vaddr; } -static void cleanup_return_instances(struct uprobe_task *utask, struct pt_regs *regs) +static void cleanup_return_instances(struct uprobe_task *utask, bool chained, + struct pt_regs *regs) { struct return_instance *ri = utask->return_instances; - enum rp_check ctx = RP_CHECK_CALL; + enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL; while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) { ri = free_ret_instance(ri); @@ -1528,7 +1529,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) struct return_instance *ri; struct uprobe_task *utask; unsigned long orig_ret_vaddr, trampoline_vaddr; - bool chained = false; + bool chained; if (!get_xol_area()) return; @@ -1554,14 +1555,15 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) goto fail; /* drop the entries invalidated by longjmp() */ - cleanup_return_instances(utask, regs); + chained = (orig_ret_vaddr == trampoline_vaddr); + cleanup_return_instances(utask, chained, regs); /* * We don't want to keep trampoline address in stack, rather keep the * original return address of first caller thru all the consequent * instances. This also makes breakpoint unwrapping easier. */ - if (orig_ret_vaddr == trampoline_vaddr) { + if (chained) { if (!utask->return_instances) { /* * This situation is not possible. Likely we have an @@ -1570,8 +1572,6 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) uprobe_warn(current, "handle tail call"); goto fail; } - - chained = true; orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; } -- cgit v1.2.3 From f58bea2fec63db72f8050ade709358257e9102ab Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:31 +0200 Subject: uprobes: Fix the usage of install_special_mapping() install_special_mapping(pages) expects that "pages" is the zero- terminated array while xol_add_vma() passes &area->page, this means that special_mapping_fault() can wrongly use the next member in xol_area (vaddr) as "struct page *". Fortunately, this area is not expandable so pgoff != 0 isn't possible (modulo bugs in special_mapping_vmops), but still this does not look good. Signed-off-by: Oleg Nesterov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Pratyush Anand Cc: Srikar Dronamraju Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134031.GA4789@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 0f370ef57a02..4b8ac5f13320 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -99,7 +99,7 @@ struct xol_area { wait_queue_head_t wq; /* if all slots are busy */ atomic_t slot_count; /* number of in-use slots */ unsigned long *bitmap; /* 0 = free slot */ - struct page *page; + struct page *pages[2]; /* * We keep the vma's vm_start rather than a pointer to the vma @@ -1142,7 +1142,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) } ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, - VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, &area->page); + VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, area->pages); if (ret) goto fail; @@ -1168,21 +1168,22 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) if (!area->bitmap) goto free_area; - area->page = alloc_page(GFP_HIGHUSER); - if (!area->page) + area->pages[0] = alloc_page(GFP_HIGHUSER); + if (!area->pages[0]) goto free_bitmap; + area->pages[1] = NULL; area->vaddr = vaddr; init_waitqueue_head(&area->wq); /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); atomic_set(&area->slot_count, 1); - copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); + copy_to_page(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE); if (!xol_add_vma(mm, area)) return area; - __free_page(area->page); + __free_page(area->pages[0]); free_bitmap: kfree(area->bitmap); free_area: @@ -1220,7 +1221,7 @@ void uprobe_clear_state(struct mm_struct *mm) if (!area) return; - put_page(area->page); + put_page(area->pages[0]); kfree(area->bitmap); kfree(area); } @@ -1289,7 +1290,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe) if (unlikely(!xol_vaddr)) return 0; - arch_uprobe_copy_ixol(area->page, xol_vaddr, + arch_uprobe_copy_ixol(area->pages[0], xol_vaddr, &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); return xol_vaddr; -- cgit v1.2.3 From 704bde3cc26a4cb34386c164107b59e09745a022 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:33 +0200 Subject: uprobes: Use vm_special_mapping to name the XOL vma Change xol_add_vma() to use _install_special_mapping(), this way we can name the vma installed by uprobes. Currently it looks like private anonymous mapping, this is confusing and complicates the debugging. With this change /proc/$pid/maps reports "[uprobes]". As a side effect this will cause core dumps to include the XOL vma and I think this is good; this can help to debug the problem if the app crashed because it was probed. Signed-off-by: Oleg Nesterov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Pratyush Anand Cc: Srikar Dronamraju Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134033.GA4796@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4b8ac5f13320..2d5b7bd337a7 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -96,17 +96,18 @@ struct uprobe { * allocated. */ struct xol_area { - wait_queue_head_t wq; /* if all slots are busy */ - atomic_t slot_count; /* number of in-use slots */ - unsigned long *bitmap; /* 0 = free slot */ - struct page *pages[2]; + wait_queue_head_t wq; /* if all slots are busy */ + atomic_t slot_count; /* number of in-use slots */ + unsigned long *bitmap; /* 0 = free slot */ + struct vm_special_mapping xol_mapping; + struct page *pages[2]; /* * We keep the vma's vm_start rather than a pointer to the vma * itself. The probed process or a naughty kernel module could make * the vma go away, and we must handle that reasonably gracefully. */ - unsigned long vaddr; /* Page(s) of instruction slots */ + unsigned long vaddr; /* Page(s) of instruction slots */ }; /* @@ -1125,11 +1126,14 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon /* Slot allocation for XOL */ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) { - int ret = -EALREADY; + struct vm_area_struct *vma; + int ret; down_write(&mm->mmap_sem); - if (mm->uprobes_state.xol_area) + if (mm->uprobes_state.xol_area) { + ret = -EALREADY; goto fail; + } if (!area->vaddr) { /* Try to map as high as possible, this is only a hint. */ @@ -1141,11 +1145,15 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) } } - ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, - VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, area->pages); - if (ret) + vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE, + VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, + &area->xol_mapping); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); goto fail; + } + ret = 0; smp_wmb(); /* pairs with get_xol_area() */ mm->uprobes_state.xol_area = area; fail: @@ -1168,6 +1176,8 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) if (!area->bitmap) goto free_area; + area->xol_mapping.name = "[uprobes]"; + area->xol_mapping.pages = area->pages; area->pages[0] = alloc_page(GFP_HIGHUSER); if (!area->pages[0]) goto free_bitmap; -- cgit v1.2.3 From 2a742cedcf13572999436676cbe36c3a9b733b0f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:36 +0200 Subject: uprobes: Fix the waitqueue_active() check in xol_free_insn_slot() The xol_free_insn_slot()->waitqueue_active() check is buggy. We need mb() after we set the conditon for wait_event(), or xol_take_insn_slot() can miss the wakeup. Signed-off-by: Oleg Nesterov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Pratyush Anand Cc: Srikar Dronamraju Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134036.GA4799@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2d5b7bd337a7..4e5e9798aa0c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1337,6 +1337,7 @@ static void xol_free_insn_slot(struct task_struct *tsk) clear_bit(slot_nr, area->bitmap); atomic_dec(&area->slot_count); + smp_mb__after_atomic(); /* pairs with prepare_to_wait() */ if (waitqueue_active(&area->wq)) wake_up(&area->wq); -- cgit v1.2.3 From e5779e8e12299f77c2421a707855d8d124171d85 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 30 Jul 2015 20:32:40 -0700 Subject: perf/x86/hw_breakpoints: Disallow kernel breakpoints unless kprobe-safe Code on the kprobe blacklist doesn't want unexpected int3 exceptions. It probably doesn't want unexpected debug exceptions either. Be safe: disallow breakpoints in nokprobes code. On non-CONFIG_KPROBES kernels, there is no kprobe blacklist. In that case, disallow kernel breakpoints entirely. It will be particularly important to keep hw breakpoints out of the entry and NMI code once we move debug exceptions off the IST stack. Signed-off-by: Andy Lutomirski Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Brian Gerst Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/e14b152af99640448d895e3c2a8c2d5ee19a1325.1438312874.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/hw_breakpoint.c | 15 +++++++++++++++ include/linux/kprobes.h | 2 ++ kernel/kprobes.c | 2 +- 3 files changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 7114ba220fd4..78f3e90c5659 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -243,6 +244,20 @@ static int arch_build_bp_info(struct perf_event *bp) info->type = X86_BREAKPOINT_RW; break; case HW_BREAKPOINT_X: + /* + * We don't allow kernel breakpoints in places that are not + * acceptable for kprobes. On non-kprobes kernels, we don't + * allow kernel breakpoints at all. + */ + if (bp->attr.bp_addr >= TASK_SIZE_MAX) { +#ifdef CONFIG_KPROBES + if (within_kprobe_blacklist(bp->attr.bp_addr)) + return -EINVAL; +#else + return -EINVAL; +#endif + } + info->type = X86_BREAKPOINT_EXECUTE; /* * x86 inst breakpoints need to have a specific undefined len. diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 1ab54754a86d..8f6849084248 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -267,6 +267,8 @@ extern void show_registers(struct pt_regs *regs); extern void kprobes_inc_nmissed_count(struct kprobe *p); extern bool arch_within_kprobe_blacklist(unsigned long addr); +extern bool within_kprobe_blacklist(unsigned long addr); + struct kprobe_insn_cache { struct mutex mutex; void *(*alloc)(void); /* allocate insn page */ diff --git a/kernel/kprobes.c b/kernel/kprobes.c index c90e417bb963..d10ab6b9b5e0 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1332,7 +1332,7 @@ bool __weak arch_within_kprobe_blacklist(unsigned long addr) addr < (unsigned long)__kprobes_text_end; } -static bool within_kprobe_blacklist(unsigned long addr) +bool within_kprobe_blacklist(unsigned long addr) { struct kprobe_blacklist_entry *ent; -- cgit v1.2.3 From 9a6694cfa2390181dec936a17c0d9d21ef7b08d9 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Thu, 30 Jul 2015 16:48:24 +0300 Subject: perf/x86/intel/pt: Do not force sync packets on every schedule-in Currently, the PT driver zeroes out the status register every time before starting the event. However, all the writable bits are already taken care of in pt_handle_status() function, except the new PacketByteCnt field, which in new versions of PT contains the number of packet bytes written since the last sync (PSB) packet. Zeroing it out before enabling PT forces a sync packet to be written. This means that, with the existing code, a sync packet (PSB and PSBEND, 18 bytes in total) will be generated every time a PT event is scheduled in. To avoid these unnecessary syncs and save a WRMSR in the fast path, this patch changes the default behavior to not clear PacketByteCnt field, so that the sync packets will be generated with the period specified as "psb_period" attribute config field. This has little impact on the trace data as the other packets that are normally sent within PSB+ (between PSB and PSBEND) have their own generation scenarios which do not depend on the sync packets. One exception where we do need to force PSB like this when tracing starts, so that the decoder has a clear sync point in the trace. For this purpose we aready have hw::itrace_started flag, which we are currently using to output PERF_RECORD_ITRACE_START. This patch moves setting itrace_started from perf core to the pmu::start, where it should still be 0 on the very first run. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: hpa@zytor.com Link: http://lkml.kernel.org/r/1438264104-16189-1-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_pt.c | 7 +++++-- kernel/events/core.c | 2 -- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c index 183de719628d..cc58ef8d9ada 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_pt.c +++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c @@ -191,6 +191,11 @@ static void pt_config(struct perf_event *event) { u64 reg; + if (!event->hw.itrace_started) { + event->hw.itrace_started = 1; + wrmsrl(MSR_IA32_RTIT_STATUS, 0); + } + reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN; if (!event->attr.exclude_kernel) @@ -910,7 +915,6 @@ void intel_pt_interrupt(void) pt_config_buffer(buf->cur->table, buf->cur_idx, buf->output_off); - wrmsrl(MSR_IA32_RTIT_STATUS, 0); pt_config(event); } } @@ -934,7 +938,6 @@ static void pt_event_start(struct perf_event *event, int mode) pt_config_buffer(buf->cur->table, buf->cur_idx, buf->output_off); - wrmsrl(MSR_IA32_RTIT_STATUS, 0); pt_config(event); } diff --git a/kernel/events/core.c b/kernel/events/core.c index a9796c8ff7e0..bdea12924b11 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6139,8 +6139,6 @@ static void perf_log_itrace_start(struct perf_event *event) event->hw.itrace_started) return; - event->hw.itrace_started = 1; - rec.header.type = PERF_RECORD_ITRACE_START; rec.header.misc = 0; rec.header.size = sizeof(rec); -- cgit v1.2.3 From 04a22fae4cbc1f7d3f7471e9b36359f98bd3f043 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 1 Jul 2015 02:13:50 +0000 Subject: tracing, perf: Implement BPF programs attached to uprobes By copying BPF related operation to uprobe processing path, this patch allow users attach BPF programs to uprobes like what they are already doing on kprobes. After this patch, users are allowed to use PERF_EVENT_IOC_SET_BPF on a uprobe perf event. Which make it possible to profile user space programs and kernel events together using BPF. Because of this patch, CONFIG_BPF_EVENTS should be selected by CONFIG_UPROBE_EVENT to ensure trace_call_bpf() is compiled even if KPROBE_EVENT is not set. Signed-off-by: Wang Nan Acked-by: Alexei Starovoitov Cc: Brendan Gregg Cc: Daniel Borkmann Cc: David Ahern Cc: He Kuang Cc: Jiri Olsa Cc: Kaixu Xia Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1435716878-189507-3-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- include/linux/trace_events.h | 5 +++++ kernel/events/core.c | 4 ++-- kernel/trace/Kconfig | 2 +- kernel/trace/trace_uprobe.c | 5 +++++ 4 files changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 180dbf8720f9..ed27917cabc9 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -243,6 +243,7 @@ enum { TRACE_EVENT_FL_USE_CALL_FILTER_BIT, TRACE_EVENT_FL_TRACEPOINT_BIT, TRACE_EVENT_FL_KPROBE_BIT, + TRACE_EVENT_FL_UPROBE_BIT, }; /* @@ -257,6 +258,7 @@ enum { * USE_CALL_FILTER - For trace internal events, don't use file filter * TRACEPOINT - Event is a tracepoint * KPROBE - Event is a kprobe + * UPROBE - Event is a uprobe */ enum { TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT), @@ -267,8 +269,11 @@ enum { TRACE_EVENT_FL_USE_CALL_FILTER = (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT), TRACE_EVENT_FL_TRACEPOINT = (1 << TRACE_EVENT_FL_TRACEPOINT_BIT), TRACE_EVENT_FL_KPROBE = (1 << TRACE_EVENT_FL_KPROBE_BIT), + TRACE_EVENT_FL_UPROBE = (1 << TRACE_EVENT_FL_UPROBE_BIT), }; +#define TRACE_EVENT_FL_UKPROBE (TRACE_EVENT_FL_KPROBE | TRACE_EVENT_FL_UPROBE) + struct trace_event_call { struct list_head list; struct trace_event_class *class; diff --git a/kernel/events/core.c b/kernel/events/core.c index bdea12924b11..77f9e5d0e2d1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6846,8 +6846,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) if (event->tp_event->prog) return -EEXIST; - if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) - /* bpf programs can only be attached to kprobes */ + if (!(event->tp_event->flags & TRACE_EVENT_FL_UKPROBE)) + /* bpf programs can only be attached to u/kprobes */ return -EINVAL; prog = bpf_prog_get(prog_fd); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 3b9a48ae153a..1153c43428f3 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -434,7 +434,7 @@ config UPROBE_EVENT config BPF_EVENTS depends on BPF_SYSCALL - depends on KPROBE_EVENT + depends on KPROBE_EVENT || UPROBE_EVENT bool default y help diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index aa1ea7b36fa8..f97479f1ce35 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1095,11 +1095,15 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, { struct trace_event_call *call = &tu->tp.call; struct uprobe_trace_entry_head *entry; + struct bpf_prog *prog = call->prog; struct hlist_head *head; void *data; int size, esize; int rctx; + if (prog && !trace_call_bpf(prog, regs)) + return; + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); size = esize + tu->tp.size + dsize; @@ -1289,6 +1293,7 @@ static int register_uprobe_event(struct trace_uprobe *tu) return -ENODEV; } + call->flags = TRACE_EVENT_FL_UPROBE; call->class->reg = trace_uprobe_register; call->data = tu; ret = trace_add_event_call(call); -- cgit v1.2.3 From c2ad6b51efc5f27d70ce952decd2a15679b83600 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 28 Jul 2015 09:00:04 +0300 Subject: perf/ring-buffer: Clarify the use of page::private for high-order AUX allocations A question [1] was raised about the use of page::private in AUX buffer allocations, so let's add a clarification about its intended use. The private field and flag are used by perf's rb_alloc_aux() path to tell the pmu driver the size of each high-order allocation, so that the driver can program those appropriately into its hardware. This only matters for PMUs that don't support hardware scatter tables. Otherwise, every page in the buffer is just a page. This patch adds a comment about the private field to the AUX buffer allocation path. [1] http://marc.info/?l=linux-kernel&m=143803696607968 Reported-by: Mathieu Poirier Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1438063204-665-1-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index c8aa3f75bc4d..182bc30899d5 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -437,7 +437,10 @@ static struct page *rb_alloc_aux_page(int node, int order) if (page && order) { /* - * Communicate the allocation size to the driver + * Communicate the allocation size to the driver: + * if we managed to secure a high-order allocation, + * set its first page's private to this order; + * !PagePrivate(page) means it's just a normal page. */ split_page(page, order); SetPagePrivate(page); -- cgit v1.2.3 From a2fb3382edbea83c6f2bf6ac15e3673b2e254aad Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 26 Aug 2015 10:57:46 +0000 Subject: tracing/uprobes: Do not print '0x (null)' when offset is 0 When manually added uprobe point with zero address, 'uprobe_events' output '(null)' instead of 0x00000000: # echo p:probe_libc/abs_0 /path/to/lib.bin:0x0 arg1=%ax > \ /sys/kernel/debug/tracing/uprobe_events # cat /sys/kernel/debug/tracing/uprobe_events p:probe_libc/abs_0 /path/to/lib.bin:0x (null) arg1=%ax This patch fixes this behavior: # cat /sys/kernel/debug/tracing/uprobe_events p:probe_libc/abs_0 /path/to/lib.bin:0x0000000000000000 Signed-off-by: Wang Nan Acked-by: Masami Hiramatsu Cc: Namhyung Kim Cc: Steven Rostedt Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1440586666-235233-8-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/trace/trace_uprobe.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index f97479f1ce35..d2f6d0be3503 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -601,7 +601,22 @@ static int probes_seq_show(struct seq_file *m, void *v) seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, trace_event_name(&tu->tp.call)); - seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); + seq_printf(m, " %s:", tu->filename); + + /* Don't print "0x (null)" when offset is 0 */ + if (tu->offset) { + seq_printf(m, "0x%p", (void *)tu->offset); + } else { + switch (sizeof(void *)) { + case 4: + seq_printf(m, "0x00000000"); + break; + case 8: + default: + seq_printf(m, "0x0000000000000000"); + break; + } + } for (i = 0; i < tu->tp.nr_args; i++) seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); -- cgit v1.2.3