diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-04-30 18:41:01 +0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-04-30 18:41:01 +0400 |
commit | e0972916e8fe943f342b0dd1c9d43dbf5bc261c2 (patch) | |
tree | 690c436f1f9b839c4ba34d17ab3efa63b97a2dce /kernel/events | |
parent | 1f889ec62c3f0d8913f3c32f9aff2a1e15099346 (diff) | |
parent | 5ac2b5c2721501a8f5c5e1cd4116cbc31ace6886 (diff) | |
download | linux-e0972916e8fe943f342b0dd1c9d43dbf5bc261c2.tar.xz |
Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull perf updates from Ingo Molnar:
"Features:
- Add "uretprobes" - an optimization to uprobes, like kretprobes are
an optimization to kprobes. "perf probe -x file sym%return" now
works like kretprobes. By Oleg Nesterov.
- Introduce per core aggregation in 'perf stat', from Stephane
Eranian.
- Add memory profiling via PEBS, from Stephane Eranian.
- Event group view for 'annotate' in --stdio, --tui and --gtk, from
Namhyung Kim.
- Add support for AMD NB and L2I "uncore" counters, by Jacob Shin.
- Add Ivy Bridge-EP uncore support, by Zheng Yan
- IBM zEnterprise EC12 oprofile support patchlet from Robert Richter.
- Add perf test entries for checking breakpoint overflow signal
handler issues, from Jiri Olsa.
- Add perf test entry for for checking number of EXIT events, from
Namhyung Kim.
- Add perf test entries for checking --cpu in record and stat, from
Jiri Olsa.
- Introduce perf stat --repeat forever, from Frederik Deweerdt.
- Add --no-demangle to report/top, from Namhyung Kim.
- PowerPC fixes plus a couple of cleanups/optimizations in uprobes
and trace_uprobes, by Oleg Nesterov.
Various fixes and refactorings:
- Fix dependency of the python binding wrt libtraceevent, from
Naohiro Aota.
- Simplify some perf_evlist methods and to allow 'stat' to share code
with 'record' and 'trace', by Arnaldo Carvalho de Melo.
- Remove dead code in related to libtraceevent integration, from
Namhyung Kim.
- Revert "perf sched: Handle PERF_RECORD_EXIT events" to get 'perf
sched lat' back working, by Arnaldo Carvalho de Melo
- We don't use Newt anymore, just plain libslang, by Arnaldo Carvalho
de Melo.
- Kill a bunch of die() calls, from Namhyung Kim.
- Fix build on non-glibc systems due to libio.h absence, from Cody P
Schafer.
- Remove some perf_session and tracing dead code, from David Ahern.
- Honor parallel jobs, fix from Borislav Petkov
- Introduce tools/lib/lk library, initially just removing duplication
among tools/perf and tools/vm. from Borislav Petkov
... and many more I missed to list, see the shortlog and git log for
more details."
* 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (136 commits)
perf/x86/intel/P4: Robistify P4 PMU types
perf/x86/amd: Fix AMD NB and L2I "uncore" support
perf/x86/amd: Remove old-style NB counter support from perf_event_amd.c
perf/x86: Check all MSRs before passing hw check
perf/x86/amd: Add support for AMD NB and L2I "uncore" counters
perf/x86/intel: Add Ivy Bridge-EP uncore support
perf/x86/intel: Fix SNB-EP CBO and PCU uncore PMU filter management
perf/x86: Avoid kfree() in CPU_{STARTING,DYING}
uprobes/perf: Avoid perf_trace_buf_prepare/submit if ->perf_events is empty
uprobes/tracing: Don't pass addr=ip to perf_trace_buf_submit()
uprobes/tracing: Change create_trace_uprobe() to support uretprobes
uprobes/tracing: Make seq_printf() code uretprobe-friendly
uprobes/tracing: Make register_uprobe_event() paths uretprobe-friendly
uprobes/tracing: Make uprobe_{trace,perf}_print() uretprobe-friendly
uprobes/tracing: Introduce is_ret_probe() and uretprobe_dispatcher()
uprobes/tracing: Introduce uprobe_{trace,perf}_print() helpers
uprobes/tracing: Generalize struct uprobe_trace_entry_head
uprobes/tracing: Kill the pointless local_save_flags/preempt_count calls
uprobes/tracing: Kill the pointless seq_print_ip_sym() call
uprobes/tracing: Kill the pointless task_pt_regs() calls
...
Diffstat (limited to 'kernel/events')
-rw-r--r-- | kernel/events/core.c | 30 | ||||
-rw-r--r-- | kernel/events/uprobes.c | 300 |
2 files changed, 281 insertions, 49 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index dce6e13cf9d7..3820e3cefbae 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -37,6 +37,7 @@ #include <linux/ftrace_event.h> #include <linux/hw_breakpoint.h> #include <linux/mm_types.h> +#include <linux/cgroup.h> #include "internal.h" @@ -234,6 +235,20 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, #ifdef CONFIG_CGROUP_PERF /* + * perf_cgroup_info keeps track of time_enabled for a cgroup. + * This is a per-cpu dynamically allocated data structure. + */ +struct perf_cgroup_info { + u64 time; + u64 timestamp; +}; + +struct perf_cgroup { + struct cgroup_subsys_state css; + struct perf_cgroup_info __percpu *info; +}; + +/* * Must ensure cgroup is pinned (css_get) before calling * this function. In other words, we cannot call this function * if there is no cgroup event for the current CPU context. @@ -976,9 +991,15 @@ static void perf_event__header_size(struct perf_event *event) if (sample_type & PERF_SAMPLE_PERIOD) size += sizeof(data->period); + if (sample_type & PERF_SAMPLE_WEIGHT) + size += sizeof(data->weight); + if (sample_type & PERF_SAMPLE_READ) size += event->read_size; + if (sample_type & PERF_SAMPLE_DATA_SRC) + size += sizeof(data->data_src.val); + event->header_size = size; } @@ -4193,6 +4214,12 @@ void perf_output_sample(struct perf_output_handle *handle, perf_output_sample_ustack(handle, data->stack_user_size, data->regs_user.regs); + + if (sample_type & PERF_SAMPLE_WEIGHT) + perf_output_put(handle, data->weight); + + if (sample_type & PERF_SAMPLE_DATA_SRC) + perf_output_put(handle, data->data_src.val); } void perf_prepare_sample(struct perf_event_header *header, @@ -4782,6 +4809,9 @@ got_name: mmap_event->file_name = name; mmap_event->file_size = size; + if (!(vma->vm_flags & VM_EXEC)) + mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; + mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; rcu_read_lock(); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index a567c8c7ef31..f3569747d629 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -75,6 +75,15 @@ struct uprobe { struct arch_uprobe arch; }; +struct return_instance { + struct uprobe *uprobe; + unsigned long func; + unsigned long orig_ret_vaddr; /* original return address */ + bool chained; /* true, if instance is nested */ + + struct return_instance *next; /* keep as stack */ +}; + /* * valid_vma: Verify if the specified vma is an executable vma * Relax restrictions while unregistering: vm_flags might have @@ -173,10 +182,31 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn) return *insn == UPROBE_SWBP_INSN; } -static void copy_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode) +/** + * is_trap_insn - check if instruction is breakpoint instruction. + * @insn: instruction to be checked. + * Default implementation of is_trap_insn + * Returns true if @insn is a breakpoint instruction. + * + * This function is needed for the case where an architecture has multiple + * trap instructions (like powerpc). + */ +bool __weak is_trap_insn(uprobe_opcode_t *insn) +{ + return is_swbp_insn(insn); +} + +static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) { void *kaddr = kmap_atomic(page); - memcpy(opcode, kaddr + (vaddr & ~PAGE_MASK), UPROBE_SWBP_INSN_SIZE); + memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len); + kunmap_atomic(kaddr); +} + +static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len) +{ + void *kaddr = kmap_atomic(page); + memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len); kunmap_atomic(kaddr); } @@ -185,7 +215,16 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t uprobe_opcode_t old_opcode; bool is_swbp; - copy_opcode(page, vaddr, &old_opcode); + /* + * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here. + * We do not check if it is any other 'trap variant' which could + * be conditional trap instruction such as the one powerpc supports. + * + * The logic is that we do not care if the underlying instruction + * is a trap variant; uprobes always wins over any other (gdb) + * breakpoint. + */ + copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); is_swbp = is_swbp_insn(&old_opcode); if (is_swbp_insn(new_opcode)) { @@ -204,7 +243,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * Expect the breakpoint instruction to be the smallest size instruction for * the architecture. If an arch has variable length instruction and the * breakpoint instruction is not of the smallest length instruction - * supported by that architecture then we need to modify is_swbp_at_addr and + * supported by that architecture then we need to modify is_trap_at_addr and * write_opcode accordingly. This would never be a problem for archs that * have fixed length instructions. */ @@ -225,7 +264,6 @@ static int write_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t opcode) { struct page *old_page, *new_page; - void *vaddr_old, *vaddr_new; struct vm_area_struct *vma; int ret; @@ -246,15 +284,8 @@ retry: __SetPageUptodate(new_page); - /* copy the page now that we've got it stable */ - vaddr_old = kmap_atomic(old_page); - vaddr_new = kmap_atomic(new_page); - - memcpy(vaddr_new, vaddr_old, PAGE_SIZE); - memcpy(vaddr_new + (vaddr & ~PAGE_MASK), &opcode, UPROBE_SWBP_INSN_SIZE); - - kunmap_atomic(vaddr_new); - kunmap_atomic(vaddr_old); + copy_highpage(new_page, old_page); + copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); ret = anon_vma_prepare(vma); if (ret) @@ -477,30 +508,18 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn, unsigned long nbytes, loff_t offset) { struct page *page; - void *vaddr; - unsigned long off; - pgoff_t idx; - - if (!filp) - return -EINVAL; if (!mapping->a_ops->readpage) return -EIO; - - idx = offset >> PAGE_CACHE_SHIFT; - off = offset & ~PAGE_MASK; - /* * Ensure that the page that has the original instruction is * populated and in page-cache. */ - page = read_mapping_page(mapping, idx, filp); + page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp); if (IS_ERR(page)) return PTR_ERR(page); - vaddr = kmap_atomic(page); - memcpy(insn, vaddr + off, nbytes); - kunmap_atomic(vaddr); + copy_from_page(page, offset, insn, nbytes); page_cache_release(page); return 0; @@ -550,7 +569,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, goto out; ret = -ENOTSUPP; - if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) + if (is_trap_insn((uprobe_opcode_t *)uprobe->arch.insn)) goto out; ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); @@ -758,7 +777,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) down_write(&mm->mmap_sem); vma = find_vma(mm, info->vaddr); if (!vma || !valid_vma(vma, is_register) || - vma->vm_file->f_mapping->host != uprobe->inode) + file_inode(vma->vm_file) != uprobe->inode) goto unlock; if (vma->vm_start > info->vaddr || @@ -828,6 +847,10 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * struct uprobe *uprobe; int ret; + /* Uprobe must have at least one set consumer */ + if (!uc->handler && !uc->ret_handler) + return -EINVAL; + /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) return -EINVAL; @@ -917,7 +940,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) loff_t offset; if (!valid_vma(vma, false) || - vma->vm_file->f_mapping->host != uprobe->inode) + file_inode(vma->vm_file) != uprobe->inode) continue; offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; @@ -1010,7 +1033,7 @@ int uprobe_mmap(struct vm_area_struct *vma) if (no_uprobe_events() || !valid_vma(vma, true)) return 0; - inode = vma->vm_file->f_mapping->host; + inode = file_inode(vma->vm_file); if (!inode) return 0; @@ -1041,7 +1064,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e struct inode *inode; struct rb_node *n; - inode = vma->vm_file->f_mapping->host; + inode = file_inode(vma->vm_file); min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; @@ -1114,6 +1137,7 @@ static struct xol_area *get_xol_area(void) { struct mm_struct *mm = current->mm; struct xol_area *area; + uprobe_opcode_t insn = UPROBE_SWBP_INSN; area = mm->uprobes_state.xol_area; if (area) @@ -1131,7 +1155,12 @@ static struct xol_area *get_xol_area(void) if (!area->page) goto free_bitmap; + /* allocate first slot of task's xol_area for the return probes */ + set_bit(0, area->bitmap); + copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); + atomic_set(&area->slot_count, 1); init_waitqueue_head(&area->wq); + if (!xol_add_vma(area)) return area; @@ -1216,9 +1245,7 @@ static unsigned long xol_take_insn_slot(struct xol_area *area) static unsigned long xol_get_insn_slot(struct uprobe *uprobe) { struct xol_area *area; - unsigned long offset; unsigned long xol_vaddr; - void *vaddr; area = get_xol_area(); if (!area) @@ -1229,10 +1256,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe) return 0; /* Initialize the slot */ - offset = xol_vaddr & ~PAGE_MASK; - vaddr = kmap_atomic(area->page); - memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES); - kunmap_atomic(vaddr); + copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES); /* * We probably need flush_icache_user_range() but it needs vma. * This should work on supported architectures too. @@ -1298,6 +1322,7 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs) void uprobe_free_utask(struct task_struct *t) { struct uprobe_task *utask = t->utask; + struct return_instance *ri, *tmp; if (!utask) return; @@ -1305,6 +1330,15 @@ void uprobe_free_utask(struct task_struct *t) if (utask->active_uprobe) put_uprobe(utask->active_uprobe); + ri = utask->return_instances; + while (ri) { + tmp = ri; + ri = ri->next; + + put_uprobe(tmp->uprobe); + kfree(tmp); + } + xol_free_insn_slot(t); kfree(utask); t->utask = NULL; @@ -1333,6 +1367,93 @@ static struct uprobe_task *get_utask(void) return current->utask; } +/* + * Current area->vaddr notion assume the trampoline address is always + * equal area->vaddr. + * + * Returns -1 in case the xol_area is not allocated. + */ +static unsigned long get_trampoline_vaddr(void) +{ + struct xol_area *area; + unsigned long trampoline_vaddr = -1; + + area = current->mm->uprobes_state.xol_area; + smp_read_barrier_depends(); + if (area) + trampoline_vaddr = area->vaddr; + + return trampoline_vaddr; +} + +static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) +{ + struct return_instance *ri; + struct uprobe_task *utask; + unsigned long orig_ret_vaddr, trampoline_vaddr; + bool chained = false; + + if (!get_xol_area()) + return; + + utask = get_utask(); + if (!utask) + return; + + if (utask->depth >= MAX_URETPROBE_DEPTH) { + printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to" + " nestedness limit pid/tgid=%d/%d\n", + current->pid, current->tgid); + return; + } + + ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL); + if (!ri) + goto fail; + + trampoline_vaddr = get_trampoline_vaddr(); + orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); + if (orig_ret_vaddr == -1) + goto fail; + + /* + * We don't want to keep trampoline address in stack, rather keep the + * original return address of first caller thru all the consequent + * instances. This also makes breakpoint unwrapping easier. + */ + if (orig_ret_vaddr == trampoline_vaddr) { + if (!utask->return_instances) { + /* + * This situation is not possible. Likely we have an + * attack from user-space. + */ + pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n", + current->pid, current->tgid); + goto fail; + } + + chained = true; + orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; + } + + atomic_inc(&uprobe->ref); + ri->uprobe = uprobe; + ri->func = instruction_pointer(regs); + ri->orig_ret_vaddr = orig_ret_vaddr; + ri->chained = chained; + + utask->depth++; + + /* add instance to the stack */ + ri->next = utask->return_instances; + utask->return_instances = ri; + + return; + + fail: + kfree(ri); +} + /* Prepare to single-step probed instruction out of line. */ static int pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) @@ -1431,7 +1552,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm) clear_bit(MMF_HAS_UPROBES, &mm->flags); } -static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) +static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) { struct page *page; uprobe_opcode_t opcode; @@ -1449,10 +1570,11 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) if (result < 0) return result; - copy_opcode(page, vaddr, &opcode); + copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); put_page(page); out: - return is_swbp_insn(&opcode); + /* This needs to return true for any variant of the trap insn */ + return is_trap_insn(&opcode); } static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) @@ -1465,14 +1587,14 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) vma = find_vma(mm, bp_vaddr); if (vma && vma->vm_start <= bp_vaddr) { if (valid_vma(vma, false)) { - struct inode *inode = vma->vm_file->f_mapping->host; + struct inode *inode = file_inode(vma->vm_file); loff_t offset = vaddr_to_offset(vma, bp_vaddr); uprobe = find_uprobe(inode, offset); } if (!uprobe) - *is_swbp = is_swbp_at_addr(mm, bp_vaddr); + *is_swbp = is_trap_at_addr(mm, bp_vaddr); } else { *is_swbp = -EFAULT; } @@ -1488,16 +1610,27 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) { struct uprobe_consumer *uc; int remove = UPROBE_HANDLER_REMOVE; + bool need_prep = false; /* prepare return uprobe, when needed */ down_read(&uprobe->register_rwsem); for (uc = uprobe->consumers; uc; uc = uc->next) { - int rc = uc->handler(uc, regs); + int rc = 0; + + if (uc->handler) { + rc = uc->handler(uc, regs); + WARN(rc & ~UPROBE_HANDLER_MASK, + "bad rc=0x%x from %pf()\n", rc, uc->handler); + } + + if (uc->ret_handler) + need_prep = true; - WARN(rc & ~UPROBE_HANDLER_MASK, - "bad rc=0x%x from %pf()\n", rc, uc->handler); remove &= rc; } + if (need_prep && !remove) + prepare_uretprobe(uprobe, regs); /* put bp at return */ + if (remove && uprobe->consumers) { WARN_ON(!uprobe_is_active(uprobe)); unapply_uprobe(uprobe, current->mm); @@ -1505,6 +1638,64 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) up_read(&uprobe->register_rwsem); } +static void +handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs) +{ + struct uprobe *uprobe = ri->uprobe; + struct uprobe_consumer *uc; + + down_read(&uprobe->register_rwsem); + for (uc = uprobe->consumers; uc; uc = uc->next) { + if (uc->ret_handler) + uc->ret_handler(uc, ri->func, regs); + } + up_read(&uprobe->register_rwsem); +} + +static bool handle_trampoline(struct pt_regs *regs) +{ + struct uprobe_task *utask; + struct return_instance *ri, *tmp; + bool chained; + + utask = current->utask; + if (!utask) + return false; + + ri = utask->return_instances; + if (!ri) + return false; + + /* + * TODO: we should throw out return_instance's invalidated by + * longjmp(), currently we assume that the probed function always + * returns. + */ + instruction_pointer_set(regs, ri->orig_ret_vaddr); + + for (;;) { + handle_uretprobe_chain(ri, regs); + + chained = ri->chained; + put_uprobe(ri->uprobe); + + tmp = ri; + ri = ri->next; + kfree(tmp); + + if (!chained) + break; + + utask->depth--; + + BUG_ON(!ri); + } + + utask->return_instances = ri; + + return true; +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. @@ -1516,8 +1707,15 @@ static void handle_swbp(struct pt_regs *regs) int uninitialized_var(is_swbp); bp_vaddr = uprobe_get_swbp_addr(regs); - uprobe = find_active_uprobe(bp_vaddr, &is_swbp); + if (bp_vaddr == get_trampoline_vaddr()) { + if (handle_trampoline(regs)) + return; + + pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n", + current->pid, current->tgid); + } + uprobe = find_active_uprobe(bp_vaddr, &is_swbp); if (!uprobe) { if (is_swbp > 0) { /* No matching uprobe; signal SIGTRAP. */ @@ -1616,7 +1814,11 @@ void uprobe_notify_resume(struct pt_regs *regs) */ int uprobe_pre_sstep_notifier(struct pt_regs *regs) { - if (!current->mm || !test_bit(MMF_HAS_UPROBES, ¤t->mm->flags)) + if (!current->mm) + return 0; + + if (!test_bit(MMF_HAS_UPROBES, ¤t->mm->flags) && + (!current->utask || !current->utask->return_instances)) return 0; set_thread_flag(TIF_UPROBE); |