From 3c502e7a0255d82621ff25d60cc816624830497e Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Thu, 4 Nov 2010 17:33:01 -0500 Subject: perf,hw_breakpoint: Initialize hardware api earlier When using early debugging, the kernel does not initialize the hw_breakpoint API early enough and causes the late initialization of the kernel debugger to fail. The boot arguments are: earlyprintk=vga ekgdboc=kbd kgdbwait Then simply type "go" at the kdb prompt and boot. The kernel will later emit the message: kgdb: Could not allocate hwbreakpoints And at that point the kernel debugger will cease to work correctly. The solution is to initialize the hw_breakpoint at the same time that all the other perf call backs are initialized instead of using a core_initcall() initialization which happens well after the kernel debugger can make use of hardware breakpoints. Signed-off-by: Jason Wessel CC: Frederic Weisbecker CC: Ingo Molnar CC: Peter Zijlstra LKML-Reference: <4CD3396D.1090308@windriver.com> Signed-off-by: Frederic Weisbecker --- kernel/perf_event.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 517d827f4982..05b7d8c72c6c 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -31,6 +31,7 @@ #include #include #include +#include #include @@ -6295,6 +6296,8 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) void __init perf_event_init(void) { + int ret; + perf_event_init_all_cpus(); init_srcu_struct(&pmus_srcu); perf_pmu_register(&perf_swevent); @@ -6302,4 +6305,7 @@ void __init perf_event_init(void) perf_pmu_register(&perf_task_clock); perf_tp_register(); perf_cpu_notifier(perf_cpu_notify); + + ret = init_hw_breakpoint(); + WARN(ret, "hw_breakpoint initialization failed with: %d", ret); } -- cgit v1.2.3 From 8882135bcd332f294df5455747ea43ba9e6f77ad Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 9 Nov 2010 19:01:43 +0100 Subject: perf: Fix owner-list vs exit Oleg noticed that a perf-fd keeping a reference on the creating task leads to a few funny side effects. There's two different aspects to this: - kernel based perf-events, these should not take out a reference on the creating task and appear on the task's event list since they're not bound to fds nor visible to userspace. - fork() and pthread_create(), these can lead to the creating task dying (and thus the task's event-list becomming useless) but keeping the list and ref alive until the event is closed. Combined they lead to malfunction of the ptrace hw_tracepoints. Cure this by not considering kernel based perf_events for the owner-list and destroying the owner-list when the owner dies. Reported-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Acked-by: Oleg Nesterov LKML-Reference: <1289576883.2084.286.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 63 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 12 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index f818d9d2dc93..671f6c8c8a32 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2235,11 +2235,6 @@ int perf_event_release_kernel(struct perf_event *event) raw_spin_unlock_irq(&ctx->lock); mutex_unlock(&ctx->mutex); - mutex_lock(&event->owner->perf_event_mutex); - list_del_init(&event->owner_entry); - mutex_unlock(&event->owner->perf_event_mutex); - put_task_struct(event->owner); - free_event(event); return 0; @@ -2252,9 +2247,43 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel); static int perf_release(struct inode *inode, struct file *file) { struct perf_event *event = file->private_data; + struct task_struct *owner; file->private_data = NULL; + rcu_read_lock(); + owner = ACCESS_ONCE(event->owner); + /* + * Matches the smp_wmb() in perf_event_exit_task(). If we observe + * !owner it means the list deletion is complete and we can indeed + * free this event, otherwise we need to serialize on + * owner->perf_event_mutex. + */ + smp_read_barrier_depends(); + if (owner) { + /* + * Since delayed_put_task_struct() also drops the last + * task reference we can safely take a new reference + * while holding the rcu_read_lock(). + */ + get_task_struct(owner); + } + rcu_read_unlock(); + + if (owner) { + mutex_lock(&owner->perf_event_mutex); + /* + * We have to re-check the event->owner field, if it is cleared + * we raced with perf_event_exit_task(), acquiring the mutex + * ensured they're done, and we can proceed with freeing the + * event. + */ + if (event->owner) + list_del_init(&event->owner_entry); + mutex_unlock(&owner->perf_event_mutex); + put_task_struct(owner); + } + return perf_event_release_kernel(event); } @@ -5678,7 +5707,7 @@ SYSCALL_DEFINE5(perf_event_open, mutex_unlock(&ctx->mutex); event->owner = current; - get_task_struct(current); + mutex_lock(¤t->perf_event_mutex); list_add_tail(&event->owner_entry, ¤t->perf_event_list); mutex_unlock(¤t->perf_event_mutex); @@ -5746,12 +5775,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, ++ctx->generation; mutex_unlock(&ctx->mutex); - event->owner = current; - get_task_struct(current); - mutex_lock(¤t->perf_event_mutex); - list_add_tail(&event->owner_entry, ¤t->perf_event_list); - mutex_unlock(¤t->perf_event_mutex); - return event; err_free: @@ -5902,8 +5925,24 @@ again: */ void perf_event_exit_task(struct task_struct *child) { + struct perf_event *event, *tmp; int ctxn; + mutex_lock(&child->perf_event_mutex); + list_for_each_entry_safe(event, tmp, &child->perf_event_list, + owner_entry) { + list_del_init(&event->owner_entry); + + /* + * Ensure the list deletion is visible before we clear + * the owner, closes a race against perf_release() where + * we need to serialize on the owner->perf_event_mutex. + */ + smp_wmb(); + event->owner = NULL; + } + mutex_unlock(&child->perf_event_mutex); + for_each_task_context_nr(ctxn) perf_event_exit_task_context(child, ctxn); } -- cgit v1.2.3 From dddd3379a619a4cb8247bfd3c94ca9ae3797aa2e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 Nov 2010 10:05:55 +0100 Subject: perf: Fix inherit vs. context rotation bug It was found that sometimes children of tasks with inherited events had one extra event. Eventually it turned out to be due to the list rotation no being exclusive with the list iteration in the inheritance code. Cure this by temporarily disabling the rotation while we inherit the events. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra LKML-Reference: Cc: Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 1 + kernel/perf_event.c | 22 ++++++++++++++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 40150f345982..142e3d6042c7 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -850,6 +850,7 @@ struct perf_event_context { int nr_active; int is_active; int nr_stat; + int rotate_disable; atomic_t refcount; struct task_struct *task; diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 671f6c8c8a32..f365dd8ef8b0 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1622,8 +1622,12 @@ static void rotate_ctx(struct perf_event_context *ctx) { raw_spin_lock(&ctx->lock); - /* Rotate the first entry last of non-pinned groups */ - list_rotate_left(&ctx->flexible_groups); + /* + * Rotate the first entry last of non-pinned groups. Rotation might be + * disabled by the inheritance code. + */ + if (!ctx->rotate_disable) + list_rotate_left(&ctx->flexible_groups); raw_spin_unlock(&ctx->lock); } @@ -6162,6 +6166,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn) struct perf_event *event; struct task_struct *parent = current; int inherited_all = 1; + unsigned long flags; int ret = 0; child->perf_event_ctxp[ctxn] = NULL; @@ -6202,6 +6207,15 @@ int perf_event_init_context(struct task_struct *child, int ctxn) break; } + /* + * We can't hold ctx->lock when iterating the ->flexible_group list due + * to allocations, but we need to prevent rotation because + * rotate_ctx() will change the list from interrupt context. + */ + raw_spin_lock_irqsave(&parent_ctx->lock, flags); + parent_ctx->rotate_disable = 1; + raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); + list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { ret = inherit_task_group(event, parent, parent_ctx, child, ctxn, &inherited_all); @@ -6209,6 +6223,10 @@ int perf_event_init_context(struct task_struct *child, int ctxn) break; } + raw_spin_lock_irqsave(&parent_ctx->lock, flags); + parent_ctx->rotate_disable = 0; + raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); + child_ctx = child->perf_event_ctxp[ctxn]; if (child_ctx && inherited_all) { -- cgit v1.2.3 From ee6dcfa40a50fe12a3ae0fb4d2653c66c3ed6556 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Nov 2010 13:49:04 +0100 Subject: perf: Fix the software context switch counter Stephane noticed that because the perf_sw_event() call is inside the perf_event_task_sched_out() call it won't get called unless we have a per-task counter. Reported-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 29 +++++++++++++++-------------- kernel/perf_event.c | 2 -- 2 files changed, 15 insertions(+), 16 deletions(-) (limited to 'kernel/perf_event.c') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 142e3d6042c7..de2c41758e29 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -909,20 +909,6 @@ extern int perf_num_counters(void); extern const char *perf_pmu_name(void); extern void __perf_event_task_sched_in(struct task_struct *task); extern void __perf_event_task_sched_out(struct task_struct *task, struct task_struct *next); - -extern atomic_t perf_task_events; - -static inline void perf_event_task_sched_in(struct task_struct *task) -{ - COND_STMT(&perf_task_events, __perf_event_task_sched_in(task)); -} - -static inline -void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next) -{ - COND_STMT(&perf_task_events, __perf_event_task_sched_out(task, next)); -} - extern int perf_event_init_task(struct task_struct *child); extern void perf_event_exit_task(struct task_struct *child); extern void perf_event_free_task(struct task_struct *task); @@ -1031,6 +1017,21 @@ have_event: __perf_sw_event(event_id, nr, nmi, regs, addr); } +extern atomic_t perf_task_events; + +static inline void perf_event_task_sched_in(struct task_struct *task) +{ + COND_STMT(&perf_task_events, __perf_event_task_sched_in(task)); +} + +static inline +void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next) +{ + perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); + + COND_STMT(&perf_task_events, __perf_event_task_sched_out(task, next)); +} + extern void perf_event_mmap(struct vm_area_struct *vma); extern struct perf_guest_info_callbacks *perf_guest_cbs; extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); diff --git a/kernel/perf_event.c b/kernel/perf_event.c index f365dd8ef8b0..eac7e3364335 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1287,8 +1287,6 @@ void __perf_event_task_sched_out(struct task_struct *task, { int ctxn; - perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); - for_each_task_context_nr(ctxn) perf_event_context_sched_out(task, ctxn, next); } -- cgit v1.2.3