From 86038c5ea81b519a8a1fcfcd5e4599aab0cdd119 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Tue, 16 Dec 2014 12:47:34 +0100 Subject: perf: Avoid horrible stack usage Both Linus (most recent) and Steve (a while ago) reported that perf related callbacks have massive stack bloat. The problem is that software events need a pt_regs in order to properly report the event location and unwind stack. And because we could not assume one was present we allocated one on stack and filled it with minimal bits required for operation. Now, pt_regs is quite large, so this is undesirable. Furthermore it turns out that most sites actually have a pt_regs pointer available, making this even more onerous, as the stack space is pointless waste. This patch addresses the problem by observing that software events have well defined nesting semantics, therefore we can use static per-cpu storage instead of on-stack. Linus made the further observation that all but the scheduler callers of perf_sw_event() have a pt_regs available, so we change the regular perf_sw_event() to require a valid pt_regs (where it used to be optional) and add perf_sw_event_sched() for the scheduler. We have a scheduler specific call instead of a more generic _noregs() like construct because we can assume non-recursion from the scheduler and thereby simplify the code further (_noregs would have to put the recursion context call inline in order to assertain which __perf_regs element to use). One last note on the implementation of perf_trace_buf_prepare(); we allow .regs = NULL for those cases where we already have a pt_regs pointer available and do not need another. Reported-by: Linus Torvalds Reported-by: Steven Rostedt Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Javi Merino Cc: Linus Torvalds Cc: Mathieu Desnoyers Cc: Oleg Nesterov Cc: Paul Mackerras Cc: Petr Mladek Cc: Steven Rostedt Cc: Tom Zanussi Cc: Vaibhav Nagarnaik Link: http://lkml.kernel.org/r/20141216115041.GW3337@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 2 +- include/linux/perf_event.h | 28 +++++++++++++++++++++------- 2 files changed, 22 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 0bebb5c348b8..d36f68b08acc 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -595,7 +595,7 @@ extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, char *filter_str); extern void ftrace_profile_free_filter(struct perf_event *event); extern void *perf_trace_buf_prepare(int size, unsigned short type, - struct pt_regs *regs, int *rctxp); + struct pt_regs **regs, int *rctxp); static inline void perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr, diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 4f7a61ca4b39..3a7bd80b4db8 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -665,6 +665,7 @@ static inline int is_software_event(struct perf_event *event) extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; +extern void ___perf_sw_event(u32, u64, struct pt_regs *, u64); extern void __perf_sw_event(u32, u64, struct pt_regs *, u64); #ifndef perf_arch_fetch_caller_regs @@ -689,14 +690,25 @@ static inline void perf_fetch_caller_regs(struct pt_regs *regs) static __always_inline void perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { - struct pt_regs hot_regs; + if (static_key_false(&perf_swevent_enabled[event_id])) + __perf_sw_event(event_id, nr, regs, addr); +} + +DECLARE_PER_CPU(struct pt_regs, __perf_regs[4]); +/* + * 'Special' version for the scheduler, it hard assumes no recursion, + * which is guaranteed by us not actually scheduling inside other swevents + * because those disable preemption. + */ +static __always_inline void +perf_sw_event_sched(u32 event_id, u64 nr, u64 addr) +{ if (static_key_false(&perf_swevent_enabled[event_id])) { - if (!regs) { - perf_fetch_caller_regs(&hot_regs); - regs = &hot_regs; - } - __perf_sw_event(event_id, nr, regs, addr); + struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]); + + perf_fetch_caller_regs(regs); + ___perf_sw_event(event_id, nr, regs, addr); } } @@ -712,7 +724,7 @@ static inline void perf_event_task_sched_in(struct task_struct *prev, static inline void perf_event_task_sched_out(struct task_struct *prev, struct task_struct *next) { - perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, NULL, 0); + perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0); if (static_key_false(&perf_sched_events.key)) __perf_event_task_sched_out(prev, next); @@ -823,6 +835,8 @@ static inline int perf_event_refresh(struct perf_event *event, int refresh) static inline void perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { } static inline void +perf_sw_event_sched(u32 event_id, u64 nr, u64 addr) { } +static inline void perf_bp_event(struct perf_event *event, void *data) { } static inline int perf_register_guest_info_callbacks -- cgit v1.2.3 From 2fde4f94e0a9531251e706fa57131b51b0df042e Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 7 Jan 2015 15:01:54 +0000 Subject: perf: Decouple unthrottling and rotating Currently the adjusments made as part of perf_event_task_tick() use the percpu rotation lists to iterate over any active PMU contexts, but these are not used by the context rotation code, having been replaced by separate (per-context) hrtimer callbacks. However, some manipulation of the rotation lists (i.e. removal of contexts) has remained in perf_rotate_context(). This leads to the following issues: * Contexts are not always removed from the rotation lists. Removal of PMUs which have been placed in rotation lists, but have not been removed by a hrtimer callback can result in corruption of the rotation lists (when memory backing the context is freed). This has been observed to result in hangs when PMU drivers built as modules are inserted and removed around the creation of events for said PMUs. * Contexts which do not require rotation may be removed from the rotation lists as a result of a hrtimer, and will not be considered by the unthrottling code in perf_event_task_tick. This patch fixes the issue by updating the rotation ist when events are scheduled in/out, ensuring that each rotation list stays in sync with the HW state. As each event holds a refcount on the module of its PMU, this ensures that when a PMU module is unloaded none of its CPU contexts can be in a rotation list. By maintaining a list of perf_event_contexts rather than perf_event_cpu_contexts, we don't need separate paths to handle the cpu and task contexts, which also makes the code a little simpler. As the rotation_list variables are not used for rotation, these are renamed to active_ctx_list, which better matches their current function. perf_pmu_rotate_{start,stop} are renamed to perf_pmu_ctx_{activate,deactivate}. Reported-by: Johannes Jensen Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Cc: Will Deacon Cc: Arnaldo Carvalho de Melo Cc: Fengguang Wu Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20150129134511.GR17721@leverpostej Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 2 +- kernel/events/core.c | 81 +++++++++++++++++----------------------------- 2 files changed, 30 insertions(+), 53 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 216653466a67..5cad0e6f3552 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -469,6 +469,7 @@ struct perf_event_context { */ struct mutex mutex; + struct list_head active_ctx_list; struct list_head pinned_groups; struct list_head flexible_groups; struct list_head event_list; @@ -519,7 +520,6 @@ struct perf_cpu_context { int exclusive; struct hrtimer hrtimer; ktime_t hrtimer_interval; - struct list_head rotation_list; struct pmu *unique_pmu; struct perf_cgroup *cgrp; }; diff --git a/kernel/events/core.c b/kernel/events/core.c index 37cc20e8aa3b..7f2fbb8b5069 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -872,22 +872,32 @@ void perf_pmu_enable(struct pmu *pmu) pmu->pmu_enable(pmu); } -static DEFINE_PER_CPU(struct list_head, rotation_list); +static DEFINE_PER_CPU(struct list_head, active_ctx_list); /* - * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized - * because they're strictly cpu affine and rotate_start is called with IRQs - * disabled, while rotate_context is called from IRQ context. + * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and + * perf_event_task_tick() are fully serialized because they're strictly cpu + * affine and perf_event_ctx{activate,deactivate} are called with IRQs + * disabled, while perf_event_task_tick is called from IRQ context. */ -static void perf_pmu_rotate_start(struct pmu *pmu) +static void perf_event_ctx_activate(struct perf_event_context *ctx) { - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - struct list_head *head = this_cpu_ptr(&rotation_list); + struct list_head *head = this_cpu_ptr(&active_ctx_list); WARN_ON(!irqs_disabled()); - if (list_empty(&cpuctx->rotation_list)) - list_add(&cpuctx->rotation_list, head); + WARN_ON(!list_empty(&ctx->active_ctx_list)); + + list_add(&ctx->active_ctx_list, head); +} + +static void perf_event_ctx_deactivate(struct perf_event_context *ctx) +{ + WARN_ON(!irqs_disabled()); + + WARN_ON(list_empty(&ctx->active_ctx_list)); + + list_del_init(&ctx->active_ctx_list); } static void get_ctx(struct perf_event_context *ctx) @@ -1233,8 +1243,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) ctx->nr_branch_stack++; list_add_rcu(&event->event_entry, &ctx->event_list); - if (!ctx->nr_events) - perf_pmu_rotate_start(ctx->pmu); ctx->nr_events++; if (event->attr.inherit_stat) ctx->nr_stat++; @@ -1561,7 +1569,8 @@ event_sched_out(struct perf_event *event, if (!is_software_event(event)) cpuctx->active_oncpu--; - ctx->nr_active--; + if (!--ctx->nr_active) + perf_event_ctx_deactivate(ctx); if (event->attr.freq && event->attr.sample_freq) ctx->nr_freq--; if (event->attr.exclusive || !cpuctx->active_oncpu) @@ -1885,7 +1894,8 @@ event_sched_in(struct perf_event *event, if (!is_software_event(event)) cpuctx->active_oncpu++; - ctx->nr_active++; + if (!ctx->nr_active++) + perf_event_ctx_activate(ctx); if (event->attr.freq && event->attr.sample_freq) ctx->nr_freq++; @@ -2742,12 +2752,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, perf_pmu_enable(ctx->pmu); perf_ctx_unlock(cpuctx, ctx); - - /* - * Since these rotations are per-cpu, we need to ensure the - * cpu-context we got scheduled on is actually rotating. - */ - perf_pmu_rotate_start(ctx->pmu); } /* @@ -3035,25 +3039,18 @@ static void rotate_ctx(struct perf_event_context *ctx) list_rotate_left(&ctx->flexible_groups); } -/* - * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized - * because they're strictly cpu affine and rotate_start is called with IRQs - * disabled, while rotate_context is called from IRQ context. - */ static int perf_rotate_context(struct perf_cpu_context *cpuctx) { struct perf_event_context *ctx = NULL; - int rotate = 0, remove = 1; + int rotate = 0; if (cpuctx->ctx.nr_events) { - remove = 0; if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) rotate = 1; } ctx = cpuctx->task_ctx; if (ctx && ctx->nr_events) { - remove = 0; if (ctx->nr_events != ctx->nr_active) rotate = 1; } @@ -3077,8 +3074,6 @@ static int perf_rotate_context(struct perf_cpu_context *cpuctx) perf_pmu_enable(cpuctx->ctx.pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); done: - if (remove) - list_del_init(&cpuctx->rotation_list); return rotate; } @@ -3096,9 +3091,8 @@ bool perf_event_can_stop_tick(void) void perf_event_task_tick(void) { - struct list_head *head = this_cpu_ptr(&rotation_list); - struct perf_cpu_context *cpuctx, *tmp; - struct perf_event_context *ctx; + struct list_head *head = this_cpu_ptr(&active_ctx_list); + struct perf_event_context *ctx, *tmp; int throttled; WARN_ON(!irqs_disabled()); @@ -3106,14 +3100,8 @@ void perf_event_task_tick(void) __this_cpu_inc(perf_throttled_seq); throttled = __this_cpu_xchg(perf_throttled_count, 0); - list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { - ctx = &cpuctx->ctx; + list_for_each_entry_safe(ctx, tmp, head, active_ctx_list) perf_adjust_freq_unthr_context(ctx, throttled); - - ctx = cpuctx->task_ctx; - if (ctx) - perf_adjust_freq_unthr_context(ctx, throttled); - } } static int event_enable_on_exec(struct perf_event *event, @@ -3272,6 +3260,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx) { raw_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); + INIT_LIST_HEAD(&ctx->active_ctx_list); INIT_LIST_HEAD(&ctx->pinned_groups); INIT_LIST_HEAD(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); @@ -6954,7 +6943,6 @@ skip_type: __perf_cpu_hrtimer_init(cpuctx, cpu); - INIT_LIST_HEAD(&cpuctx->rotation_list); cpuctx->unique_pmu = pmu; } @@ -8384,7 +8372,7 @@ static void __init perf_event_init_all_cpus(void) for_each_possible_cpu(cpu) { swhash = &per_cpu(swevent_htable, cpu); mutex_init(&swhash->hlist_mutex); - INIT_LIST_HEAD(&per_cpu(rotation_list, cpu)); + INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu)); } } @@ -8405,22 +8393,11 @@ static void perf_event_init_cpu(int cpu) } #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC -static void perf_pmu_rotate_stop(struct pmu *pmu) -{ - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - - WARN_ON(!irqs_disabled()); - - list_del_init(&cpuctx->rotation_list); -} - static void __perf_event_exit_context(void *__info) { struct remove_event re = { .detach_group = true }; struct perf_event_context *ctx = __info; - perf_pmu_rotate_stop(ctx->pmu); - rcu_read_lock(); list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry) __perf_remove_from_context(&re); -- cgit v1.2.3