From 0588fa30db44fd2d4032b36a061c87478a43fbee Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 21 Mar 2011 22:59:21 -0400 Subject: tracing: Convert trace_printk() formats for module to const char * The trace_printk() formats for modules do not show up in the debugfs/tracing/printk_formats file. Only the formats that are for trace_printk()s that are in the kernel core. To facilitate the change to add trace_printk() formats from modules into that file as well, we need to convert the structure that holds the formats from char fmt[], into const char *fmt, and allocate them separately. Signed-off-by: Steven Rostedt --- kernel/trace/trace_printk.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 2547d8813cf0..b8b268158af0 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -32,7 +32,7 @@ static DEFINE_MUTEX(btrace_mutex); struct trace_bprintk_fmt { struct list_head list; - char fmt[0]; + const char *fmt; }; static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) @@ -49,6 +49,7 @@ static void hold_module_trace_bprintk_format(const char **start, const char **end) { const char **iter; + char *fmt; mutex_lock(&btrace_mutex); for (iter = start; iter < end; iter++) { @@ -58,14 +59,18 @@ void hold_module_trace_bprintk_format(const char **start, const char **end) continue; } - tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt) - + strlen(*iter) + 1, GFP_KERNEL); - if (tb_fmt) { + tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL); + if (tb_fmt) + fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL); + if (tb_fmt && fmt) { list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); - strcpy(tb_fmt->fmt, *iter); + strcpy(fmt, *iter); + tb_fmt->fmt = fmt; *iter = tb_fmt->fmt; - } else + } else { + kfree(tb_fmt); *iter = NULL; + } } mutex_unlock(&btrace_mutex); } -- cgit v1.2.3 From 1813dc3776c22ad4b0294a6df8434b9a02c98109 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 21 Mar 2011 23:36:31 -0400 Subject: tracing: Print trace_bprintk() formats for modules too The file debugfs/tracing/printk_formats maps the addresses to the formats that are used by trace_bprintk() so that userspace tools can read the buffer and be able to decode trace_bprintk events to get the format saved when reading the ring buffer directly. This is because trace_bprintk() does not store the format into the buffer, but just the address of the format, which is hidden in the kernel memory. But currently it only exports trace_bprintk()s from the kernel core and not for modules. The modules need their formats exported as well. Signed-off-by: Steven Rostedt --- kernel/trace/trace_printk.c | 103 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 97 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index b8b268158af0..dff763b7baf1 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -89,6 +89,76 @@ static int module_trace_bprintk_format_notify(struct notifier_block *self, return 0; } +/* + * The debugfs/tracing/printk_formats file maps the addresses with + * the ASCII formats that are used in the bprintk events in the + * buffer. For userspace tools to be able to decode the events from + * the buffer, they need to be able to map the address with the format. + * + * The addresses of the bprintk formats are in their own section + * __trace_printk_fmt. But for modules we copy them into a link list. + * The code to print the formats and their addresses passes around the + * address of the fmt string. If the fmt address passed into the seq + * functions is within the kernel core __trace_printk_fmt section, then + * it simply uses the next pointer in the list. + * + * When the fmt pointer is outside the kernel core __trace_printk_fmt + * section, then we need to read the link list pointers. The trick is + * we pass the address of the string to the seq function just like + * we do for the kernel core formats. To get back the structure that + * holds the format, we simply use containerof() and then go to the + * next format in the list. + */ +static const char ** +find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos) +{ + struct trace_bprintk_fmt *mod_fmt; + + if (list_empty(&trace_bprintk_fmt_list)) + return NULL; + + /* + * v will point to the address of the fmt record from t_next + * v will be NULL from t_start. + * If this is the first pointer or called from start + * then we need to walk the list. + */ + if (!v || start_index == *pos) { + struct trace_bprintk_fmt *p; + + /* search the module list */ + list_for_each_entry(p, &trace_bprintk_fmt_list, list) { + if (start_index == *pos) + return &p->fmt; + start_index++; + } + /* pos > index */ + return NULL; + } + + /* + * v points to the address of the fmt field in the mod list + * structure that holds the module print format. + */ + mod_fmt = container_of(v, typeof(*mod_fmt), fmt); + if (mod_fmt->list.next == &trace_bprintk_fmt_list) + return NULL; + + mod_fmt = container_of(mod_fmt->list.next, typeof(*mod_fmt), list); + + return &mod_fmt->fmt; +} + +static void format_mod_start(void) +{ + mutex_lock(&btrace_mutex); +} + +static void format_mod_stop(void) +{ + mutex_unlock(&btrace_mutex); +} + #else /* !CONFIG_MODULES */ __init static int module_trace_bprintk_format_notify(struct notifier_block *self, @@ -96,6 +166,13 @@ module_trace_bprintk_format_notify(struct notifier_block *self, { return 0; } +static inline const char ** +find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos) +{ + return NULL; +} +static inline void format_mod_start(void) { } +static inline void format_mod_stop(void) { } #endif /* CONFIG_MODULES */ @@ -158,20 +235,33 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) } EXPORT_SYMBOL_GPL(__ftrace_vprintk); +static const char **find_next(void *v, loff_t *pos) +{ + const char **fmt = v; + int start_index; + + if (!fmt) + fmt = __start___trace_bprintk_fmt + *pos; + + start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt; + + if (*pos < start_index) + return fmt; + + return find_next_mod_format(start_index, v, fmt, pos); +} + static void * t_start(struct seq_file *m, loff_t *pos) { - const char **fmt = __start___trace_bprintk_fmt + *pos; - - if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt) - return NULL; - return fmt; + format_mod_start(); + return find_next(NULL, pos); } static void *t_next(struct seq_file *m, void * v, loff_t *pos) { (*pos)++; - return t_start(m, pos); + return find_next(v, pos); } static int t_show(struct seq_file *m, void *v) @@ -210,6 +300,7 @@ static int t_show(struct seq_file *m, void *v) static void t_stop(struct seq_file *m, void *p) { + format_mod_stop(); } static const struct seq_operations show_format_seq_ops = { -- cgit v1.2.3 From ee5e51f51be755830f57445e268ba50e88ccbdbb Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 25 Mar 2011 12:05:18 +0100 Subject: tracing: Avoid soft lockup in trace_pipe running following commands: # enable the binary option echo 1 > ./options/bin # disable context info option echo 0 > ./options/context-info # tracing only events echo 1 > ./events/enable cat trace_pipe plus forcing system to generate many tracing events, is causing lockup (in NON preemptive kernels) inside tracing_read_pipe function. The issue is also easily reproduced by running ltp stress test. (ftrace_stress_test.sh) The reasons are: - bin/hex/raw output functions for events are set to trace_nop_print function, which prints nothing and returns TRACE_TYPE_HANDLED value - LOST EVENT trace do not handle trace_seq overflow These reasons force the while loop in tracing_read_pipe function never to break. The attached patch fixies handling of lost event trace, and changes trace_nop_print to print minimal info, which is needed for the correct tracing_read_pipe processing. v2 changes: - omit the cond_resched changes by trace_nop_print changes - WARN changed to WARN_ONCE and added info to be able to find out the culprit v3 changes: - make more accurate patch comment Signed-off-by: Jiri Olsa LKML-Reference: <20110325110518.GC1922@jolsa.brq.redhat.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 15 ++++++++++++--- kernel/trace/trace_output.c | 3 +++ 2 files changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9541c27c1cf2..5af42f478c06 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2013,9 +2013,10 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) { enum print_line_t ret; - if (iter->lost_events) - trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", - iter->cpu, iter->lost_events); + if (iter->lost_events && + !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", + iter->cpu, iter->lost_events)) + return TRACE_TYPE_PARTIAL_LINE; if (iter->trace && iter->trace->print_line) { ret = iter->trace->print_line(iter); @@ -3229,6 +3230,14 @@ waitagain: if (iter->seq.len >= cnt) break; + + /* + * Setting the full flag means we reached the trace_seq buffer + * size and we should leave by partial output condition above. + * One of the trace_seq_* functions is not used properly. + */ + WARN_ONCE(iter->seq.full, "full flag set for trace type %d", + iter->ent->type); } trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 456be9063c2d..cf535ccedc86 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -830,6 +830,9 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event); enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, struct trace_event *event) { + if (!trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type)) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; } -- cgit v1.2.3 From d430d3d7e646eb1eac2bb4aa244a644312e67c76 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Wed, 16 Mar 2011 17:29:47 -0400 Subject: jump label: Introduce static_branch() interface Introduce: static __always_inline bool static_branch(struct jump_label_key *key); instead of the old JUMP_LABEL(key, label) macro. In this way, jump labels become really easy to use: Define: struct jump_label_key jump_key; Can be used as: if (static_branch(&jump_key)) do unlikely code enable/disale via: jump_label_inc(&jump_key); jump_label_dec(&jump_key); that's it! For the jump labels disabled case, the static_branch() becomes an atomic_read(), and jump_label_inc()/dec() are simply atomic_inc(), atomic_dec() operations. We show testing results for this change below. Thanks to H. Peter Anvin for suggesting the 'static_branch()' construct. Since we now require a 'struct jump_label_key *key', we can store a pointer into the jump table addresses. In this way, we can enable/disable jump labels, in basically constant time. This change allows us to completely remove the previous hashtable scheme. Thanks to Peter Zijlstra for this re-write. Testing: I ran a series of 'tbench 20' runs 5 times (with reboots) for 3 configurations, where tracepoints were disabled. jump label configured in avg: 815.6 jump label *not* configured in (using atomic reads) avg: 800.1 jump label *not* configured in (regular reads) avg: 803.4 Signed-off-by: Peter Zijlstra LKML-Reference: <20110316212947.GA8792@redhat.com> Signed-off-by: Jason Baron Suggested-by: H. Peter Anvin Tested-by: David Daney Acked-by: Ralf Baechle Acked-by: David S. Miller Acked-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt --- arch/mips/include/asm/jump_label.h | 22 +- arch/sparc/include/asm/jump_label.h | 25 +- arch/x86/include/asm/alternative.h | 3 +- arch/x86/include/asm/jump_label.h | 26 +- arch/x86/kernel/alternative.c | 2 +- arch/x86/kernel/module.c | 1 + include/asm-generic/vmlinux.lds.h | 14 +- include/linux/dynamic_debug.h | 2 - include/linux/jump_label.h | 89 +++--- include/linux/jump_label_ref.h | 44 --- include/linux/perf_event.h | 26 +- include/linux/tracepoint.h | 22 +- kernel/jump_label.c | 539 +++++++++++++++--------------------- kernel/perf_event.c | 4 +- kernel/tracepoint.c | 23 +- 15 files changed, 356 insertions(+), 486 deletions(-) delete mode 100644 include/linux/jump_label_ref.h (limited to 'kernel') diff --git a/arch/mips/include/asm/jump_label.h b/arch/mips/include/asm/jump_label.h index 7622ccf75076..1881b316ca45 100644 --- a/arch/mips/include/asm/jump_label.h +++ b/arch/mips/include/asm/jump_label.h @@ -20,16 +20,18 @@ #define WORD_INSN ".word" #endif -#define JUMP_LABEL(key, label) \ - do { \ - asm goto("1:\tnop\n\t" \ - "nop\n\t" \ - ".pushsection __jump_table, \"a\"\n\t" \ - WORD_INSN " 1b, %l[" #label "], %0\n\t" \ - ".popsection\n\t" \ - : : "i" (key) : : label); \ - } while (0) - +static __always_inline bool arch_static_branch(struct jump_label_key *key) +{ + asm goto("1:\tnop\n\t" + "nop\n\t" + ".pushsection __jump_table, \"aw\"\n\t" + WORD_INSN " 1b, %l[l_yes], %0\n\t" + ".popsection\n\t" + : : "i" (key) : : l_yes); + return false; +l_yes: + return true; +} #endif /* __KERNEL__ */ diff --git a/arch/sparc/include/asm/jump_label.h b/arch/sparc/include/asm/jump_label.h index 427d4684e0d2..fc73a82366f8 100644 --- a/arch/sparc/include/asm/jump_label.h +++ b/arch/sparc/include/asm/jump_label.h @@ -7,17 +7,20 @@ #define JUMP_LABEL_NOP_SIZE 4 -#define JUMP_LABEL(key, label) \ - do { \ - asm goto("1:\n\t" \ - "nop\n\t" \ - "nop\n\t" \ - ".pushsection __jump_table, \"a\"\n\t"\ - ".align 4\n\t" \ - ".word 1b, %l[" #label "], %c0\n\t" \ - ".popsection \n\t" \ - : : "i" (key) : : label);\ - } while (0) +static __always_inline bool arch_static_branch(struct jump_label_key *key) +{ + asm goto("1:\n\t" + "nop\n\t" + "nop\n\t" + ".pushsection __jump_table, \"aw\"\n\t" + ".align 4\n\t" + ".word 1b, %l[l_yes], %c0\n\t" + ".popsection \n\t" + : : "i" (key) : : l_yes); + return false; +l_yes: + return true; +} #endif /* __KERNEL__ */ diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 13009d1af99a..8cdd1e247975 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -4,7 +4,6 @@ #include #include #include -#include #include /* @@ -191,7 +190,7 @@ extern void *text_poke(void *addr, const void *opcode, size_t len); extern void *text_poke_smp(void *addr, const void *opcode, size_t len); extern void text_poke_smp_batch(struct text_poke_param *params, int n); -#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) +#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_JUMP_LABEL) #define IDEAL_NOP_SIZE_5 5 extern unsigned char ideal_nop5[IDEAL_NOP_SIZE_5]; extern void arch_init_ideal_nop5(void); diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 574dbc22893a..f217cee86533 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -5,20 +5,24 @@ #include #include +#include #define JUMP_LABEL_NOP_SIZE 5 -# define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t" - -# define JUMP_LABEL(key, label) \ - do { \ - asm goto("1:" \ - JUMP_LABEL_INITIAL_NOP \ - ".pushsection __jump_table, \"aw\" \n\t"\ - _ASM_PTR "1b, %l[" #label "], %c0 \n\t" \ - ".popsection \n\t" \ - : : "i" (key) : : label); \ - } while (0) +#define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t" + +static __always_inline bool arch_static_branch(struct jump_label_key *key) +{ + asm goto("1:" + JUMP_LABEL_INITIAL_NOP + ".pushsection __jump_table, \"aw\" \n\t" + _ASM_PTR "1b, %l[l_yes], %c0 \n\t" + ".popsection \n\t" + : : "i" (key) : : l_yes); + return false; +l_yes: + return true; +} #endif /* __KERNEL__ */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 4a234677e213..651454b0c811 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -679,7 +679,7 @@ void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n) __stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); } -#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) +#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_JUMP_LABEL) #ifdef CONFIG_X86_64 unsigned char ideal_nop5[5] = { 0x66, 0x66, 0x66, 0x66, 0x90 }; diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index ab23f1ad4bf1..52f256f2cc81 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 32c45e5fe0ab..79522166d7f1 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -170,6 +170,10 @@ STRUCT_ALIGN(); \ *(__tracepoints) \ /* implement dynamic printk debug */ \ + . = ALIGN(8); \ + VMLINUX_SYMBOL(__start___jump_table) = .; \ + *(__jump_table) \ + VMLINUX_SYMBOL(__stop___jump_table) = .; \ . = ALIGN(8); \ VMLINUX_SYMBOL(__start___verbose) = .; \ *(__verbose) \ @@ -228,8 +232,6 @@ \ BUG_TABLE \ \ - JUMP_TABLE \ - \ /* PCI quirks */ \ .pci_fixup : AT(ADDR(.pci_fixup) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start_pci_fixups_early) = .; \ @@ -589,14 +591,6 @@ #define BUG_TABLE #endif -#define JUMP_TABLE \ - . = ALIGN(8); \ - __jump_table : AT(ADDR(__jump_table) - LOAD_OFFSET) { \ - VMLINUX_SYMBOL(__start___jump_table) = .; \ - *(__jump_table) \ - VMLINUX_SYMBOL(__stop___jump_table) = .; \ - } - #ifdef CONFIG_PM_TRACE #define TRACEDATA \ . = ALIGN(4); \ diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index 0c9653f11c18..e747ecd48e1c 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -1,8 +1,6 @@ #ifndef _DYNAMIC_DEBUG_H #define _DYNAMIC_DEBUG_H -#include - /* dynamic_printk_enabled, and dynamic_printk_enabled2 are bitmasks in which * bit n is set to 1 if any modname hashes into the bucket n, 0 otherwise. They * use independent hash functions, to reduce the chance of false positives. diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 7880f18e4b86..83e745f3ead7 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -1,20 +1,43 @@ #ifndef _LINUX_JUMP_LABEL_H #define _LINUX_JUMP_LABEL_H +#include +#include + #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL) + +struct jump_label_key { + atomic_t enabled; + struct jump_entry *entries; +#ifdef CONFIG_MODULES + struct jump_label_mod *next; +#endif +}; + # include # define HAVE_JUMP_LABEL #endif enum jump_label_type { + JUMP_LABEL_DISABLE = 0, JUMP_LABEL_ENABLE, - JUMP_LABEL_DISABLE }; struct module; #ifdef HAVE_JUMP_LABEL +#ifdef CONFIG_MODULES +#define JUMP_LABEL_INIT {{ 0 }, NULL, NULL} +#else +#define JUMP_LABEL_INIT {{ 0 }, NULL} +#endif + +static __always_inline bool static_branch(struct jump_label_key *key) +{ + return arch_static_branch(key); +} + extern struct jump_entry __start___jump_table[]; extern struct jump_entry __stop___jump_table[]; @@ -23,37 +46,37 @@ extern void jump_label_unlock(void); extern void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type); extern void arch_jump_label_text_poke_early(jump_label_t addr); -extern void jump_label_update(unsigned long key, enum jump_label_type type); -extern void jump_label_apply_nops(struct module *mod); extern int jump_label_text_reserved(void *start, void *end); +extern void jump_label_inc(struct jump_label_key *key); +extern void jump_label_dec(struct jump_label_key *key); +extern bool jump_label_enabled(struct jump_label_key *key); +extern void jump_label_apply_nops(struct module *mod); -#define jump_label_enable(key) \ - jump_label_update((unsigned long)key, JUMP_LABEL_ENABLE); +#else -#define jump_label_disable(key) \ - jump_label_update((unsigned long)key, JUMP_LABEL_DISABLE); +#include -#else +#define JUMP_LABEL_INIT {ATOMIC_INIT(0)} -#define JUMP_LABEL(key, label) \ -do { \ - if (unlikely(*key)) \ - goto label; \ -} while (0) +struct jump_label_key { + atomic_t enabled; +}; -#define jump_label_enable(cond_var) \ -do { \ - *(cond_var) = 1; \ -} while (0) +static __always_inline bool static_branch(struct jump_label_key *key) +{ + if (unlikely(atomic_read(&key->enabled))) + return true; + return false; +} -#define jump_label_disable(cond_var) \ -do { \ - *(cond_var) = 0; \ -} while (0) +static inline void jump_label_inc(struct jump_label_key *key) +{ + atomic_inc(&key->enabled); +} -static inline int jump_label_apply_nops(struct module *mod) +static inline void jump_label_dec(struct jump_label_key *key) { - return 0; + atomic_dec(&key->enabled); } static inline int jump_label_text_reserved(void *start, void *end) @@ -64,16 +87,16 @@ static inline int jump_label_text_reserved(void *start, void *end) static inline void jump_label_lock(void) {} static inline void jump_label_unlock(void) {} -#endif +static inline bool jump_label_enabled(struct jump_label_key *key) +{ + return !!atomic_read(&key->enabled); +} -#define COND_STMT(key, stmt) \ -do { \ - __label__ jl_enabled; \ - JUMP_LABEL(key, jl_enabled); \ - if (0) { \ -jl_enabled: \ - stmt; \ - } \ -} while (0) +static inline int jump_label_apply_nops(struct module *mod) +{ + return 0; +} + +#endif #endif diff --git a/include/linux/jump_label_ref.h b/include/linux/jump_label_ref.h deleted file mode 100644 index e5d012ad92c6..000000000000 --- a/include/linux/jump_label_ref.h +++ /dev/null @@ -1,44 +0,0 @@ -#ifndef _LINUX_JUMP_LABEL_REF_H -#define _LINUX_JUMP_LABEL_REF_H - -#include -#include - -#ifdef HAVE_JUMP_LABEL - -static inline void jump_label_inc(atomic_t *key) -{ - if (atomic_add_return(1, key) == 1) - jump_label_enable(key); -} - -static inline void jump_label_dec(atomic_t *key) -{ - if (atomic_dec_and_test(key)) - jump_label_disable(key); -} - -#else /* !HAVE_JUMP_LABEL */ - -static inline void jump_label_inc(atomic_t *key) -{ - atomic_inc(key); -} - -static inline void jump_label_dec(atomic_t *key) -{ - atomic_dec(key); -} - -#undef JUMP_LABEL -#define JUMP_LABEL(key, label) \ -do { \ - if (unlikely(__builtin_choose_expr( \ - __builtin_types_compatible_p(typeof(key), atomic_t *), \ - atomic_read((atomic_t *)(key)), *(key)))) \ - goto label; \ -} while (0) - -#endif /* HAVE_JUMP_LABEL */ - -#endif /* _LINUX_JUMP_LABEL_REF_H */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 311b4dc785a1..730b7821690f 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -505,7 +505,7 @@ struct perf_guest_info_callbacks { #include #include #include -#include +#include #include #include @@ -1034,7 +1034,7 @@ static inline int is_software_event(struct perf_event *event) return event->pmu->task_ctx_nr == perf_sw_context; } -extern atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; +extern struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; extern void __perf_sw_event(u32, u64, int, struct pt_regs *, u64); @@ -1063,22 +1063,21 @@ perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) { struct pt_regs hot_regs; - JUMP_LABEL(&perf_swevent_enabled[event_id], have_event); - return; - -have_event: - if (!regs) { - perf_fetch_caller_regs(&hot_regs); - regs = &hot_regs; + if (static_branch(&perf_swevent_enabled[event_id])) { + if (!regs) { + perf_fetch_caller_regs(&hot_regs); + regs = &hot_regs; + } + __perf_sw_event(event_id, nr, nmi, regs, addr); } - __perf_sw_event(event_id, nr, nmi, regs, addr); } -extern atomic_t perf_sched_events; +extern struct jump_label_key perf_sched_events; static inline void perf_event_task_sched_in(struct task_struct *task) { - COND_STMT(&perf_sched_events, __perf_event_task_sched_in(task)); + if (static_branch(&perf_sched_events)) + __perf_event_task_sched_in(task); } static inline @@ -1086,7 +1085,8 @@ void perf_event_task_sched_out(struct task_struct *task, struct task_struct *nex { perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); - COND_STMT(&perf_sched_events, __perf_event_task_sched_out(task, next)); + if (static_branch(&perf_sched_events)) + __perf_event_task_sched_out(task, next); } extern void perf_event_mmap(struct vm_area_struct *vma); diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 97c84a58efb8..d530a4460a0b 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -29,7 +29,7 @@ struct tracepoint_func { struct tracepoint { const char *name; /* Tracepoint name */ - int state; /* State. */ + struct jump_label_key key; void (*regfunc)(void); void (*unregfunc)(void); struct tracepoint_func __rcu *funcs; @@ -146,9 +146,7 @@ void tracepoint_update_probe_range(struct tracepoint * const *begin, extern struct tracepoint __tracepoint_##name; \ static inline void trace_##name(proto) \ { \ - JUMP_LABEL(&__tracepoint_##name.state, do_trace); \ - return; \ -do_trace: \ + if (static_branch(&__tracepoint_##name.key)) \ __DO_TRACE(&__tracepoint_##name, \ TP_PROTO(data_proto), \ TP_ARGS(data_args), \ @@ -176,14 +174,14 @@ do_trace: \ * structures, so we create an array of pointers that will be used for iteration * on the tracepoints. */ -#define DEFINE_TRACE_FN(name, reg, unreg) \ - static const char __tpstrtab_##name[] \ - __attribute__((section("__tracepoints_strings"))) = #name; \ - struct tracepoint __tracepoint_##name \ - __attribute__((section("__tracepoints"))) = \ - { __tpstrtab_##name, 0, reg, unreg, NULL }; \ - static struct tracepoint * const __tracepoint_ptr_##name __used \ - __attribute__((section("__tracepoints_ptrs"))) = \ +#define DEFINE_TRACE_FN(name, reg, unreg) \ + static const char __tpstrtab_##name[] \ + __attribute__((section("__tracepoints_strings"))) = #name; \ + struct tracepoint __tracepoint_##name \ + __attribute__((section("__tracepoints"))) = \ + { __tpstrtab_##name, JUMP_LABEL_INIT, reg, unreg, NULL };\ + static struct tracepoint * const __tracepoint_ptr_##name __used \ + __attribute__((section("__tracepoints_ptrs"))) = \ &__tracepoint_##name; #define DEFINE_TRACE(name) \ diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 3b79bd938330..74d1c099fbd1 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -2,43 +2,23 @@ * jump label support * * Copyright (C) 2009 Jason Baron + * Copyright (C) 2011 Peter Zijlstra * */ -#include #include #include #include #include -#include #include #include #include +#include #ifdef HAVE_JUMP_LABEL -#define JUMP_LABEL_HASH_BITS 6 -#define JUMP_LABEL_TABLE_SIZE (1 << JUMP_LABEL_HASH_BITS) -static struct hlist_head jump_label_table[JUMP_LABEL_TABLE_SIZE]; - /* mutex to protect coming/going of the the jump_label table */ static DEFINE_MUTEX(jump_label_mutex); -struct jump_label_entry { - struct hlist_node hlist; - struct jump_entry *table; - int nr_entries; - /* hang modules off here */ - struct hlist_head modules; - unsigned long key; -}; - -struct jump_label_module_entry { - struct hlist_node hlist; - struct jump_entry *table; - int nr_entries; - struct module *mod; -}; - void jump_label_lock(void) { mutex_lock(&jump_label_mutex); @@ -49,6 +29,11 @@ void jump_label_unlock(void) mutex_unlock(&jump_label_mutex); } +bool jump_label_enabled(struct jump_label_key *key) +{ + return !!atomic_read(&key->enabled); +} + static int jump_label_cmp(const void *a, const void *b) { const struct jump_entry *jea = a; @@ -64,7 +49,7 @@ static int jump_label_cmp(const void *a, const void *b) } static void -sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop) +jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) { unsigned long size; @@ -73,118 +58,25 @@ sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop) sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); } -static struct jump_label_entry *get_jump_label_entry(jump_label_t key) -{ - struct hlist_head *head; - struct hlist_node *node; - struct jump_label_entry *e; - u32 hash = jhash((void *)&key, sizeof(jump_label_t), 0); - - head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)]; - hlist_for_each_entry(e, node, head, hlist) { - if (key == e->key) - return e; - } - return NULL; -} +static void jump_label_update(struct jump_label_key *key, int enable); -static struct jump_label_entry * -add_jump_label_entry(jump_label_t key, int nr_entries, struct jump_entry *table) +void jump_label_inc(struct jump_label_key *key) { - struct hlist_head *head; - struct jump_label_entry *e; - u32 hash; - - e = get_jump_label_entry(key); - if (e) - return ERR_PTR(-EEXIST); - - e = kmalloc(sizeof(struct jump_label_entry), GFP_KERNEL); - if (!e) - return ERR_PTR(-ENOMEM); - - hash = jhash((void *)&key, sizeof(jump_label_t), 0); - head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)]; - e->key = key; - e->table = table; - e->nr_entries = nr_entries; - INIT_HLIST_HEAD(&(e->modules)); - hlist_add_head(&e->hlist, head); - return e; -} + if (atomic_inc_not_zero(&key->enabled)) + return; -static int -build_jump_label_hashtable(struct jump_entry *start, struct jump_entry *stop) -{ - struct jump_entry *iter, *iter_begin; - struct jump_label_entry *entry; - int count; - - sort_jump_label_entries(start, stop); - iter = start; - while (iter < stop) { - entry = get_jump_label_entry(iter->key); - if (!entry) { - iter_begin = iter; - count = 0; - while ((iter < stop) && - (iter->key == iter_begin->key)) { - iter++; - count++; - } - entry = add_jump_label_entry(iter_begin->key, - count, iter_begin); - if (IS_ERR(entry)) - return PTR_ERR(entry); - } else { - WARN_ONCE(1, KERN_ERR "build_jump_hashtable: unexpected entry!\n"); - return -1; - } - } - return 0; + jump_label_lock(); + if (atomic_add_return(1, &key->enabled) == 1) + jump_label_update(key, JUMP_LABEL_ENABLE); + jump_label_unlock(); } -/*** - * jump_label_update - update jump label text - * @key - key value associated with a a jump label - * @type - enum set to JUMP_LABEL_ENABLE or JUMP_LABEL_DISABLE - * - * Will enable/disable the jump for jump label @key, depending on the - * value of @type. - * - */ - -void jump_label_update(unsigned long key, enum jump_label_type type) +void jump_label_dec(struct jump_label_key *key) { - struct jump_entry *iter; - struct jump_label_entry *entry; - struct hlist_node *module_node; - struct jump_label_module_entry *e_module; - int count; + if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) + return; - jump_label_lock(); - entry = get_jump_label_entry((jump_label_t)key); - if (entry) { - count = entry->nr_entries; - iter = entry->table; - while (count--) { - if (kernel_text_address(iter->code)) - arch_jump_label_transform(iter, type); - iter++; - } - /* eanble/disable jump labels in modules */ - hlist_for_each_entry(e_module, module_node, &(entry->modules), - hlist) { - count = e_module->nr_entries; - iter = e_module->table; - while (count--) { - if (iter->key && - kernel_text_address(iter->code)) - arch_jump_label_transform(iter, type); - iter++; - } - } - } + jump_label_update(key, JUMP_LABEL_DISABLE); jump_label_unlock(); } @@ -197,77 +89,33 @@ static int addr_conflict(struct jump_entry *entry, void *start, void *end) return 0; } -#ifdef CONFIG_MODULES - -static int module_conflict(void *start, void *end) +static int __jump_label_text_reserved(struct jump_entry *iter_start, + struct jump_entry *iter_stop, void *start, void *end) { - struct hlist_head *head; - struct hlist_node *node, *node_next, *module_node, *module_node_next; - struct jump_label_entry *e; - struct jump_label_module_entry *e_module; struct jump_entry *iter; - int i, count; - int conflict = 0; - - for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { - head = &jump_label_table[i]; - hlist_for_each_entry_safe(e, node, node_next, head, hlist) { - hlist_for_each_entry_safe(e_module, module_node, - module_node_next, - &(e->modules), hlist) { - count = e_module->nr_entries; - iter = e_module->table; - while (count--) { - if (addr_conflict(iter, start, end)) { - conflict = 1; - goto out; - } - iter++; - } - } - } - } -out: - return conflict; -} - -#endif - -/*** - * jump_label_text_reserved - check if addr range is reserved - * @start: start text addr - * @end: end text addr - * - * checks if the text addr located between @start and @end - * overlaps with any of the jump label patch addresses. Code - * that wants to modify kernel text should first verify that - * it does not overlap with any of the jump label addresses. - * Caller must hold jump_label_mutex. - * - * returns 1 if there is an overlap, 0 otherwise - */ -int jump_label_text_reserved(void *start, void *end) -{ - struct jump_entry *iter; - struct jump_entry *iter_start = __start___jump_table; - struct jump_entry *iter_stop = __start___jump_table; - int conflict = 0; iter = iter_start; while (iter < iter_stop) { - if (addr_conflict(iter, start, end)) { - conflict = 1; - goto out; - } + if (addr_conflict(iter, start, end)) + return 1; iter++; } - /* now check modules */ -#ifdef CONFIG_MODULES - conflict = module_conflict(start, end); -#endif -out: - return conflict; + return 0; +} + +static void __jump_label_update(struct jump_label_key *key, + struct jump_entry *entry, int enable) +{ + for (; entry->key == (jump_label_t)(unsigned long)key; entry++) { + /* + * entry->code set to 0 invalidates module init text sections + * kernel_text_address() verifies we are not in core kernel + * init code, see jump_label_invalidate_module_init(). + */ + if (entry->code && kernel_text_address(entry->code)) + arch_jump_label_transform(entry, enable); + } } /* @@ -277,142 +125,173 @@ void __weak arch_jump_label_text_poke_early(jump_label_t addr) { } -static __init int init_jump_label(void) +static __init int jump_label_init(void) { - int ret; struct jump_entry *iter_start = __start___jump_table; struct jump_entry *iter_stop = __stop___jump_table; + struct jump_label_key *key = NULL; struct jump_entry *iter; jump_label_lock(); - ret = build_jump_label_hashtable(__start___jump_table, - __stop___jump_table); - iter = iter_start; - while (iter < iter_stop) { + jump_label_sort_entries(iter_start, iter_stop); + + for (iter = iter_start; iter < iter_stop; iter++) { arch_jump_label_text_poke_early(iter->code); - iter++; + if (iter->key == (jump_label_t)(unsigned long)key) + continue; + + key = (struct jump_label_key *)(unsigned long)iter->key; + atomic_set(&key->enabled, 0); + key->entries = iter; +#ifdef CONFIG_MODULES + key->next = NULL; +#endif } jump_label_unlock(); - return ret; + + return 0; } -early_initcall(init_jump_label); +early_initcall(jump_label_init); #ifdef CONFIG_MODULES -static struct jump_label_module_entry * -add_jump_label_module_entry(struct jump_label_entry *entry, - struct jump_entry *iter_begin, - int count, struct module *mod) +struct jump_label_mod { + struct jump_label_mod *next; + struct jump_entry *entries; + struct module *mod; +}; + +static int __jump_label_mod_text_reserved(void *start, void *end) +{ + struct module *mod; + + mod = __module_text_address((unsigned long)start); + if (!mod) + return 0; + + WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); + + return __jump_label_text_reserved(mod->jump_entries, + mod->jump_entries + mod->num_jump_entries, + start, end); +} + +static void __jump_label_mod_update(struct jump_label_key *key, int enable) +{ + struct jump_label_mod *mod = key->next; + + while (mod) { + __jump_label_update(key, mod->entries, enable); + mod = mod->next; + } +} + +/*** + * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop() + * @mod: module to patch + * + * Allow for run-time selection of the optimal nops. Before the module + * loads patch these with arch_get_jump_label_nop(), which is specified by + * the arch specific jump label code. + */ +void jump_label_apply_nops(struct module *mod) { - struct jump_label_module_entry *e; - - e = kmalloc(sizeof(struct jump_label_module_entry), GFP_KERNEL); - if (!e) - return ERR_PTR(-ENOMEM); - e->mod = mod; - e->nr_entries = count; - e->table = iter_begin; - hlist_add_head(&e->hlist, &entry->modules); - return e; + struct jump_entry *iter_start = mod->jump_entries; + struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; + struct jump_entry *iter; + + /* if the module doesn't have jump label entries, just return */ + if (iter_start == iter_stop) + return; + + for (iter = iter_start; iter < iter_stop; iter++) + arch_jump_label_text_poke_early(iter->code); } -static int add_jump_label_module(struct module *mod) +static int jump_label_add_module(struct module *mod) { - struct jump_entry *iter, *iter_begin; - struct jump_label_entry *entry; - struct jump_label_module_entry *module_entry; - int count; + struct jump_entry *iter_start = mod->jump_entries; + struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; + struct jump_entry *iter; + struct jump_label_key *key = NULL; + struct jump_label_mod *jlm; /* if the module doesn't have jump label entries, just return */ - if (!mod->num_jump_entries) + if (iter_start == iter_stop) return 0; - sort_jump_label_entries(mod->jump_entries, - mod->jump_entries + mod->num_jump_entries); - iter = mod->jump_entries; - while (iter < mod->jump_entries + mod->num_jump_entries) { - entry = get_jump_label_entry(iter->key); - iter_begin = iter; - count = 0; - while ((iter < mod->jump_entries + mod->num_jump_entries) && - (iter->key == iter_begin->key)) { - iter++; - count++; - } - if (!entry) { - entry = add_jump_label_entry(iter_begin->key, 0, NULL); - if (IS_ERR(entry)) - return PTR_ERR(entry); + jump_label_sort_entries(iter_start, iter_stop); + + for (iter = iter_start; iter < iter_stop; iter++) { + if (iter->key == (jump_label_t)(unsigned long)key) + continue; + + key = (struct jump_label_key *)(unsigned long)iter->key; + + if (__module_address(iter->key) == mod) { + atomic_set(&key->enabled, 0); + key->entries = iter; + key->next = NULL; + continue; } - module_entry = add_jump_label_module_entry(entry, iter_begin, - count, mod); - if (IS_ERR(module_entry)) - return PTR_ERR(module_entry); + + jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL); + if (!jlm) + return -ENOMEM; + + jlm->mod = mod; + jlm->entries = iter; + jlm->next = key->next; + key->next = jlm; + + if (jump_label_enabled(key)) + __jump_label_update(key, iter, JUMP_LABEL_ENABLE); } + return 0; } -static void remove_jump_label_module(struct module *mod) +static void jump_label_del_module(struct module *mod) { - struct hlist_head *head; - struct hlist_node *node, *node_next, *module_node, *module_node_next; - struct jump_label_entry *e; - struct jump_label_module_entry *e_module; - int i; + struct jump_entry *iter_start = mod->jump_entries; + struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; + struct jump_entry *iter; + struct jump_label_key *key = NULL; + struct jump_label_mod *jlm, **prev; - /* if the module doesn't have jump label entries, just return */ - if (!mod->num_jump_entries) - return; + for (iter = iter_start; iter < iter_stop; iter++) { + if (iter->key == (jump_label_t)(unsigned long)key) + continue; + + key = (struct jump_label_key *)(unsigned long)iter->key; + + if (__module_address(iter->key) == mod) + continue; + + prev = &key->next; + jlm = key->next; - for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { - head = &jump_label_table[i]; - hlist_for_each_entry_safe(e, node, node_next, head, hlist) { - hlist_for_each_entry_safe(e_module, module_node, - module_node_next, - &(e->modules), hlist) { - if (e_module->mod == mod) { - hlist_del(&e_module->hlist); - kfree(e_module); - } - } - if (hlist_empty(&e->modules) && (e->nr_entries == 0)) { - hlist_del(&e->hlist); - kfree(e); - } + while (jlm && jlm->mod != mod) { + prev = &jlm->next; + jlm = jlm->next; + } + + if (jlm) { + *prev = jlm->next; + kfree(jlm); } } } -static void remove_jump_label_module_init(struct module *mod) +static void jump_label_invalidate_module_init(struct module *mod) { - struct hlist_head *head; - struct hlist_node *node, *node_next, *module_node, *module_node_next; - struct jump_label_entry *e; - struct jump_label_module_entry *e_module; + struct jump_entry *iter_start = mod->jump_entries; + struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; struct jump_entry *iter; - int i, count; - - /* if the module doesn't have jump label entries, just return */ - if (!mod->num_jump_entries) - return; - for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { - head = &jump_label_table[i]; - hlist_for_each_entry_safe(e, node, node_next, head, hlist) { - hlist_for_each_entry_safe(e_module, module_node, - module_node_next, - &(e->modules), hlist) { - if (e_module->mod != mod) - continue; - count = e_module->nr_entries; - iter = e_module->table; - while (count--) { - if (within_module_init(iter->code, mod)) - iter->key = 0; - iter++; - } - } - } + for (iter = iter_start; iter < iter_stop; iter++) { + if (within_module_init(iter->code, mod)) + iter->code = 0; } } @@ -426,59 +305,77 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val, switch (val) { case MODULE_STATE_COMING: jump_label_lock(); - ret = add_jump_label_module(mod); + ret = jump_label_add_module(mod); if (ret) - remove_jump_label_module(mod); + jump_label_del_module(mod); jump_label_unlock(); break; case MODULE_STATE_GOING: jump_label_lock(); - remove_jump_label_module(mod); + jump_label_del_module(mod); jump_label_unlock(); break; case MODULE_STATE_LIVE: jump_label_lock(); - remove_jump_label_module_init(mod); + jump_label_invalidate_module_init(mod); jump_label_unlock(); break; } - return ret; -} -/*** - * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop() - * @mod: module to patch - * - * Allow for run-time selection of the optimal nops. Before the module - * loads patch these with arch_get_jump_label_nop(), which is specified by - * the arch specific jump label code. - */ -void jump_label_apply_nops(struct module *mod) -{ - struct jump_entry *iter; - - /* if the module doesn't have jump label entries, just return */ - if (!mod->num_jump_entries) - return; - - iter = mod->jump_entries; - while (iter < mod->jump_entries + mod->num_jump_entries) { - arch_jump_label_text_poke_early(iter->code); - iter++; - } + return notifier_from_errno(ret); } struct notifier_block jump_label_module_nb = { .notifier_call = jump_label_module_notify, - .priority = 0, + .priority = 1, /* higher than tracepoints */ }; -static __init int init_jump_label_module(void) +static __init int jump_label_init_module(void) { return register_module_notifier(&jump_label_module_nb); } -early_initcall(init_jump_label_module); +early_initcall(jump_label_init_module); #endif /* CONFIG_MODULES */ +/*** + * jump_label_text_reserved - check if addr range is reserved + * @start: start text addr + * @end: end text addr + * + * checks if the text addr located between @start and @end + * overlaps with any of the jump label patch addresses. Code + * that wants to modify kernel text should first verify that + * it does not overlap with any of the jump label addresses. + * Caller must hold jump_label_mutex. + * + * returns 1 if there is an overlap, 0 otherwise + */ +int jump_label_text_reserved(void *start, void *end) +{ + int ret = __jump_label_text_reserved(__start___jump_table, + __stop___jump_table, start, end); + + if (ret) + return ret; + +#ifdef CONFIG_MODULES + ret = __jump_label_mod_text_reserved(start, end); +#endif + return ret; +} + +static void jump_label_update(struct jump_label_key *key, int enable) +{ + struct jump_entry *entry = key->entries; + + /* if there are no users, entry can be NULL */ + if (entry) + __jump_label_update(key, entry, enable); + +#ifdef CONFIG_MODULES + __jump_label_mod_update(key, enable); +#endif +} + #endif diff --git a/kernel/perf_event.c b/kernel/perf_event.c index c75925c4d1e2..d665e92fbd44 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -125,7 +125,7 @@ enum event_type_t { * perf_sched_events : >0 events exist * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu */ -atomic_t perf_sched_events __read_mostly; +struct jump_label_key perf_sched_events __read_mostly; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static atomic_t nr_mmap_events __read_mostly; @@ -5417,7 +5417,7 @@ fail: return err; } -atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; +struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; static void sw_perf_event_destroy(struct perf_event *event) { diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 68187af4889e..b219f1449c54 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -251,9 +251,9 @@ static void set_tracepoint(struct tracepoint_entry **entry, { WARN_ON(strcmp((*entry)->name, elem->name) != 0); - if (elem->regfunc && !elem->state && active) + if (elem->regfunc && !jump_label_enabled(&elem->key) && active) elem->regfunc(); - else if (elem->unregfunc && elem->state && !active) + else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active) elem->unregfunc(); /* @@ -264,13 +264,10 @@ static void set_tracepoint(struct tracepoint_entry **entry, * is used. */ rcu_assign_pointer(elem->funcs, (*entry)->funcs); - if (!elem->state && active) { - jump_label_enable(&elem->state); - elem->state = active; - } else if (elem->state && !active) { - jump_label_disable(&elem->state); - elem->state = active; - } + if (active && !jump_label_enabled(&elem->key)) + jump_label_inc(&elem->key); + else if (!active && jump_label_enabled(&elem->key)) + jump_label_dec(&elem->key); } /* @@ -281,13 +278,11 @@ static void set_tracepoint(struct tracepoint_entry **entry, */ static void disable_tracepoint(struct tracepoint *elem) { - if (elem->unregfunc && elem->state) + if (elem->unregfunc && jump_label_enabled(&elem->key)) elem->unregfunc(); - if (elem->state) { - jump_label_disable(&elem->state); - elem->state = 0; - } + if (jump_label_enabled(&elem->key)) + jump_label_dec(&elem->key); rcu_assign_pointer(elem->funcs, NULL); } -- cgit v1.2.3 From 058e297d34a404caaa5ed277de15698d8dc43000 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 29 Apr 2011 22:35:33 -0400 Subject: ftrace: Only update the function code on write to filter files If function tracing is enabled, a read of the filter files will cause the call to stop_machine to update the function trace sites. It should only call stop_machine on write. Cc: stable@kernel.org Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ee24fa1935ac..666880d051ef 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2413,14 +2413,16 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) ftrace_match_records(parser->buffer, parser->idx, enable); } - mutex_lock(&ftrace_lock); - if (ftrace_start_up && ftrace_enabled) - ftrace_run_update_code(FTRACE_ENABLE_CALLS); - mutex_unlock(&ftrace_lock); - trace_parser_put(parser); kfree(iter); + if (file->f_mode & FMODE_WRITE) { + mutex_lock(&ftrace_lock); + if (ftrace_start_up && ftrace_enabled) + ftrace_run_update_code(FTRACE_ENABLE_CALLS); + mutex_unlock(&ftrace_lock); + } + mutex_unlock(&ftrace_regex_lock); return 0; } -- cgit v1.2.3 From 0778d9ad33898faab7bf6316108b471790376e35 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 29 Apr 2011 10:36:31 -0400 Subject: ftrace: Make FTRACE_WARN_ON() work in if condition Let FTRACE_WARN_ON() be used as a stand alone statement or inside a conditional: if (FTRACE_WARN_ON(x)) Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ee24fa1935ac..4ff65599c973 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -39,16 +39,20 @@ #include "trace_stat.h" #define FTRACE_WARN_ON(cond) \ - do { \ - if (WARN_ON(cond)) \ + ({ \ + int ___r = cond; \ + if (WARN_ON(___r)) \ ftrace_kill(); \ - } while (0) + ___r; \ + }) #define FTRACE_WARN_ON_ONCE(cond) \ - do { \ - if (WARN_ON_ONCE(cond)) \ + ({ \ + int ___r = cond; \ + if (WARN_ON_ONCE(___r)) \ ftrace_kill(); \ - } while (0) + ___r; \ + }) /* hash bits for specific function selection */ #define FTRACE_HASH_BITS 7 -- cgit v1.2.3 From 8ab2b7efd3e2ccf2c2dda3206b8171ecdbd0af40 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 21 Apr 2011 22:41:35 -0400 Subject: ftrace: Remove unnecessary disabling of irqs The disabling of interrupts around ftrace_update_code() was used to protect against the evil ftrace daemon from years past. But that daemon has long been killed. It is safe to keep interrupts enabled while updating the initial mcount into nops. The ftrace_mutex is also held which keeps other users at bay. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4ff65599c973..f199fb2e1d2c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2707,7 +2707,6 @@ static int ftrace_process_locs(struct module *mod, { unsigned long *p; unsigned long addr; - unsigned long flags; mutex_lock(&ftrace_lock); p = start; @@ -2724,10 +2723,7 @@ static int ftrace_process_locs(struct module *mod, ftrace_record_ip(addr); } - /* disable interrupts to prevent kstop machine */ - local_irq_save(flags); ftrace_update_code(mod); - local_irq_restore(flags); mutex_unlock(&ftrace_lock); return 0; -- cgit v1.2.3 From 3499e461147636bf55c41128d83b679ac6ab2d86 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 21 Apr 2011 22:59:12 -0400 Subject: ftrace: Remove failures file The failures file in the debugfs tracing directory would list the functions that failed to convert when the old dead ftrace daemon tried to update code but failed. Since this code is now dead along with the daemon the failures file is useless. Remove it. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 39 ++------------------------------------- 1 file changed, 2 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f199fb2e1d2c..97b30f818642 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1355,9 +1355,8 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) enum { FTRACE_ITER_FILTER = (1 << 0), FTRACE_ITER_NOTRACE = (1 << 1), - FTRACE_ITER_FAILURES = (1 << 2), - FTRACE_ITER_PRINTALL = (1 << 3), - FTRACE_ITER_HASH = (1 << 4), + FTRACE_ITER_PRINTALL = (1 << 2), + FTRACE_ITER_HASH = (1 << 3), }; #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ @@ -1487,12 +1486,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos) rec = &iter->pg->records[iter->idx++]; if ((rec->flags & FTRACE_FL_FREE) || - (!(iter->flags & FTRACE_ITER_FAILURES) && - (rec->flags & FTRACE_FL_FAILED)) || - - ((iter->flags & FTRACE_ITER_FAILURES) && - !(rec->flags & FTRACE_FL_FAILED)) || - ((iter->flags & FTRACE_ITER_FILTER) && !(rec->flags & FTRACE_FL_FILTER)) || @@ -1633,24 +1626,6 @@ ftrace_avail_open(struct inode *inode, struct file *file) return ret; } -static int -ftrace_failures_open(struct inode *inode, struct file *file) -{ - int ret; - struct seq_file *m; - struct ftrace_iterator *iter; - - ret = ftrace_avail_open(inode, file); - if (!ret) { - m = file->private_data; - iter = m->private; - iter->flags = FTRACE_ITER_FAILURES; - } - - return ret; -} - - static void ftrace_filter_reset(int enable) { struct ftrace_page *pg; @@ -2448,13 +2423,6 @@ static const struct file_operations ftrace_avail_fops = { .release = seq_release_private, }; -static const struct file_operations ftrace_failures_fops = { - .open = ftrace_failures_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; - static const struct file_operations ftrace_filter_fops = { .open = ftrace_filter_open, .read = seq_read, @@ -2683,9 +2651,6 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) trace_create_file("available_filter_functions", 0444, d_tracer, NULL, &ftrace_avail_fops); - trace_create_file("failures", 0444, - d_tracer, NULL, &ftrace_failures_fops); - trace_create_file("set_ftrace_filter", 0644, d_tracer, NULL, &ftrace_filter_fops); -- cgit v1.2.3 From 45a4a2372b364107cabea79f255b333236626416 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 21 Apr 2011 23:16:46 -0400 Subject: ftrace: Remove FTRACE_FL_FAILED flag Since we disable all function tracer processing if we detect that a modification of a instruction had failed, we do not need to track that the record has failed. No more ftrace processing is allowed, and the FTRACE_FL_FAILED flag is pointless. Removing this flag simplifies some of the code, but some ftrace_disabled checks needed to be added or move around a little. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 9 +++--- kernel/trace/ftrace.c | 76 +++++++++++++++++++++++++++++++------------------- 2 files changed, 51 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index ca29e03c1fac..2a195ffd4269 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -147,11 +147,10 @@ extern int ftrace_text_reserved(void *start, void *end); enum { FTRACE_FL_FREE = (1 << 0), - FTRACE_FL_FAILED = (1 << 1), - FTRACE_FL_FILTER = (1 << 2), - FTRACE_FL_ENABLED = (1 << 3), - FTRACE_FL_NOTRACE = (1 << 4), - FTRACE_FL_CONVERTED = (1 << 5), + FTRACE_FL_FILTER = (1 << 1), + FTRACE_FL_ENABLED = (1 << 2), + FTRACE_FL_NOTRACE = (1 << 3), + FTRACE_FL_CONVERTED = (1 << 4), }; struct dyn_ftrace { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 97b30f818642..eb19fae2c54a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1083,19 +1083,20 @@ static void ftrace_replace_code(int enable) struct ftrace_page *pg; int failed; + if (unlikely(ftrace_disabled)) + return; + do_for_each_ftrace_rec(pg, rec) { /* * Skip over free records, records that have * failed and not converted. */ if (rec->flags & FTRACE_FL_FREE || - rec->flags & FTRACE_FL_FAILED || !(rec->flags & FTRACE_FL_CONVERTED)) continue; failed = __ftrace_replace_code(rec, enable); if (failed) { - rec->flags |= FTRACE_FL_FAILED; ftrace_bug(failed, rec->ip); /* Stop processing */ return; @@ -1111,10 +1112,12 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) ip = rec->ip; + if (unlikely(ftrace_disabled)) + return 0; + ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); if (ret) { ftrace_bug(ret, ip); - rec->flags |= FTRACE_FL_FAILED; return 0; } return 1; @@ -1466,6 +1469,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos) struct ftrace_iterator *iter = m->private; struct dyn_ftrace *rec = NULL; + if (unlikely(ftrace_disabled)) + return NULL; + if (iter->flags & FTRACE_ITER_HASH) return t_hash_next(m, pos); @@ -1518,6 +1524,10 @@ static void *t_start(struct seq_file *m, loff_t *pos) loff_t l; mutex_lock(&ftrace_lock); + + if (unlikely(ftrace_disabled)) + return NULL; + /* * If an lseek was done, then reset and start from beginning. */ @@ -1636,8 +1646,6 @@ static void ftrace_filter_reset(int enable) if (enable) ftrace_filtered = 0; do_for_each_ftrace_rec(pg, rec) { - if (rec->flags & FTRACE_FL_FAILED) - continue; rec->flags &= ~type; } while_for_each_ftrace_rec(); mutex_unlock(&ftrace_lock); @@ -1767,9 +1775,6 @@ static int ftrace_match_records(char *buff, int len, int enable) mutex_lock(&ftrace_lock); do_for_each_ftrace_rec(pg, rec) { - if (rec->flags & FTRACE_FL_FAILED) - continue; - if (ftrace_match_record(rec, search, search_len, type)) { if (not) rec->flags &= ~flag; @@ -1837,10 +1842,11 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable) } mutex_lock(&ftrace_lock); - do_for_each_ftrace_rec(pg, rec) { - if (rec->flags & FTRACE_FL_FAILED) - continue; + if (unlikely(ftrace_disabled)) + goto out_unlock; + + do_for_each_ftrace_rec(pg, rec) { if (ftrace_match_module_record(rec, mod, search, search_len, type)) { @@ -1854,6 +1860,7 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable) ftrace_filtered = 1; } while_for_each_ftrace_rec(); + out_unlock: mutex_unlock(&ftrace_lock); return found; @@ -2008,10 +2015,11 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, return -EINVAL; mutex_lock(&ftrace_lock); - do_for_each_ftrace_rec(pg, rec) { - if (rec->flags & FTRACE_FL_FAILED) - continue; + if (unlikely(ftrace_disabled)) + goto out_unlock; + + do_for_each_ftrace_rec(pg, rec) { if (!ftrace_match_record(rec, search, len, type)) continue; @@ -2218,6 +2226,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, mutex_lock(&ftrace_regex_lock); + ret = -ENODEV; + if (unlikely(ftrace_disabled)) + goto out_unlock; + if (file->f_mode & FMODE_READ) { struct seq_file *m = file->private_data; iter = m->private; @@ -2545,9 +2557,6 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) bool exists; int i; - if (ftrace_disabled) - return -ENODEV; - /* decode regex */ type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) @@ -2556,9 +2565,15 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) search_len = strlen(search); mutex_lock(&ftrace_lock); + + if (unlikely(ftrace_disabled)) { + mutex_unlock(&ftrace_lock); + return -ENODEV; + } + do_for_each_ftrace_rec(pg, rec) { - if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) + if (rec->flags & FTRACE_FL_FREE) continue; if (ftrace_match_record(rec, search, search_len, type)) { @@ -2700,10 +2715,11 @@ void ftrace_release_mod(struct module *mod) struct dyn_ftrace *rec; struct ftrace_page *pg; + mutex_lock(&ftrace_lock); + if (ftrace_disabled) - return; + goto out_unlock; - mutex_lock(&ftrace_lock); do_for_each_ftrace_rec(pg, rec) { if (within_module_core(rec->ip, mod)) { /* @@ -2714,6 +2730,7 @@ void ftrace_release_mod(struct module *mod) ftrace_free_rec(rec); } } while_for_each_ftrace_rec(); + out_unlock: mutex_unlock(&ftrace_lock); } @@ -3108,16 +3125,17 @@ void ftrace_kill(void) */ int register_ftrace_function(struct ftrace_ops *ops) { - int ret; - - if (unlikely(ftrace_disabled)) - return -1; + int ret = -1; mutex_lock(&ftrace_lock); + if (unlikely(ftrace_disabled)) + goto out_unlock; + ret = __register_ftrace_function(ops); ftrace_startup(0); + out_unlock: mutex_unlock(&ftrace_lock); return ret; } @@ -3145,14 +3163,14 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret; - - if (unlikely(ftrace_disabled)) - return -ENODEV; + int ret = -ENODEV; mutex_lock(&ftrace_lock); - ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (unlikely(ftrace_disabled)) + goto out; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) goto out; -- cgit v1.2.3 From d2c8c3eafbf715306ec891e7ca52d3d999acbe31 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 25 Apr 2011 14:32:42 -0400 Subject: ftrace: Remove FTRACE_FL_CONVERTED flag Since we disable all function tracer processing if we detect that a modification of a instruction had failed, we do not need to track that the record has failed. No more ftrace processing is allowed, and the FTRACE_FL_CONVERTED flag is pointless. The FTRACE_FL_CONVERTED flag was used to denote records that were successfully converted from mcount calls into nops. But if a single record fails, all of ftrace is disabled. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 1 - kernel/trace/ftrace.c | 12 ++++-------- 2 files changed, 4 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 2a195ffd4269..32047449b309 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -150,7 +150,6 @@ enum { FTRACE_FL_FILTER = (1 << 1), FTRACE_FL_ENABLED = (1 << 2), FTRACE_FL_NOTRACE = (1 << 3), - FTRACE_FL_CONVERTED = (1 << 4), }; struct dyn_ftrace { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index eb19fae2c54a..9abaaf46f212 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1087,12 +1087,8 @@ static void ftrace_replace_code(int enable) return; do_for_each_ftrace_rec(pg, rec) { - /* - * Skip over free records, records that have - * failed and not converted. - */ - if (rec->flags & FTRACE_FL_FREE || - !(rec->flags & FTRACE_FL_CONVERTED)) + /* Skip over free records */ + if (rec->flags & FTRACE_FL_FREE) continue; failed = __ftrace_replace_code(rec, enable); @@ -1280,10 +1276,10 @@ static int ftrace_update_code(struct module *mod) */ if (!ftrace_code_disable(mod, p)) { ftrace_free_rec(p); - continue; + /* Game over */ + break; } - p->flags |= FTRACE_FL_CONVERTED; ftrace_update_cnt++; /* -- cgit v1.2.3 From 996e87be7f537fb34638875dd37083166c733425 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 26 Apr 2011 16:11:03 -0400 Subject: ftrace: Move record update for normal and modules into a separate function The updating of a function record is moved to a single function. This will allow us to add specific changes in one location for both modules and kernel functions. Later patches will determine if the function record itself needs to be updated (which enables the mcount caller), or just the ftrace_ops needs the update. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9abaaf46f212..5b758ea344ce 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1743,6 +1743,15 @@ static int ftrace_match(char *str, char *regex, int len, int type) return matched; } +static void +update_record(struct dyn_ftrace *rec, unsigned long flag, int not) +{ + if (not) + rec->flags &= ~flag; + else + rec->flags |= flag; +} + static int ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type) { @@ -1772,10 +1781,7 @@ static int ftrace_match_records(char *buff, int len, int enable) do_for_each_ftrace_rec(pg, rec) { if (ftrace_match_record(rec, search, search_len, type)) { - if (not) - rec->flags &= ~flag; - else - rec->flags |= flag; + update_record(rec, flag, not); found = 1; } /* @@ -1846,10 +1852,7 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable) if (ftrace_match_module_record(rec, mod, search, search_len, type)) { - if (not) - rec->flags &= ~flag; - else - rec->flags |= flag; + update_record(rec, flag, not); found = 1; } if (enable && (rec->flags & FTRACE_FL_FILTER)) -- cgit v1.2.3 From 491d0dcfb9707e1f83eff93ca503eb7573162ef2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 27 Apr 2011 21:43:36 -0400 Subject: ftrace: Consolidate updating of ftrace_trace_function There are three locations that perform almost identical functions in order to update the ftrace_trace_function (the ftrace function variable that gets called by mcount). Consolidate these into a single function called update_ftrace_function(). Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 95 ++++++++++++++++++--------------------------------- 1 file changed, 34 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5b758ea344ce..33bcc71ca09a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -151,6 +151,34 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip) } #endif +static void update_ftrace_function(void) +{ + ftrace_func_t func; + + /* + * If there's only one function registered, then call that + * function directly. Otherwise, we need to iterate over the + * registered callers. + */ + if (ftrace_list == &ftrace_list_end || + ftrace_list->next == &ftrace_list_end) + func = ftrace_list->func; + else + func = ftrace_list_func; + + /* If we filter on pids, update to use the pid function */ + if (!list_empty(&ftrace_pids)) { + set_ftrace_pid_function(func); + func = ftrace_pid_func; + } +#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST + ftrace_trace_function = func; +#else + __ftrace_trace_function = func; + ftrace_trace_function = ftrace_test_stop_func; +#endif +} + static int __register_ftrace_function(struct ftrace_ops *ops) { ops->next = ftrace_list; @@ -162,30 +190,8 @@ static int __register_ftrace_function(struct ftrace_ops *ops) */ rcu_assign_pointer(ftrace_list, ops); - if (ftrace_enabled) { - ftrace_func_t func; - - if (ops->next == &ftrace_list_end) - func = ops->func; - else - func = ftrace_list_func; - - if (!list_empty(&ftrace_pids)) { - set_ftrace_pid_function(func); - func = ftrace_pid_func; - } - - /* - * For one func, simply call it directly. - * For more than one func, call the chain. - */ -#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST - ftrace_trace_function = func; -#else - __ftrace_trace_function = func; - ftrace_trace_function = ftrace_test_stop_func; -#endif - } + if (ftrace_enabled) + update_ftrace_function(); return 0; } @@ -213,52 +219,19 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) *p = (*p)->next; - if (ftrace_enabled) { - /* If we only have one func left, then call that directly */ - if (ftrace_list->next == &ftrace_list_end) { - ftrace_func_t func = ftrace_list->func; - - if (!list_empty(&ftrace_pids)) { - set_ftrace_pid_function(func); - func = ftrace_pid_func; - } -#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST - ftrace_trace_function = func; -#else - __ftrace_trace_function = func; -#endif - } - } + if (ftrace_enabled) + update_ftrace_function(); return 0; } static void ftrace_update_pid_func(void) { - ftrace_func_t func; - + /* Only do something if we are tracing something */ if (ftrace_trace_function == ftrace_stub) return; -#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST - func = ftrace_trace_function; -#else - func = __ftrace_trace_function; -#endif - - if (!list_empty(&ftrace_pids)) { - set_ftrace_pid_function(func); - func = ftrace_pid_func; - } else { - if (func == ftrace_pid_func) - func = ftrace_pid_function; - } - -#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST - ftrace_trace_function = func; -#else - __ftrace_trace_function = func; -#endif + update_ftrace_function(); } #ifdef CONFIG_FUNCTION_PROFILER -- cgit v1.2.3 From b9df92d2a94eef8811061aecb1396290df440e2e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 28 Apr 2011 20:32:08 -0400 Subject: ftrace: Consolidate the function match routines for normal and mods The code used for matching functions is almost identical between normal selecting of functions and using the :mod: feature of set_ftrace_notrace. Consolidate the two users into one function. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 98 +++++++++++++++++++-------------------------------- 1 file changed, 36 insertions(+), 62 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 33bcc71ca09a..4f19dbba12f9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1726,34 +1726,52 @@ update_record(struct dyn_ftrace *rec, unsigned long flag, int not) } static int -ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type) +ftrace_match_record(struct dyn_ftrace *rec, char *mod, + char *regex, int len, int type) { char str[KSYM_SYMBOL_LEN]; + char *modname; + + kallsyms_lookup(rec->ip, NULL, NULL, &modname, str); + + if (mod) { + /* module lookup requires matching the module */ + if (!modname || strcmp(modname, mod)) + return 0; + + /* blank search means to match all funcs in the mod */ + if (!len) + return 1; + } - kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); return ftrace_match(str, regex, len, type); } -static int ftrace_match_records(char *buff, int len, int enable) +static int match_records(char *buff, int len, char *mod, int enable, int not) { - unsigned int search_len; + unsigned search_len = 0; struct ftrace_page *pg; struct dyn_ftrace *rec; + int type = MATCH_FULL; + char *search = buff; unsigned long flag; - char *search; - int type; - int not; int found = 0; - flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; - type = filter_parse_regex(buff, len, &search, ¬); + if (len) { + type = filter_parse_regex(buff, len, &search, ¬); + search_len = strlen(search); + } - search_len = strlen(search); + flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; mutex_lock(&ftrace_lock); + + if (unlikely(ftrace_disabled)) + goto out_unlock; + do_for_each_ftrace_rec(pg, rec) { - if (ftrace_match_record(rec, search, search_len, type)) { + if (ftrace_match_record(rec, mod, search, search_len, type)) { update_record(rec, flag, not); found = 1; } @@ -1763,43 +1781,23 @@ static int ftrace_match_records(char *buff, int len, int enable) */ if (enable && (rec->flags & FTRACE_FL_FILTER)) ftrace_filtered = 1; + } while_for_each_ftrace_rec(); + out_unlock: mutex_unlock(&ftrace_lock); return found; } static int -ftrace_match_module_record(struct dyn_ftrace *rec, char *mod, - char *regex, int len, int type) +ftrace_match_records(char *buff, int len, int enable) { - char str[KSYM_SYMBOL_LEN]; - char *modname; - - kallsyms_lookup(rec->ip, NULL, NULL, &modname, str); - - if (!modname || strcmp(modname, mod)) - return 0; - - /* blank search means to match all funcs in the mod */ - if (len) - return ftrace_match(str, regex, len, type); - else - return 1; + return match_records(buff, len, NULL, enable, 0); } static int ftrace_match_module_records(char *buff, char *mod, int enable) { - unsigned search_len = 0; - struct ftrace_page *pg; - struct dyn_ftrace *rec; - int type = MATCH_FULL; - char *search = buff; - unsigned long flag; int not = 0; - int found = 0; - - flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; /* blank or '*' mean the same */ if (strcmp(buff, "*") == 0) @@ -1811,31 +1809,7 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable) not = 1; } - if (strlen(buff)) { - type = filter_parse_regex(buff, strlen(buff), &search, ¬); - search_len = strlen(search); - } - - mutex_lock(&ftrace_lock); - - if (unlikely(ftrace_disabled)) - goto out_unlock; - - do_for_each_ftrace_rec(pg, rec) { - - if (ftrace_match_module_record(rec, mod, - search, search_len, type)) { - update_record(rec, flag, not); - found = 1; - } - if (enable && (rec->flags & FTRACE_FL_FILTER)) - ftrace_filtered = 1; - - } while_for_each_ftrace_rec(); - out_unlock: - mutex_unlock(&ftrace_lock); - - return found; + return match_records(buff, strlen(buff), mod, enable, not); } /* @@ -1993,7 +1967,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, do_for_each_ftrace_rec(pg, rec) { - if (!ftrace_match_record(rec, search, len, type)) + if (!ftrace_match_record(rec, NULL, search, len, type)) continue; entry = kmalloc(sizeof(*entry), GFP_KERNEL); @@ -2548,7 +2522,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) if (rec->flags & FTRACE_FL_FREE) continue; - if (ftrace_match_record(rec, search, search_len, type)) { + if (ftrace_match_record(rec, NULL, search, search_len, type)) { /* if it is in the array */ exists = false; for (i = 0; i < *idx; i++) { -- cgit v1.2.3 From fae85b7c8bcc7de9c0a2698587e20c15beb7d5a6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 26 Oct 2010 20:24:03 +0200 Subject: perf: Start the restructuring mv kernel/perf_event.c -> kernel/events/core.c. From there, all further sensible splitting can happen. The idea is that due to perf_event.c becoming pretty sizable and with the advent of the marriage with ftrace, splitting functionality into its logical parts should help speeding up the unification and to manage the complexity of the subsystem. Signed-off-by: Borislav Petkov --- kernel/Makefile | 5 +- kernel/events/Makefile | 5 + kernel/events/core.c | 7455 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/perf_event.c | 7455 ------------------------------------------------ 4 files changed, 7463 insertions(+), 7457 deletions(-) create mode 100644 kernel/events/Makefile create mode 100644 kernel/events/core.c delete mode 100644 kernel/perf_event.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 85cbfb31e73e..79815306474f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -21,7 +21,6 @@ CFLAGS_REMOVE_mutex-debug.o = -pg CFLAGS_REMOVE_rtmutex-debug.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_sched_clock.o = -pg -CFLAGS_REMOVE_perf_event.o = -pg CFLAGS_REMOVE_irq_work.o = -pg endif @@ -103,7 +102,9 @@ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_IRQ_WORK) += irq_work.o -obj-$(CONFIG_PERF_EVENTS) += perf_event.o + +obj-$(CONFIG_PERF_EVENTS) += events/ + obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o obj-$(CONFIG_PADATA) += padata.o diff --git a/kernel/events/Makefile b/kernel/events/Makefile new file mode 100644 index 000000000000..26c00e4570e5 --- /dev/null +++ b/kernel/events/Makefile @@ -0,0 +1,5 @@ +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_core.o = -pg +endif + +obj-y += core.o diff --git a/kernel/events/core.c b/kernel/events/core.c new file mode 100644 index 000000000000..440bc485bbff --- /dev/null +++ b/kernel/events/core.c @@ -0,0 +1,7455 @@ +/* + * Performance events core code: + * + * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright © 2009 Paul Mackerras, IBM Corp. + * + * For licensing details see kernel-base/COPYING + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +struct remote_function_call { + struct task_struct *p; + int (*func)(void *info); + void *info; + int ret; +}; + +static void remote_function(void *data) +{ + struct remote_function_call *tfc = data; + struct task_struct *p = tfc->p; + + if (p) { + tfc->ret = -EAGAIN; + if (task_cpu(p) != smp_processor_id() || !task_curr(p)) + return; + } + + tfc->ret = tfc->func(tfc->info); +} + +/** + * task_function_call - call a function on the cpu on which a task runs + * @p: the task to evaluate + * @func: the function to be called + * @info: the function call argument + * + * Calls the function @func when the task is currently running. This might + * be on the current CPU, which just calls the function directly + * + * returns: @func return value, or + * -ESRCH - when the process isn't running + * -EAGAIN - when the process moved away + */ +static int +task_function_call(struct task_struct *p, int (*func) (void *info), void *info) +{ + struct remote_function_call data = { + .p = p, + .func = func, + .info = info, + .ret = -ESRCH, /* No such (running) process */ + }; + + if (task_curr(p)) + smp_call_function_single(task_cpu(p), remote_function, &data, 1); + + return data.ret; +} + +/** + * cpu_function_call - call a function on the cpu + * @func: the function to be called + * @info: the function call argument + * + * Calls the function @func on the remote cpu. + * + * returns: @func return value or -ENXIO when the cpu is offline + */ +static int cpu_function_call(int cpu, int (*func) (void *info), void *info) +{ + struct remote_function_call data = { + .p = NULL, + .func = func, + .info = info, + .ret = -ENXIO, /* No such CPU */ + }; + + smp_call_function_single(cpu, remote_function, &data, 1); + + return data.ret; +} + +#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ + PERF_FLAG_FD_OUTPUT |\ + PERF_FLAG_PID_CGROUP) + +enum event_type_t { + EVENT_FLEXIBLE = 0x1, + EVENT_PINNED = 0x2, + EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, +}; + +/* + * perf_sched_events : >0 events exist + * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu + */ +struct jump_label_key perf_sched_events __read_mostly; +static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); + +static atomic_t nr_mmap_events __read_mostly; +static atomic_t nr_comm_events __read_mostly; +static atomic_t nr_task_events __read_mostly; + +static LIST_HEAD(pmus); +static DEFINE_MUTEX(pmus_lock); +static struct srcu_struct pmus_srcu; + +/* + * perf event paranoia level: + * -1 - not paranoid at all + * 0 - disallow raw tracepoint access for unpriv + * 1 - disallow cpu events for unpriv + * 2 - disallow kernel profiling for unpriv + */ +int sysctl_perf_event_paranoid __read_mostly = 1; + +/* Minimum for 512 kiB + 1 user control page */ +int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ + +/* + * max perf event sample rate + */ +#define DEFAULT_MAX_SAMPLE_RATE 100000 +int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; +static int max_samples_per_tick __read_mostly = + DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); + +int perf_proc_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (ret || !write) + return ret; + + max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); + + return 0; +} + +static atomic64_t perf_event_id; + +static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, + enum event_type_t event_type); + +static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, + enum event_type_t event_type, + struct task_struct *task); + +static void update_context_time(struct perf_event_context *ctx); +static u64 perf_event_time(struct perf_event *event); + +void __weak perf_event_print_debug(void) { } + +extern __weak const char *perf_pmu_name(void) +{ + return "pmu"; +} + +static inline u64 perf_clock(void) +{ + return local_clock(); +} + +static inline struct perf_cpu_context * +__get_cpu_context(struct perf_event_context *ctx) +{ + return this_cpu_ptr(ctx->pmu->pmu_cpu_context); +} + +#ifdef CONFIG_CGROUP_PERF + +/* + * Must ensure cgroup is pinned (css_get) before calling + * this function. In other words, we cannot call this function + * if there is no cgroup event for the current CPU context. + */ +static inline struct perf_cgroup * +perf_cgroup_from_task(struct task_struct *task) +{ + return container_of(task_subsys_state(task, perf_subsys_id), + struct perf_cgroup, css); +} + +static inline bool +perf_cgroup_match(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + + return !event->cgrp || event->cgrp == cpuctx->cgrp; +} + +static inline void perf_get_cgroup(struct perf_event *event) +{ + css_get(&event->cgrp->css); +} + +static inline void perf_put_cgroup(struct perf_event *event) +{ + css_put(&event->cgrp->css); +} + +static inline void perf_detach_cgroup(struct perf_event *event) +{ + perf_put_cgroup(event); + event->cgrp = NULL; +} + +static inline int is_cgroup_event(struct perf_event *event) +{ + return event->cgrp != NULL; +} + +static inline u64 perf_cgroup_event_time(struct perf_event *event) +{ + struct perf_cgroup_info *t; + + t = per_cpu_ptr(event->cgrp->info, event->cpu); + return t->time; +} + +static inline void __update_cgrp_time(struct perf_cgroup *cgrp) +{ + struct perf_cgroup_info *info; + u64 now; + + now = perf_clock(); + + info = this_cpu_ptr(cgrp->info); + + info->time += now - info->timestamp; + info->timestamp = now; +} + +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) +{ + struct perf_cgroup *cgrp_out = cpuctx->cgrp; + if (cgrp_out) + __update_cgrp_time(cgrp_out); +} + +static inline void update_cgrp_time_from_event(struct perf_event *event) +{ + struct perf_cgroup *cgrp; + + /* + * ensure we access cgroup data only when needed and + * when we know the cgroup is pinned (css_get) + */ + if (!is_cgroup_event(event)) + return; + + cgrp = perf_cgroup_from_task(current); + /* + * Do not update time when cgroup is not active + */ + if (cgrp == event->cgrp) + __update_cgrp_time(event->cgrp); +} + +static inline void +perf_cgroup_set_timestamp(struct task_struct *task, + struct perf_event_context *ctx) +{ + struct perf_cgroup *cgrp; + struct perf_cgroup_info *info; + + /* + * ctx->lock held by caller + * ensure we do not access cgroup data + * unless we have the cgroup pinned (css_get) + */ + if (!task || !ctx->nr_cgroups) + return; + + cgrp = perf_cgroup_from_task(task); + info = this_cpu_ptr(cgrp->info); + info->timestamp = ctx->timestamp; +} + +#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ +#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */ + +/* + * reschedule events based on the cgroup constraint of task. + * + * mode SWOUT : schedule out everything + * mode SWIN : schedule in based on cgroup for next + */ +void perf_cgroup_switch(struct task_struct *task, int mode) +{ + struct perf_cpu_context *cpuctx; + struct pmu *pmu; + unsigned long flags; + + /* + * disable interrupts to avoid geting nr_cgroup + * changes via __perf_event_disable(). Also + * avoids preemption. + */ + local_irq_save(flags); + + /* + * we reschedule only in the presence of cgroup + * constrained events. + */ + rcu_read_lock(); + + list_for_each_entry_rcu(pmu, &pmus, entry) { + + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + perf_pmu_disable(cpuctx->ctx.pmu); + + /* + * perf_cgroup_events says at least one + * context on this CPU has cgroup events. + * + * ctx->nr_cgroups reports the number of cgroup + * events for a context. + */ + if (cpuctx->ctx.nr_cgroups > 0) { + + if (mode & PERF_CGROUP_SWOUT) { + cpu_ctx_sched_out(cpuctx, EVENT_ALL); + /* + * must not be done before ctxswout due + * to event_filter_match() in event_sched_out() + */ + cpuctx->cgrp = NULL; + } + + if (mode & PERF_CGROUP_SWIN) { + WARN_ON_ONCE(cpuctx->cgrp); + /* set cgrp before ctxsw in to + * allow event_filter_match() to not + * have to pass task around + */ + cpuctx->cgrp = perf_cgroup_from_task(task); + cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); + } + } + + perf_pmu_enable(cpuctx->ctx.pmu); + } + + rcu_read_unlock(); + + local_irq_restore(flags); +} + +static inline void perf_cgroup_sched_out(struct task_struct *task) +{ + perf_cgroup_switch(task, PERF_CGROUP_SWOUT); +} + +static inline void perf_cgroup_sched_in(struct task_struct *task) +{ + perf_cgroup_switch(task, PERF_CGROUP_SWIN); +} + +static inline int perf_cgroup_connect(int fd, struct perf_event *event, + struct perf_event_attr *attr, + struct perf_event *group_leader) +{ + struct perf_cgroup *cgrp; + struct cgroup_subsys_state *css; + struct file *file; + int ret = 0, fput_needed; + + file = fget_light(fd, &fput_needed); + if (!file) + return -EBADF; + + css = cgroup_css_from_dir(file, perf_subsys_id); + if (IS_ERR(css)) { + ret = PTR_ERR(css); + goto out; + } + + cgrp = container_of(css, struct perf_cgroup, css); + event->cgrp = cgrp; + + /* must be done before we fput() the file */ + perf_get_cgroup(event); + + /* + * all events in a group must monitor + * the same cgroup because a task belongs + * to only one perf cgroup at a time + */ + if (group_leader && group_leader->cgrp != cgrp) { + perf_detach_cgroup(event); + ret = -EINVAL; + } +out: + fput_light(file, fput_needed); + return ret; +} + +static inline void +perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) +{ + struct perf_cgroup_info *t; + t = per_cpu_ptr(event->cgrp->info, event->cpu); + event->shadow_ctx_time = now - t->timestamp; +} + +static inline void +perf_cgroup_defer_enabled(struct perf_event *event) +{ + /* + * when the current task's perf cgroup does not match + * the event's, we need to remember to call the + * perf_mark_enable() function the first time a task with + * a matching perf cgroup is scheduled in. + */ + if (is_cgroup_event(event) && !perf_cgroup_match(event)) + event->cgrp_defer_enabled = 1; +} + +static inline void +perf_cgroup_mark_enabled(struct perf_event *event, + struct perf_event_context *ctx) +{ + struct perf_event *sub; + u64 tstamp = perf_event_time(event); + + if (!event->cgrp_defer_enabled) + return; + + event->cgrp_defer_enabled = 0; + + event->tstamp_enabled = tstamp - event->total_time_enabled; + list_for_each_entry(sub, &event->sibling_list, group_entry) { + if (sub->state >= PERF_EVENT_STATE_INACTIVE) { + sub->tstamp_enabled = tstamp - sub->total_time_enabled; + sub->cgrp_defer_enabled = 0; + } + } +} +#else /* !CONFIG_CGROUP_PERF */ + +static inline bool +perf_cgroup_match(struct perf_event *event) +{ + return true; +} + +static inline void perf_detach_cgroup(struct perf_event *event) +{} + +static inline int is_cgroup_event(struct perf_event *event) +{ + return 0; +} + +static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event) +{ + return 0; +} + +static inline void update_cgrp_time_from_event(struct perf_event *event) +{ +} + +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) +{ +} + +static inline void perf_cgroup_sched_out(struct task_struct *task) +{ +} + +static inline void perf_cgroup_sched_in(struct task_struct *task) +{ +} + +static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, + struct perf_event_attr *attr, + struct perf_event *group_leader) +{ + return -EINVAL; +} + +static inline void +perf_cgroup_set_timestamp(struct task_struct *task, + struct perf_event_context *ctx) +{ +} + +void +perf_cgroup_switch(struct task_struct *task, struct task_struct *next) +{ +} + +static inline void +perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) +{ +} + +static inline u64 perf_cgroup_event_time(struct perf_event *event) +{ + return 0; +} + +static inline void +perf_cgroup_defer_enabled(struct perf_event *event) +{ +} + +static inline void +perf_cgroup_mark_enabled(struct perf_event *event, + struct perf_event_context *ctx) +{ +} +#endif + +void perf_pmu_disable(struct pmu *pmu) +{ + int *count = this_cpu_ptr(pmu->pmu_disable_count); + if (!(*count)++) + pmu->pmu_disable(pmu); +} + +void perf_pmu_enable(struct pmu *pmu) +{ + int *count = this_cpu_ptr(pmu->pmu_disable_count); + if (!--(*count)) + pmu->pmu_enable(pmu); +} + +static DEFINE_PER_CPU(struct list_head, rotation_list); + +/* + * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized + * because they're strictly cpu affine and rotate_start is called with IRQs + * disabled, while rotate_context is called from IRQ context. + */ +static void perf_pmu_rotate_start(struct pmu *pmu) +{ + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + struct list_head *head = &__get_cpu_var(rotation_list); + + WARN_ON(!irqs_disabled()); + + if (list_empty(&cpuctx->rotation_list)) + list_add(&cpuctx->rotation_list, head); +} + +static void get_ctx(struct perf_event_context *ctx) +{ + WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); +} + +static void free_ctx(struct rcu_head *head) +{ + struct perf_event_context *ctx; + + ctx = container_of(head, struct perf_event_context, rcu_head); + kfree(ctx); +} + +static void put_ctx(struct perf_event_context *ctx) +{ + if (atomic_dec_and_test(&ctx->refcount)) { + if (ctx->parent_ctx) + put_ctx(ctx->parent_ctx); + if (ctx->task) + put_task_struct(ctx->task); + call_rcu(&ctx->rcu_head, free_ctx); + } +} + +static void unclone_ctx(struct perf_event_context *ctx) +{ + if (ctx->parent_ctx) { + put_ctx(ctx->parent_ctx); + ctx->parent_ctx = NULL; + } +} + +static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) +{ + /* + * only top level events have the pid namespace they were created in + */ + if (event->parent) + event = event->parent; + + return task_tgid_nr_ns(p, event->ns); +} + +static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) +{ + /* + * only top level events have the pid namespace they were created in + */ + if (event->parent) + event = event->parent; + + return task_pid_nr_ns(p, event->ns); +} + +/* + * If we inherit events we want to return the parent event id + * to userspace. + */ +static u64 primary_event_id(struct perf_event *event) +{ + u64 id = event->id; + + if (event->parent) + id = event->parent->id; + + return id; +} + +/* + * Get the perf_event_context for a task and lock it. + * This has to cope with with the fact that until it is locked, + * the context could get moved to another task. + */ +static struct perf_event_context * +perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) +{ + struct perf_event_context *ctx; + + rcu_read_lock(); +retry: + ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); + if (ctx) { + /* + * If this context is a clone of another, it might + * get swapped for another underneath us by + * perf_event_task_sched_out, though the + * rcu_read_lock() protects us from any context + * getting freed. Lock the context and check if it + * got swapped before we could get the lock, and retry + * if so. If we locked the right context, then it + * can't get swapped on us any more. + */ + raw_spin_lock_irqsave(&ctx->lock, *flags); + if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { + raw_spin_unlock_irqrestore(&ctx->lock, *flags); + goto retry; + } + + if (!atomic_inc_not_zero(&ctx->refcount)) { + raw_spin_unlock_irqrestore(&ctx->lock, *flags); + ctx = NULL; + } + } + rcu_read_unlock(); + return ctx; +} + +/* + * Get the context for a task and increment its pin_count so it + * can't get swapped to another task. This also increments its + * reference count so that the context can't get freed. + */ +static struct perf_event_context * +perf_pin_task_context(struct task_struct *task, int ctxn) +{ + struct perf_event_context *ctx; + unsigned long flags; + + ctx = perf_lock_task_context(task, ctxn, &flags); + if (ctx) { + ++ctx->pin_count; + raw_spin_unlock_irqrestore(&ctx->lock, flags); + } + return ctx; +} + +static void perf_unpin_context(struct perf_event_context *ctx) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&ctx->lock, flags); + --ctx->pin_count; + raw_spin_unlock_irqrestore(&ctx->lock, flags); +} + +/* + * Update the record of the current time in a context. + */ +static void update_context_time(struct perf_event_context *ctx) +{ + u64 now = perf_clock(); + + ctx->time += now - ctx->timestamp; + ctx->timestamp = now; +} + +static u64 perf_event_time(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + + if (is_cgroup_event(event)) + return perf_cgroup_event_time(event); + + return ctx ? ctx->time : 0; +} + +/* + * Update the total_time_enabled and total_time_running fields for a event. + */ +static void update_event_times(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + u64 run_end; + + if (event->state < PERF_EVENT_STATE_INACTIVE || + event->group_leader->state < PERF_EVENT_STATE_INACTIVE) + return; + /* + * in cgroup mode, time_enabled represents + * the time the event was enabled AND active + * tasks were in the monitored cgroup. This is + * independent of the activity of the context as + * there may be a mix of cgroup and non-cgroup events. + * + * That is why we treat cgroup events differently + * here. + */ + if (is_cgroup_event(event)) + run_end = perf_event_time(event); + else if (ctx->is_active) + run_end = ctx->time; + else + run_end = event->tstamp_stopped; + + event->total_time_enabled = run_end - event->tstamp_enabled; + + if (event->state == PERF_EVENT_STATE_INACTIVE) + run_end = event->tstamp_stopped; + else + run_end = perf_event_time(event); + + event->total_time_running = run_end - event->tstamp_running; + +} + +/* + * Update total_time_enabled and total_time_running for all events in a group. + */ +static void update_group_times(struct perf_event *leader) +{ + struct perf_event *event; + + update_event_times(leader); + list_for_each_entry(event, &leader->sibling_list, group_entry) + update_event_times(event); +} + +static struct list_head * +ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) +{ + if (event->attr.pinned) + return &ctx->pinned_groups; + else + return &ctx->flexible_groups; +} + +/* + * Add a event from the lists for its context. + * Must be called with ctx->mutex and ctx->lock held. + */ +static void +list_add_event(struct perf_event *event, struct perf_event_context *ctx) +{ + WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); + event->attach_state |= PERF_ATTACH_CONTEXT; + + /* + * If we're a stand alone event or group leader, we go to the context + * list, group events are kept attached to the group so that + * perf_group_detach can, at all times, locate all siblings. + */ + if (event->group_leader == event) { + struct list_head *list; + + if (is_software_event(event)) + event->group_flags |= PERF_GROUP_SOFTWARE; + + list = ctx_group_list(event, ctx); + list_add_tail(&event->group_entry, list); + } + + if (is_cgroup_event(event)) + ctx->nr_cgroups++; + + list_add_rcu(&event->event_entry, &ctx->event_list); + if (!ctx->nr_events) + perf_pmu_rotate_start(ctx->pmu); + ctx->nr_events++; + if (event->attr.inherit_stat) + ctx->nr_stat++; +} + +/* + * Called at perf_event creation and when events are attached/detached from a + * group. + */ +static void perf_event__read_size(struct perf_event *event) +{ + int entry = sizeof(u64); /* value */ + int size = 0; + int nr = 1; + + if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + size += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + size += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_ID) + entry += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_GROUP) { + nr += event->group_leader->nr_siblings; + size += sizeof(u64); + } + + size += entry * nr; + event->read_size = size; +} + +static void perf_event__header_size(struct perf_event *event) +{ + struct perf_sample_data *data; + u64 sample_type = event->attr.sample_type; + u16 size = 0; + + perf_event__read_size(event); + + if (sample_type & PERF_SAMPLE_IP) + size += sizeof(data->ip); + + if (sample_type & PERF_SAMPLE_ADDR) + size += sizeof(data->addr); + + if (sample_type & PERF_SAMPLE_PERIOD) + size += sizeof(data->period); + + if (sample_type & PERF_SAMPLE_READ) + size += event->read_size; + + event->header_size = size; +} + +static void perf_event__id_header_size(struct perf_event *event) +{ + struct perf_sample_data *data; + u64 sample_type = event->attr.sample_type; + u16 size = 0; + + if (sample_type & PERF_SAMPLE_TID) + size += sizeof(data->tid_entry); + + if (sample_type & PERF_SAMPLE_TIME) + size += sizeof(data->time); + + if (sample_type & PERF_SAMPLE_ID) + size += sizeof(data->id); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + size += sizeof(data->stream_id); + + if (sample_type & PERF_SAMPLE_CPU) + size += sizeof(data->cpu_entry); + + event->id_header_size = size; +} + +static void perf_group_attach(struct perf_event *event) +{ + struct perf_event *group_leader = event->group_leader, *pos; + + /* + * We can have double attach due to group movement in perf_event_open. + */ + if (event->attach_state & PERF_ATTACH_GROUP) + return; + + event->attach_state |= PERF_ATTACH_GROUP; + + if (group_leader == event) + return; + + if (group_leader->group_flags & PERF_GROUP_SOFTWARE && + !is_software_event(event)) + group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; + + list_add_tail(&event->group_entry, &group_leader->sibling_list); + group_leader->nr_siblings++; + + perf_event__header_size(group_leader); + + list_for_each_entry(pos, &group_leader->sibling_list, group_entry) + perf_event__header_size(pos); +} + +/* + * Remove a event from the lists for its context. + * Must be called with ctx->mutex and ctx->lock held. + */ +static void +list_del_event(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_cpu_context *cpuctx; + /* + * We can have double detach due to exit/hot-unplug + close. + */ + if (!(event->attach_state & PERF_ATTACH_CONTEXT)) + return; + + event->attach_state &= ~PERF_ATTACH_CONTEXT; + + if (is_cgroup_event(event)) { + ctx->nr_cgroups--; + cpuctx = __get_cpu_context(ctx); + /* + * if there are no more cgroup events + * then cler cgrp to avoid stale pointer + * in update_cgrp_time_from_cpuctx() + */ + if (!ctx->nr_cgroups) + cpuctx->cgrp = NULL; + } + + ctx->nr_events--; + if (event->attr.inherit_stat) + ctx->nr_stat--; + + list_del_rcu(&event->event_entry); + + if (event->group_leader == event) + list_del_init(&event->group_entry); + + update_group_times(event); + + /* + * If event was in error state, then keep it + * that way, otherwise bogus counts will be + * returned on read(). The only way to get out + * of error state is by explicit re-enabling + * of the event + */ + if (event->state > PERF_EVENT_STATE_OFF) + event->state = PERF_EVENT_STATE_OFF; +} + +static void perf_group_detach(struct perf_event *event) +{ + struct perf_event *sibling, *tmp; + struct list_head *list = NULL; + + /* + * We can have double detach due to exit/hot-unplug + close. + */ + if (!(event->attach_state & PERF_ATTACH_GROUP)) + return; + + event->attach_state &= ~PERF_ATTACH_GROUP; + + /* + * If this is a sibling, remove it from its group. + */ + if (event->group_leader != event) { + list_del_init(&event->group_entry); + event->group_leader->nr_siblings--; + goto out; + } + + if (!list_empty(&event->group_entry)) + list = &event->group_entry; + + /* + * If this was a group event with sibling events then + * upgrade the siblings to singleton events by adding them + * to whatever list we are on. + */ + list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { + if (list) + list_move_tail(&sibling->group_entry, list); + sibling->group_leader = sibling; + + /* Inherit group flags from the previous leader */ + sibling->group_flags = event->group_flags; + } + +out: + perf_event__header_size(event->group_leader); + + list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry) + perf_event__header_size(tmp); +} + +static inline int +event_filter_match(struct perf_event *event) +{ + return (event->cpu == -1 || event->cpu == smp_processor_id()) + && perf_cgroup_match(event); +} + +static void +event_sched_out(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + u64 tstamp = perf_event_time(event); + u64 delta; + /* + * An event which could not be activated because of + * filter mismatch still needs to have its timings + * maintained, otherwise bogus information is return + * via read() for time_enabled, time_running: + */ + if (event->state == PERF_EVENT_STATE_INACTIVE + && !event_filter_match(event)) { + delta = tstamp - event->tstamp_stopped; + event->tstamp_running += delta; + event->tstamp_stopped = tstamp; + } + + if (event->state != PERF_EVENT_STATE_ACTIVE) + return; + + event->state = PERF_EVENT_STATE_INACTIVE; + if (event->pending_disable) { + event->pending_disable = 0; + event->state = PERF_EVENT_STATE_OFF; + } + event->tstamp_stopped = tstamp; + event->pmu->del(event, 0); + event->oncpu = -1; + + if (!is_software_event(event)) + cpuctx->active_oncpu--; + ctx->nr_active--; + if (event->attr.exclusive || !cpuctx->active_oncpu) + cpuctx->exclusive = 0; +} + +static void +group_sched_out(struct perf_event *group_event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + struct perf_event *event; + int state = group_event->state; + + event_sched_out(group_event, cpuctx, ctx); + + /* + * Schedule out siblings (if any): + */ + list_for_each_entry(event, &group_event->sibling_list, group_entry) + event_sched_out(event, cpuctx, ctx); + + if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) + cpuctx->exclusive = 0; +} + +/* + * Cross CPU call to remove a performance event + * + * We disable the event on the hardware level first. After that we + * remove it from the context list. + */ +static int __perf_remove_from_context(void *info) +{ + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + + raw_spin_lock(&ctx->lock); + event_sched_out(event, cpuctx, ctx); + list_del_event(event, ctx); + raw_spin_unlock(&ctx->lock); + + return 0; +} + + +/* + * Remove the event from a task's (or a CPU's) list of events. + * + * CPU events are removed with a smp call. For task events we only + * call when the task is on a CPU. + * + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to + * remains valid. This is OK when called from perf_release since + * that only calls us on the top-level context, which can't be a clone. + * When called from perf_event_exit_task, it's OK because the + * context has been detached from its task. + */ +static void perf_remove_from_context(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct task_struct *task = ctx->task; + + lockdep_assert_held(&ctx->mutex); + + if (!task) { + /* + * Per cpu events are removed via an smp call and + * the removal is always successful. + */ + cpu_function_call(event->cpu, __perf_remove_from_context, event); + return; + } + +retry: + if (!task_function_call(task, __perf_remove_from_context, event)) + return; + + raw_spin_lock_irq(&ctx->lock); + /* + * If we failed to find a running task, but find the context active now + * that we've acquired the ctx->lock, retry. + */ + if (ctx->is_active) { + raw_spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * Since the task isn't running, its safe to remove the event, us + * holding the ctx->lock ensures the task won't get scheduled in. + */ + list_del_event(event, ctx); + raw_spin_unlock_irq(&ctx->lock); +} + +/* + * Cross CPU call to disable a performance event + */ +static int __perf_event_disable(void *info) +{ + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + + /* + * If this is a per-task event, need to check whether this + * event's task is the current task on this cpu. + * + * Can trigger due to concurrent perf_event_context_sched_out() + * flipping contexts around. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return -EINVAL; + + raw_spin_lock(&ctx->lock); + + /* + * If the event is on, turn it off. + * If it is in error state, leave it in error state. + */ + if (event->state >= PERF_EVENT_STATE_INACTIVE) { + update_context_time(ctx); + update_cgrp_time_from_event(event); + update_group_times(event); + if (event == event->group_leader) + group_sched_out(event, cpuctx, ctx); + else + event_sched_out(event, cpuctx, ctx); + event->state = PERF_EVENT_STATE_OFF; + } + + raw_spin_unlock(&ctx->lock); + + return 0; +} + +/* + * Disable a event. + * + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to + * remains valid. This condition is satisifed when called through + * perf_event_for_each_child or perf_event_for_each because they + * hold the top-level event's child_mutex, so any descendant that + * goes to exit will block in sync_child_event. + * When called from perf_pending_event it's OK because event->ctx + * is the current context on this CPU and preemption is disabled, + * hence we can't get into perf_event_task_sched_out for this context. + */ +void perf_event_disable(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Disable the event on the cpu that it's on + */ + cpu_function_call(event->cpu, __perf_event_disable, event); + return; + } + +retry: + if (!task_function_call(task, __perf_event_disable, event)) + return; + + raw_spin_lock_irq(&ctx->lock); + /* + * If the event is still active, we need to retry the cross-call. + */ + if (event->state == PERF_EVENT_STATE_ACTIVE) { + raw_spin_unlock_irq(&ctx->lock); + /* + * Reload the task pointer, it might have been changed by + * a concurrent perf_event_context_sched_out(). + */ + task = ctx->task; + goto retry; + } + + /* + * Since we have the lock this context can't be scheduled + * in, so we can change the state safely. + */ + if (event->state == PERF_EVENT_STATE_INACTIVE) { + update_group_times(event); + event->state = PERF_EVENT_STATE_OFF; + } + raw_spin_unlock_irq(&ctx->lock); +} + +static void perf_set_shadow_time(struct perf_event *event, + struct perf_event_context *ctx, + u64 tstamp) +{ + /* + * use the correct time source for the time snapshot + * + * We could get by without this by leveraging the + * fact that to get to this function, the caller + * has most likely already called update_context_time() + * and update_cgrp_time_xx() and thus both timestamp + * are identical (or very close). Given that tstamp is, + * already adjusted for cgroup, we could say that: + * tstamp - ctx->timestamp + * is equivalent to + * tstamp - cgrp->timestamp. + * + * Then, in perf_output_read(), the calculation would + * work with no changes because: + * - event is guaranteed scheduled in + * - no scheduled out in between + * - thus the timestamp would be the same + * + * But this is a bit hairy. + * + * So instead, we have an explicit cgroup call to remain + * within the time time source all along. We believe it + * is cleaner and simpler to understand. + */ + if (is_cgroup_event(event)) + perf_cgroup_set_shadow_time(event, tstamp); + else + event->shadow_ctx_time = tstamp - ctx->timestamp; +} + +#define MAX_INTERRUPTS (~0ULL) + +static void perf_log_throttle(struct perf_event *event, int enable); + +static int +event_sched_in(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + u64 tstamp = perf_event_time(event); + + if (event->state <= PERF_EVENT_STATE_OFF) + return 0; + + event->state = PERF_EVENT_STATE_ACTIVE; + event->oncpu = smp_processor_id(); + + /* + * Unthrottle events, since we scheduled we might have missed several + * ticks already, also for a heavily scheduling task there is little + * guarantee it'll get a tick in a timely manner. + */ + if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) { + perf_log_throttle(event, 1); + event->hw.interrupts = 0; + } + + /* + * The new state must be visible before we turn it on in the hardware: + */ + smp_wmb(); + + if (event->pmu->add(event, PERF_EF_START)) { + event->state = PERF_EVENT_STATE_INACTIVE; + event->oncpu = -1; + return -EAGAIN; + } + + event->tstamp_running += tstamp - event->tstamp_stopped; + + perf_set_shadow_time(event, ctx, tstamp); + + if (!is_software_event(event)) + cpuctx->active_oncpu++; + ctx->nr_active++; + + if (event->attr.exclusive) + cpuctx->exclusive = 1; + + return 0; +} + +static int +group_sched_in(struct perf_event *group_event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + struct perf_event *event, *partial_group = NULL; + struct pmu *pmu = group_event->pmu; + u64 now = ctx->time; + bool simulate = false; + + if (group_event->state == PERF_EVENT_STATE_OFF) + return 0; + + pmu->start_txn(pmu); + + if (event_sched_in(group_event, cpuctx, ctx)) { + pmu->cancel_txn(pmu); + return -EAGAIN; + } + + /* + * Schedule in siblings as one group (if any): + */ + list_for_each_entry(event, &group_event->sibling_list, group_entry) { + if (event_sched_in(event, cpuctx, ctx)) { + partial_group = event; + goto group_error; + } + } + + if (!pmu->commit_txn(pmu)) + return 0; + +group_error: + /* + * Groups can be scheduled in as one unit only, so undo any + * partial group before returning: + * The events up to the failed event are scheduled out normally, + * tstamp_stopped will be updated. + * + * The failed events and the remaining siblings need to have + * their timings updated as if they had gone thru event_sched_in() + * and event_sched_out(). This is required to get consistent timings + * across the group. This also takes care of the case where the group + * could never be scheduled by ensuring tstamp_stopped is set to mark + * the time the event was actually stopped, such that time delta + * calculation in update_event_times() is correct. + */ + list_for_each_entry(event, &group_event->sibling_list, group_entry) { + if (event == partial_group) + simulate = true; + + if (simulate) { + event->tstamp_running += now - event->tstamp_stopped; + event->tstamp_stopped = now; + } else { + event_sched_out(event, cpuctx, ctx); + } + } + event_sched_out(group_event, cpuctx, ctx); + + pmu->cancel_txn(pmu); + + return -EAGAIN; +} + +/* + * Work out whether we can put this event group on the CPU now. + */ +static int group_can_go_on(struct perf_event *event, + struct perf_cpu_context *cpuctx, + int can_add_hw) +{ + /* + * Groups consisting entirely of software events can always go on. + */ + if (event->group_flags & PERF_GROUP_SOFTWARE) + return 1; + /* + * If an exclusive group is already on, no other hardware + * events can go on. + */ + if (cpuctx->exclusive) + return 0; + /* + * If this group is exclusive and there are already + * events on the CPU, it can't go on. + */ + if (event->attr.exclusive && cpuctx->active_oncpu) + return 0; + /* + * Otherwise, try to add it if all previous groups were able + * to go on. + */ + return can_add_hw; +} + +static void add_event_to_ctx(struct perf_event *event, + struct perf_event_context *ctx) +{ + u64 tstamp = perf_event_time(event); + + list_add_event(event, ctx); + perf_group_attach(event); + event->tstamp_enabled = tstamp; + event->tstamp_running = tstamp; + event->tstamp_stopped = tstamp; +} + +static void perf_event_context_sched_in(struct perf_event_context *ctx, + struct task_struct *tsk); + +/* + * Cross CPU call to install and enable a performance event + * + * Must be called with ctx->mutex held + */ +static int __perf_install_in_context(void *info) +{ + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + struct perf_event *leader = event->group_leader; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + int err; + + /* + * In case we're installing a new context to an already running task, + * could also happen before perf_event_task_sched_in() on architectures + * which do context switches with IRQs enabled. + */ + if (ctx->task && !cpuctx->task_ctx) + perf_event_context_sched_in(ctx, ctx->task); + + raw_spin_lock(&ctx->lock); + ctx->is_active = 1; + update_context_time(ctx); + /* + * update cgrp time only if current cgrp + * matches event->cgrp. Must be done before + * calling add_event_to_ctx() + */ + update_cgrp_time_from_event(event); + + add_event_to_ctx(event, ctx); + + if (!event_filter_match(event)) + goto unlock; + + /* + * Don't put the event on if it is disabled or if + * it is in a group and the group isn't on. + */ + if (event->state != PERF_EVENT_STATE_INACTIVE || + (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)) + goto unlock; + + /* + * An exclusive event can't go on if there are already active + * hardware events, and no hardware event can go on if there + * is already an exclusive event on. + */ + if (!group_can_go_on(event, cpuctx, 1)) + err = -EEXIST; + else + err = event_sched_in(event, cpuctx, ctx); + + if (err) { + /* + * This event couldn't go on. If it is in a group + * then we have to pull the whole group off. + * If the event group is pinned then put it in error state. + */ + if (leader != event) + group_sched_out(leader, cpuctx, ctx); + if (leader->attr.pinned) { + update_group_times(leader); + leader->state = PERF_EVENT_STATE_ERROR; + } + } + +unlock: + raw_spin_unlock(&ctx->lock); + + return 0; +} + +/* + * Attach a performance event to a context + * + * First we add the event to the list with the hardware enable bit + * in event->hw_config cleared. + * + * If the event is attached to a task which is on a CPU we use a smp + * call to enable it in the task context. The task might have been + * scheduled away, but we check this in the smp call again. + */ +static void +perf_install_in_context(struct perf_event_context *ctx, + struct perf_event *event, + int cpu) +{ + struct task_struct *task = ctx->task; + + lockdep_assert_held(&ctx->mutex); + + event->ctx = ctx; + + if (!task) { + /* + * Per cpu events are installed via an smp call and + * the install is always successful. + */ + cpu_function_call(cpu, __perf_install_in_context, event); + return; + } + +retry: + if (!task_function_call(task, __perf_install_in_context, event)) + return; + + raw_spin_lock_irq(&ctx->lock); + /* + * If we failed to find a running task, but find the context active now + * that we've acquired the ctx->lock, retry. + */ + if (ctx->is_active) { + raw_spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * Since the task isn't running, its safe to add the event, us holding + * the ctx->lock ensures the task won't get scheduled in. + */ + add_event_to_ctx(event, ctx); + raw_spin_unlock_irq(&ctx->lock); +} + +/* + * Put a event into inactive state and update time fields. + * Enabling the leader of a group effectively enables all + * the group members that aren't explicitly disabled, so we + * have to update their ->tstamp_enabled also. + * Note: this works for group members as well as group leaders + * since the non-leader members' sibling_lists will be empty. + */ +static void __perf_event_mark_enabled(struct perf_event *event, + struct perf_event_context *ctx) +{ + struct perf_event *sub; + u64 tstamp = perf_event_time(event); + + event->state = PERF_EVENT_STATE_INACTIVE; + event->tstamp_enabled = tstamp - event->total_time_enabled; + list_for_each_entry(sub, &event->sibling_list, group_entry) { + if (sub->state >= PERF_EVENT_STATE_INACTIVE) + sub->tstamp_enabled = tstamp - sub->total_time_enabled; + } +} + +/* + * Cross CPU call to enable a performance event + */ +static int __perf_event_enable(void *info) +{ + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + struct perf_event *leader = event->group_leader; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + int err; + + if (WARN_ON_ONCE(!ctx->is_active)) + return -EINVAL; + + raw_spin_lock(&ctx->lock); + update_context_time(ctx); + + if (event->state >= PERF_EVENT_STATE_INACTIVE) + goto unlock; + + /* + * set current task's cgroup time reference point + */ + perf_cgroup_set_timestamp(current, ctx); + + __perf_event_mark_enabled(event, ctx); + + if (!event_filter_match(event)) { + if (is_cgroup_event(event)) + perf_cgroup_defer_enabled(event); + goto unlock; + } + + /* + * If the event is in a group and isn't the group leader, + * then don't put it on unless the group is on. + */ + if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) + goto unlock; + + if (!group_can_go_on(event, cpuctx, 1)) { + err = -EEXIST; + } else { + if (event == leader) + err = group_sched_in(event, cpuctx, ctx); + else + err = event_sched_in(event, cpuctx, ctx); + } + + if (err) { + /* + * If this event can't go on and it's part of a + * group, then the whole group has to come off. + */ + if (leader != event) + group_sched_out(leader, cpuctx, ctx); + if (leader->attr.pinned) { + update_group_times(leader); + leader->state = PERF_EVENT_STATE_ERROR; + } + } + +unlock: + raw_spin_unlock(&ctx->lock); + + return 0; +} + +/* + * Enable a event. + * + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to + * remains valid. This condition is satisfied when called through + * perf_event_for_each_child or perf_event_for_each as described + * for perf_event_disable. + */ +void perf_event_enable(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Enable the event on the cpu that it's on + */ + cpu_function_call(event->cpu, __perf_event_enable, event); + return; + } + + raw_spin_lock_irq(&ctx->lock); + if (event->state >= PERF_EVENT_STATE_INACTIVE) + goto out; + + /* + * If the event is in error state, clear that first. + * That way, if we see the event in error state below, we + * know that it has gone back into error state, as distinct + * from the task having been scheduled away before the + * cross-call arrived. + */ + if (event->state == PERF_EVENT_STATE_ERROR) + event->state = PERF_EVENT_STATE_OFF; + +retry: + if (!ctx->is_active) { + __perf_event_mark_enabled(event, ctx); + goto out; + } + + raw_spin_unlock_irq(&ctx->lock); + + if (!task_function_call(task, __perf_event_enable, event)) + return; + + raw_spin_lock_irq(&ctx->lock); + + /* + * If the context is active and the event is still off, + * we need to retry the cross-call. + */ + if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) { + /* + * task could have been flipped by a concurrent + * perf_event_context_sched_out() + */ + task = ctx->task; + goto retry; + } + +out: + raw_spin_unlock_irq(&ctx->lock); +} + +static int perf_event_refresh(struct perf_event *event, int refresh) +{ + /* + * not supported on inherited events + */ + if (event->attr.inherit || !is_sampling_event(event)) + return -EINVAL; + + atomic_add(refresh, &event->event_limit); + perf_event_enable(event); + + return 0; +} + +static void ctx_sched_out(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, + enum event_type_t event_type) +{ + struct perf_event *event; + + raw_spin_lock(&ctx->lock); + perf_pmu_disable(ctx->pmu); + ctx->is_active = 0; + if (likely(!ctx->nr_events)) + goto out; + update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx); + + if (!ctx->nr_active) + goto out; + + if (event_type & EVENT_PINNED) { + list_for_each_entry(event, &ctx->pinned_groups, group_entry) + group_sched_out(event, cpuctx, ctx); + } + + if (event_type & EVENT_FLEXIBLE) { + list_for_each_entry(event, &ctx->flexible_groups, group_entry) + group_sched_out(event, cpuctx, ctx); + } +out: + perf_pmu_enable(ctx->pmu); + raw_spin_unlock(&ctx->lock); +} + +/* + * Test whether two contexts are equivalent, i.e. whether they + * have both been cloned from the same version of the same context + * and they both have the same number of enabled events. + * If the number of enabled events is the same, then the set + * of enabled events should be the same, because these are both + * inherited contexts, therefore we can't access individual events + * in them directly with an fd; we can only enable/disable all + * events via prctl, or enable/disable all events in a family + * via ioctl, which will have the same effect on both contexts. + */ +static int context_equiv(struct perf_event_context *ctx1, + struct perf_event_context *ctx2) +{ + return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx + && ctx1->parent_gen == ctx2->parent_gen + && !ctx1->pin_count && !ctx2->pin_count; +} + +static void __perf_event_sync_stat(struct perf_event *event, + struct perf_event *next_event) +{ + u64 value; + + if (!event->attr.inherit_stat) + return; + + /* + * Update the event value, we cannot use perf_event_read() + * because we're in the middle of a context switch and have IRQs + * disabled, which upsets smp_call_function_single(), however + * we know the event must be on the current CPU, therefore we + * don't need to use it. + */ + switch (event->state) { + case PERF_EVENT_STATE_ACTIVE: + event->pmu->read(event); + /* fall-through */ + + case PERF_EVENT_STATE_INACTIVE: + update_event_times(event); + break; + + default: + break; + } + + /* + * In order to keep per-task stats reliable we need to flip the event + * values when we flip the contexts. + */ + value = local64_read(&next_event->count); + value = local64_xchg(&event->count, value); + local64_set(&next_event->count, value); + + swap(event->total_time_enabled, next_event->total_time_enabled); + swap(event->total_time_running, next_event->total_time_running); + + /* + * Since we swizzled the values, update the user visible data too. + */ + perf_event_update_userpage(event); + perf_event_update_userpage(next_event); +} + +#define list_next_entry(pos, member) \ + list_entry(pos->member.next, typeof(*pos), member) + +static void perf_event_sync_stat(struct perf_event_context *ctx, + struct perf_event_context *next_ctx) +{ + struct perf_event *event, *next_event; + + if (!ctx->nr_stat) + return; + + update_context_time(ctx); + + event = list_first_entry(&ctx->event_list, + struct perf_event, event_entry); + + next_event = list_first_entry(&next_ctx->event_list, + struct perf_event, event_entry); + + while (&event->event_entry != &ctx->event_list && + &next_event->event_entry != &next_ctx->event_list) { + + __perf_event_sync_stat(event, next_event); + + event = list_next_entry(event, event_entry); + next_event = list_next_entry(next_event, event_entry); + } +} + +static void perf_event_context_sched_out(struct task_struct *task, int ctxn, + struct task_struct *next) +{ + struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; + struct perf_event_context *next_ctx; + struct perf_event_context *parent; + struct perf_cpu_context *cpuctx; + int do_switch = 1; + + if (likely(!ctx)) + return; + + cpuctx = __get_cpu_context(ctx); + if (!cpuctx->task_ctx) + return; + + rcu_read_lock(); + parent = rcu_dereference(ctx->parent_ctx); + next_ctx = next->perf_event_ctxp[ctxn]; + if (parent && next_ctx && + rcu_dereference(next_ctx->parent_ctx) == parent) { + /* + * Looks like the two contexts are clones, so we might be + * able to optimize the context switch. We lock both + * contexts and check that they are clones under the + * lock (including re-checking that neither has been + * uncloned in the meantime). It doesn't matter which + * order we take the locks because no other cpu could + * be trying to lock both of these tasks. + */ + raw_spin_lock(&ctx->lock); + raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); + if (context_equiv(ctx, next_ctx)) { + /* + * XXX do we need a memory barrier of sorts + * wrt to rcu_dereference() of perf_event_ctxp + */ + task->perf_event_ctxp[ctxn] = next_ctx; + next->perf_event_ctxp[ctxn] = ctx; + ctx->task = next; + next_ctx->task = task; + do_switch = 0; + + perf_event_sync_stat(ctx, next_ctx); + } + raw_spin_unlock(&next_ctx->lock); + raw_spin_unlock(&ctx->lock); + } + rcu_read_unlock(); + + if (do_switch) { + ctx_sched_out(ctx, cpuctx, EVENT_ALL); + cpuctx->task_ctx = NULL; + } +} + +#define for_each_task_context_nr(ctxn) \ + for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) + +/* + * Called from scheduler to remove the events of the current task, + * with interrupts disabled. + * + * We stop each event and update the event value in event->count. + * + * This does not protect us against NMI, but disable() + * sets the disabled bit in the control field of event _before_ + * accessing the event control register. If a NMI hits, then it will + * not restart the event. + */ +void __perf_event_task_sched_out(struct task_struct *task, + struct task_struct *next) +{ + int ctxn; + + for_each_task_context_nr(ctxn) + perf_event_context_sched_out(task, ctxn, next); + + /* + * if cgroup events exist on this CPU, then we need + * to check if we have to switch out PMU state. + * cgroup event are system-wide mode only + */ + if (atomic_read(&__get_cpu_var(perf_cgroup_events))) + perf_cgroup_sched_out(task); +} + +static void task_ctx_sched_out(struct perf_event_context *ctx, + enum event_type_t event_type) +{ + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + + if (!cpuctx->task_ctx) + return; + + if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) + return; + + ctx_sched_out(ctx, cpuctx, event_type); + cpuctx->task_ctx = NULL; +} + +/* + * Called with IRQs disabled + */ +static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, + enum event_type_t event_type) +{ + ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); +} + +static void +ctx_pinned_sched_in(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx) +{ + struct perf_event *event; + + list_for_each_entry(event, &ctx->pinned_groups, group_entry) { + if (event->state <= PERF_EVENT_STATE_OFF) + continue; + if (!event_filter_match(event)) + continue; + + /* may need to reset tstamp_enabled */ + if (is_cgroup_event(event)) + perf_cgroup_mark_enabled(event, ctx); + + if (group_can_go_on(event, cpuctx, 1)) + group_sched_in(event, cpuctx, ctx); + + /* + * If this pinned group hasn't been scheduled, + * put it in error state. + */ + if (event->state == PERF_EVENT_STATE_INACTIVE) { + update_group_times(event); + event->state = PERF_EVENT_STATE_ERROR; + } + } +} + +static void +ctx_flexible_sched_in(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx) +{ + struct perf_event *event; + int can_add_hw = 1; + + list_for_each_entry(event, &ctx->flexible_groups, group_entry) { + /* Ignore events in OFF or ERROR state */ + if (event->state <= PERF_EVENT_STATE_OFF) + continue; + /* + * Listen to the 'cpu' scheduling filter constraint + * of events: + */ + if (!event_filter_match(event)) + continue; + + /* may need to reset tstamp_enabled */ + if (is_cgroup_event(event)) + perf_cgroup_mark_enabled(event, ctx); + + if (group_can_go_on(event, cpuctx, can_add_hw)) { + if (group_sched_in(event, cpuctx, ctx)) + can_add_hw = 0; + } + } +} + +static void +ctx_sched_in(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, + enum event_type_t event_type, + struct task_struct *task) +{ + u64 now; + + raw_spin_lock(&ctx->lock); + ctx->is_active = 1; + if (likely(!ctx->nr_events)) + goto out; + + now = perf_clock(); + ctx->timestamp = now; + perf_cgroup_set_timestamp(task, ctx); + /* + * First go through the list and put on any pinned groups + * in order to give them the best chance of going on. + */ + if (event_type & EVENT_PINNED) + ctx_pinned_sched_in(ctx, cpuctx); + + /* Then walk through the lower prio flexible groups */ + if (event_type & EVENT_FLEXIBLE) + ctx_flexible_sched_in(ctx, cpuctx); + +out: + raw_spin_unlock(&ctx->lock); +} + +static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, + enum event_type_t event_type, + struct task_struct *task) +{ + struct perf_event_context *ctx = &cpuctx->ctx; + + ctx_sched_in(ctx, cpuctx, event_type, task); +} + +static void task_ctx_sched_in(struct perf_event_context *ctx, + enum event_type_t event_type) +{ + struct perf_cpu_context *cpuctx; + + cpuctx = __get_cpu_context(ctx); + if (cpuctx->task_ctx == ctx) + return; + + ctx_sched_in(ctx, cpuctx, event_type, NULL); + cpuctx->task_ctx = ctx; +} + +static void perf_event_context_sched_in(struct perf_event_context *ctx, + struct task_struct *task) +{ + struct perf_cpu_context *cpuctx; + + cpuctx = __get_cpu_context(ctx); + if (cpuctx->task_ctx == ctx) + return; + + perf_pmu_disable(ctx->pmu); + /* + * We want to keep the following priority order: + * cpu pinned (that don't need to move), task pinned, + * cpu flexible, task flexible. + */ + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + + ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); + ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); + + cpuctx->task_ctx = ctx; + + /* + * Since these rotations are per-cpu, we need to ensure the + * cpu-context we got scheduled on is actually rotating. + */ + perf_pmu_rotate_start(ctx->pmu); + perf_pmu_enable(ctx->pmu); +} + +/* + * Called from scheduler to add the events of the current task + * with interrupts disabled. + * + * We restore the event value and then enable it. + * + * This does not protect us against NMI, but enable() + * sets the enabled bit in the control field of event _before_ + * accessing the event control register. If a NMI hits, then it will + * keep the event running. + */ +void __perf_event_task_sched_in(struct task_struct *task) +{ + struct perf_event_context *ctx; + int ctxn; + + for_each_task_context_nr(ctxn) { + ctx = task->perf_event_ctxp[ctxn]; + if (likely(!ctx)) + continue; + + perf_event_context_sched_in(ctx, task); + } + /* + * if cgroup events exist on this CPU, then we need + * to check if we have to switch in PMU state. + * cgroup event are system-wide mode only + */ + if (atomic_read(&__get_cpu_var(perf_cgroup_events))) + perf_cgroup_sched_in(task); +} + +static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) +{ + u64 frequency = event->attr.sample_freq; + u64 sec = NSEC_PER_SEC; + u64 divisor, dividend; + + int count_fls, nsec_fls, frequency_fls, sec_fls; + + count_fls = fls64(count); + nsec_fls = fls64(nsec); + frequency_fls = fls64(frequency); + sec_fls = 30; + + /* + * We got @count in @nsec, with a target of sample_freq HZ + * the target period becomes: + * + * @count * 10^9 + * period = ------------------- + * @nsec * sample_freq + * + */ + + /* + * Reduce accuracy by one bit such that @a and @b converge + * to a similar magnitude. + */ +#define REDUCE_FLS(a, b) \ +do { \ + if (a##_fls > b##_fls) { \ + a >>= 1; \ + a##_fls--; \ + } else { \ + b >>= 1; \ + b##_fls--; \ + } \ +} while (0) + + /* + * Reduce accuracy until either term fits in a u64, then proceed with + * the other, so that finally we can do a u64/u64 division. + */ + while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) { + REDUCE_FLS(nsec, frequency); + REDUCE_FLS(sec, count); + } + + if (count_fls + sec_fls > 64) { + divisor = nsec * frequency; + + while (count_fls + sec_fls > 64) { + REDUCE_FLS(count, sec); + divisor >>= 1; + } + + dividend = count * sec; + } else { + dividend = count * sec; + + while (nsec_fls + frequency_fls > 64) { + REDUCE_FLS(nsec, frequency); + dividend >>= 1; + } + + divisor = nsec * frequency; + } + + if (!divisor) + return dividend; + + return div64_u64(dividend, divisor); +} + +static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) +{ + struct hw_perf_event *hwc = &event->hw; + s64 period, sample_period; + s64 delta; + + period = perf_calculate_period(event, nsec, count); + + delta = (s64)(period - hwc->sample_period); + delta = (delta + 7) / 8; /* low pass filter */ + + sample_period = hwc->sample_period + delta; + + if (!sample_period) + sample_period = 1; + + hwc->sample_period = sample_period; + + if (local64_read(&hwc->period_left) > 8*sample_period) { + event->pmu->stop(event, PERF_EF_UPDATE); + local64_set(&hwc->period_left, 0); + event->pmu->start(event, PERF_EF_RELOAD); + } +} + +static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) +{ + struct perf_event *event; + struct hw_perf_event *hwc; + u64 interrupts, now; + s64 delta; + + raw_spin_lock(&ctx->lock); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (event->state != PERF_EVENT_STATE_ACTIVE) + continue; + + if (!event_filter_match(event)) + continue; + + hwc = &event->hw; + + interrupts = hwc->interrupts; + hwc->interrupts = 0; + + /* + * unthrottle events on the tick + */ + if (interrupts == MAX_INTERRUPTS) { + perf_log_throttle(event, 1); + event->pmu->start(event, 0); + } + + if (!event->attr.freq || !event->attr.sample_freq) + continue; + + event->pmu->read(event); + now = local64_read(&event->count); + delta = now - hwc->freq_count_stamp; + hwc->freq_count_stamp = now; + + if (delta > 0) + perf_adjust_period(event, period, delta); + } + raw_spin_unlock(&ctx->lock); +} + +/* + * Round-robin a context's events: + */ +static void rotate_ctx(struct perf_event_context *ctx) +{ + raw_spin_lock(&ctx->lock); + + /* + * Rotate the first entry last of non-pinned groups. Rotation might be + * disabled by the inheritance code. + */ + if (!ctx->rotate_disable) + list_rotate_left(&ctx->flexible_groups); + + raw_spin_unlock(&ctx->lock); +} + +/* + * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized + * because they're strictly cpu affine and rotate_start is called with IRQs + * disabled, while rotate_context is called from IRQ context. + */ +static void perf_rotate_context(struct perf_cpu_context *cpuctx) +{ + u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC; + struct perf_event_context *ctx = NULL; + int rotate = 0, remove = 1; + + if (cpuctx->ctx.nr_events) { + remove = 0; + if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) + rotate = 1; + } + + ctx = cpuctx->task_ctx; + if (ctx && ctx->nr_events) { + remove = 0; + if (ctx->nr_events != ctx->nr_active) + rotate = 1; + } + + perf_pmu_disable(cpuctx->ctx.pmu); + perf_ctx_adjust_freq(&cpuctx->ctx, interval); + if (ctx) + perf_ctx_adjust_freq(ctx, interval); + + if (!rotate) + goto done; + + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + if (ctx) + task_ctx_sched_out(ctx, EVENT_FLEXIBLE); + + rotate_ctx(&cpuctx->ctx); + if (ctx) + rotate_ctx(ctx); + + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current); + if (ctx) + task_ctx_sched_in(ctx, EVENT_FLEXIBLE); + +done: + if (remove) + list_del_init(&cpuctx->rotation_list); + + perf_pmu_enable(cpuctx->ctx.pmu); +} + +void perf_event_task_tick(void) +{ + struct list_head *head = &__get_cpu_var(rotation_list); + struct perf_cpu_context *cpuctx, *tmp; + + WARN_ON(!irqs_disabled()); + + list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { + if (cpuctx->jiffies_interval == 1 || + !(jiffies % cpuctx->jiffies_interval)) + perf_rotate_context(cpuctx); + } +} + +static int event_enable_on_exec(struct perf_event *event, + struct perf_event_context *ctx) +{ + if (!event->attr.enable_on_exec) + return 0; + + event->attr.enable_on_exec = 0; + if (event->state >= PERF_EVENT_STATE_INACTIVE) + return 0; + + __perf_event_mark_enabled(event, ctx); + + return 1; +} + +/* + * Enable all of a task's events that have been marked enable-on-exec. + * This expects task == current. + */ +static void perf_event_enable_on_exec(struct perf_event_context *ctx) +{ + struct perf_event *event; + unsigned long flags; + int enabled = 0; + int ret; + + local_irq_save(flags); + if (!ctx || !ctx->nr_events) + goto out; + + /* + * We must ctxsw out cgroup events to avoid conflict + * when invoking perf_task_event_sched_in() later on + * in this function. Otherwise we end up trying to + * ctxswin cgroup events which are already scheduled + * in. + */ + perf_cgroup_sched_out(current); + task_ctx_sched_out(ctx, EVENT_ALL); + + raw_spin_lock(&ctx->lock); + + list_for_each_entry(event, &ctx->pinned_groups, group_entry) { + ret = event_enable_on_exec(event, ctx); + if (ret) + enabled = 1; + } + + list_for_each_entry(event, &ctx->flexible_groups, group_entry) { + ret = event_enable_on_exec(event, ctx); + if (ret) + enabled = 1; + } + + /* + * Unclone this context if we enabled any event. + */ + if (enabled) + unclone_ctx(ctx); + + raw_spin_unlock(&ctx->lock); + + /* + * Also calls ctxswin for cgroup events, if any: + */ + perf_event_context_sched_in(ctx, ctx->task); +out: + local_irq_restore(flags); +} + +/* + * Cross CPU call to read the hardware event + */ +static void __perf_event_read(void *info) +{ + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. In that case + * event->count would have been updated to a recent sample + * when the event was scheduled out. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + + raw_spin_lock(&ctx->lock); + if (ctx->is_active) { + update_context_time(ctx); + update_cgrp_time_from_event(event); + } + update_event_times(event); + if (event->state == PERF_EVENT_STATE_ACTIVE) + event->pmu->read(event); + raw_spin_unlock(&ctx->lock); +} + +static inline u64 perf_event_count(struct perf_event *event) +{ + return local64_read(&event->count) + atomic64_read(&event->child_count); +} + +static u64 perf_event_read(struct perf_event *event) +{ + /* + * If event is enabled and currently active on a CPU, update the + * value in the event structure: + */ + if (event->state == PERF_EVENT_STATE_ACTIVE) { + smp_call_function_single(event->oncpu, + __perf_event_read, event, 1); + } else if (event->state == PERF_EVENT_STATE_INACTIVE) { + struct perf_event_context *ctx = event->ctx; + unsigned long flags; + + raw_spin_lock_irqsave(&ctx->lock, flags); + /* + * may read while context is not active + * (e.g., thread is blocked), in that case + * we cannot update context time + */ + if (ctx->is_active) { + update_context_time(ctx); + update_cgrp_time_from_event(event); + } + update_event_times(event); + raw_spin_unlock_irqrestore(&ctx->lock, flags); + } + + return perf_event_count(event); +} + +/* + * Callchain support + */ + +struct callchain_cpus_entries { + struct rcu_head rcu_head; + struct perf_callchain_entry *cpu_entries[0]; +}; + +static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); +static atomic_t nr_callchain_events; +static DEFINE_MUTEX(callchain_mutex); +struct callchain_cpus_entries *callchain_cpus_entries; + + +__weak void perf_callchain_kernel(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ +} + +__weak void perf_callchain_user(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ +} + +static void release_callchain_buffers_rcu(struct rcu_head *head) +{ + struct callchain_cpus_entries *entries; + int cpu; + + entries = container_of(head, struct callchain_cpus_entries, rcu_head); + + for_each_possible_cpu(cpu) + kfree(entries->cpu_entries[cpu]); + + kfree(entries); +} + +static void release_callchain_buffers(void) +{ + struct callchain_cpus_entries *entries; + + entries = callchain_cpus_entries; + rcu_assign_pointer(callchain_cpus_entries, NULL); + call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); +} + +static int alloc_callchain_buffers(void) +{ + int cpu; + int size; + struct callchain_cpus_entries *entries; + + /* + * We can't use the percpu allocation API for data that can be + * accessed from NMI. Use a temporary manual per cpu allocation + * until that gets sorted out. + */ + size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); + + entries = kzalloc(size, GFP_KERNEL); + if (!entries) + return -ENOMEM; + + size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; + + for_each_possible_cpu(cpu) { + entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, + cpu_to_node(cpu)); + if (!entries->cpu_entries[cpu]) + goto fail; + } + + rcu_assign_pointer(callchain_cpus_entries, entries); + + return 0; + +fail: + for_each_possible_cpu(cpu) + kfree(entries->cpu_entries[cpu]); + kfree(entries); + + return -ENOMEM; +} + +static int get_callchain_buffers(void) +{ + int err = 0; + int count; + + mutex_lock(&callchain_mutex); + + count = atomic_inc_return(&nr_callchain_events); + if (WARN_ON_ONCE(count < 1)) { + err = -EINVAL; + goto exit; + } + + if (count > 1) { + /* If the allocation failed, give up */ + if (!callchain_cpus_entries) + err = -ENOMEM; + goto exit; + } + + err = alloc_callchain_buffers(); + if (err) + release_callchain_buffers(); +exit: + mutex_unlock(&callchain_mutex); + + return err; +} + +static void put_callchain_buffers(void) +{ + if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { + release_callchain_buffers(); + mutex_unlock(&callchain_mutex); + } +} + +static int get_recursion_context(int *recursion) +{ + int rctx; + + if (in_nmi()) + rctx = 3; + else if (in_irq()) + rctx = 2; + else if (in_softirq()) + rctx = 1; + else + rctx = 0; + + if (recursion[rctx]) + return -1; + + recursion[rctx]++; + barrier(); + + return rctx; +} + +static inline void put_recursion_context(int *recursion, int rctx) +{ + barrier(); + recursion[rctx]--; +} + +static struct perf_callchain_entry *get_callchain_entry(int *rctx) +{ + int cpu; + struct callchain_cpus_entries *entries; + + *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); + if (*rctx == -1) + return NULL; + + entries = rcu_dereference(callchain_cpus_entries); + if (!entries) + return NULL; + + cpu = smp_processor_id(); + + return &entries->cpu_entries[cpu][*rctx]; +} + +static void +put_callchain_entry(int rctx) +{ + put_recursion_context(__get_cpu_var(callchain_recursion), rctx); +} + +static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + int rctx; + struct perf_callchain_entry *entry; + + + entry = get_callchain_entry(&rctx); + if (rctx == -1) + return NULL; + + if (!entry) + goto exit_put; + + entry->nr = 0; + + if (!user_mode(regs)) { + perf_callchain_store(entry, PERF_CONTEXT_KERNEL); + perf_callchain_kernel(entry, regs); + if (current->mm) + regs = task_pt_regs(current); + else + regs = NULL; + } + + if (regs) { + perf_callchain_store(entry, PERF_CONTEXT_USER); + perf_callchain_user(entry, regs); + } + +exit_put: + put_callchain_entry(rctx); + + return entry; +} + +/* + * Initialize the perf_event context in a task_struct: + */ +static void __perf_event_init_context(struct perf_event_context *ctx) +{ + raw_spin_lock_init(&ctx->lock); + mutex_init(&ctx->mutex); + INIT_LIST_HEAD(&ctx->pinned_groups); + INIT_LIST_HEAD(&ctx->flexible_groups); + INIT_LIST_HEAD(&ctx->event_list); + atomic_set(&ctx->refcount, 1); +} + +static struct perf_event_context * +alloc_perf_context(struct pmu *pmu, struct task_struct *task) +{ + struct perf_event_context *ctx; + + ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); + if (!ctx) + return NULL; + + __perf_event_init_context(ctx); + if (task) { + ctx->task = task; + get_task_struct(task); + } + ctx->pmu = pmu; + + return ctx; +} + +static struct task_struct * +find_lively_task_by_vpid(pid_t vpid) +{ + struct task_struct *task; + int err; + + rcu_read_lock(); + if (!vpid) + task = current; + else + task = find_task_by_vpid(vpid); + if (task) + get_task_struct(task); + rcu_read_unlock(); + + if (!task) + return ERR_PTR(-ESRCH); + + /* Reuse ptrace permission checks for now. */ + err = -EACCES; + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto errout; + + return task; +errout: + put_task_struct(task); + return ERR_PTR(err); + +} + +/* + * Returns a matching context with refcount and pincount. + */ +static struct perf_event_context * +find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) +{ + struct perf_event_context *ctx; + struct perf_cpu_context *cpuctx; + unsigned long flags; + int ctxn, err; + + if (!task) { + /* Must be root to operate on a CPU event: */ + if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EACCES); + + /* + * We could be clever and allow to attach a event to an + * offline CPU and activate it when the CPU comes up, but + * that's for later. + */ + if (!cpu_online(cpu)) + return ERR_PTR(-ENODEV); + + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + ctx = &cpuctx->ctx; + get_ctx(ctx); + ++ctx->pin_count; + + return ctx; + } + + err = -EINVAL; + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + goto errout; + +retry: + ctx = perf_lock_task_context(task, ctxn, &flags); + if (ctx) { + unclone_ctx(ctx); + ++ctx->pin_count; + raw_spin_unlock_irqrestore(&ctx->lock, flags); + } + + if (!ctx) { + ctx = alloc_perf_context(pmu, task); + err = -ENOMEM; + if (!ctx) + goto errout; + + get_ctx(ctx); + + err = 0; + mutex_lock(&task->perf_event_mutex); + /* + * If it has already passed perf_event_exit_task(). + * we must see PF_EXITING, it takes this mutex too. + */ + if (task->flags & PF_EXITING) + err = -ESRCH; + else if (task->perf_event_ctxp[ctxn]) + err = -EAGAIN; + else { + ++ctx->pin_count; + rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); + } + mutex_unlock(&task->perf_event_mutex); + + if (unlikely(err)) { + put_task_struct(task); + kfree(ctx); + + if (err == -EAGAIN) + goto retry; + goto errout; + } + } + + return ctx; + +errout: + return ERR_PTR(err); +} + +static void perf_event_free_filter(struct perf_event *event); + +static void free_event_rcu(struct rcu_head *head) +{ + struct perf_event *event; + + event = container_of(head, struct perf_event, rcu_head); + if (event->ns) + put_pid_ns(event->ns); + perf_event_free_filter(event); + kfree(event); +} + +static void perf_buffer_put(struct perf_buffer *buffer); + +static void free_event(struct perf_event *event) +{ + irq_work_sync(&event->pending); + + if (!event->parent) { + if (event->attach_state & PERF_ATTACH_TASK) + jump_label_dec(&perf_sched_events); + if (event->attr.mmap || event->attr.mmap_data) + atomic_dec(&nr_mmap_events); + if (event->attr.comm) + atomic_dec(&nr_comm_events); + if (event->attr.task) + atomic_dec(&nr_task_events); + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + put_callchain_buffers(); + if (is_cgroup_event(event)) { + atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); + jump_label_dec(&perf_sched_events); + } + } + + if (event->buffer) { + perf_buffer_put(event->buffer); + event->buffer = NULL; + } + + if (is_cgroup_event(event)) + perf_detach_cgroup(event); + + if (event->destroy) + event->destroy(event); + + if (event->ctx) + put_ctx(event->ctx); + + call_rcu(&event->rcu_head, free_event_rcu); +} + +int perf_event_release_kernel(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + + /* + * Remove from the PMU, can't get re-enabled since we got + * here because the last ref went. + */ + perf_event_disable(event); + + WARN_ON_ONCE(ctx->parent_ctx); + /* + * There are two ways this annotation is useful: + * + * 1) there is a lock recursion from perf_event_exit_task + * see the comment there. + * + * 2) there is a lock-inversion with mmap_sem through + * perf_event_read_group(), which takes faults while + * holding ctx->mutex, however this is called after + * the last filedesc died, so there is no possibility + * to trigger the AB-BA case. + */ + mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); + raw_spin_lock_irq(&ctx->lock); + perf_group_detach(event); + list_del_event(event, ctx); + raw_spin_unlock_irq(&ctx->lock); + mutex_unlock(&ctx->mutex); + + free_event(event); + + return 0; +} +EXPORT_SYMBOL_GPL(perf_event_release_kernel); + +/* + * Called when the last reference to the file is gone. + */ +static int perf_release(struct inode *inode, struct file *file) +{ + struct perf_event *event = file->private_data; + struct task_struct *owner; + + file->private_data = NULL; + + rcu_read_lock(); + owner = ACCESS_ONCE(event->owner); + /* + * Matches the smp_wmb() in perf_event_exit_task(). If we observe + * !owner it means the list deletion is complete and we can indeed + * free this event, otherwise we need to serialize on + * owner->perf_event_mutex. + */ + smp_read_barrier_depends(); + if (owner) { + /* + * Since delayed_put_task_struct() also drops the last + * task reference we can safely take a new reference + * while holding the rcu_read_lock(). + */ + get_task_struct(owner); + } + rcu_read_unlock(); + + if (owner) { + mutex_lock(&owner->perf_event_mutex); + /* + * We have to re-check the event->owner field, if it is cleared + * we raced with perf_event_exit_task(), acquiring the mutex + * ensured they're done, and we can proceed with freeing the + * event. + */ + if (event->owner) + list_del_init(&event->owner_entry); + mutex_unlock(&owner->perf_event_mutex); + put_task_struct(owner); + } + + return perf_event_release_kernel(event); +} + +u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) +{ + struct perf_event *child; + u64 total = 0; + + *enabled = 0; + *running = 0; + + mutex_lock(&event->child_mutex); + total += perf_event_read(event); + *enabled += event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); + *running += event->total_time_running + + atomic64_read(&event->child_total_time_running); + + list_for_each_entry(child, &event->child_list, child_list) { + total += perf_event_read(child); + *enabled += child->total_time_enabled; + *running += child->total_time_running; + } + mutex_unlock(&event->child_mutex); + + return total; +} +EXPORT_SYMBOL_GPL(perf_event_read_value); + +static int perf_event_read_group(struct perf_event *event, + u64 read_format, char __user *buf) +{ + struct perf_event *leader = event->group_leader, *sub; + int n = 0, size = 0, ret = -EFAULT; + struct perf_event_context *ctx = leader->ctx; + u64 values[5]; + u64 count, enabled, running; + + mutex_lock(&ctx->mutex); + count = perf_event_read_value(leader, &enabled, &running); + + values[n++] = 1 + leader->nr_siblings; + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = enabled; + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = running; + values[n++] = count; + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(leader); + + size = n * sizeof(u64); + + if (copy_to_user(buf, values, size)) + goto unlock; + + ret = size; + + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + n = 0; + + values[n++] = perf_event_read_value(sub, &enabled, &running); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(sub); + + size = n * sizeof(u64); + + if (copy_to_user(buf + ret, values, size)) { + ret = -EFAULT; + goto unlock; + } + + ret += size; + } +unlock: + mutex_unlock(&ctx->mutex); + + return ret; +} + +static int perf_event_read_one(struct perf_event *event, + u64 read_format, char __user *buf) +{ + u64 enabled, running; + u64 values[4]; + int n = 0; + + values[n++] = perf_event_read_value(event, &enabled, &running); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = enabled; + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = running; + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(event); + + if (copy_to_user(buf, values, n * sizeof(u64))) + return -EFAULT; + + return n * sizeof(u64); +} + +/* + * Read the performance event - simple non blocking version for now + */ +static ssize_t +perf_read_hw(struct perf_event *event, char __user *buf, size_t count) +{ + u64 read_format = event->attr.read_format; + int ret; + + /* + * Return end-of-file for a read on a event that is in + * error state (i.e. because it was pinned but it couldn't be + * scheduled on to the CPU at some point). + */ + if (event->state == PERF_EVENT_STATE_ERROR) + return 0; + + if (count < event->read_size) + return -ENOSPC; + + WARN_ON_ONCE(event->ctx->parent_ctx); + if (read_format & PERF_FORMAT_GROUP) + ret = perf_event_read_group(event, read_format, buf); + else + ret = perf_event_read_one(event, read_format, buf); + + return ret; +} + +static ssize_t +perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + struct perf_event *event = file->private_data; + + return perf_read_hw(event, buf, count); +} + +static unsigned int perf_poll(struct file *file, poll_table *wait) +{ + struct perf_event *event = file->private_data; + struct perf_buffer *buffer; + unsigned int events = POLL_HUP; + + rcu_read_lock(); + buffer = rcu_dereference(event->buffer); + if (buffer) + events = atomic_xchg(&buffer->poll, 0); + rcu_read_unlock(); + + poll_wait(file, &event->waitq, wait); + + return events; +} + +static void perf_event_reset(struct perf_event *event) +{ + (void)perf_event_read(event); + local64_set(&event->count, 0); + perf_event_update_userpage(event); +} + +/* + * Holding the top-level event's child_mutex means that any + * descendant process that has inherited this event will block + * in sync_child_event if it goes to exit, thus satisfying the + * task existence requirements of perf_event_enable/disable. + */ +static void perf_event_for_each_child(struct perf_event *event, + void (*func)(struct perf_event *)) +{ + struct perf_event *child; + + WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->child_mutex); + func(event); + list_for_each_entry(child, &event->child_list, child_list) + func(child); + mutex_unlock(&event->child_mutex); +} + +static void perf_event_for_each(struct perf_event *event, + void (*func)(struct perf_event *)) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_event *sibling; + + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + event = event->group_leader; + + perf_event_for_each_child(event, func); + func(event); + list_for_each_entry(sibling, &event->sibling_list, group_entry) + perf_event_for_each_child(event, func); + mutex_unlock(&ctx->mutex); +} + +static int perf_event_period(struct perf_event *event, u64 __user *arg) +{ + struct perf_event_context *ctx = event->ctx; + int ret = 0; + u64 value; + + if (!is_sampling_event(event)) + return -EINVAL; + + if (copy_from_user(&value, arg, sizeof(value))) + return -EFAULT; + + if (!value) + return -EINVAL; + + raw_spin_lock_irq(&ctx->lock); + if (event->attr.freq) { + if (value > sysctl_perf_event_sample_rate) { + ret = -EINVAL; + goto unlock; + } + + event->attr.sample_freq = value; + } else { + event->attr.sample_period = value; + event->hw.sample_period = value; + } +unlock: + raw_spin_unlock_irq(&ctx->lock); + + return ret; +} + +static const struct file_operations perf_fops; + +static struct perf_event *perf_fget_light(int fd, int *fput_needed) +{ + struct file *file; + + file = fget_light(fd, fput_needed); + if (!file) + return ERR_PTR(-EBADF); + + if (file->f_op != &perf_fops) { + fput_light(file, *fput_needed); + *fput_needed = 0; + return ERR_PTR(-EBADF); + } + + return file->private_data; +} + +static int perf_event_set_output(struct perf_event *event, + struct perf_event *output_event); +static int perf_event_set_filter(struct perf_event *event, void __user *arg); + +static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct perf_event *event = file->private_data; + void (*func)(struct perf_event *); + u32 flags = arg; + + switch (cmd) { + case PERF_EVENT_IOC_ENABLE: + func = perf_event_enable; + break; + case PERF_EVENT_IOC_DISABLE: + func = perf_event_disable; + break; + case PERF_EVENT_IOC_RESET: + func = perf_event_reset; + break; + + case PERF_EVENT_IOC_REFRESH: + return perf_event_refresh(event, arg); + + case PERF_EVENT_IOC_PERIOD: + return perf_event_period(event, (u64 __user *)arg); + + case PERF_EVENT_IOC_SET_OUTPUT: + { + struct perf_event *output_event = NULL; + int fput_needed = 0; + int ret; + + if (arg != -1) { + output_event = perf_fget_light(arg, &fput_needed); + if (IS_ERR(output_event)) + return PTR_ERR(output_event); + } + + ret = perf_event_set_output(event, output_event); + if (output_event) + fput_light(output_event->filp, fput_needed); + + return ret; + } + + case PERF_EVENT_IOC_SET_FILTER: + return perf_event_set_filter(event, (void __user *)arg); + + default: + return -ENOTTY; + } + + if (flags & PERF_IOC_FLAG_GROUP) + perf_event_for_each(event, func); + else + perf_event_for_each_child(event, func); + + return 0; +} + +int perf_event_task_enable(void) +{ + struct perf_event *event; + + mutex_lock(¤t->perf_event_mutex); + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) + perf_event_for_each_child(event, perf_event_enable); + mutex_unlock(¤t->perf_event_mutex); + + return 0; +} + +int perf_event_task_disable(void) +{ + struct perf_event *event; + + mutex_lock(¤t->perf_event_mutex); + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) + perf_event_for_each_child(event, perf_event_disable); + mutex_unlock(¤t->perf_event_mutex); + + return 0; +} + +#ifndef PERF_EVENT_INDEX_OFFSET +# define PERF_EVENT_INDEX_OFFSET 0 +#endif + +static int perf_event_index(struct perf_event *event) +{ + if (event->hw.state & PERF_HES_STOPPED) + return 0; + + if (event->state != PERF_EVENT_STATE_ACTIVE) + return 0; + + return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; +} + +/* + * Callers need to ensure there can be no nesting of this function, otherwise + * the seqlock logic goes bad. We can not serialize this because the arch + * code calls this from NMI context. + */ +void perf_event_update_userpage(struct perf_event *event) +{ + struct perf_event_mmap_page *userpg; + struct perf_buffer *buffer; + + rcu_read_lock(); + buffer = rcu_dereference(event->buffer); + if (!buffer) + goto unlock; + + userpg = buffer->user_page; + + /* + * Disable preemption so as to not let the corresponding user-space + * spin too long if we get preempted. + */ + preempt_disable(); + ++userpg->lock; + barrier(); + userpg->index = perf_event_index(event); + userpg->offset = perf_event_count(event); + if (event->state == PERF_EVENT_STATE_ACTIVE) + userpg->offset -= local64_read(&event->hw.prev_count); + + userpg->time_enabled = event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); + + userpg->time_running = event->total_time_running + + atomic64_read(&event->child_total_time_running); + + barrier(); + ++userpg->lock; + preempt_enable(); +unlock: + rcu_read_unlock(); +} + +static unsigned long perf_data_size(struct perf_buffer *buffer); + +static void +perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags) +{ + long max_size = perf_data_size(buffer); + + if (watermark) + buffer->watermark = min(max_size, watermark); + + if (!buffer->watermark) + buffer->watermark = max_size / 2; + + if (flags & PERF_BUFFER_WRITABLE) + buffer->writable = 1; + + atomic_set(&buffer->refcount, 1); +} + +#ifndef CONFIG_PERF_USE_VMALLOC + +/* + * Back perf_mmap() with regular GFP_KERNEL-0 pages. + */ + +static struct page * +perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) +{ + if (pgoff > buffer->nr_pages) + return NULL; + + if (pgoff == 0) + return virt_to_page(buffer->user_page); + + return virt_to_page(buffer->data_pages[pgoff - 1]); +} + +static void *perf_mmap_alloc_page(int cpu) +{ + struct page *page; + int node; + + node = (cpu == -1) ? cpu : cpu_to_node(cpu); + page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); + if (!page) + return NULL; + + return page_address(page); +} + +static struct perf_buffer * +perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) +{ + struct perf_buffer *buffer; + unsigned long size; + int i; + + size = sizeof(struct perf_buffer); + size += nr_pages * sizeof(void *); + + buffer = kzalloc(size, GFP_KERNEL); + if (!buffer) + goto fail; + + buffer->user_page = perf_mmap_alloc_page(cpu); + if (!buffer->user_page) + goto fail_user_page; + + for (i = 0; i < nr_pages; i++) { + buffer->data_pages[i] = perf_mmap_alloc_page(cpu); + if (!buffer->data_pages[i]) + goto fail_data_pages; + } + + buffer->nr_pages = nr_pages; + + perf_buffer_init(buffer, watermark, flags); + + return buffer; + +fail_data_pages: + for (i--; i >= 0; i--) + free_page((unsigned long)buffer->data_pages[i]); + + free_page((unsigned long)buffer->user_page); + +fail_user_page: + kfree(buffer); + +fail: + return NULL; +} + +static void perf_mmap_free_page(unsigned long addr) +{ + struct page *page = virt_to_page((void *)addr); + + page->mapping = NULL; + __free_page(page); +} + +static void perf_buffer_free(struct perf_buffer *buffer) +{ + int i; + + perf_mmap_free_page((unsigned long)buffer->user_page); + for (i = 0; i < buffer->nr_pages; i++) + perf_mmap_free_page((unsigned long)buffer->data_pages[i]); + kfree(buffer); +} + +static inline int page_order(struct perf_buffer *buffer) +{ + return 0; +} + +#else + +/* + * Back perf_mmap() with vmalloc memory. + * + * Required for architectures that have d-cache aliasing issues. + */ + +static inline int page_order(struct perf_buffer *buffer) +{ + return buffer->page_order; +} + +static struct page * +perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) +{ + if (pgoff > (1UL << page_order(buffer))) + return NULL; + + return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE); +} + +static void perf_mmap_unmark_page(void *addr) +{ + struct page *page = vmalloc_to_page(addr); + + page->mapping = NULL; +} + +static void perf_buffer_free_work(struct work_struct *work) +{ + struct perf_buffer *buffer; + void *base; + int i, nr; + + buffer = container_of(work, struct perf_buffer, work); + nr = 1 << page_order(buffer); + + base = buffer->user_page; + for (i = 0; i < nr + 1; i++) + perf_mmap_unmark_page(base + (i * PAGE_SIZE)); + + vfree(base); + kfree(buffer); +} + +static void perf_buffer_free(struct perf_buffer *buffer) +{ + schedule_work(&buffer->work); +} + +static struct perf_buffer * +perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) +{ + struct perf_buffer *buffer; + unsigned long size; + void *all_buf; + + size = sizeof(struct perf_buffer); + size += sizeof(void *); + + buffer = kzalloc(size, GFP_KERNEL); + if (!buffer) + goto fail; + + INIT_WORK(&buffer->work, perf_buffer_free_work); + + all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); + if (!all_buf) + goto fail_all_buf; + + buffer->user_page = all_buf; + buffer->data_pages[0] = all_buf + PAGE_SIZE; + buffer->page_order = ilog2(nr_pages); + buffer->nr_pages = 1; + + perf_buffer_init(buffer, watermark, flags); + + return buffer; + +fail_all_buf: + kfree(buffer); + +fail: + return NULL; +} + +#endif + +static unsigned long perf_data_size(struct perf_buffer *buffer) +{ + return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer)); +} + +static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct perf_event *event = vma->vm_file->private_data; + struct perf_buffer *buffer; + int ret = VM_FAULT_SIGBUS; + + if (vmf->flags & FAULT_FLAG_MKWRITE) { + if (vmf->pgoff == 0) + ret = 0; + return ret; + } + + rcu_read_lock(); + buffer = rcu_dereference(event->buffer); + if (!buffer) + goto unlock; + + if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) + goto unlock; + + vmf->page = perf_mmap_to_page(buffer, vmf->pgoff); + if (!vmf->page) + goto unlock; + + get_page(vmf->page); + vmf->page->mapping = vma->vm_file->f_mapping; + vmf->page->index = vmf->pgoff; + + ret = 0; +unlock: + rcu_read_unlock(); + + return ret; +} + +static void perf_buffer_free_rcu(struct rcu_head *rcu_head) +{ + struct perf_buffer *buffer; + + buffer = container_of(rcu_head, struct perf_buffer, rcu_head); + perf_buffer_free(buffer); +} + +static struct perf_buffer *perf_buffer_get(struct perf_event *event) +{ + struct perf_buffer *buffer; + + rcu_read_lock(); + buffer = rcu_dereference(event->buffer); + if (buffer) { + if (!atomic_inc_not_zero(&buffer->refcount)) + buffer = NULL; + } + rcu_read_unlock(); + + return buffer; +} + +static void perf_buffer_put(struct perf_buffer *buffer) +{ + if (!atomic_dec_and_test(&buffer->refcount)) + return; + + call_rcu(&buffer->rcu_head, perf_buffer_free_rcu); +} + +static void perf_mmap_open(struct vm_area_struct *vma) +{ + struct perf_event *event = vma->vm_file->private_data; + + atomic_inc(&event->mmap_count); +} + +static void perf_mmap_close(struct vm_area_struct *vma) +{ + struct perf_event *event = vma->vm_file->private_data; + + if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { + unsigned long size = perf_data_size(event->buffer); + struct user_struct *user = event->mmap_user; + struct perf_buffer *buffer = event->buffer; + + atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); + vma->vm_mm->locked_vm -= event->mmap_locked; + rcu_assign_pointer(event->buffer, NULL); + mutex_unlock(&event->mmap_mutex); + + perf_buffer_put(buffer); + free_uid(user); + } +} + +static const struct vm_operations_struct perf_mmap_vmops = { + .open = perf_mmap_open, + .close = perf_mmap_close, + .fault = perf_mmap_fault, + .page_mkwrite = perf_mmap_fault, +}; + +static int perf_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct perf_event *event = file->private_data; + unsigned long user_locked, user_lock_limit; + struct user_struct *user = current_user(); + unsigned long locked, lock_limit; + struct perf_buffer *buffer; + unsigned long vma_size; + unsigned long nr_pages; + long user_extra, extra; + int ret = 0, flags = 0; + + /* + * Don't allow mmap() of inherited per-task counters. This would + * create a performance issue due to all children writing to the + * same buffer. + */ + if (event->cpu == -1 && event->attr.inherit) + return -EINVAL; + + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + vma_size = vma->vm_end - vma->vm_start; + nr_pages = (vma_size / PAGE_SIZE) - 1; + + /* + * If we have buffer pages ensure they're a power-of-two number, so we + * can do bitmasks instead of modulo. + */ + if (nr_pages != 0 && !is_power_of_2(nr_pages)) + return -EINVAL; + + if (vma_size != PAGE_SIZE * (1 + nr_pages)) + return -EINVAL; + + if (vma->vm_pgoff != 0) + return -EINVAL; + + WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->mmap_mutex); + if (event->buffer) { + if (event->buffer->nr_pages == nr_pages) + atomic_inc(&event->buffer->refcount); + else + ret = -EINVAL; + goto unlock; + } + + user_extra = nr_pages + 1; + user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); + + /* + * Increase the limit linearly with more CPUs: + */ + user_lock_limit *= num_online_cpus(); + + user_locked = atomic_long_read(&user->locked_vm) + user_extra; + + extra = 0; + if (user_locked > user_lock_limit) + extra = user_locked - user_lock_limit; + + lock_limit = rlimit(RLIMIT_MEMLOCK); + lock_limit >>= PAGE_SHIFT; + locked = vma->vm_mm->locked_vm + extra; + + if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && + !capable(CAP_IPC_LOCK)) { + ret = -EPERM; + goto unlock; + } + + WARN_ON(event->buffer); + + if (vma->vm_flags & VM_WRITE) + flags |= PERF_BUFFER_WRITABLE; + + buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark, + event->cpu, flags); + if (!buffer) { + ret = -ENOMEM; + goto unlock; + } + rcu_assign_pointer(event->buffer, buffer); + + atomic_long_add(user_extra, &user->locked_vm); + event->mmap_locked = extra; + event->mmap_user = get_current_user(); + vma->vm_mm->locked_vm += event->mmap_locked; + +unlock: + if (!ret) + atomic_inc(&event->mmap_count); + mutex_unlock(&event->mmap_mutex); + + vma->vm_flags |= VM_RESERVED; + vma->vm_ops = &perf_mmap_vmops; + + return ret; +} + +static int perf_fasync(int fd, struct file *filp, int on) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct perf_event *event = filp->private_data; + int retval; + + mutex_lock(&inode->i_mutex); + retval = fasync_helper(fd, filp, on, &event->fasync); + mutex_unlock(&inode->i_mutex); + + if (retval < 0) + return retval; + + return 0; +} + +static const struct file_operations perf_fops = { + .llseek = no_llseek, + .release = perf_release, + .read = perf_read, + .poll = perf_poll, + .unlocked_ioctl = perf_ioctl, + .compat_ioctl = perf_ioctl, + .mmap = perf_mmap, + .fasync = perf_fasync, +}; + +/* + * Perf event wakeup + * + * If there's data, ensure we set the poll() state and publish everything + * to user-space before waking everybody up. + */ + +void perf_event_wakeup(struct perf_event *event) +{ + wake_up_all(&event->waitq); + + if (event->pending_kill) { + kill_fasync(&event->fasync, SIGIO, event->pending_kill); + event->pending_kill = 0; + } +} + +static void perf_pending_event(struct irq_work *entry) +{ + struct perf_event *event = container_of(entry, + struct perf_event, pending); + + if (event->pending_disable) { + event->pending_disable = 0; + __perf_event_disable(event); + } + + if (event->pending_wakeup) { + event->pending_wakeup = 0; + perf_event_wakeup(event); + } +} + +/* + * We assume there is only KVM supporting the callbacks. + * Later on, we might change it to a list if there is + * another virtualization implementation supporting the callbacks. + */ +struct perf_guest_info_callbacks *perf_guest_cbs; + +int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +{ + perf_guest_cbs = cbs; + return 0; +} +EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); + +int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +{ + perf_guest_cbs = NULL; + return 0; +} +EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); + +/* + * Output + */ +static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail, + unsigned long offset, unsigned long head) +{ + unsigned long mask; + + if (!buffer->writable) + return true; + + mask = perf_data_size(buffer) - 1; + + offset = (offset - tail) & mask; + head = (head - tail) & mask; + + if ((int)(head - offset) < 0) + return false; + + return true; +} + +static void perf_output_wakeup(struct perf_output_handle *handle) +{ + atomic_set(&handle->buffer->poll, POLL_IN); + + if (handle->nmi) { + handle->event->pending_wakeup = 1; + irq_work_queue(&handle->event->pending); + } else + perf_event_wakeup(handle->event); +} + +/* + * We need to ensure a later event_id doesn't publish a head when a former + * event isn't done writing. However since we need to deal with NMIs we + * cannot fully serialize things. + * + * We only publish the head (and generate a wakeup) when the outer-most + * event completes. + */ +static void perf_output_get_handle(struct perf_output_handle *handle) +{ + struct perf_buffer *buffer = handle->buffer; + + preempt_disable(); + local_inc(&buffer->nest); + handle->wakeup = local_read(&buffer->wakeup); +} + +static void perf_output_put_handle(struct perf_output_handle *handle) +{ + struct perf_buffer *buffer = handle->buffer; + unsigned long head; + +again: + head = local_read(&buffer->head); + + /* + * IRQ/NMI can happen here, which means we can miss a head update. + */ + + if (!local_dec_and_test(&buffer->nest)) + goto out; + + /* + * Publish the known good head. Rely on the full barrier implied + * by atomic_dec_and_test() order the buffer->head read and this + * write. + */ + buffer->user_page->data_head = head; + + /* + * Now check if we missed an update, rely on the (compiler) + * barrier in atomic_dec_and_test() to re-read buffer->head. + */ + if (unlikely(head != local_read(&buffer->head))) { + local_inc(&buffer->nest); + goto again; + } + + if (handle->wakeup != local_read(&buffer->wakeup)) + perf_output_wakeup(handle); + +out: + preempt_enable(); +} + +__always_inline void perf_output_copy(struct perf_output_handle *handle, + const void *buf, unsigned int len) +{ + do { + unsigned long size = min_t(unsigned long, handle->size, len); + + memcpy(handle->addr, buf, size); + + len -= size; + handle->addr += size; + buf += size; + handle->size -= size; + if (!handle->size) { + struct perf_buffer *buffer = handle->buffer; + + handle->page++; + handle->page &= buffer->nr_pages - 1; + handle->addr = buffer->data_pages[handle->page]; + handle->size = PAGE_SIZE << page_order(buffer); + } + } while (len); +} + +static void __perf_event_header__init_id(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event) +{ + u64 sample_type = event->attr.sample_type; + + data->type = sample_type; + header->size += event->id_header_size; + + if (sample_type & PERF_SAMPLE_TID) { + /* namespace issues */ + data->tid_entry.pid = perf_event_pid(event, current); + data->tid_entry.tid = perf_event_tid(event, current); + } + + if (sample_type & PERF_SAMPLE_TIME) + data->time = perf_clock(); + + if (sample_type & PERF_SAMPLE_ID) + data->id = primary_event_id(event); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + data->stream_id = event->id; + + if (sample_type & PERF_SAMPLE_CPU) { + data->cpu_entry.cpu = raw_smp_processor_id(); + data->cpu_entry.reserved = 0; + } +} + +static void perf_event_header__init_id(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event) +{ + if (event->attr.sample_id_all) + __perf_event_header__init_id(header, data, event); +} + +static void __perf_event__output_id_sample(struct perf_output_handle *handle, + struct perf_sample_data *data) +{ + u64 sample_type = data->type; + + if (sample_type & PERF_SAMPLE_TID) + perf_output_put(handle, data->tid_entry); + + if (sample_type & PERF_SAMPLE_TIME) + perf_output_put(handle, data->time); + + if (sample_type & PERF_SAMPLE_ID) + perf_output_put(handle, data->id); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + perf_output_put(handle, data->stream_id); + + if (sample_type & PERF_SAMPLE_CPU) + perf_output_put(handle, data->cpu_entry); +} + +static void perf_event__output_id_sample(struct perf_event *event, + struct perf_output_handle *handle, + struct perf_sample_data *sample) +{ + if (event->attr.sample_id_all) + __perf_event__output_id_sample(handle, sample); +} + +int perf_output_begin(struct perf_output_handle *handle, + struct perf_event *event, unsigned int size, + int nmi, int sample) +{ + struct perf_buffer *buffer; + unsigned long tail, offset, head; + int have_lost; + struct perf_sample_data sample_data; + struct { + struct perf_event_header header; + u64 id; + u64 lost; + } lost_event; + + rcu_read_lock(); + /* + * For inherited events we send all the output towards the parent. + */ + if (event->parent) + event = event->parent; + + buffer = rcu_dereference(event->buffer); + if (!buffer) + goto out; + + handle->buffer = buffer; + handle->event = event; + handle->nmi = nmi; + handle->sample = sample; + + if (!buffer->nr_pages) + goto out; + + have_lost = local_read(&buffer->lost); + if (have_lost) { + lost_event.header.size = sizeof(lost_event); + perf_event_header__init_id(&lost_event.header, &sample_data, + event); + size += lost_event.header.size; + } + + perf_output_get_handle(handle); + + do { + /* + * Userspace could choose to issue a mb() before updating the + * tail pointer. So that all reads will be completed before the + * write is issued. + */ + tail = ACCESS_ONCE(buffer->user_page->data_tail); + smp_rmb(); + offset = head = local_read(&buffer->head); + head += size; + if (unlikely(!perf_output_space(buffer, tail, offset, head))) + goto fail; + } while (local_cmpxchg(&buffer->head, offset, head) != offset); + + if (head - local_read(&buffer->wakeup) > buffer->watermark) + local_add(buffer->watermark, &buffer->wakeup); + + handle->page = offset >> (PAGE_SHIFT + page_order(buffer)); + handle->page &= buffer->nr_pages - 1; + handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1); + handle->addr = buffer->data_pages[handle->page]; + handle->addr += handle->size; + handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size; + + if (have_lost) { + lost_event.header.type = PERF_RECORD_LOST; + lost_event.header.misc = 0; + lost_event.id = event->id; + lost_event.lost = local_xchg(&buffer->lost, 0); + + perf_output_put(handle, lost_event); + perf_event__output_id_sample(event, handle, &sample_data); + } + + return 0; + +fail: + local_inc(&buffer->lost); + perf_output_put_handle(handle); +out: + rcu_read_unlock(); + + return -ENOSPC; +} + +void perf_output_end(struct perf_output_handle *handle) +{ + struct perf_event *event = handle->event; + struct perf_buffer *buffer = handle->buffer; + + int wakeup_events = event->attr.wakeup_events; + + if (handle->sample && wakeup_events) { + int events = local_inc_return(&buffer->events); + if (events >= wakeup_events) { + local_sub(wakeup_events, &buffer->events); + local_inc(&buffer->wakeup); + } + } + + perf_output_put_handle(handle); + rcu_read_unlock(); +} + +static void perf_output_read_one(struct perf_output_handle *handle, + struct perf_event *event, + u64 enabled, u64 running) +{ + u64 read_format = event->attr.read_format; + u64 values[4]; + int n = 0; + + values[n++] = perf_event_count(event); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = enabled + + atomic64_read(&event->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = running + + atomic64_read(&event->child_total_time_running); + } + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(event); + + perf_output_copy(handle, values, n * sizeof(u64)); +} + +/* + * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. + */ +static void perf_output_read_group(struct perf_output_handle *handle, + struct perf_event *event, + u64 enabled, u64 running) +{ + struct perf_event *leader = event->group_leader, *sub; + u64 read_format = event->attr.read_format; + u64 values[5]; + int n = 0; + + values[n++] = 1 + leader->nr_siblings; + + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = enabled; + + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = running; + + if (leader != event) + leader->pmu->read(leader); + + values[n++] = perf_event_count(leader); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(leader); + + perf_output_copy(handle, values, n * sizeof(u64)); + + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + n = 0; + + if (sub != event) + sub->pmu->read(sub); + + values[n++] = perf_event_count(sub); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(sub); + + perf_output_copy(handle, values, n * sizeof(u64)); + } +} + +#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ + PERF_FORMAT_TOTAL_TIME_RUNNING) + +static void perf_output_read(struct perf_output_handle *handle, + struct perf_event *event) +{ + u64 enabled = 0, running = 0, now, ctx_time; + u64 read_format = event->attr.read_format; + + /* + * compute total_time_enabled, total_time_running + * based on snapshot values taken when the event + * was last scheduled in. + * + * we cannot simply called update_context_time() + * because of locking issue as we are called in + * NMI context + */ + if (read_format & PERF_FORMAT_TOTAL_TIMES) { + now = perf_clock(); + ctx_time = event->shadow_ctx_time + now; + enabled = ctx_time - event->tstamp_enabled; + running = ctx_time - event->tstamp_running; + } + + if (event->attr.read_format & PERF_FORMAT_GROUP) + perf_output_read_group(handle, event, enabled, running); + else + perf_output_read_one(handle, event, enabled, running); +} + +void perf_output_sample(struct perf_output_handle *handle, + struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event) +{ + u64 sample_type = data->type; + + perf_output_put(handle, *header); + + if (sample_type & PERF_SAMPLE_IP) + perf_output_put(handle, data->ip); + + if (sample_type & PERF_SAMPLE_TID) + perf_output_put(handle, data->tid_entry); + + if (sample_type & PERF_SAMPLE_TIME) + perf_output_put(handle, data->time); + + if (sample_type & PERF_SAMPLE_ADDR) + perf_output_put(handle, data->addr); + + if (sample_type & PERF_SAMPLE_ID) + perf_output_put(handle, data->id); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + perf_output_put(handle, data->stream_id); + + if (sample_type & PERF_SAMPLE_CPU) + perf_output_put(handle, data->cpu_entry); + + if (sample_type & PERF_SAMPLE_PERIOD) + perf_output_put(handle, data->period); + + if (sample_type & PERF_SAMPLE_READ) + perf_output_read(handle, event); + + if (sample_type & PERF_SAMPLE_CALLCHAIN) { + if (data->callchain) { + int size = 1; + + if (data->callchain) + size += data->callchain->nr; + + size *= sizeof(u64); + + perf_output_copy(handle, data->callchain, size); + } else { + u64 nr = 0; + perf_output_put(handle, nr); + } + } + + if (sample_type & PERF_SAMPLE_RAW) { + if (data->raw) { + perf_output_put(handle, data->raw->size); + perf_output_copy(handle, data->raw->data, + data->raw->size); + } else { + struct { + u32 size; + u32 data; + } raw = { + .size = sizeof(u32), + .data = 0, + }; + perf_output_put(handle, raw); + } + } +} + +void perf_prepare_sample(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event, + struct pt_regs *regs) +{ + u64 sample_type = event->attr.sample_type; + + header->type = PERF_RECORD_SAMPLE; + header->size = sizeof(*header) + event->header_size; + + header->misc = 0; + header->misc |= perf_misc_flags(regs); + + __perf_event_header__init_id(header, data, event); + + if (sample_type & PERF_SAMPLE_IP) + data->ip = perf_instruction_pointer(regs); + + if (sample_type & PERF_SAMPLE_CALLCHAIN) { + int size = 1; + + data->callchain = perf_callchain(regs); + + if (data->callchain) + size += data->callchain->nr; + + header->size += size * sizeof(u64); + } + + if (sample_type & PERF_SAMPLE_RAW) { + int size = sizeof(u32); + + if (data->raw) + size += data->raw->size; + else + size += sizeof(u32); + + WARN_ON_ONCE(size & (sizeof(u64)-1)); + header->size += size; + } +} + +static void perf_event_output(struct perf_event *event, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct perf_output_handle handle; + struct perf_event_header header; + + /* protect the callchain buffers */ + rcu_read_lock(); + + perf_prepare_sample(&header, data, event, regs); + + if (perf_output_begin(&handle, event, header.size, nmi, 1)) + goto exit; + + perf_output_sample(&handle, &header, data, event); + + perf_output_end(&handle); + +exit: + rcu_read_unlock(); +} + +/* + * read event_id + */ + +struct perf_read_event { + struct perf_event_header header; + + u32 pid; + u32 tid; +}; + +static void +perf_event_read_event(struct perf_event *event, + struct task_struct *task) +{ + struct perf_output_handle handle; + struct perf_sample_data sample; + struct perf_read_event read_event = { + .header = { + .type = PERF_RECORD_READ, + .misc = 0, + .size = sizeof(read_event) + event->read_size, + }, + .pid = perf_event_pid(event, task), + .tid = perf_event_tid(event, task), + }; + int ret; + + perf_event_header__init_id(&read_event.header, &sample, event); + ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); + if (ret) + return; + + perf_output_put(&handle, read_event); + perf_output_read(&handle, event); + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +/* + * task tracking -- fork/exit + * + * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task + */ + +struct perf_task_event { + struct task_struct *task; + struct perf_event_context *task_ctx; + + struct { + struct perf_event_header header; + + u32 pid; + u32 ppid; + u32 tid; + u32 ptid; + u64 time; + } event_id; +}; + +static void perf_event_task_output(struct perf_event *event, + struct perf_task_event *task_event) +{ + struct perf_output_handle handle; + struct perf_sample_data sample; + struct task_struct *task = task_event->task; + int ret, size = task_event->event_id.header.size; + + perf_event_header__init_id(&task_event->event_id.header, &sample, event); + + ret = perf_output_begin(&handle, event, + task_event->event_id.header.size, 0, 0); + if (ret) + goto out; + + task_event->event_id.pid = perf_event_pid(event, task); + task_event->event_id.ppid = perf_event_pid(event, current); + + task_event->event_id.tid = perf_event_tid(event, task); + task_event->event_id.ptid = perf_event_tid(event, current); + + perf_output_put(&handle, task_event->event_id); + + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +out: + task_event->event_id.header.size = size; +} + +static int perf_event_task_match(struct perf_event *event) +{ + if (event->state < PERF_EVENT_STATE_INACTIVE) + return 0; + + if (!event_filter_match(event)) + return 0; + + if (event->attr.comm || event->attr.mmap || + event->attr.mmap_data || event->attr.task) + return 1; + + return 0; +} + +static void perf_event_task_ctx(struct perf_event_context *ctx, + struct perf_task_event *task_event) +{ + struct perf_event *event; + + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_event_task_match(event)) + perf_event_task_output(event, task_event); + } +} + +static void perf_event_task_event(struct perf_task_event *task_event) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + struct pmu *pmu; + int ctxn; + + rcu_read_lock(); + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + if (cpuctx->active_pmu != pmu) + goto next; + perf_event_task_ctx(&cpuctx->ctx, task_event); + + ctx = task_event->task_ctx; + if (!ctx) { + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + goto next; + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + } + if (ctx) + perf_event_task_ctx(ctx, task_event); +next: + put_cpu_ptr(pmu->pmu_cpu_context); + } + rcu_read_unlock(); +} + +static void perf_event_task(struct task_struct *task, + struct perf_event_context *task_ctx, + int new) +{ + struct perf_task_event task_event; + + if (!atomic_read(&nr_comm_events) && + !atomic_read(&nr_mmap_events) && + !atomic_read(&nr_task_events)) + return; + + task_event = (struct perf_task_event){ + .task = task, + .task_ctx = task_ctx, + .event_id = { + .header = { + .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT, + .misc = 0, + .size = sizeof(task_event.event_id), + }, + /* .pid */ + /* .ppid */ + /* .tid */ + /* .ptid */ + .time = perf_clock(), + }, + }; + + perf_event_task_event(&task_event); +} + +void perf_event_fork(struct task_struct *task) +{ + perf_event_task(task, NULL, 1); +} + +/* + * comm tracking + */ + +struct perf_comm_event { + struct task_struct *task; + char *comm; + int comm_size; + + struct { + struct perf_event_header header; + + u32 pid; + u32 tid; + } event_id; +}; + +static void perf_event_comm_output(struct perf_event *event, + struct perf_comm_event *comm_event) +{ + struct perf_output_handle handle; + struct perf_sample_data sample; + int size = comm_event->event_id.header.size; + int ret; + + perf_event_header__init_id(&comm_event->event_id.header, &sample, event); + ret = perf_output_begin(&handle, event, + comm_event->event_id.header.size, 0, 0); + + if (ret) + goto out; + + comm_event->event_id.pid = perf_event_pid(event, comm_event->task); + comm_event->event_id.tid = perf_event_tid(event, comm_event->task); + + perf_output_put(&handle, comm_event->event_id); + perf_output_copy(&handle, comm_event->comm, + comm_event->comm_size); + + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +out: + comm_event->event_id.header.size = size; +} + +static int perf_event_comm_match(struct perf_event *event) +{ + if (event->state < PERF_EVENT_STATE_INACTIVE) + return 0; + + if (!event_filter_match(event)) + return 0; + + if (event->attr.comm) + return 1; + + return 0; +} + +static void perf_event_comm_ctx(struct perf_event_context *ctx, + struct perf_comm_event *comm_event) +{ + struct perf_event *event; + + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_event_comm_match(event)) + perf_event_comm_output(event, comm_event); + } +} + +static void perf_event_comm_event(struct perf_comm_event *comm_event) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + char comm[TASK_COMM_LEN]; + unsigned int size; + struct pmu *pmu; + int ctxn; + + memset(comm, 0, sizeof(comm)); + strlcpy(comm, comm_event->task->comm, sizeof(comm)); + size = ALIGN(strlen(comm)+1, sizeof(u64)); + + comm_event->comm = comm; + comm_event->comm_size = size; + + comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; + rcu_read_lock(); + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + if (cpuctx->active_pmu != pmu) + goto next; + perf_event_comm_ctx(&cpuctx->ctx, comm_event); + + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + goto next; + + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) + perf_event_comm_ctx(ctx, comm_event); +next: + put_cpu_ptr(pmu->pmu_cpu_context); + } + rcu_read_unlock(); +} + +void perf_event_comm(struct task_struct *task) +{ + struct perf_comm_event comm_event; + struct perf_event_context *ctx; + int ctxn; + + for_each_task_context_nr(ctxn) { + ctx = task->perf_event_ctxp[ctxn]; + if (!ctx) + continue; + + perf_event_enable_on_exec(ctx); + } + + if (!atomic_read(&nr_comm_events)) + return; + + comm_event = (struct perf_comm_event){ + .task = task, + /* .comm */ + /* .comm_size */ + .event_id = { + .header = { + .type = PERF_RECORD_COMM, + .misc = 0, + /* .size */ + }, + /* .pid */ + /* .tid */ + }, + }; + + perf_event_comm_event(&comm_event); +} + +/* + * mmap tracking + */ + +struct perf_mmap_event { + struct vm_area_struct *vma; + + const char *file_name; + int file_size; + + struct { + struct perf_event_header header; + + u32 pid; + u32 tid; + u64 start; + u64 len; + u64 pgoff; + } event_id; +}; + +static void perf_event_mmap_output(struct perf_event *event, + struct perf_mmap_event *mmap_event) +{ + struct perf_output_handle handle; + struct perf_sample_data sample; + int size = mmap_event->event_id.header.size; + int ret; + + perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); + ret = perf_output_begin(&handle, event, + mmap_event->event_id.header.size, 0, 0); + if (ret) + goto out; + + mmap_event->event_id.pid = perf_event_pid(event, current); + mmap_event->event_id.tid = perf_event_tid(event, current); + + perf_output_put(&handle, mmap_event->event_id); + perf_output_copy(&handle, mmap_event->file_name, + mmap_event->file_size); + + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +out: + mmap_event->event_id.header.size = size; +} + +static int perf_event_mmap_match(struct perf_event *event, + struct perf_mmap_event *mmap_event, + int executable) +{ + if (event->state < PERF_EVENT_STATE_INACTIVE) + return 0; + + if (!event_filter_match(event)) + return 0; + + if ((!executable && event->attr.mmap_data) || + (executable && event->attr.mmap)) + return 1; + + return 0; +} + +static void perf_event_mmap_ctx(struct perf_event_context *ctx, + struct perf_mmap_event *mmap_event, + int executable) +{ + struct perf_event *event; + + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_event_mmap_match(event, mmap_event, executable)) + perf_event_mmap_output(event, mmap_event); + } +} + +static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + struct vm_area_struct *vma = mmap_event->vma; + struct file *file = vma->vm_file; + unsigned int size; + char tmp[16]; + char *buf = NULL; + const char *name; + struct pmu *pmu; + int ctxn; + + memset(tmp, 0, sizeof(tmp)); + + if (file) { + /* + * d_path works from the end of the buffer backwards, so we + * need to add enough zero bytes after the string to handle + * the 64bit alignment we do later. + */ + buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); + if (!buf) { + name = strncpy(tmp, "//enomem", sizeof(tmp)); + goto got_name; + } + name = d_path(&file->f_path, buf, PATH_MAX); + if (IS_ERR(name)) { + name = strncpy(tmp, "//toolong", sizeof(tmp)); + goto got_name; + } + } else { + if (arch_vma_name(mmap_event->vma)) { + name = strncpy(tmp, arch_vma_name(mmap_event->vma), + sizeof(tmp)); + goto got_name; + } + + if (!vma->vm_mm) { + name = strncpy(tmp, "[vdso]", sizeof(tmp)); + goto got_name; + } else if (vma->vm_start <= vma->vm_mm->start_brk && + vma->vm_end >= vma->vm_mm->brk) { + name = strncpy(tmp, "[heap]", sizeof(tmp)); + goto got_name; + } else if (vma->vm_start <= vma->vm_mm->start_stack && + vma->vm_end >= vma->vm_mm->start_stack) { + name = strncpy(tmp, "[stack]", sizeof(tmp)); + goto got_name; + } + + name = strncpy(tmp, "//anon", sizeof(tmp)); + goto got_name; + } + +got_name: + size = ALIGN(strlen(name)+1, sizeof(u64)); + + mmap_event->file_name = name; + mmap_event->file_size = size; + + mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; + + rcu_read_lock(); + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + if (cpuctx->active_pmu != pmu) + goto next; + perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, + vma->vm_flags & VM_EXEC); + + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + goto next; + + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) { + perf_event_mmap_ctx(ctx, mmap_event, + vma->vm_flags & VM_EXEC); + } +next: + put_cpu_ptr(pmu->pmu_cpu_context); + } + rcu_read_unlock(); + + kfree(buf); +} + +void perf_event_mmap(struct vm_area_struct *vma) +{ + struct perf_mmap_event mmap_event; + + if (!atomic_read(&nr_mmap_events)) + return; + + mmap_event = (struct perf_mmap_event){ + .vma = vma, + /* .file_name */ + /* .file_size */ + .event_id = { + .header = { + .type = PERF_RECORD_MMAP, + .misc = PERF_RECORD_MISC_USER, + /* .size */ + }, + /* .pid */ + /* .tid */ + .start = vma->vm_start, + .len = vma->vm_end - vma->vm_start, + .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, + }, + }; + + perf_event_mmap_event(&mmap_event); +} + +/* + * IRQ throttle logging + */ + +static void perf_log_throttle(struct perf_event *event, int enable) +{ + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret; + + struct { + struct perf_event_header header; + u64 time; + u64 id; + u64 stream_id; + } throttle_event = { + .header = { + .type = PERF_RECORD_THROTTLE, + .misc = 0, + .size = sizeof(throttle_event), + }, + .time = perf_clock(), + .id = primary_event_id(event), + .stream_id = event->id, + }; + + if (enable) + throttle_event.header.type = PERF_RECORD_UNTHROTTLE; + + perf_event_header__init_id(&throttle_event.header, &sample, event); + + ret = perf_output_begin(&handle, event, + throttle_event.header.size, 1, 0); + if (ret) + return; + + perf_output_put(&handle, throttle_event); + perf_event__output_id_sample(event, &handle, &sample); + perf_output_end(&handle); +} + +/* + * Generic event overflow handling, sampling. + */ + +static int __perf_event_overflow(struct perf_event *event, int nmi, + int throttle, struct perf_sample_data *data, + struct pt_regs *regs) +{ + int events = atomic_read(&event->event_limit); + struct hw_perf_event *hwc = &event->hw; + int ret = 0; + + /* + * Non-sampling counters might still use the PMI to fold short + * hardware counters, ignore those. + */ + if (unlikely(!is_sampling_event(event))) + return 0; + + if (unlikely(hwc->interrupts >= max_samples_per_tick)) { + if (throttle) { + hwc->interrupts = MAX_INTERRUPTS; + perf_log_throttle(event, 0); + ret = 1; + } + } else + hwc->interrupts++; + + if (event->attr.freq) { + u64 now = perf_clock(); + s64 delta = now - hwc->freq_time_stamp; + + hwc->freq_time_stamp = now; + + if (delta > 0 && delta < 2*TICK_NSEC) + perf_adjust_period(event, delta, hwc->last_period); + } + + /* + * XXX event_limit might not quite work as expected on inherited + * events + */ + + event->pending_kill = POLL_IN; + if (events && atomic_dec_and_test(&event->event_limit)) { + ret = 1; + event->pending_kill = POLL_HUP; + if (nmi) { + event->pending_disable = 1; + irq_work_queue(&event->pending); + } else + perf_event_disable(event); + } + + if (event->overflow_handler) + event->overflow_handler(event, nmi, data, regs); + else + perf_event_output(event, nmi, data, regs); + + return ret; +} + +int perf_event_overflow(struct perf_event *event, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + return __perf_event_overflow(event, nmi, 1, data, regs); +} + +/* + * Generic software event infrastructure + */ + +struct swevent_htable { + struct swevent_hlist *swevent_hlist; + struct mutex hlist_mutex; + int hlist_refcount; + + /* Recursion avoidance in each contexts */ + int recursion[PERF_NR_CONTEXTS]; +}; + +static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); + +/* + * We directly increment event->count and keep a second value in + * event->hw.period_left to count intervals. This period event + * is kept in the range [-sample_period, 0] so that we can use the + * sign as trigger. + */ + +static u64 perf_swevent_set_period(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 period = hwc->last_period; + u64 nr, offset; + s64 old, val; + + hwc->last_period = hwc->sample_period; + +again: + old = val = local64_read(&hwc->period_left); + if (val < 0) + return 0; + + nr = div64_u64(period + val, period); + offset = nr * period; + val -= offset; + if (local64_cmpxchg(&hwc->period_left, old, val) != old) + goto again; + + return nr; +} + +static void perf_swevent_overflow(struct perf_event *event, u64 overflow, + int nmi, struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct hw_perf_event *hwc = &event->hw; + int throttle = 0; + + data->period = event->hw.last_period; + if (!overflow) + overflow = perf_swevent_set_period(event); + + if (hwc->interrupts == MAX_INTERRUPTS) + return; + + for (; overflow; overflow--) { + if (__perf_event_overflow(event, nmi, throttle, + data, regs)) { + /* + * We inhibit the overflow from happening when + * hwc->interrupts == MAX_INTERRUPTS. + */ + break; + } + throttle = 1; + } +} + +static void perf_swevent_event(struct perf_event *event, u64 nr, + int nmi, struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct hw_perf_event *hwc = &event->hw; + + local64_add(nr, &event->count); + + if (!regs) + return; + + if (!is_sampling_event(event)) + return; + + if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) + return perf_swevent_overflow(event, 1, nmi, data, regs); + + if (local64_add_negative(nr, &hwc->period_left)) + return; + + perf_swevent_overflow(event, 0, nmi, data, regs); +} + +static int perf_exclude_event(struct perf_event *event, + struct pt_regs *regs) +{ + if (event->hw.state & PERF_HES_STOPPED) + return 1; + + if (regs) { + if (event->attr.exclude_user && user_mode(regs)) + return 1; + + if (event->attr.exclude_kernel && !user_mode(regs)) + return 1; + } + + return 0; +} + +static int perf_swevent_match(struct perf_event *event, + enum perf_type_id type, + u32 event_id, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + if (event->attr.type != type) + return 0; + + if (event->attr.config != event_id) + return 0; + + if (perf_exclude_event(event, regs)) + return 0; + + return 1; +} + +static inline u64 swevent_hash(u64 type, u32 event_id) +{ + u64 val = event_id | (type << 32); + + return hash_64(val, SWEVENT_HLIST_BITS); +} + +static inline struct hlist_head * +__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) +{ + u64 hash = swevent_hash(type, event_id); + + return &hlist->heads[hash]; +} + +/* For the read side: events when they trigger */ +static inline struct hlist_head * +find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id) +{ + struct swevent_hlist *hlist; + + hlist = rcu_dereference(swhash->swevent_hlist); + if (!hlist) + return NULL; + + return __find_swevent_head(hlist, type, event_id); +} + +/* For the event head insertion and removal in the hlist */ +static inline struct hlist_head * +find_swevent_head(struct swevent_htable *swhash, struct perf_event *event) +{ + struct swevent_hlist *hlist; + u32 event_id = event->attr.config; + u64 type = event->attr.type; + + /* + * Event scheduling is always serialized against hlist allocation + * and release. Which makes the protected version suitable here. + * The context lock guarantees that. + */ + hlist = rcu_dereference_protected(swhash->swevent_hlist, + lockdep_is_held(&event->ctx->lock)); + if (!hlist) + return NULL; + + return __find_swevent_head(hlist, type, event_id); +} + +static void do_perf_sw_event(enum perf_type_id type, u32 event_id, + u64 nr, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); + struct perf_event *event; + struct hlist_node *node; + struct hlist_head *head; + + rcu_read_lock(); + head = find_swevent_head_rcu(swhash, type, event_id); + if (!head) + goto end; + + hlist_for_each_entry_rcu(event, node, head, hlist_entry) { + if (perf_swevent_match(event, type, event_id, data, regs)) + perf_swevent_event(event, nr, nmi, data, regs); + } +end: + rcu_read_unlock(); +} + +int perf_swevent_get_recursion_context(void) +{ + struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); + + return get_recursion_context(swhash->recursion); +} +EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); + +inline void perf_swevent_put_recursion_context(int rctx) +{ + struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); + + put_recursion_context(swhash->recursion, rctx); +} + +void __perf_sw_event(u32 event_id, u64 nr, int nmi, + struct pt_regs *regs, u64 addr) +{ + struct perf_sample_data data; + int rctx; + + preempt_disable_notrace(); + rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) + return; + + perf_sample_data_init(&data, addr); + + do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); + + perf_swevent_put_recursion_context(rctx); + preempt_enable_notrace(); +} + +static void perf_swevent_read(struct perf_event *event) +{ +} + +static int perf_swevent_add(struct perf_event *event, int flags) +{ + struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); + struct hw_perf_event *hwc = &event->hw; + struct hlist_head *head; + + if (is_sampling_event(event)) { + hwc->last_period = hwc->sample_period; + perf_swevent_set_period(event); + } + + hwc->state = !(flags & PERF_EF_START); + + head = find_swevent_head(swhash, event); + if (WARN_ON_ONCE(!head)) + return -EINVAL; + + hlist_add_head_rcu(&event->hlist_entry, head); + + return 0; +} + +static void perf_swevent_del(struct perf_event *event, int flags) +{ + hlist_del_rcu(&event->hlist_entry); +} + +static void perf_swevent_start(struct perf_event *event, int flags) +{ + event->hw.state = 0; +} + +static void perf_swevent_stop(struct perf_event *event, int flags) +{ + event->hw.state = PERF_HES_STOPPED; +} + +/* Deref the hlist from the update side */ +static inline struct swevent_hlist * +swevent_hlist_deref(struct swevent_htable *swhash) +{ + return rcu_dereference_protected(swhash->swevent_hlist, + lockdep_is_held(&swhash->hlist_mutex)); +} + +static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) +{ + struct swevent_hlist *hlist; + + hlist = container_of(rcu_head, struct swevent_hlist, rcu_head); + kfree(hlist); +} + +static void swevent_hlist_release(struct swevent_htable *swhash) +{ + struct swevent_hlist *hlist = swevent_hlist_deref(swhash); + + if (!hlist) + return; + + rcu_assign_pointer(swhash->swevent_hlist, NULL); + call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); +} + +static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) +{ + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); + + mutex_lock(&swhash->hlist_mutex); + + if (!--swhash->hlist_refcount) + swevent_hlist_release(swhash); + + mutex_unlock(&swhash->hlist_mutex); +} + +static void swevent_hlist_put(struct perf_event *event) +{ + int cpu; + + if (event->cpu != -1) { + swevent_hlist_put_cpu(event, event->cpu); + return; + } + + for_each_possible_cpu(cpu) + swevent_hlist_put_cpu(event, cpu); +} + +static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) +{ + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); + int err = 0; + + mutex_lock(&swhash->hlist_mutex); + + if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) { + struct swevent_hlist *hlist; + + hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); + if (!hlist) { + err = -ENOMEM; + goto exit; + } + rcu_assign_pointer(swhash->swevent_hlist, hlist); + } + swhash->hlist_refcount++; +exit: + mutex_unlock(&swhash->hlist_mutex); + + return err; +} + +static int swevent_hlist_get(struct perf_event *event) +{ + int err; + int cpu, failed_cpu; + + if (event->cpu != -1) + return swevent_hlist_get_cpu(event, event->cpu); + + get_online_cpus(); + for_each_possible_cpu(cpu) { + err = swevent_hlist_get_cpu(event, cpu); + if (err) { + failed_cpu = cpu; + goto fail; + } + } + put_online_cpus(); + + return 0; +fail: + for_each_possible_cpu(cpu) { + if (cpu == failed_cpu) + break; + swevent_hlist_put_cpu(event, cpu); + } + + put_online_cpus(); + return err; +} + +struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; + +static void sw_perf_event_destroy(struct perf_event *event) +{ + u64 event_id = event->attr.config; + + WARN_ON(event->parent); + + jump_label_dec(&perf_swevent_enabled[event_id]); + swevent_hlist_put(event); +} + +static int perf_swevent_init(struct perf_event *event) +{ + int event_id = event->attr.config; + + if (event->attr.type != PERF_TYPE_SOFTWARE) + return -ENOENT; + + switch (event_id) { + case PERF_COUNT_SW_CPU_CLOCK: + case PERF_COUNT_SW_TASK_CLOCK: + return -ENOENT; + + default: + break; + } + + if (event_id >= PERF_COUNT_SW_MAX) + return -ENOENT; + + if (!event->parent) { + int err; + + err = swevent_hlist_get(event); + if (err) + return err; + + jump_label_inc(&perf_swevent_enabled[event_id]); + event->destroy = sw_perf_event_destroy; + } + + return 0; +} + +static struct pmu perf_swevent = { + .task_ctx_nr = perf_sw_context, + + .event_init = perf_swevent_init, + .add = perf_swevent_add, + .del = perf_swevent_del, + .start = perf_swevent_start, + .stop = perf_swevent_stop, + .read = perf_swevent_read, +}; + +#ifdef CONFIG_EVENT_TRACING + +static int perf_tp_filter_match(struct perf_event *event, + struct perf_sample_data *data) +{ + void *record = data->raw->data; + + if (likely(!event->filter) || filter_match_preds(event->filter, record)) + return 1; + return 0; +} + +static int perf_tp_event_match(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + if (event->hw.state & PERF_HES_STOPPED) + return 0; + /* + * All tracepoints are from kernel-space. + */ + if (event->attr.exclude_kernel) + return 0; + + if (!perf_tp_filter_match(event, data)) + return 0; + + return 1; +} + +void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, + struct pt_regs *regs, struct hlist_head *head, int rctx) +{ + struct perf_sample_data data; + struct perf_event *event; + struct hlist_node *node; + + struct perf_raw_record raw = { + .size = entry_size, + .data = record, + }; + + perf_sample_data_init(&data, addr); + data.raw = &raw; + + hlist_for_each_entry_rcu(event, node, head, hlist_entry) { + if (perf_tp_event_match(event, &data, regs)) + perf_swevent_event(event, count, 1, &data, regs); + } + + perf_swevent_put_recursion_context(rctx); +} +EXPORT_SYMBOL_GPL(perf_tp_event); + +static void tp_perf_event_destroy(struct perf_event *event) +{ + perf_trace_destroy(event); +} + +static int perf_tp_event_init(struct perf_event *event) +{ + int err; + + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -ENOENT; + + err = perf_trace_init(event); + if (err) + return err; + + event->destroy = tp_perf_event_destroy; + + return 0; +} + +static struct pmu perf_tracepoint = { + .task_ctx_nr = perf_sw_context, + + .event_init = perf_tp_event_init, + .add = perf_trace_add, + .del = perf_trace_del, + .start = perf_swevent_start, + .stop = perf_swevent_stop, + .read = perf_swevent_read, +}; + +static inline void perf_tp_register(void) +{ + perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); +} + +static int perf_event_set_filter(struct perf_event *event, void __user *arg) +{ + char *filter_str; + int ret; + + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -EINVAL; + + filter_str = strndup_user(arg, PAGE_SIZE); + if (IS_ERR(filter_str)) + return PTR_ERR(filter_str); + + ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); + + kfree(filter_str); + return ret; +} + +static void perf_event_free_filter(struct perf_event *event) +{ + ftrace_profile_free_filter(event); +} + +#else + +static inline void perf_tp_register(void) +{ +} + +static int perf_event_set_filter(struct perf_event *event, void __user *arg) +{ + return -ENOENT; +} + +static void perf_event_free_filter(struct perf_event *event) +{ +} + +#endif /* CONFIG_EVENT_TRACING */ + +#ifdef CONFIG_HAVE_HW_BREAKPOINT +void perf_bp_event(struct perf_event *bp, void *data) +{ + struct perf_sample_data sample; + struct pt_regs *regs = data; + + perf_sample_data_init(&sample, bp->attr.bp_addr); + + if (!bp->hw.state && !perf_exclude_event(bp, regs)) + perf_swevent_event(bp, 1, 1, &sample, regs); +} +#endif + +/* + * hrtimer based swevent callback + */ + +static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) +{ + enum hrtimer_restart ret = HRTIMER_RESTART; + struct perf_sample_data data; + struct pt_regs *regs; + struct perf_event *event; + u64 period; + + event = container_of(hrtimer, struct perf_event, hw.hrtimer); + + if (event->state != PERF_EVENT_STATE_ACTIVE) + return HRTIMER_NORESTART; + + event->pmu->read(event); + + perf_sample_data_init(&data, 0); + data.period = event->hw.last_period; + regs = get_irq_regs(); + + if (regs && !perf_exclude_event(event, regs)) { + if (!(event->attr.exclude_idle && current->pid == 0)) + if (perf_event_overflow(event, 0, &data, regs)) + ret = HRTIMER_NORESTART; + } + + period = max_t(u64, 10000, event->hw.sample_period); + hrtimer_forward_now(hrtimer, ns_to_ktime(period)); + + return ret; +} + +static void perf_swevent_start_hrtimer(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + s64 period; + + if (!is_sampling_event(event)) + return; + + period = local64_read(&hwc->period_left); + if (period) { + if (period < 0) + period = 10000; + + local64_set(&hwc->period_left, 0); + } else { + period = max_t(u64, 10000, hwc->sample_period); + } + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(period), 0, + HRTIMER_MODE_REL_PINNED, 0); +} + +static void perf_swevent_cancel_hrtimer(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (is_sampling_event(event)) { + ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); + local64_set(&hwc->period_left, ktime_to_ns(remaining)); + + hrtimer_cancel(&hwc->hrtimer); + } +} + +static void perf_swevent_init_hrtimer(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (!is_sampling_event(event)) + return; + + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swevent_hrtimer; + + /* + * Since hrtimers have a fixed rate, we can do a static freq->period + * mapping and avoid the whole period adjust feedback stuff. + */ + if (event->attr.freq) { + long freq = event->attr.sample_freq; + + event->attr.sample_period = NSEC_PER_SEC / freq; + hwc->sample_period = event->attr.sample_period; + local64_set(&hwc->period_left, hwc->sample_period); + event->attr.freq = 0; + } +} + +/* + * Software event: cpu wall time clock + */ + +static void cpu_clock_event_update(struct perf_event *event) +{ + s64 prev; + u64 now; + + now = local_clock(); + prev = local64_xchg(&event->hw.prev_count, now); + local64_add(now - prev, &event->count); +} + +static void cpu_clock_event_start(struct perf_event *event, int flags) +{ + local64_set(&event->hw.prev_count, local_clock()); + perf_swevent_start_hrtimer(event); +} + +static void cpu_clock_event_stop(struct perf_event *event, int flags) +{ + perf_swevent_cancel_hrtimer(event); + cpu_clock_event_update(event); +} + +static int cpu_clock_event_add(struct perf_event *event, int flags) +{ + if (flags & PERF_EF_START) + cpu_clock_event_start(event, flags); + + return 0; +} + +static void cpu_clock_event_del(struct perf_event *event, int flags) +{ + cpu_clock_event_stop(event, flags); +} + +static void cpu_clock_event_read(struct perf_event *event) +{ + cpu_clock_event_update(event); +} + +static int cpu_clock_event_init(struct perf_event *event) +{ + if (event->attr.type != PERF_TYPE_SOFTWARE) + return -ENOENT; + + if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) + return -ENOENT; + + perf_swevent_init_hrtimer(event); + + return 0; +} + +static struct pmu perf_cpu_clock = { + .task_ctx_nr = perf_sw_context, + + .event_init = cpu_clock_event_init, + .add = cpu_clock_event_add, + .del = cpu_clock_event_del, + .start = cpu_clock_event_start, + .stop = cpu_clock_event_stop, + .read = cpu_clock_event_read, +}; + +/* + * Software event: task time clock + */ + +static void task_clock_event_update(struct perf_event *event, u64 now) +{ + u64 prev; + s64 delta; + + prev = local64_xchg(&event->hw.prev_count, now); + delta = now - prev; + local64_add(delta, &event->count); +} + +static void task_clock_event_start(struct perf_event *event, int flags) +{ + local64_set(&event->hw.prev_count, event->ctx->time); + perf_swevent_start_hrtimer(event); +} + +static void task_clock_event_stop(struct perf_event *event, int flags) +{ + perf_swevent_cancel_hrtimer(event); + task_clock_event_update(event, event->ctx->time); +} + +static int task_clock_event_add(struct perf_event *event, int flags) +{ + if (flags & PERF_EF_START) + task_clock_event_start(event, flags); + + return 0; +} + +static void task_clock_event_del(struct perf_event *event, int flags) +{ + task_clock_event_stop(event, PERF_EF_UPDATE); +} + +static void task_clock_event_read(struct perf_event *event) +{ + u64 now = perf_clock(); + u64 delta = now - event->ctx->timestamp; + u64 time = event->ctx->time + delta; + + task_clock_event_update(event, time); +} + +static int task_clock_event_init(struct perf_event *event) +{ + if (event->attr.type != PERF_TYPE_SOFTWARE) + return -ENOENT; + + if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) + return -ENOENT; + + perf_swevent_init_hrtimer(event); + + return 0; +} + +static struct pmu perf_task_clock = { + .task_ctx_nr = perf_sw_context, + + .event_init = task_clock_event_init, + .add = task_clock_event_add, + .del = task_clock_event_del, + .start = task_clock_event_start, + .stop = task_clock_event_stop, + .read = task_clock_event_read, +}; + +static void perf_pmu_nop_void(struct pmu *pmu) +{ +} + +static int perf_pmu_nop_int(struct pmu *pmu) +{ + return 0; +} + +static void perf_pmu_start_txn(struct pmu *pmu) +{ + perf_pmu_disable(pmu); +} + +static int perf_pmu_commit_txn(struct pmu *pmu) +{ + perf_pmu_enable(pmu); + return 0; +} + +static void perf_pmu_cancel_txn(struct pmu *pmu) +{ + perf_pmu_enable(pmu); +} + +/* + * Ensures all contexts with the same task_ctx_nr have the same + * pmu_cpu_context too. + */ +static void *find_pmu_context(int ctxn) +{ + struct pmu *pmu; + + if (ctxn < 0) + return NULL; + + list_for_each_entry(pmu, &pmus, entry) { + if (pmu->task_ctx_nr == ctxn) + return pmu->pmu_cpu_context; + } + + return NULL; +} + +static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct perf_cpu_context *cpuctx; + + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + + if (cpuctx->active_pmu == old_pmu) + cpuctx->active_pmu = pmu; + } +} + +static void free_pmu_context(struct pmu *pmu) +{ + struct pmu *i; + + mutex_lock(&pmus_lock); + /* + * Like a real lame refcount. + */ + list_for_each_entry(i, &pmus, entry) { + if (i->pmu_cpu_context == pmu->pmu_cpu_context) { + update_pmu_context(i, pmu); + goto out; + } + } + + free_percpu(pmu->pmu_cpu_context); +out: + mutex_unlock(&pmus_lock); +} +static struct idr pmu_idr; + +static ssize_t +type_show(struct device *dev, struct device_attribute *attr, char *page) +{ + struct pmu *pmu = dev_get_drvdata(dev); + + return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); +} + +static struct device_attribute pmu_dev_attrs[] = { + __ATTR_RO(type), + __ATTR_NULL, +}; + +static int pmu_bus_running; +static struct bus_type pmu_bus = { + .name = "event_source", + .dev_attrs = pmu_dev_attrs, +}; + +static void pmu_dev_release(struct device *dev) +{ + kfree(dev); +} + +static int pmu_dev_alloc(struct pmu *pmu) +{ + int ret = -ENOMEM; + + pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL); + if (!pmu->dev) + goto out; + + device_initialize(pmu->dev); + ret = dev_set_name(pmu->dev, "%s", pmu->name); + if (ret) + goto free_dev; + + dev_set_drvdata(pmu->dev, pmu); + pmu->dev->bus = &pmu_bus; + pmu->dev->release = pmu_dev_release; + ret = device_add(pmu->dev); + if (ret) + goto free_dev; + +out: + return ret; + +free_dev: + put_device(pmu->dev); + goto out; +} + +static struct lock_class_key cpuctx_mutex; + +int perf_pmu_register(struct pmu *pmu, char *name, int type) +{ + int cpu, ret; + + mutex_lock(&pmus_lock); + ret = -ENOMEM; + pmu->pmu_disable_count = alloc_percpu(int); + if (!pmu->pmu_disable_count) + goto unlock; + + pmu->type = -1; + if (!name) + goto skip_type; + pmu->name = name; + + if (type < 0) { + int err = idr_pre_get(&pmu_idr, GFP_KERNEL); + if (!err) + goto free_pdc; + + err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type); + if (err) { + ret = err; + goto free_pdc; + } + } + pmu->type = type; + + if (pmu_bus_running) { + ret = pmu_dev_alloc(pmu); + if (ret) + goto free_idr; + } + +skip_type: + pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); + if (pmu->pmu_cpu_context) + goto got_cpu_context; + + pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); + if (!pmu->pmu_cpu_context) + goto free_dev; + + for_each_possible_cpu(cpu) { + struct perf_cpu_context *cpuctx; + + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + __perf_event_init_context(&cpuctx->ctx); + lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); + cpuctx->ctx.type = cpu_context; + cpuctx->ctx.pmu = pmu; + cpuctx->jiffies_interval = 1; + INIT_LIST_HEAD(&cpuctx->rotation_list); + cpuctx->active_pmu = pmu; + } + +got_cpu_context: + if (!pmu->start_txn) { + if (pmu->pmu_enable) { + /* + * If we have pmu_enable/pmu_disable calls, install + * transaction stubs that use that to try and batch + * hardware accesses. + */ + pmu->start_txn = perf_pmu_start_txn; + pmu->commit_txn = perf_pmu_commit_txn; + pmu->cancel_txn = perf_pmu_cancel_txn; + } else { + pmu->start_txn = perf_pmu_nop_void; + pmu->commit_txn = perf_pmu_nop_int; + pmu->cancel_txn = perf_pmu_nop_void; + } + } + + if (!pmu->pmu_enable) { + pmu->pmu_enable = perf_pmu_nop_void; + pmu->pmu_disable = perf_pmu_nop_void; + } + + list_add_rcu(&pmu->entry, &pmus); + ret = 0; +unlock: + mutex_unlock(&pmus_lock); + + return ret; + +free_dev: + device_del(pmu->dev); + put_device(pmu->dev); + +free_idr: + if (pmu->type >= PERF_TYPE_MAX) + idr_remove(&pmu_idr, pmu->type); + +free_pdc: + free_percpu(pmu->pmu_disable_count); + goto unlock; +} + +void perf_pmu_unregister(struct pmu *pmu) +{ + mutex_lock(&pmus_lock); + list_del_rcu(&pmu->entry); + mutex_unlock(&pmus_lock); + + /* + * We dereference the pmu list under both SRCU and regular RCU, so + * synchronize against both of those. + */ + synchronize_srcu(&pmus_srcu); + synchronize_rcu(); + + free_percpu(pmu->pmu_disable_count); + if (pmu->type >= PERF_TYPE_MAX) + idr_remove(&pmu_idr, pmu->type); + device_del(pmu->dev); + put_device(pmu->dev); + free_pmu_context(pmu); +} + +struct pmu *perf_init_event(struct perf_event *event) +{ + struct pmu *pmu = NULL; + int idx; + int ret; + + idx = srcu_read_lock(&pmus_srcu); + + rcu_read_lock(); + pmu = idr_find(&pmu_idr, event->attr.type); + rcu_read_unlock(); + if (pmu) { + ret = pmu->event_init(event); + if (ret) + pmu = ERR_PTR(ret); + goto unlock; + } + + list_for_each_entry_rcu(pmu, &pmus, entry) { + ret = pmu->event_init(event); + if (!ret) + goto unlock; + + if (ret != -ENOENT) { + pmu = ERR_PTR(ret); + goto unlock; + } + } + pmu = ERR_PTR(-ENOENT); +unlock: + srcu_read_unlock(&pmus_srcu, idx); + + return pmu; +} + +/* + * Allocate and initialize a event structure + */ +static struct perf_event * +perf_event_alloc(struct perf_event_attr *attr, int cpu, + struct task_struct *task, + struct perf_event *group_leader, + struct perf_event *parent_event, + perf_overflow_handler_t overflow_handler) +{ + struct pmu *pmu; + struct perf_event *event; + struct hw_perf_event *hwc; + long err; + + if ((unsigned)cpu >= nr_cpu_ids) { + if (!task || cpu != -1) + return ERR_PTR(-EINVAL); + } + + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return ERR_PTR(-ENOMEM); + + /* + * Single events are their own group leaders, with an + * empty sibling list: + */ + if (!group_leader) + group_leader = event; + + mutex_init(&event->child_mutex); + INIT_LIST_HEAD(&event->child_list); + + INIT_LIST_HEAD(&event->group_entry); + INIT_LIST_HEAD(&event->event_entry); + INIT_LIST_HEAD(&event->sibling_list); + init_waitqueue_head(&event->waitq); + init_irq_work(&event->pending, perf_pending_event); + + mutex_init(&event->mmap_mutex); + + event->cpu = cpu; + event->attr = *attr; + event->group_leader = group_leader; + event->pmu = NULL; + event->oncpu = -1; + + event->parent = parent_event; + + event->ns = get_pid_ns(current->nsproxy->pid_ns); + event->id = atomic64_inc_return(&perf_event_id); + + event->state = PERF_EVENT_STATE_INACTIVE; + + if (task) { + event->attach_state = PERF_ATTACH_TASK; +#ifdef CONFIG_HAVE_HW_BREAKPOINT + /* + * hw_breakpoint is a bit difficult here.. + */ + if (attr->type == PERF_TYPE_BREAKPOINT) + event->hw.bp_target = task; +#endif + } + + if (!overflow_handler && parent_event) + overflow_handler = parent_event->overflow_handler; + + event->overflow_handler = overflow_handler; + + if (attr->disabled) + event->state = PERF_EVENT_STATE_OFF; + + pmu = NULL; + + hwc = &event->hw; + hwc->sample_period = attr->sample_period; + if (attr->freq && attr->sample_freq) + hwc->sample_period = 1; + hwc->last_period = hwc->sample_period; + + local64_set(&hwc->period_left, hwc->sample_period); + + /* + * we currently do not support PERF_FORMAT_GROUP on inherited events + */ + if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) + goto done; + + pmu = perf_init_event(event); + +done: + err = 0; + if (!pmu) + err = -EINVAL; + else if (IS_ERR(pmu)) + err = PTR_ERR(pmu); + + if (err) { + if (event->ns) + put_pid_ns(event->ns); + kfree(event); + return ERR_PTR(err); + } + + event->pmu = pmu; + + if (!event->parent) { + if (event->attach_state & PERF_ATTACH_TASK) + jump_label_inc(&perf_sched_events); + if (event->attr.mmap || event->attr.mmap_data) + atomic_inc(&nr_mmap_events); + if (event->attr.comm) + atomic_inc(&nr_comm_events); + if (event->attr.task) + atomic_inc(&nr_task_events); + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { + err = get_callchain_buffers(); + if (err) { + free_event(event); + return ERR_PTR(err); + } + } + } + + return event; +} + +static int perf_copy_attr(struct perf_event_attr __user *uattr, + struct perf_event_attr *attr) +{ + u32 size; + int ret; + + if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) + return -EFAULT; + + /* + * zero the full structure, so that a short copy will be nice. + */ + memset(attr, 0, sizeof(*attr)); + + ret = get_user(size, &uattr->size); + if (ret) + return ret; + + if (size > PAGE_SIZE) /* silly large */ + goto err_size; + + if (!size) /* abi compat */ + size = PERF_ATTR_SIZE_VER0; + + if (size < PERF_ATTR_SIZE_VER0) + goto err_size; + + /* + * If we're handed a bigger struct than we know of, + * ensure all the unknown bits are 0 - i.e. new + * user-space does not rely on any kernel feature + * extensions we dont know about yet. + */ + if (size > sizeof(*attr)) { + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + + addr = (void __user *)uattr + sizeof(*attr); + end = (void __user *)uattr + size; + + for (; addr < end; addr++) { + ret = get_user(val, addr); + if (ret) + return ret; + if (val) + goto err_size; + } + size = sizeof(*attr); + } + + ret = copy_from_user(attr, uattr, size); + if (ret) + return -EFAULT; + + /* + * If the type exists, the corresponding creation will verify + * the attr->config. + */ + if (attr->type >= PERF_TYPE_MAX) + return -EINVAL; + + if (attr->__reserved_1) + return -EINVAL; + + if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) + return -EINVAL; + + if (attr->read_format & ~(PERF_FORMAT_MAX-1)) + return -EINVAL; + +out: + return ret; + +err_size: + put_user(sizeof(*attr), &uattr->size); + ret = -E2BIG; + goto out; +} + +static int +perf_event_set_output(struct perf_event *event, struct perf_event *output_event) +{ + struct perf_buffer *buffer = NULL, *old_buffer = NULL; + int ret = -EINVAL; + + if (!output_event) + goto set; + + /* don't allow circular references */ + if (event == output_event) + goto out; + + /* + * Don't allow cross-cpu buffers + */ + if (output_event->cpu != event->cpu) + goto out; + + /* + * If its not a per-cpu buffer, it must be the same task. + */ + if (output_event->cpu == -1 && output_event->ctx != event->ctx) + goto out; + +set: + mutex_lock(&event->mmap_mutex); + /* Can't redirect output if we've got an active mmap() */ + if (atomic_read(&event->mmap_count)) + goto unlock; + + if (output_event) { + /* get the buffer we want to redirect to */ + buffer = perf_buffer_get(output_event); + if (!buffer) + goto unlock; + } + + old_buffer = event->buffer; + rcu_assign_pointer(event->buffer, buffer); + ret = 0; +unlock: + mutex_unlock(&event->mmap_mutex); + + if (old_buffer) + perf_buffer_put(old_buffer); +out: + return ret; +} + +/** + * sys_perf_event_open - open a performance event, associate it to a task/cpu + * + * @attr_uptr: event_id type attributes for monitoring/sampling + * @pid: target pid + * @cpu: target cpu + * @group_fd: group leader event fd + */ +SYSCALL_DEFINE5(perf_event_open, + struct perf_event_attr __user *, attr_uptr, + pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) +{ + struct perf_event *group_leader = NULL, *output_event = NULL; + struct perf_event *event, *sibling; + struct perf_event_attr attr; + struct perf_event_context *ctx; + struct file *event_file = NULL; + struct file *group_file = NULL; + struct task_struct *task = NULL; + struct pmu *pmu; + int event_fd; + int move_group = 0; + int fput_needed = 0; + int err; + + /* for future expandability... */ + if (flags & ~PERF_FLAG_ALL) + return -EINVAL; + + err = perf_copy_attr(attr_uptr, &attr); + if (err) + return err; + + if (!attr.exclude_kernel) { + if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + return -EACCES; + } + + if (attr.freq) { + if (attr.sample_freq > sysctl_perf_event_sample_rate) + return -EINVAL; + } + + /* + * In cgroup mode, the pid argument is used to pass the fd + * opened to the cgroup directory in cgroupfs. The cpu argument + * designates the cpu on which to monitor threads from that + * cgroup. + */ + if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) + return -EINVAL; + + event_fd = get_unused_fd_flags(O_RDWR); + if (event_fd < 0) + return event_fd; + + if (group_fd != -1) { + group_leader = perf_fget_light(group_fd, &fput_needed); + if (IS_ERR(group_leader)) { + err = PTR_ERR(group_leader); + goto err_fd; + } + group_file = group_leader->filp; + if (flags & PERF_FLAG_FD_OUTPUT) + output_event = group_leader; + if (flags & PERF_FLAG_FD_NO_GROUP) + group_leader = NULL; + } + + if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) { + task = find_lively_task_by_vpid(pid); + if (IS_ERR(task)) { + err = PTR_ERR(task); + goto err_group_fd; + } + } + + event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL); + if (IS_ERR(event)) { + err = PTR_ERR(event); + goto err_task; + } + + if (flags & PERF_FLAG_PID_CGROUP) { + err = perf_cgroup_connect(pid, event, &attr, group_leader); + if (err) + goto err_alloc; + /* + * one more event: + * - that has cgroup constraint on event->cpu + * - that may need work on context switch + */ + atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); + jump_label_inc(&perf_sched_events); + } + + /* + * Special case software events and allow them to be part of + * any hardware group. + */ + pmu = event->pmu; + + if (group_leader && + (is_software_event(event) != is_software_event(group_leader))) { + if (is_software_event(event)) { + /* + * If event and group_leader are not both a software + * event, and event is, then group leader is not. + * + * Allow the addition of software events to !software + * groups, this is safe because software events never + * fail to schedule. + */ + pmu = group_leader->pmu; + } else if (is_software_event(group_leader) && + (group_leader->group_flags & PERF_GROUP_SOFTWARE)) { + /* + * In case the group is a pure software group, and we + * try to add a hardware event, move the whole group to + * the hardware context. + */ + move_group = 1; + } + } + + /* + * Get the target context (task or percpu): + */ + ctx = find_get_context(pmu, task, cpu); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + goto err_alloc; + } + + if (task) { + put_task_struct(task); + task = NULL; + } + + /* + * Look up the group leader (we will attach this event to it): + */ + if (group_leader) { + err = -EINVAL; + + /* + * Do not allow a recursive hierarchy (this new sibling + * becoming part of another group-sibling): + */ + if (group_leader->group_leader != group_leader) + goto err_context; + /* + * Do not allow to attach to a group in a different + * task or CPU context: + */ + if (move_group) { + if (group_leader->ctx->type != ctx->type) + goto err_context; + } else { + if (group_leader->ctx != ctx) + goto err_context; + } + + /* + * Only a group leader can be exclusive or pinned + */ + if (attr.exclusive || attr.pinned) + goto err_context; + } + + if (output_event) { + err = perf_event_set_output(event, output_event); + if (err) + goto err_context; + } + + event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); + if (IS_ERR(event_file)) { + err = PTR_ERR(event_file); + goto err_context; + } + + if (move_group) { + struct perf_event_context *gctx = group_leader->ctx; + + mutex_lock(&gctx->mutex); + perf_remove_from_context(group_leader); + list_for_each_entry(sibling, &group_leader->sibling_list, + group_entry) { + perf_remove_from_context(sibling); + put_ctx(gctx); + } + mutex_unlock(&gctx->mutex); + put_ctx(gctx); + } + + event->filp = event_file; + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + + if (move_group) { + perf_install_in_context(ctx, group_leader, cpu); + get_ctx(ctx); + list_for_each_entry(sibling, &group_leader->sibling_list, + group_entry) { + perf_install_in_context(ctx, sibling, cpu); + get_ctx(ctx); + } + } + + perf_install_in_context(ctx, event, cpu); + ++ctx->generation; + perf_unpin_context(ctx); + mutex_unlock(&ctx->mutex); + + event->owner = current; + + mutex_lock(¤t->perf_event_mutex); + list_add_tail(&event->owner_entry, ¤t->perf_event_list); + mutex_unlock(¤t->perf_event_mutex); + + /* + * Precalculate sample_data sizes + */ + perf_event__header_size(event); + perf_event__id_header_size(event); + + /* + * Drop the reference on the group_event after placing the + * new event on the sibling_list. This ensures destruction + * of the group leader will find the pointer to itself in + * perf_group_detach(). + */ + fput_light(group_file, fput_needed); + fd_install(event_fd, event_file); + return event_fd; + +err_context: + perf_unpin_context(ctx); + put_ctx(ctx); +err_alloc: + free_event(event); +err_task: + if (task) + put_task_struct(task); +err_group_fd: + fput_light(group_file, fput_needed); +err_fd: + put_unused_fd(event_fd); + return err; +} + +/** + * perf_event_create_kernel_counter + * + * @attr: attributes of the counter to create + * @cpu: cpu in which the counter is bound + * @task: task to profile (NULL for percpu) + */ +struct perf_event * +perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, + struct task_struct *task, + perf_overflow_handler_t overflow_handler) +{ + struct perf_event_context *ctx; + struct perf_event *event; + int err; + + /* + * Get the target context (task or percpu): + */ + + event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler); + if (IS_ERR(event)) { + err = PTR_ERR(event); + goto err; + } + + ctx = find_get_context(event->pmu, task, cpu); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + goto err_free; + } + + event->filp = NULL; + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + perf_install_in_context(ctx, event, cpu); + ++ctx->generation; + perf_unpin_context(ctx); + mutex_unlock(&ctx->mutex); + + return event; + +err_free: + free_event(event); +err: + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); + +static void sync_child_event(struct perf_event *child_event, + struct task_struct *child) +{ + struct perf_event *parent_event = child_event->parent; + u64 child_val; + + if (child_event->attr.inherit_stat) + perf_event_read_event(child_event, child); + + child_val = perf_event_count(child_event); + + /* + * Add back the child's count to the parent's count: + */ + atomic64_add(child_val, &parent_event->child_count); + atomic64_add(child_event->total_time_enabled, + &parent_event->child_total_time_enabled); + atomic64_add(child_event->total_time_running, + &parent_event->child_total_time_running); + + /* + * Remove this event from the parent's list + */ + WARN_ON_ONCE(parent_event->ctx->parent_ctx); + mutex_lock(&parent_event->child_mutex); + list_del_init(&child_event->child_list); + mutex_unlock(&parent_event->child_mutex); + + /* + * Release the parent event, if this was the last + * reference to it. + */ + fput(parent_event->filp); +} + +static void +__perf_event_exit_task(struct perf_event *child_event, + struct perf_event_context *child_ctx, + struct task_struct *child) +{ + if (child_event->parent) { + raw_spin_lock_irq(&child_ctx->lock); + perf_group_detach(child_event); + raw_spin_unlock_irq(&child_ctx->lock); + } + + perf_remove_from_context(child_event); + + /* + * It can happen that the parent exits first, and has events + * that are still around due to the child reference. These + * events need to be zapped. + */ + if (child_event->parent) { + sync_child_event(child_event, child); + free_event(child_event); + } +} + +static void perf_event_exit_task_context(struct task_struct *child, int ctxn) +{ + struct perf_event *child_event, *tmp; + struct perf_event_context *child_ctx; + unsigned long flags; + + if (likely(!child->perf_event_ctxp[ctxn])) { + perf_event_task(child, NULL, 0); + return; + } + + local_irq_save(flags); + /* + * We can't reschedule here because interrupts are disabled, + * and either child is current or it is a task that can't be + * scheduled, so we are now safe from rescheduling changing + * our context. + */ + child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); + task_ctx_sched_out(child_ctx, EVENT_ALL); + + /* + * Take the context lock here so that if find_get_context is + * reading child->perf_event_ctxp, we wait until it has + * incremented the context's refcount before we do put_ctx below. + */ + raw_spin_lock(&child_ctx->lock); + child->perf_event_ctxp[ctxn] = NULL; + /* + * If this context is a clone; unclone it so it can't get + * swapped to another process while we're removing all + * the events from it. + */ + unclone_ctx(child_ctx); + update_context_time(child_ctx); + raw_spin_unlock_irqrestore(&child_ctx->lock, flags); + + /* + * Report the task dead after unscheduling the events so that we + * won't get any samples after PERF_RECORD_EXIT. We can however still + * get a few PERF_RECORD_READ events. + */ + perf_event_task(child, child_ctx, 0); + + /* + * We can recurse on the same lock type through: + * + * __perf_event_exit_task() + * sync_child_event() + * fput(parent_event->filp) + * perf_release() + * mutex_lock(&ctx->mutex) + * + * But since its the parent context it won't be the same instance. + */ + mutex_lock(&child_ctx->mutex); + +again: + list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, + group_entry) + __perf_event_exit_task(child_event, child_ctx, child); + + list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups, + group_entry) + __perf_event_exit_task(child_event, child_ctx, child); + + /* + * If the last event was a group event, it will have appended all + * its siblings to the list, but we obtained 'tmp' before that which + * will still point to the list head terminating the iteration. + */ + if (!list_empty(&child_ctx->pinned_groups) || + !list_empty(&child_ctx->flexible_groups)) + goto again; + + mutex_unlock(&child_ctx->mutex); + + put_ctx(child_ctx); +} + +/* + * When a child task exits, feed back event values to parent events. + */ +void perf_event_exit_task(struct task_struct *child) +{ + struct perf_event *event, *tmp; + int ctxn; + + mutex_lock(&child->perf_event_mutex); + list_for_each_entry_safe(event, tmp, &child->perf_event_list, + owner_entry) { + list_del_init(&event->owner_entry); + + /* + * Ensure the list deletion is visible before we clear + * the owner, closes a race against perf_release() where + * we need to serialize on the owner->perf_event_mutex. + */ + smp_wmb(); + event->owner = NULL; + } + mutex_unlock(&child->perf_event_mutex); + + for_each_task_context_nr(ctxn) + perf_event_exit_task_context(child, ctxn); +} + +static void perf_free_event(struct perf_event *event, + struct perf_event_context *ctx) +{ + struct perf_event *parent = event->parent; + + if (WARN_ON_ONCE(!parent)) + return; + + mutex_lock(&parent->child_mutex); + list_del_init(&event->child_list); + mutex_unlock(&parent->child_mutex); + + fput(parent->filp); + + perf_group_detach(event); + list_del_event(event, ctx); + free_event(event); +} + +/* + * free an unexposed, unused context as created by inheritance by + * perf_event_init_task below, used by fork() in case of fail. + */ +void perf_event_free_task(struct task_struct *task) +{ + struct perf_event_context *ctx; + struct perf_event *event, *tmp; + int ctxn; + + for_each_task_context_nr(ctxn) { + ctx = task->perf_event_ctxp[ctxn]; + if (!ctx) + continue; + + mutex_lock(&ctx->mutex); +again: + list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, + group_entry) + perf_free_event(event, ctx); + + list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, + group_entry) + perf_free_event(event, ctx); + + if (!list_empty(&ctx->pinned_groups) || + !list_empty(&ctx->flexible_groups)) + goto again; + + mutex_unlock(&ctx->mutex); + + put_ctx(ctx); + } +} + +void perf_event_delayed_put(struct task_struct *task) +{ + int ctxn; + + for_each_task_context_nr(ctxn) + WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); +} + +/* + * inherit a event from parent task to child task: + */ +static struct perf_event * +inherit_event(struct perf_event *parent_event, + struct task_struct *parent, + struct perf_event_context *parent_ctx, + struct task_struct *child, + struct perf_event *group_leader, + struct perf_event_context *child_ctx) +{ + struct perf_event *child_event; + unsigned long flags; + + /* + * Instead of creating recursive hierarchies of events, + * we link inherited events back to the original parent, + * which has a filp for sure, which we use as the reference + * count: + */ + if (parent_event->parent) + parent_event = parent_event->parent; + + child_event = perf_event_alloc(&parent_event->attr, + parent_event->cpu, + child, + group_leader, parent_event, + NULL); + if (IS_ERR(child_event)) + return child_event; + get_ctx(child_ctx); + + /* + * Make the child state follow the state of the parent event, + * not its attr.disabled bit. We hold the parent's mutex, + * so we won't race with perf_event_{en, dis}able_family. + */ + if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) + child_event->state = PERF_EVENT_STATE_INACTIVE; + else + child_event->state = PERF_EVENT_STATE_OFF; + + if (parent_event->attr.freq) { + u64 sample_period = parent_event->hw.sample_period; + struct hw_perf_event *hwc = &child_event->hw; + + hwc->sample_period = sample_period; + hwc->last_period = sample_period; + + local64_set(&hwc->period_left, sample_period); + } + + child_event->ctx = child_ctx; + child_event->overflow_handler = parent_event->overflow_handler; + + /* + * Precalculate sample_data sizes + */ + perf_event__header_size(child_event); + perf_event__id_header_size(child_event); + + /* + * Link it up in the child's context: + */ + raw_spin_lock_irqsave(&child_ctx->lock, flags); + add_event_to_ctx(child_event, child_ctx); + raw_spin_unlock_irqrestore(&child_ctx->lock, flags); + + /* + * Get a reference to the parent filp - we will fput it + * when the child event exits. This is safe to do because + * we are in the parent and we know that the filp still + * exists and has a nonzero count: + */ + atomic_long_inc(&parent_event->filp->f_count); + + /* + * Link this into the parent event's child list + */ + WARN_ON_ONCE(parent_event->ctx->parent_ctx); + mutex_lock(&parent_event->child_mutex); + list_add_tail(&child_event->child_list, &parent_event->child_list); + mutex_unlock(&parent_event->child_mutex); + + return child_event; +} + +static int inherit_group(struct perf_event *parent_event, + struct task_struct *parent, + struct perf_event_context *parent_ctx, + struct task_struct *child, + struct perf_event_context *child_ctx) +{ + struct perf_event *leader; + struct perf_event *sub; + struct perf_event *child_ctr; + + leader = inherit_event(parent_event, parent, parent_ctx, + child, NULL, child_ctx); + if (IS_ERR(leader)) + return PTR_ERR(leader); + list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { + child_ctr = inherit_event(sub, parent, parent_ctx, + child, leader, child_ctx); + if (IS_ERR(child_ctr)) + return PTR_ERR(child_ctr); + } + return 0; +} + +static int +inherit_task_group(struct perf_event *event, struct task_struct *parent, + struct perf_event_context *parent_ctx, + struct task_struct *child, int ctxn, + int *inherited_all) +{ + int ret; + struct perf_event_context *child_ctx; + + if (!event->attr.inherit) { + *inherited_all = 0; + return 0; + } + + child_ctx = child->perf_event_ctxp[ctxn]; + if (!child_ctx) { + /* + * This is executed from the parent task context, so + * inherit events that have been marked for cloning. + * First allocate and initialize a context for the + * child. + */ + + child_ctx = alloc_perf_context(event->pmu, child); + if (!child_ctx) + return -ENOMEM; + + child->perf_event_ctxp[ctxn] = child_ctx; + } + + ret = inherit_group(event, parent, parent_ctx, + child, child_ctx); + + if (ret) + *inherited_all = 0; + + return ret; +} + +/* + * Initialize the perf_event context in task_struct + */ +int perf_event_init_context(struct task_struct *child, int ctxn) +{ + struct perf_event_context *child_ctx, *parent_ctx; + struct perf_event_context *cloned_ctx; + struct perf_event *event; + struct task_struct *parent = current; + int inherited_all = 1; + unsigned long flags; + int ret = 0; + + if (likely(!parent->perf_event_ctxp[ctxn])) + return 0; + + /* + * If the parent's context is a clone, pin it so it won't get + * swapped under us. + */ + parent_ctx = perf_pin_task_context(parent, ctxn); + + /* + * No need to check if parent_ctx != NULL here; since we saw + * it non-NULL earlier, the only reason for it to become NULL + * is if we exit, and since we're currently in the middle of + * a fork we can't be exiting at the same time. + */ + + /* + * Lock the parent list. No need to lock the child - not PID + * hashed yet and not running, so nobody can access it. + */ + mutex_lock(&parent_ctx->mutex); + + /* + * We dont have to disable NMIs - we are only looking at + * the list, not manipulating it: + */ + list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { + ret = inherit_task_group(event, parent, parent_ctx, + child, ctxn, &inherited_all); + if (ret) + break; + } + + /* + * We can't hold ctx->lock when iterating the ->flexible_group list due + * to allocations, but we need to prevent rotation because + * rotate_ctx() will change the list from interrupt context. + */ + raw_spin_lock_irqsave(&parent_ctx->lock, flags); + parent_ctx->rotate_disable = 1; + raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); + + list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { + ret = inherit_task_group(event, parent, parent_ctx, + child, ctxn, &inherited_all); + if (ret) + break; + } + + raw_spin_lock_irqsave(&parent_ctx->lock, flags); + parent_ctx->rotate_disable = 0; + + child_ctx = child->perf_event_ctxp[ctxn]; + + if (child_ctx && inherited_all) { + /* + * Mark the child context as a clone of the parent + * context, or of whatever the parent is a clone of. + * + * Note that if the parent is a clone, the holding of + * parent_ctx->lock avoids it from being uncloned. + */ + cloned_ctx = parent_ctx->parent_ctx; + if (cloned_ctx) { + child_ctx->parent_ctx = cloned_ctx; + child_ctx->parent_gen = parent_ctx->parent_gen; + } else { + child_ctx->parent_ctx = parent_ctx; + child_ctx->parent_gen = parent_ctx->generation; + } + get_ctx(child_ctx->parent_ctx); + } + + raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); + mutex_unlock(&parent_ctx->mutex); + + perf_unpin_context(parent_ctx); + put_ctx(parent_ctx); + + return ret; +} + +/* + * Initialize the perf_event context in task_struct + */ +int perf_event_init_task(struct task_struct *child) +{ + int ctxn, ret; + + memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp)); + mutex_init(&child->perf_event_mutex); + INIT_LIST_HEAD(&child->perf_event_list); + + for_each_task_context_nr(ctxn) { + ret = perf_event_init_context(child, ctxn); + if (ret) + return ret; + } + + return 0; +} + +static void __init perf_event_init_all_cpus(void) +{ + struct swevent_htable *swhash; + int cpu; + + for_each_possible_cpu(cpu) { + swhash = &per_cpu(swevent_htable, cpu); + mutex_init(&swhash->hlist_mutex); + INIT_LIST_HEAD(&per_cpu(rotation_list, cpu)); + } +} + +static void __cpuinit perf_event_init_cpu(int cpu) +{ + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); + + mutex_lock(&swhash->hlist_mutex); + if (swhash->hlist_refcount > 0) { + struct swevent_hlist *hlist; + + hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); + WARN_ON(!hlist); + rcu_assign_pointer(swhash->swevent_hlist, hlist); + } + mutex_unlock(&swhash->hlist_mutex); +} + +#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC +static void perf_pmu_rotate_stop(struct pmu *pmu) +{ + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + WARN_ON(!irqs_disabled()); + + list_del_init(&cpuctx->rotation_list); +} + +static void __perf_event_exit_context(void *__info) +{ + struct perf_event_context *ctx = __info; + struct perf_event *event, *tmp; + + perf_pmu_rotate_stop(ctx->pmu); + + list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) + __perf_remove_from_context(event); + list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) + __perf_remove_from_context(event); +} + +static void perf_event_exit_cpu_context(int cpu) +{ + struct perf_event_context *ctx; + struct pmu *pmu; + int idx; + + idx = srcu_read_lock(&pmus_srcu); + list_for_each_entry_rcu(pmu, &pmus, entry) { + ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx; + + mutex_lock(&ctx->mutex); + smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); + mutex_unlock(&ctx->mutex); + } + srcu_read_unlock(&pmus_srcu, idx); +} + +static void perf_event_exit_cpu(int cpu) +{ + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); + + mutex_lock(&swhash->hlist_mutex); + swevent_hlist_release(swhash); + mutex_unlock(&swhash->hlist_mutex); + + perf_event_exit_cpu_context(cpu); +} +#else +static inline void perf_event_exit_cpu(int cpu) { } +#endif + +static int +perf_reboot(struct notifier_block *notifier, unsigned long val, void *v) +{ + int cpu; + + for_each_online_cpu(cpu) + perf_event_exit_cpu(cpu); + + return NOTIFY_OK; +} + +/* + * Run the perf reboot notifier at the very last possible moment so that + * the generic watchdog code runs as long as possible. + */ +static struct notifier_block perf_reboot_notifier = { + .notifier_call = perf_reboot, + .priority = INT_MIN, +}; + +static int __cpuinit +perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + + case CPU_UP_PREPARE: + case CPU_DOWN_FAILED: + perf_event_init_cpu(cpu); + break; + + case CPU_UP_CANCELED: + case CPU_DOWN_PREPARE: + perf_event_exit_cpu(cpu); + break; + + default: + break; + } + + return NOTIFY_OK; +} + +void __init perf_event_init(void) +{ + int ret; + + idr_init(&pmu_idr); + + perf_event_init_all_cpus(); + init_srcu_struct(&pmus_srcu); + perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); + perf_pmu_register(&perf_cpu_clock, NULL, -1); + perf_pmu_register(&perf_task_clock, NULL, -1); + perf_tp_register(); + perf_cpu_notifier(perf_cpu_notify); + register_reboot_notifier(&perf_reboot_notifier); + + ret = init_hw_breakpoint(); + WARN(ret, "hw_breakpoint initialization failed with: %d", ret); +} + +static int __init perf_event_sysfs_init(void) +{ + struct pmu *pmu; + int ret; + + mutex_lock(&pmus_lock); + + ret = bus_register(&pmu_bus); + if (ret) + goto unlock; + + list_for_each_entry(pmu, &pmus, entry) { + if (!pmu->name || pmu->type < 0) + continue; + + ret = pmu_dev_alloc(pmu); + WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret); + } + pmu_bus_running = 1; + ret = 0; + +unlock: + mutex_unlock(&pmus_lock); + + return ret; +} +device_initcall(perf_event_sysfs_init); + +#ifdef CONFIG_CGROUP_PERF +static struct cgroup_subsys_state *perf_cgroup_create( + struct cgroup_subsys *ss, struct cgroup *cont) +{ + struct perf_cgroup *jc; + + jc = kzalloc(sizeof(*jc), GFP_KERNEL); + if (!jc) + return ERR_PTR(-ENOMEM); + + jc->info = alloc_percpu(struct perf_cgroup_info); + if (!jc->info) { + kfree(jc); + return ERR_PTR(-ENOMEM); + } + + return &jc->css; +} + +static void perf_cgroup_destroy(struct cgroup_subsys *ss, + struct cgroup *cont) +{ + struct perf_cgroup *jc; + jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), + struct perf_cgroup, css); + free_percpu(jc->info); + kfree(jc); +} + +static int __perf_cgroup_move(void *info) +{ + struct task_struct *task = info; + perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN); + return 0; +} + +static void perf_cgroup_move(struct task_struct *task) +{ + task_function_call(task, __perf_cgroup_move, task); +} + +static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup *old_cgrp, struct task_struct *task, + bool threadgroup) +{ + perf_cgroup_move(task); + if (threadgroup) { + struct task_struct *c; + rcu_read_lock(); + list_for_each_entry_rcu(c, &task->thread_group, thread_group) { + perf_cgroup_move(c); + } + rcu_read_unlock(); + } +} + +static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup *old_cgrp, struct task_struct *task) +{ + /* + * cgroup_exit() is called in the copy_process() failure path. + * Ignore this case since the task hasn't ran yet, this avoids + * trying to poke a half freed task state from generic code. + */ + if (!(task->flags & PF_EXITING)) + return; + + perf_cgroup_move(task); +} + +struct cgroup_subsys perf_subsys = { + .name = "perf_event", + .subsys_id = perf_subsys_id, + .create = perf_cgroup_create, + .destroy = perf_cgroup_destroy, + .exit = perf_cgroup_exit, + .attach = perf_cgroup_attach, +}; +#endif /* CONFIG_CGROUP_PERF */ diff --git a/kernel/perf_event.c b/kernel/perf_event.c deleted file mode 100644 index 440bc485bbff..000000000000 --- a/kernel/perf_event.c +++ /dev/null @@ -1,7455 +0,0 @@ -/* - * Performance events core code: - * - * Copyright (C) 2008 Thomas Gleixner - * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra - * Copyright © 2009 Paul Mackerras, IBM Corp. - * - * For licensing details see kernel-base/COPYING - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -struct remote_function_call { - struct task_struct *p; - int (*func)(void *info); - void *info; - int ret; -}; - -static void remote_function(void *data) -{ - struct remote_function_call *tfc = data; - struct task_struct *p = tfc->p; - - if (p) { - tfc->ret = -EAGAIN; - if (task_cpu(p) != smp_processor_id() || !task_curr(p)) - return; - } - - tfc->ret = tfc->func(tfc->info); -} - -/** - * task_function_call - call a function on the cpu on which a task runs - * @p: the task to evaluate - * @func: the function to be called - * @info: the function call argument - * - * Calls the function @func when the task is currently running. This might - * be on the current CPU, which just calls the function directly - * - * returns: @func return value, or - * -ESRCH - when the process isn't running - * -EAGAIN - when the process moved away - */ -static int -task_function_call(struct task_struct *p, int (*func) (void *info), void *info) -{ - struct remote_function_call data = { - .p = p, - .func = func, - .info = info, - .ret = -ESRCH, /* No such (running) process */ - }; - - if (task_curr(p)) - smp_call_function_single(task_cpu(p), remote_function, &data, 1); - - return data.ret; -} - -/** - * cpu_function_call - call a function on the cpu - * @func: the function to be called - * @info: the function call argument - * - * Calls the function @func on the remote cpu. - * - * returns: @func return value or -ENXIO when the cpu is offline - */ -static int cpu_function_call(int cpu, int (*func) (void *info), void *info) -{ - struct remote_function_call data = { - .p = NULL, - .func = func, - .info = info, - .ret = -ENXIO, /* No such CPU */ - }; - - smp_call_function_single(cpu, remote_function, &data, 1); - - return data.ret; -} - -#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ - PERF_FLAG_FD_OUTPUT |\ - PERF_FLAG_PID_CGROUP) - -enum event_type_t { - EVENT_FLEXIBLE = 0x1, - EVENT_PINNED = 0x2, - EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, -}; - -/* - * perf_sched_events : >0 events exist - * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu - */ -struct jump_label_key perf_sched_events __read_mostly; -static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); - -static atomic_t nr_mmap_events __read_mostly; -static atomic_t nr_comm_events __read_mostly; -static atomic_t nr_task_events __read_mostly; - -static LIST_HEAD(pmus); -static DEFINE_MUTEX(pmus_lock); -static struct srcu_struct pmus_srcu; - -/* - * perf event paranoia level: - * -1 - not paranoid at all - * 0 - disallow raw tracepoint access for unpriv - * 1 - disallow cpu events for unpriv - * 2 - disallow kernel profiling for unpriv - */ -int sysctl_perf_event_paranoid __read_mostly = 1; - -/* Minimum for 512 kiB + 1 user control page */ -int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ - -/* - * max perf event sample rate - */ -#define DEFAULT_MAX_SAMPLE_RATE 100000 -int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; -static int max_samples_per_tick __read_mostly = - DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); - -int perf_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int ret = proc_dointvec(table, write, buffer, lenp, ppos); - - if (ret || !write) - return ret; - - max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); - - return 0; -} - -static atomic64_t perf_event_id; - -static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, - enum event_type_t event_type); - -static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task); - -static void update_context_time(struct perf_event_context *ctx); -static u64 perf_event_time(struct perf_event *event); - -void __weak perf_event_print_debug(void) { } - -extern __weak const char *perf_pmu_name(void) -{ - return "pmu"; -} - -static inline u64 perf_clock(void) -{ - return local_clock(); -} - -static inline struct perf_cpu_context * -__get_cpu_context(struct perf_event_context *ctx) -{ - return this_cpu_ptr(ctx->pmu->pmu_cpu_context); -} - -#ifdef CONFIG_CGROUP_PERF - -/* - * Must ensure cgroup is pinned (css_get) before calling - * this function. In other words, we cannot call this function - * if there is no cgroup event for the current CPU context. - */ -static inline struct perf_cgroup * -perf_cgroup_from_task(struct task_struct *task) -{ - return container_of(task_subsys_state(task, perf_subsys_id), - struct perf_cgroup, css); -} - -static inline bool -perf_cgroup_match(struct perf_event *event) -{ - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - - return !event->cgrp || event->cgrp == cpuctx->cgrp; -} - -static inline void perf_get_cgroup(struct perf_event *event) -{ - css_get(&event->cgrp->css); -} - -static inline void perf_put_cgroup(struct perf_event *event) -{ - css_put(&event->cgrp->css); -} - -static inline void perf_detach_cgroup(struct perf_event *event) -{ - perf_put_cgroup(event); - event->cgrp = NULL; -} - -static inline int is_cgroup_event(struct perf_event *event) -{ - return event->cgrp != NULL; -} - -static inline u64 perf_cgroup_event_time(struct perf_event *event) -{ - struct perf_cgroup_info *t; - - t = per_cpu_ptr(event->cgrp->info, event->cpu); - return t->time; -} - -static inline void __update_cgrp_time(struct perf_cgroup *cgrp) -{ - struct perf_cgroup_info *info; - u64 now; - - now = perf_clock(); - - info = this_cpu_ptr(cgrp->info); - - info->time += now - info->timestamp; - info->timestamp = now; -} - -static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) -{ - struct perf_cgroup *cgrp_out = cpuctx->cgrp; - if (cgrp_out) - __update_cgrp_time(cgrp_out); -} - -static inline void update_cgrp_time_from_event(struct perf_event *event) -{ - struct perf_cgroup *cgrp; - - /* - * ensure we access cgroup data only when needed and - * when we know the cgroup is pinned (css_get) - */ - if (!is_cgroup_event(event)) - return; - - cgrp = perf_cgroup_from_task(current); - /* - * Do not update time when cgroup is not active - */ - if (cgrp == event->cgrp) - __update_cgrp_time(event->cgrp); -} - -static inline void -perf_cgroup_set_timestamp(struct task_struct *task, - struct perf_event_context *ctx) -{ - struct perf_cgroup *cgrp; - struct perf_cgroup_info *info; - - /* - * ctx->lock held by caller - * ensure we do not access cgroup data - * unless we have the cgroup pinned (css_get) - */ - if (!task || !ctx->nr_cgroups) - return; - - cgrp = perf_cgroup_from_task(task); - info = this_cpu_ptr(cgrp->info); - info->timestamp = ctx->timestamp; -} - -#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ -#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */ - -/* - * reschedule events based on the cgroup constraint of task. - * - * mode SWOUT : schedule out everything - * mode SWIN : schedule in based on cgroup for next - */ -void perf_cgroup_switch(struct task_struct *task, int mode) -{ - struct perf_cpu_context *cpuctx; - struct pmu *pmu; - unsigned long flags; - - /* - * disable interrupts to avoid geting nr_cgroup - * changes via __perf_event_disable(). Also - * avoids preemption. - */ - local_irq_save(flags); - - /* - * we reschedule only in the presence of cgroup - * constrained events. - */ - rcu_read_lock(); - - list_for_each_entry_rcu(pmu, &pmus, entry) { - - cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - - perf_pmu_disable(cpuctx->ctx.pmu); - - /* - * perf_cgroup_events says at least one - * context on this CPU has cgroup events. - * - * ctx->nr_cgroups reports the number of cgroup - * events for a context. - */ - if (cpuctx->ctx.nr_cgroups > 0) { - - if (mode & PERF_CGROUP_SWOUT) { - cpu_ctx_sched_out(cpuctx, EVENT_ALL); - /* - * must not be done before ctxswout due - * to event_filter_match() in event_sched_out() - */ - cpuctx->cgrp = NULL; - } - - if (mode & PERF_CGROUP_SWIN) { - WARN_ON_ONCE(cpuctx->cgrp); - /* set cgrp before ctxsw in to - * allow event_filter_match() to not - * have to pass task around - */ - cpuctx->cgrp = perf_cgroup_from_task(task); - cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); - } - } - - perf_pmu_enable(cpuctx->ctx.pmu); - } - - rcu_read_unlock(); - - local_irq_restore(flags); -} - -static inline void perf_cgroup_sched_out(struct task_struct *task) -{ - perf_cgroup_switch(task, PERF_CGROUP_SWOUT); -} - -static inline void perf_cgroup_sched_in(struct task_struct *task) -{ - perf_cgroup_switch(task, PERF_CGROUP_SWIN); -} - -static inline int perf_cgroup_connect(int fd, struct perf_event *event, - struct perf_event_attr *attr, - struct perf_event *group_leader) -{ - struct perf_cgroup *cgrp; - struct cgroup_subsys_state *css; - struct file *file; - int ret = 0, fput_needed; - - file = fget_light(fd, &fput_needed); - if (!file) - return -EBADF; - - css = cgroup_css_from_dir(file, perf_subsys_id); - if (IS_ERR(css)) { - ret = PTR_ERR(css); - goto out; - } - - cgrp = container_of(css, struct perf_cgroup, css); - event->cgrp = cgrp; - - /* must be done before we fput() the file */ - perf_get_cgroup(event); - - /* - * all events in a group must monitor - * the same cgroup because a task belongs - * to only one perf cgroup at a time - */ - if (group_leader && group_leader->cgrp != cgrp) { - perf_detach_cgroup(event); - ret = -EINVAL; - } -out: - fput_light(file, fput_needed); - return ret; -} - -static inline void -perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) -{ - struct perf_cgroup_info *t; - t = per_cpu_ptr(event->cgrp->info, event->cpu); - event->shadow_ctx_time = now - t->timestamp; -} - -static inline void -perf_cgroup_defer_enabled(struct perf_event *event) -{ - /* - * when the current task's perf cgroup does not match - * the event's, we need to remember to call the - * perf_mark_enable() function the first time a task with - * a matching perf cgroup is scheduled in. - */ - if (is_cgroup_event(event) && !perf_cgroup_match(event)) - event->cgrp_defer_enabled = 1; -} - -static inline void -perf_cgroup_mark_enabled(struct perf_event *event, - struct perf_event_context *ctx) -{ - struct perf_event *sub; - u64 tstamp = perf_event_time(event); - - if (!event->cgrp_defer_enabled) - return; - - event->cgrp_defer_enabled = 0; - - event->tstamp_enabled = tstamp - event->total_time_enabled; - list_for_each_entry(sub, &event->sibling_list, group_entry) { - if (sub->state >= PERF_EVENT_STATE_INACTIVE) { - sub->tstamp_enabled = tstamp - sub->total_time_enabled; - sub->cgrp_defer_enabled = 0; - } - } -} -#else /* !CONFIG_CGROUP_PERF */ - -static inline bool -perf_cgroup_match(struct perf_event *event) -{ - return true; -} - -static inline void perf_detach_cgroup(struct perf_event *event) -{} - -static inline int is_cgroup_event(struct perf_event *event) -{ - return 0; -} - -static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event) -{ - return 0; -} - -static inline void update_cgrp_time_from_event(struct perf_event *event) -{ -} - -static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) -{ -} - -static inline void perf_cgroup_sched_out(struct task_struct *task) -{ -} - -static inline void perf_cgroup_sched_in(struct task_struct *task) -{ -} - -static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, - struct perf_event_attr *attr, - struct perf_event *group_leader) -{ - return -EINVAL; -} - -static inline void -perf_cgroup_set_timestamp(struct task_struct *task, - struct perf_event_context *ctx) -{ -} - -void -perf_cgroup_switch(struct task_struct *task, struct task_struct *next) -{ -} - -static inline void -perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) -{ -} - -static inline u64 perf_cgroup_event_time(struct perf_event *event) -{ - return 0; -} - -static inline void -perf_cgroup_defer_enabled(struct perf_event *event) -{ -} - -static inline void -perf_cgroup_mark_enabled(struct perf_event *event, - struct perf_event_context *ctx) -{ -} -#endif - -void perf_pmu_disable(struct pmu *pmu) -{ - int *count = this_cpu_ptr(pmu->pmu_disable_count); - if (!(*count)++) - pmu->pmu_disable(pmu); -} - -void perf_pmu_enable(struct pmu *pmu) -{ - int *count = this_cpu_ptr(pmu->pmu_disable_count); - if (!--(*count)) - pmu->pmu_enable(pmu); -} - -static DEFINE_PER_CPU(struct list_head, rotation_list); - -/* - * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized - * because they're strictly cpu affine and rotate_start is called with IRQs - * disabled, while rotate_context is called from IRQ context. - */ -static void perf_pmu_rotate_start(struct pmu *pmu) -{ - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - struct list_head *head = &__get_cpu_var(rotation_list); - - WARN_ON(!irqs_disabled()); - - if (list_empty(&cpuctx->rotation_list)) - list_add(&cpuctx->rotation_list, head); -} - -static void get_ctx(struct perf_event_context *ctx) -{ - WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); -} - -static void free_ctx(struct rcu_head *head) -{ - struct perf_event_context *ctx; - - ctx = container_of(head, struct perf_event_context, rcu_head); - kfree(ctx); -} - -static void put_ctx(struct perf_event_context *ctx) -{ - if (atomic_dec_and_test(&ctx->refcount)) { - if (ctx->parent_ctx) - put_ctx(ctx->parent_ctx); - if (ctx->task) - put_task_struct(ctx->task); - call_rcu(&ctx->rcu_head, free_ctx); - } -} - -static void unclone_ctx(struct perf_event_context *ctx) -{ - if (ctx->parent_ctx) { - put_ctx(ctx->parent_ctx); - ctx->parent_ctx = NULL; - } -} - -static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) -{ - /* - * only top level events have the pid namespace they were created in - */ - if (event->parent) - event = event->parent; - - return task_tgid_nr_ns(p, event->ns); -} - -static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) -{ - /* - * only top level events have the pid namespace they were created in - */ - if (event->parent) - event = event->parent; - - return task_pid_nr_ns(p, event->ns); -} - -/* - * If we inherit events we want to return the parent event id - * to userspace. - */ -static u64 primary_event_id(struct perf_event *event) -{ - u64 id = event->id; - - if (event->parent) - id = event->parent->id; - - return id; -} - -/* - * Get the perf_event_context for a task and lock it. - * This has to cope with with the fact that until it is locked, - * the context could get moved to another task. - */ -static struct perf_event_context * -perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) -{ - struct perf_event_context *ctx; - - rcu_read_lock(); -retry: - ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); - if (ctx) { - /* - * If this context is a clone of another, it might - * get swapped for another underneath us by - * perf_event_task_sched_out, though the - * rcu_read_lock() protects us from any context - * getting freed. Lock the context and check if it - * got swapped before we could get the lock, and retry - * if so. If we locked the right context, then it - * can't get swapped on us any more. - */ - raw_spin_lock_irqsave(&ctx->lock, *flags); - if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { - raw_spin_unlock_irqrestore(&ctx->lock, *flags); - goto retry; - } - - if (!atomic_inc_not_zero(&ctx->refcount)) { - raw_spin_unlock_irqrestore(&ctx->lock, *flags); - ctx = NULL; - } - } - rcu_read_unlock(); - return ctx; -} - -/* - * Get the context for a task and increment its pin_count so it - * can't get swapped to another task. This also increments its - * reference count so that the context can't get freed. - */ -static struct perf_event_context * -perf_pin_task_context(struct task_struct *task, int ctxn) -{ - struct perf_event_context *ctx; - unsigned long flags; - - ctx = perf_lock_task_context(task, ctxn, &flags); - if (ctx) { - ++ctx->pin_count; - raw_spin_unlock_irqrestore(&ctx->lock, flags); - } - return ctx; -} - -static void perf_unpin_context(struct perf_event_context *ctx) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&ctx->lock, flags); - --ctx->pin_count; - raw_spin_unlock_irqrestore(&ctx->lock, flags); -} - -/* - * Update the record of the current time in a context. - */ -static void update_context_time(struct perf_event_context *ctx) -{ - u64 now = perf_clock(); - - ctx->time += now - ctx->timestamp; - ctx->timestamp = now; -} - -static u64 perf_event_time(struct perf_event *event) -{ - struct perf_event_context *ctx = event->ctx; - - if (is_cgroup_event(event)) - return perf_cgroup_event_time(event); - - return ctx ? ctx->time : 0; -} - -/* - * Update the total_time_enabled and total_time_running fields for a event. - */ -static void update_event_times(struct perf_event *event) -{ - struct perf_event_context *ctx = event->ctx; - u64 run_end; - - if (event->state < PERF_EVENT_STATE_INACTIVE || - event->group_leader->state < PERF_EVENT_STATE_INACTIVE) - return; - /* - * in cgroup mode, time_enabled represents - * the time the event was enabled AND active - * tasks were in the monitored cgroup. This is - * independent of the activity of the context as - * there may be a mix of cgroup and non-cgroup events. - * - * That is why we treat cgroup events differently - * here. - */ - if (is_cgroup_event(event)) - run_end = perf_event_time(event); - else if (ctx->is_active) - run_end = ctx->time; - else - run_end = event->tstamp_stopped; - - event->total_time_enabled = run_end - event->tstamp_enabled; - - if (event->state == PERF_EVENT_STATE_INACTIVE) - run_end = event->tstamp_stopped; - else - run_end = perf_event_time(event); - - event->total_time_running = run_end - event->tstamp_running; - -} - -/* - * Update total_time_enabled and total_time_running for all events in a group. - */ -static void update_group_times(struct perf_event *leader) -{ - struct perf_event *event; - - update_event_times(leader); - list_for_each_entry(event, &leader->sibling_list, group_entry) - update_event_times(event); -} - -static struct list_head * -ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) -{ - if (event->attr.pinned) - return &ctx->pinned_groups; - else - return &ctx->flexible_groups; -} - -/* - * Add a event from the lists for its context. - * Must be called with ctx->mutex and ctx->lock held. - */ -static void -list_add_event(struct perf_event *event, struct perf_event_context *ctx) -{ - WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); - event->attach_state |= PERF_ATTACH_CONTEXT; - - /* - * If we're a stand alone event or group leader, we go to the context - * list, group events are kept attached to the group so that - * perf_group_detach can, at all times, locate all siblings. - */ - if (event->group_leader == event) { - struct list_head *list; - - if (is_software_event(event)) - event->group_flags |= PERF_GROUP_SOFTWARE; - - list = ctx_group_list(event, ctx); - list_add_tail(&event->group_entry, list); - } - - if (is_cgroup_event(event)) - ctx->nr_cgroups++; - - list_add_rcu(&event->event_entry, &ctx->event_list); - if (!ctx->nr_events) - perf_pmu_rotate_start(ctx->pmu); - ctx->nr_events++; - if (event->attr.inherit_stat) - ctx->nr_stat++; -} - -/* - * Called at perf_event creation and when events are attached/detached from a - * group. - */ -static void perf_event__read_size(struct perf_event *event) -{ - int entry = sizeof(u64); /* value */ - int size = 0; - int nr = 1; - - if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - size += sizeof(u64); - - if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - size += sizeof(u64); - - if (event->attr.read_format & PERF_FORMAT_ID) - entry += sizeof(u64); - - if (event->attr.read_format & PERF_FORMAT_GROUP) { - nr += event->group_leader->nr_siblings; - size += sizeof(u64); - } - - size += entry * nr; - event->read_size = size; -} - -static void perf_event__header_size(struct perf_event *event) -{ - struct perf_sample_data *data; - u64 sample_type = event->attr.sample_type; - u16 size = 0; - - perf_event__read_size(event); - - if (sample_type & PERF_SAMPLE_IP) - size += sizeof(data->ip); - - if (sample_type & PERF_SAMPLE_ADDR) - size += sizeof(data->addr); - - if (sample_type & PERF_SAMPLE_PERIOD) - size += sizeof(data->period); - - if (sample_type & PERF_SAMPLE_READ) - size += event->read_size; - - event->header_size = size; -} - -static void perf_event__id_header_size(struct perf_event *event) -{ - struct perf_sample_data *data; - u64 sample_type = event->attr.sample_type; - u16 size = 0; - - if (sample_type & PERF_SAMPLE_TID) - size += sizeof(data->tid_entry); - - if (sample_type & PERF_SAMPLE_TIME) - size += sizeof(data->time); - - if (sample_type & PERF_SAMPLE_ID) - size += sizeof(data->id); - - if (sample_type & PERF_SAMPLE_STREAM_ID) - size += sizeof(data->stream_id); - - if (sample_type & PERF_SAMPLE_CPU) - size += sizeof(data->cpu_entry); - - event->id_header_size = size; -} - -static void perf_group_attach(struct perf_event *event) -{ - struct perf_event *group_leader = event->group_leader, *pos; - - /* - * We can have double attach due to group movement in perf_event_open. - */ - if (event->attach_state & PERF_ATTACH_GROUP) - return; - - event->attach_state |= PERF_ATTACH_GROUP; - - if (group_leader == event) - return; - - if (group_leader->group_flags & PERF_GROUP_SOFTWARE && - !is_software_event(event)) - group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; - - list_add_tail(&event->group_entry, &group_leader->sibling_list); - group_leader->nr_siblings++; - - perf_event__header_size(group_leader); - - list_for_each_entry(pos, &group_leader->sibling_list, group_entry) - perf_event__header_size(pos); -} - -/* - * Remove a event from the lists for its context. - * Must be called with ctx->mutex and ctx->lock held. - */ -static void -list_del_event(struct perf_event *event, struct perf_event_context *ctx) -{ - struct perf_cpu_context *cpuctx; - /* - * We can have double detach due to exit/hot-unplug + close. - */ - if (!(event->attach_state & PERF_ATTACH_CONTEXT)) - return; - - event->attach_state &= ~PERF_ATTACH_CONTEXT; - - if (is_cgroup_event(event)) { - ctx->nr_cgroups--; - cpuctx = __get_cpu_context(ctx); - /* - * if there are no more cgroup events - * then cler cgrp to avoid stale pointer - * in update_cgrp_time_from_cpuctx() - */ - if (!ctx->nr_cgroups) - cpuctx->cgrp = NULL; - } - - ctx->nr_events--; - if (event->attr.inherit_stat) - ctx->nr_stat--; - - list_del_rcu(&event->event_entry); - - if (event->group_leader == event) - list_del_init(&event->group_entry); - - update_group_times(event); - - /* - * If event was in error state, then keep it - * that way, otherwise bogus counts will be - * returned on read(). The only way to get out - * of error state is by explicit re-enabling - * of the event - */ - if (event->state > PERF_EVENT_STATE_OFF) - event->state = PERF_EVENT_STATE_OFF; -} - -static void perf_group_detach(struct perf_event *event) -{ - struct perf_event *sibling, *tmp; - struct list_head *list = NULL; - - /* - * We can have double detach due to exit/hot-unplug + close. - */ - if (!(event->attach_state & PERF_ATTACH_GROUP)) - return; - - event->attach_state &= ~PERF_ATTACH_GROUP; - - /* - * If this is a sibling, remove it from its group. - */ - if (event->group_leader != event) { - list_del_init(&event->group_entry); - event->group_leader->nr_siblings--; - goto out; - } - - if (!list_empty(&event->group_entry)) - list = &event->group_entry; - - /* - * If this was a group event with sibling events then - * upgrade the siblings to singleton events by adding them - * to whatever list we are on. - */ - list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { - if (list) - list_move_tail(&sibling->group_entry, list); - sibling->group_leader = sibling; - - /* Inherit group flags from the previous leader */ - sibling->group_flags = event->group_flags; - } - -out: - perf_event__header_size(event->group_leader); - - list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry) - perf_event__header_size(tmp); -} - -static inline int -event_filter_match(struct perf_event *event) -{ - return (event->cpu == -1 || event->cpu == smp_processor_id()) - && perf_cgroup_match(event); -} - -static void -event_sched_out(struct perf_event *event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - u64 tstamp = perf_event_time(event); - u64 delta; - /* - * An event which could not be activated because of - * filter mismatch still needs to have its timings - * maintained, otherwise bogus information is return - * via read() for time_enabled, time_running: - */ - if (event->state == PERF_EVENT_STATE_INACTIVE - && !event_filter_match(event)) { - delta = tstamp - event->tstamp_stopped; - event->tstamp_running += delta; - event->tstamp_stopped = tstamp; - } - - if (event->state != PERF_EVENT_STATE_ACTIVE) - return; - - event->state = PERF_EVENT_STATE_INACTIVE; - if (event->pending_disable) { - event->pending_disable = 0; - event->state = PERF_EVENT_STATE_OFF; - } - event->tstamp_stopped = tstamp; - event->pmu->del(event, 0); - event->oncpu = -1; - - if (!is_software_event(event)) - cpuctx->active_oncpu--; - ctx->nr_active--; - if (event->attr.exclusive || !cpuctx->active_oncpu) - cpuctx->exclusive = 0; -} - -static void -group_sched_out(struct perf_event *group_event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - struct perf_event *event; - int state = group_event->state; - - event_sched_out(group_event, cpuctx, ctx); - - /* - * Schedule out siblings (if any): - */ - list_for_each_entry(event, &group_event->sibling_list, group_entry) - event_sched_out(event, cpuctx, ctx); - - if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) - cpuctx->exclusive = 0; -} - -/* - * Cross CPU call to remove a performance event - * - * We disable the event on the hardware level first. After that we - * remove it from the context list. - */ -static int __perf_remove_from_context(void *info) -{ - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - - raw_spin_lock(&ctx->lock); - event_sched_out(event, cpuctx, ctx); - list_del_event(event, ctx); - raw_spin_unlock(&ctx->lock); - - return 0; -} - - -/* - * Remove the event from a task's (or a CPU's) list of events. - * - * CPU events are removed with a smp call. For task events we only - * call when the task is on a CPU. - * - * If event->ctx is a cloned context, callers must make sure that - * every task struct that event->ctx->task could possibly point to - * remains valid. This is OK when called from perf_release since - * that only calls us on the top-level context, which can't be a clone. - * When called from perf_event_exit_task, it's OK because the - * context has been detached from its task. - */ -static void perf_remove_from_context(struct perf_event *event) -{ - struct perf_event_context *ctx = event->ctx; - struct task_struct *task = ctx->task; - - lockdep_assert_held(&ctx->mutex); - - if (!task) { - /* - * Per cpu events are removed via an smp call and - * the removal is always successful. - */ - cpu_function_call(event->cpu, __perf_remove_from_context, event); - return; - } - -retry: - if (!task_function_call(task, __perf_remove_from_context, event)) - return; - - raw_spin_lock_irq(&ctx->lock); - /* - * If we failed to find a running task, but find the context active now - * that we've acquired the ctx->lock, retry. - */ - if (ctx->is_active) { - raw_spin_unlock_irq(&ctx->lock); - goto retry; - } - - /* - * Since the task isn't running, its safe to remove the event, us - * holding the ctx->lock ensures the task won't get scheduled in. - */ - list_del_event(event, ctx); - raw_spin_unlock_irq(&ctx->lock); -} - -/* - * Cross CPU call to disable a performance event - */ -static int __perf_event_disable(void *info) -{ - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - - /* - * If this is a per-task event, need to check whether this - * event's task is the current task on this cpu. - * - * Can trigger due to concurrent perf_event_context_sched_out() - * flipping contexts around. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return -EINVAL; - - raw_spin_lock(&ctx->lock); - - /* - * If the event is on, turn it off. - * If it is in error state, leave it in error state. - */ - if (event->state >= PERF_EVENT_STATE_INACTIVE) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - update_group_times(event); - if (event == event->group_leader) - group_sched_out(event, cpuctx, ctx); - else - event_sched_out(event, cpuctx, ctx); - event->state = PERF_EVENT_STATE_OFF; - } - - raw_spin_unlock(&ctx->lock); - - return 0; -} - -/* - * Disable a event. - * - * If event->ctx is a cloned context, callers must make sure that - * every task struct that event->ctx->task could possibly point to - * remains valid. This condition is satisifed when called through - * perf_event_for_each_child or perf_event_for_each because they - * hold the top-level event's child_mutex, so any descendant that - * goes to exit will block in sync_child_event. - * When called from perf_pending_event it's OK because event->ctx - * is the current context on this CPU and preemption is disabled, - * hence we can't get into perf_event_task_sched_out for this context. - */ -void perf_event_disable(struct perf_event *event) -{ - struct perf_event_context *ctx = event->ctx; - struct task_struct *task = ctx->task; - - if (!task) { - /* - * Disable the event on the cpu that it's on - */ - cpu_function_call(event->cpu, __perf_event_disable, event); - return; - } - -retry: - if (!task_function_call(task, __perf_event_disable, event)) - return; - - raw_spin_lock_irq(&ctx->lock); - /* - * If the event is still active, we need to retry the cross-call. - */ - if (event->state == PERF_EVENT_STATE_ACTIVE) { - raw_spin_unlock_irq(&ctx->lock); - /* - * Reload the task pointer, it might have been changed by - * a concurrent perf_event_context_sched_out(). - */ - task = ctx->task; - goto retry; - } - - /* - * Since we have the lock this context can't be scheduled - * in, so we can change the state safely. - */ - if (event->state == PERF_EVENT_STATE_INACTIVE) { - update_group_times(event); - event->state = PERF_EVENT_STATE_OFF; - } - raw_spin_unlock_irq(&ctx->lock); -} - -static void perf_set_shadow_time(struct perf_event *event, - struct perf_event_context *ctx, - u64 tstamp) -{ - /* - * use the correct time source for the time snapshot - * - * We could get by without this by leveraging the - * fact that to get to this function, the caller - * has most likely already called update_context_time() - * and update_cgrp_time_xx() and thus both timestamp - * are identical (or very close). Given that tstamp is, - * already adjusted for cgroup, we could say that: - * tstamp - ctx->timestamp - * is equivalent to - * tstamp - cgrp->timestamp. - * - * Then, in perf_output_read(), the calculation would - * work with no changes because: - * - event is guaranteed scheduled in - * - no scheduled out in between - * - thus the timestamp would be the same - * - * But this is a bit hairy. - * - * So instead, we have an explicit cgroup call to remain - * within the time time source all along. We believe it - * is cleaner and simpler to understand. - */ - if (is_cgroup_event(event)) - perf_cgroup_set_shadow_time(event, tstamp); - else - event->shadow_ctx_time = tstamp - ctx->timestamp; -} - -#define MAX_INTERRUPTS (~0ULL) - -static void perf_log_throttle(struct perf_event *event, int enable); - -static int -event_sched_in(struct perf_event *event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - u64 tstamp = perf_event_time(event); - - if (event->state <= PERF_EVENT_STATE_OFF) - return 0; - - event->state = PERF_EVENT_STATE_ACTIVE; - event->oncpu = smp_processor_id(); - - /* - * Unthrottle events, since we scheduled we might have missed several - * ticks already, also for a heavily scheduling task there is little - * guarantee it'll get a tick in a timely manner. - */ - if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) { - perf_log_throttle(event, 1); - event->hw.interrupts = 0; - } - - /* - * The new state must be visible before we turn it on in the hardware: - */ - smp_wmb(); - - if (event->pmu->add(event, PERF_EF_START)) { - event->state = PERF_EVENT_STATE_INACTIVE; - event->oncpu = -1; - return -EAGAIN; - } - - event->tstamp_running += tstamp - event->tstamp_stopped; - - perf_set_shadow_time(event, ctx, tstamp); - - if (!is_software_event(event)) - cpuctx->active_oncpu++; - ctx->nr_active++; - - if (event->attr.exclusive) - cpuctx->exclusive = 1; - - return 0; -} - -static int -group_sched_in(struct perf_event *group_event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - struct perf_event *event, *partial_group = NULL; - struct pmu *pmu = group_event->pmu; - u64 now = ctx->time; - bool simulate = false; - - if (group_event->state == PERF_EVENT_STATE_OFF) - return 0; - - pmu->start_txn(pmu); - - if (event_sched_in(group_event, cpuctx, ctx)) { - pmu->cancel_txn(pmu); - return -EAGAIN; - } - - /* - * Schedule in siblings as one group (if any): - */ - list_for_each_entry(event, &group_event->sibling_list, group_entry) { - if (event_sched_in(event, cpuctx, ctx)) { - partial_group = event; - goto group_error; - } - } - - if (!pmu->commit_txn(pmu)) - return 0; - -group_error: - /* - * Groups can be scheduled in as one unit only, so undo any - * partial group before returning: - * The events up to the failed event are scheduled out normally, - * tstamp_stopped will be updated. - * - * The failed events and the remaining siblings need to have - * their timings updated as if they had gone thru event_sched_in() - * and event_sched_out(). This is required to get consistent timings - * across the group. This also takes care of the case where the group - * could never be scheduled by ensuring tstamp_stopped is set to mark - * the time the event was actually stopped, such that time delta - * calculation in update_event_times() is correct. - */ - list_for_each_entry(event, &group_event->sibling_list, group_entry) { - if (event == partial_group) - simulate = true; - - if (simulate) { - event->tstamp_running += now - event->tstamp_stopped; - event->tstamp_stopped = now; - } else { - event_sched_out(event, cpuctx, ctx); - } - } - event_sched_out(group_event, cpuctx, ctx); - - pmu->cancel_txn(pmu); - - return -EAGAIN; -} - -/* - * Work out whether we can put this event group on the CPU now. - */ -static int group_can_go_on(struct perf_event *event, - struct perf_cpu_context *cpuctx, - int can_add_hw) -{ - /* - * Groups consisting entirely of software events can always go on. - */ - if (event->group_flags & PERF_GROUP_SOFTWARE) - return 1; - /* - * If an exclusive group is already on, no other hardware - * events can go on. - */ - if (cpuctx->exclusive) - return 0; - /* - * If this group is exclusive and there are already - * events on the CPU, it can't go on. - */ - if (event->attr.exclusive && cpuctx->active_oncpu) - return 0; - /* - * Otherwise, try to add it if all previous groups were able - * to go on. - */ - return can_add_hw; -} - -static void add_event_to_ctx(struct perf_event *event, - struct perf_event_context *ctx) -{ - u64 tstamp = perf_event_time(event); - - list_add_event(event, ctx); - perf_group_attach(event); - event->tstamp_enabled = tstamp; - event->tstamp_running = tstamp; - event->tstamp_stopped = tstamp; -} - -static void perf_event_context_sched_in(struct perf_event_context *ctx, - struct task_struct *tsk); - -/* - * Cross CPU call to install and enable a performance event - * - * Must be called with ctx->mutex held - */ -static int __perf_install_in_context(void *info) -{ - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; - struct perf_event *leader = event->group_leader; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - int err; - - /* - * In case we're installing a new context to an already running task, - * could also happen before perf_event_task_sched_in() on architectures - * which do context switches with IRQs enabled. - */ - if (ctx->task && !cpuctx->task_ctx) - perf_event_context_sched_in(ctx, ctx->task); - - raw_spin_lock(&ctx->lock); - ctx->is_active = 1; - update_context_time(ctx); - /* - * update cgrp time only if current cgrp - * matches event->cgrp. Must be done before - * calling add_event_to_ctx() - */ - update_cgrp_time_from_event(event); - - add_event_to_ctx(event, ctx); - - if (!event_filter_match(event)) - goto unlock; - - /* - * Don't put the event on if it is disabled or if - * it is in a group and the group isn't on. - */ - if (event->state != PERF_EVENT_STATE_INACTIVE || - (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)) - goto unlock; - - /* - * An exclusive event can't go on if there are already active - * hardware events, and no hardware event can go on if there - * is already an exclusive event on. - */ - if (!group_can_go_on(event, cpuctx, 1)) - err = -EEXIST; - else - err = event_sched_in(event, cpuctx, ctx); - - if (err) { - /* - * This event couldn't go on. If it is in a group - * then we have to pull the whole group off. - * If the event group is pinned then put it in error state. - */ - if (leader != event) - group_sched_out(leader, cpuctx, ctx); - if (leader->attr.pinned) { - update_group_times(leader); - leader->state = PERF_EVENT_STATE_ERROR; - } - } - -unlock: - raw_spin_unlock(&ctx->lock); - - return 0; -} - -/* - * Attach a performance event to a context - * - * First we add the event to the list with the hardware enable bit - * in event->hw_config cleared. - * - * If the event is attached to a task which is on a CPU we use a smp - * call to enable it in the task context. The task might have been - * scheduled away, but we check this in the smp call again. - */ -static void -perf_install_in_context(struct perf_event_context *ctx, - struct perf_event *event, - int cpu) -{ - struct task_struct *task = ctx->task; - - lockdep_assert_held(&ctx->mutex); - - event->ctx = ctx; - - if (!task) { - /* - * Per cpu events are installed via an smp call and - * the install is always successful. - */ - cpu_function_call(cpu, __perf_install_in_context, event); - return; - } - -retry: - if (!task_function_call(task, __perf_install_in_context, event)) - return; - - raw_spin_lock_irq(&ctx->lock); - /* - * If we failed to find a running task, but find the context active now - * that we've acquired the ctx->lock, retry. - */ - if (ctx->is_active) { - raw_spin_unlock_irq(&ctx->lock); - goto retry; - } - - /* - * Since the task isn't running, its safe to add the event, us holding - * the ctx->lock ensures the task won't get scheduled in. - */ - add_event_to_ctx(event, ctx); - raw_spin_unlock_irq(&ctx->lock); -} - -/* - * Put a event into inactive state and update time fields. - * Enabling the leader of a group effectively enables all - * the group members that aren't explicitly disabled, so we - * have to update their ->tstamp_enabled also. - * Note: this works for group members as well as group leaders - * since the non-leader members' sibling_lists will be empty. - */ -static void __perf_event_mark_enabled(struct perf_event *event, - struct perf_event_context *ctx) -{ - struct perf_event *sub; - u64 tstamp = perf_event_time(event); - - event->state = PERF_EVENT_STATE_INACTIVE; - event->tstamp_enabled = tstamp - event->total_time_enabled; - list_for_each_entry(sub, &event->sibling_list, group_entry) { - if (sub->state >= PERF_EVENT_STATE_INACTIVE) - sub->tstamp_enabled = tstamp - sub->total_time_enabled; - } -} - -/* - * Cross CPU call to enable a performance event - */ -static int __perf_event_enable(void *info) -{ - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; - struct perf_event *leader = event->group_leader; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - int err; - - if (WARN_ON_ONCE(!ctx->is_active)) - return -EINVAL; - - raw_spin_lock(&ctx->lock); - update_context_time(ctx); - - if (event->state >= PERF_EVENT_STATE_INACTIVE) - goto unlock; - - /* - * set current task's cgroup time reference point - */ - perf_cgroup_set_timestamp(current, ctx); - - __perf_event_mark_enabled(event, ctx); - - if (!event_filter_match(event)) { - if (is_cgroup_event(event)) - perf_cgroup_defer_enabled(event); - goto unlock; - } - - /* - * If the event is in a group and isn't the group leader, - * then don't put it on unless the group is on. - */ - if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) - goto unlock; - - if (!group_can_go_on(event, cpuctx, 1)) { - err = -EEXIST; - } else { - if (event == leader) - err = group_sched_in(event, cpuctx, ctx); - else - err = event_sched_in(event, cpuctx, ctx); - } - - if (err) { - /* - * If this event can't go on and it's part of a - * group, then the whole group has to come off. - */ - if (leader != event) - group_sched_out(leader, cpuctx, ctx); - if (leader->attr.pinned) { - update_group_times(leader); - leader->state = PERF_EVENT_STATE_ERROR; - } - } - -unlock: - raw_spin_unlock(&ctx->lock); - - return 0; -} - -/* - * Enable a event. - * - * If event->ctx is a cloned context, callers must make sure that - * every task struct that event->ctx->task could possibly point to - * remains valid. This condition is satisfied when called through - * perf_event_for_each_child or perf_event_for_each as described - * for perf_event_disable. - */ -void perf_event_enable(struct perf_event *event) -{ - struct perf_event_context *ctx = event->ctx; - struct task_struct *task = ctx->task; - - if (!task) { - /* - * Enable the event on the cpu that it's on - */ - cpu_function_call(event->cpu, __perf_event_enable, event); - return; - } - - raw_spin_lock_irq(&ctx->lock); - if (event->state >= PERF_EVENT_STATE_INACTIVE) - goto out; - - /* - * If the event is in error state, clear that first. - * That way, if we see the event in error state below, we - * know that it has gone back into error state, as distinct - * from the task having been scheduled away before the - * cross-call arrived. - */ - if (event->state == PERF_EVENT_STATE_ERROR) - event->state = PERF_EVENT_STATE_OFF; - -retry: - if (!ctx->is_active) { - __perf_event_mark_enabled(event, ctx); - goto out; - } - - raw_spin_unlock_irq(&ctx->lock); - - if (!task_function_call(task, __perf_event_enable, event)) - return; - - raw_spin_lock_irq(&ctx->lock); - - /* - * If the context is active and the event is still off, - * we need to retry the cross-call. - */ - if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) { - /* - * task could have been flipped by a concurrent - * perf_event_context_sched_out() - */ - task = ctx->task; - goto retry; - } - -out: - raw_spin_unlock_irq(&ctx->lock); -} - -static int perf_event_refresh(struct perf_event *event, int refresh) -{ - /* - * not supported on inherited events - */ - if (event->attr.inherit || !is_sampling_event(event)) - return -EINVAL; - - atomic_add(refresh, &event->event_limit); - perf_event_enable(event); - - return 0; -} - -static void ctx_sched_out(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, - enum event_type_t event_type) -{ - struct perf_event *event; - - raw_spin_lock(&ctx->lock); - perf_pmu_disable(ctx->pmu); - ctx->is_active = 0; - if (likely(!ctx->nr_events)) - goto out; - update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx); - - if (!ctx->nr_active) - goto out; - - if (event_type & EVENT_PINNED) { - list_for_each_entry(event, &ctx->pinned_groups, group_entry) - group_sched_out(event, cpuctx, ctx); - } - - if (event_type & EVENT_FLEXIBLE) { - list_for_each_entry(event, &ctx->flexible_groups, group_entry) - group_sched_out(event, cpuctx, ctx); - } -out: - perf_pmu_enable(ctx->pmu); - raw_spin_unlock(&ctx->lock); -} - -/* - * Test whether two contexts are equivalent, i.e. whether they - * have both been cloned from the same version of the same context - * and they both have the same number of enabled events. - * If the number of enabled events is the same, then the set - * of enabled events should be the same, because these are both - * inherited contexts, therefore we can't access individual events - * in them directly with an fd; we can only enable/disable all - * events via prctl, or enable/disable all events in a family - * via ioctl, which will have the same effect on both contexts. - */ -static int context_equiv(struct perf_event_context *ctx1, - struct perf_event_context *ctx2) -{ - return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx - && ctx1->parent_gen == ctx2->parent_gen - && !ctx1->pin_count && !ctx2->pin_count; -} - -static void __perf_event_sync_stat(struct perf_event *event, - struct perf_event *next_event) -{ - u64 value; - - if (!event->attr.inherit_stat) - return; - - /* - * Update the event value, we cannot use perf_event_read() - * because we're in the middle of a context switch and have IRQs - * disabled, which upsets smp_call_function_single(), however - * we know the event must be on the current CPU, therefore we - * don't need to use it. - */ - switch (event->state) { - case PERF_EVENT_STATE_ACTIVE: - event->pmu->read(event); - /* fall-through */ - - case PERF_EVENT_STATE_INACTIVE: - update_event_times(event); - break; - - default: - break; - } - - /* - * In order to keep per-task stats reliable we need to flip the event - * values when we flip the contexts. - */ - value = local64_read(&next_event->count); - value = local64_xchg(&event->count, value); - local64_set(&next_event->count, value); - - swap(event->total_time_enabled, next_event->total_time_enabled); - swap(event->total_time_running, next_event->total_time_running); - - /* - * Since we swizzled the values, update the user visible data too. - */ - perf_event_update_userpage(event); - perf_event_update_userpage(next_event); -} - -#define list_next_entry(pos, member) \ - list_entry(pos->member.next, typeof(*pos), member) - -static void perf_event_sync_stat(struct perf_event_context *ctx, - struct perf_event_context *next_ctx) -{ - struct perf_event *event, *next_event; - - if (!ctx->nr_stat) - return; - - update_context_time(ctx); - - event = list_first_entry(&ctx->event_list, - struct perf_event, event_entry); - - next_event = list_first_entry(&next_ctx->event_list, - struct perf_event, event_entry); - - while (&event->event_entry != &ctx->event_list && - &next_event->event_entry != &next_ctx->event_list) { - - __perf_event_sync_stat(event, next_event); - - event = list_next_entry(event, event_entry); - next_event = list_next_entry(next_event, event_entry); - } -} - -static void perf_event_context_sched_out(struct task_struct *task, int ctxn, - struct task_struct *next) -{ - struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; - struct perf_event_context *next_ctx; - struct perf_event_context *parent; - struct perf_cpu_context *cpuctx; - int do_switch = 1; - - if (likely(!ctx)) - return; - - cpuctx = __get_cpu_context(ctx); - if (!cpuctx->task_ctx) - return; - - rcu_read_lock(); - parent = rcu_dereference(ctx->parent_ctx); - next_ctx = next->perf_event_ctxp[ctxn]; - if (parent && next_ctx && - rcu_dereference(next_ctx->parent_ctx) == parent) { - /* - * Looks like the two contexts are clones, so we might be - * able to optimize the context switch. We lock both - * contexts and check that they are clones under the - * lock (including re-checking that neither has been - * uncloned in the meantime). It doesn't matter which - * order we take the locks because no other cpu could - * be trying to lock both of these tasks. - */ - raw_spin_lock(&ctx->lock); - raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); - if (context_equiv(ctx, next_ctx)) { - /* - * XXX do we need a memory barrier of sorts - * wrt to rcu_dereference() of perf_event_ctxp - */ - task->perf_event_ctxp[ctxn] = next_ctx; - next->perf_event_ctxp[ctxn] = ctx; - ctx->task = next; - next_ctx->task = task; - do_switch = 0; - - perf_event_sync_stat(ctx, next_ctx); - } - raw_spin_unlock(&next_ctx->lock); - raw_spin_unlock(&ctx->lock); - } - rcu_read_unlock(); - - if (do_switch) { - ctx_sched_out(ctx, cpuctx, EVENT_ALL); - cpuctx->task_ctx = NULL; - } -} - -#define for_each_task_context_nr(ctxn) \ - for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) - -/* - * Called from scheduler to remove the events of the current task, - * with interrupts disabled. - * - * We stop each event and update the event value in event->count. - * - * This does not protect us against NMI, but disable() - * sets the disabled bit in the control field of event _before_ - * accessing the event control register. If a NMI hits, then it will - * not restart the event. - */ -void __perf_event_task_sched_out(struct task_struct *task, - struct task_struct *next) -{ - int ctxn; - - for_each_task_context_nr(ctxn) - perf_event_context_sched_out(task, ctxn, next); - - /* - * if cgroup events exist on this CPU, then we need - * to check if we have to switch out PMU state. - * cgroup event are system-wide mode only - */ - if (atomic_read(&__get_cpu_var(perf_cgroup_events))) - perf_cgroup_sched_out(task); -} - -static void task_ctx_sched_out(struct perf_event_context *ctx, - enum event_type_t event_type) -{ - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - - if (!cpuctx->task_ctx) - return; - - if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) - return; - - ctx_sched_out(ctx, cpuctx, event_type); - cpuctx->task_ctx = NULL; -} - -/* - * Called with IRQs disabled - */ -static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, - enum event_type_t event_type) -{ - ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); -} - -static void -ctx_pinned_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx) -{ - struct perf_event *event; - - list_for_each_entry(event, &ctx->pinned_groups, group_entry) { - if (event->state <= PERF_EVENT_STATE_OFF) - continue; - if (!event_filter_match(event)) - continue; - - /* may need to reset tstamp_enabled */ - if (is_cgroup_event(event)) - perf_cgroup_mark_enabled(event, ctx); - - if (group_can_go_on(event, cpuctx, 1)) - group_sched_in(event, cpuctx, ctx); - - /* - * If this pinned group hasn't been scheduled, - * put it in error state. - */ - if (event->state == PERF_EVENT_STATE_INACTIVE) { - update_group_times(event); - event->state = PERF_EVENT_STATE_ERROR; - } - } -} - -static void -ctx_flexible_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx) -{ - struct perf_event *event; - int can_add_hw = 1; - - list_for_each_entry(event, &ctx->flexible_groups, group_entry) { - /* Ignore events in OFF or ERROR state */ - if (event->state <= PERF_EVENT_STATE_OFF) - continue; - /* - * Listen to the 'cpu' scheduling filter constraint - * of events: - */ - if (!event_filter_match(event)) - continue; - - /* may need to reset tstamp_enabled */ - if (is_cgroup_event(event)) - perf_cgroup_mark_enabled(event, ctx); - - if (group_can_go_on(event, cpuctx, can_add_hw)) { - if (group_sched_in(event, cpuctx, ctx)) - can_add_hw = 0; - } - } -} - -static void -ctx_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task) -{ - u64 now; - - raw_spin_lock(&ctx->lock); - ctx->is_active = 1; - if (likely(!ctx->nr_events)) - goto out; - - now = perf_clock(); - ctx->timestamp = now; - perf_cgroup_set_timestamp(task, ctx); - /* - * First go through the list and put on any pinned groups - * in order to give them the best chance of going on. - */ - if (event_type & EVENT_PINNED) - ctx_pinned_sched_in(ctx, cpuctx); - - /* Then walk through the lower prio flexible groups */ - if (event_type & EVENT_FLEXIBLE) - ctx_flexible_sched_in(ctx, cpuctx); - -out: - raw_spin_unlock(&ctx->lock); -} - -static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task) -{ - struct perf_event_context *ctx = &cpuctx->ctx; - - ctx_sched_in(ctx, cpuctx, event_type, task); -} - -static void task_ctx_sched_in(struct perf_event_context *ctx, - enum event_type_t event_type) -{ - struct perf_cpu_context *cpuctx; - - cpuctx = __get_cpu_context(ctx); - if (cpuctx->task_ctx == ctx) - return; - - ctx_sched_in(ctx, cpuctx, event_type, NULL); - cpuctx->task_ctx = ctx; -} - -static void perf_event_context_sched_in(struct perf_event_context *ctx, - struct task_struct *task) -{ - struct perf_cpu_context *cpuctx; - - cpuctx = __get_cpu_context(ctx); - if (cpuctx->task_ctx == ctx) - return; - - perf_pmu_disable(ctx->pmu); - /* - * We want to keep the following priority order: - * cpu pinned (that don't need to move), task pinned, - * cpu flexible, task flexible. - */ - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - - ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); - ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); - - cpuctx->task_ctx = ctx; - - /* - * Since these rotations are per-cpu, we need to ensure the - * cpu-context we got scheduled on is actually rotating. - */ - perf_pmu_rotate_start(ctx->pmu); - perf_pmu_enable(ctx->pmu); -} - -/* - * Called from scheduler to add the events of the current task - * with interrupts disabled. - * - * We restore the event value and then enable it. - * - * This does not protect us against NMI, but enable() - * sets the enabled bit in the control field of event _before_ - * accessing the event control register. If a NMI hits, then it will - * keep the event running. - */ -void __perf_event_task_sched_in(struct task_struct *task) -{ - struct perf_event_context *ctx; - int ctxn; - - for_each_task_context_nr(ctxn) { - ctx = task->perf_event_ctxp[ctxn]; - if (likely(!ctx)) - continue; - - perf_event_context_sched_in(ctx, task); - } - /* - * if cgroup events exist on this CPU, then we need - * to check if we have to switch in PMU state. - * cgroup event are system-wide mode only - */ - if (atomic_read(&__get_cpu_var(perf_cgroup_events))) - perf_cgroup_sched_in(task); -} - -static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) -{ - u64 frequency = event->attr.sample_freq; - u64 sec = NSEC_PER_SEC; - u64 divisor, dividend; - - int count_fls, nsec_fls, frequency_fls, sec_fls; - - count_fls = fls64(count); - nsec_fls = fls64(nsec); - frequency_fls = fls64(frequency); - sec_fls = 30; - - /* - * We got @count in @nsec, with a target of sample_freq HZ - * the target period becomes: - * - * @count * 10^9 - * period = ------------------- - * @nsec * sample_freq - * - */ - - /* - * Reduce accuracy by one bit such that @a and @b converge - * to a similar magnitude. - */ -#define REDUCE_FLS(a, b) \ -do { \ - if (a##_fls > b##_fls) { \ - a >>= 1; \ - a##_fls--; \ - } else { \ - b >>= 1; \ - b##_fls--; \ - } \ -} while (0) - - /* - * Reduce accuracy until either term fits in a u64, then proceed with - * the other, so that finally we can do a u64/u64 division. - */ - while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) { - REDUCE_FLS(nsec, frequency); - REDUCE_FLS(sec, count); - } - - if (count_fls + sec_fls > 64) { - divisor = nsec * frequency; - - while (count_fls + sec_fls > 64) { - REDUCE_FLS(count, sec); - divisor >>= 1; - } - - dividend = count * sec; - } else { - dividend = count * sec; - - while (nsec_fls + frequency_fls > 64) { - REDUCE_FLS(nsec, frequency); - dividend >>= 1; - } - - divisor = nsec * frequency; - } - - if (!divisor) - return dividend; - - return div64_u64(dividend, divisor); -} - -static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) -{ - struct hw_perf_event *hwc = &event->hw; - s64 period, sample_period; - s64 delta; - - period = perf_calculate_period(event, nsec, count); - - delta = (s64)(period - hwc->sample_period); - delta = (delta + 7) / 8; /* low pass filter */ - - sample_period = hwc->sample_period + delta; - - if (!sample_period) - sample_period = 1; - - hwc->sample_period = sample_period; - - if (local64_read(&hwc->period_left) > 8*sample_period) { - event->pmu->stop(event, PERF_EF_UPDATE); - local64_set(&hwc->period_left, 0); - event->pmu->start(event, PERF_EF_RELOAD); - } -} - -static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) -{ - struct perf_event *event; - struct hw_perf_event *hwc; - u64 interrupts, now; - s64 delta; - - raw_spin_lock(&ctx->lock); - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (event->state != PERF_EVENT_STATE_ACTIVE) - continue; - - if (!event_filter_match(event)) - continue; - - hwc = &event->hw; - - interrupts = hwc->interrupts; - hwc->interrupts = 0; - - /* - * unthrottle events on the tick - */ - if (interrupts == MAX_INTERRUPTS) { - perf_log_throttle(event, 1); - event->pmu->start(event, 0); - } - - if (!event->attr.freq || !event->attr.sample_freq) - continue; - - event->pmu->read(event); - now = local64_read(&event->count); - delta = now - hwc->freq_count_stamp; - hwc->freq_count_stamp = now; - - if (delta > 0) - perf_adjust_period(event, period, delta); - } - raw_spin_unlock(&ctx->lock); -} - -/* - * Round-robin a context's events: - */ -static void rotate_ctx(struct perf_event_context *ctx) -{ - raw_spin_lock(&ctx->lock); - - /* - * Rotate the first entry last of non-pinned groups. Rotation might be - * disabled by the inheritance code. - */ - if (!ctx->rotate_disable) - list_rotate_left(&ctx->flexible_groups); - - raw_spin_unlock(&ctx->lock); -} - -/* - * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized - * because they're strictly cpu affine and rotate_start is called with IRQs - * disabled, while rotate_context is called from IRQ context. - */ -static void perf_rotate_context(struct perf_cpu_context *cpuctx) -{ - u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC; - struct perf_event_context *ctx = NULL; - int rotate = 0, remove = 1; - - if (cpuctx->ctx.nr_events) { - remove = 0; - if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) - rotate = 1; - } - - ctx = cpuctx->task_ctx; - if (ctx && ctx->nr_events) { - remove = 0; - if (ctx->nr_events != ctx->nr_active) - rotate = 1; - } - - perf_pmu_disable(cpuctx->ctx.pmu); - perf_ctx_adjust_freq(&cpuctx->ctx, interval); - if (ctx) - perf_ctx_adjust_freq(ctx, interval); - - if (!rotate) - goto done; - - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - if (ctx) - task_ctx_sched_out(ctx, EVENT_FLEXIBLE); - - rotate_ctx(&cpuctx->ctx); - if (ctx) - rotate_ctx(ctx); - - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current); - if (ctx) - task_ctx_sched_in(ctx, EVENT_FLEXIBLE); - -done: - if (remove) - list_del_init(&cpuctx->rotation_list); - - perf_pmu_enable(cpuctx->ctx.pmu); -} - -void perf_event_task_tick(void) -{ - struct list_head *head = &__get_cpu_var(rotation_list); - struct perf_cpu_context *cpuctx, *tmp; - - WARN_ON(!irqs_disabled()); - - list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { - if (cpuctx->jiffies_interval == 1 || - !(jiffies % cpuctx->jiffies_interval)) - perf_rotate_context(cpuctx); - } -} - -static int event_enable_on_exec(struct perf_event *event, - struct perf_event_context *ctx) -{ - if (!event->attr.enable_on_exec) - return 0; - - event->attr.enable_on_exec = 0; - if (event->state >= PERF_EVENT_STATE_INACTIVE) - return 0; - - __perf_event_mark_enabled(event, ctx); - - return 1; -} - -/* - * Enable all of a task's events that have been marked enable-on-exec. - * This expects task == current. - */ -static void perf_event_enable_on_exec(struct perf_event_context *ctx) -{ - struct perf_event *event; - unsigned long flags; - int enabled = 0; - int ret; - - local_irq_save(flags); - if (!ctx || !ctx->nr_events) - goto out; - - /* - * We must ctxsw out cgroup events to avoid conflict - * when invoking perf_task_event_sched_in() later on - * in this function. Otherwise we end up trying to - * ctxswin cgroup events which are already scheduled - * in. - */ - perf_cgroup_sched_out(current); - task_ctx_sched_out(ctx, EVENT_ALL); - - raw_spin_lock(&ctx->lock); - - list_for_each_entry(event, &ctx->pinned_groups, group_entry) { - ret = event_enable_on_exec(event, ctx); - if (ret) - enabled = 1; - } - - list_for_each_entry(event, &ctx->flexible_groups, group_entry) { - ret = event_enable_on_exec(event, ctx); - if (ret) - enabled = 1; - } - - /* - * Unclone this context if we enabled any event. - */ - if (enabled) - unclone_ctx(ctx); - - raw_spin_unlock(&ctx->lock); - - /* - * Also calls ctxswin for cgroup events, if any: - */ - perf_event_context_sched_in(ctx, ctx->task); -out: - local_irq_restore(flags); -} - -/* - * Cross CPU call to read the hardware event - */ -static void __perf_event_read(void *info) -{ - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - - /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. In that case - * event->count would have been updated to a recent sample - * when the event was scheduled out. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; - - raw_spin_lock(&ctx->lock); - if (ctx->is_active) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - } - update_event_times(event); - if (event->state == PERF_EVENT_STATE_ACTIVE) - event->pmu->read(event); - raw_spin_unlock(&ctx->lock); -} - -static inline u64 perf_event_count(struct perf_event *event) -{ - return local64_read(&event->count) + atomic64_read(&event->child_count); -} - -static u64 perf_event_read(struct perf_event *event) -{ - /* - * If event is enabled and currently active on a CPU, update the - * value in the event structure: - */ - if (event->state == PERF_EVENT_STATE_ACTIVE) { - smp_call_function_single(event->oncpu, - __perf_event_read, event, 1); - } else if (event->state == PERF_EVENT_STATE_INACTIVE) { - struct perf_event_context *ctx = event->ctx; - unsigned long flags; - - raw_spin_lock_irqsave(&ctx->lock, flags); - /* - * may read while context is not active - * (e.g., thread is blocked), in that case - * we cannot update context time - */ - if (ctx->is_active) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - } - update_event_times(event); - raw_spin_unlock_irqrestore(&ctx->lock, flags); - } - - return perf_event_count(event); -} - -/* - * Callchain support - */ - -struct callchain_cpus_entries { - struct rcu_head rcu_head; - struct perf_callchain_entry *cpu_entries[0]; -}; - -static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); -static atomic_t nr_callchain_events; -static DEFINE_MUTEX(callchain_mutex); -struct callchain_cpus_entries *callchain_cpus_entries; - - -__weak void perf_callchain_kernel(struct perf_callchain_entry *entry, - struct pt_regs *regs) -{ -} - -__weak void perf_callchain_user(struct perf_callchain_entry *entry, - struct pt_regs *regs) -{ -} - -static void release_callchain_buffers_rcu(struct rcu_head *head) -{ - struct callchain_cpus_entries *entries; - int cpu; - - entries = container_of(head, struct callchain_cpus_entries, rcu_head); - - for_each_possible_cpu(cpu) - kfree(entries->cpu_entries[cpu]); - - kfree(entries); -} - -static void release_callchain_buffers(void) -{ - struct callchain_cpus_entries *entries; - - entries = callchain_cpus_entries; - rcu_assign_pointer(callchain_cpus_entries, NULL); - call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); -} - -static int alloc_callchain_buffers(void) -{ - int cpu; - int size; - struct callchain_cpus_entries *entries; - - /* - * We can't use the percpu allocation API for data that can be - * accessed from NMI. Use a temporary manual per cpu allocation - * until that gets sorted out. - */ - size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); - - entries = kzalloc(size, GFP_KERNEL); - if (!entries) - return -ENOMEM; - - size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; - - for_each_possible_cpu(cpu) { - entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, - cpu_to_node(cpu)); - if (!entries->cpu_entries[cpu]) - goto fail; - } - - rcu_assign_pointer(callchain_cpus_entries, entries); - - return 0; - -fail: - for_each_possible_cpu(cpu) - kfree(entries->cpu_entries[cpu]); - kfree(entries); - - return -ENOMEM; -} - -static int get_callchain_buffers(void) -{ - int err = 0; - int count; - - mutex_lock(&callchain_mutex); - - count = atomic_inc_return(&nr_callchain_events); - if (WARN_ON_ONCE(count < 1)) { - err = -EINVAL; - goto exit; - } - - if (count > 1) { - /* If the allocation failed, give up */ - if (!callchain_cpus_entries) - err = -ENOMEM; - goto exit; - } - - err = alloc_callchain_buffers(); - if (err) - release_callchain_buffers(); -exit: - mutex_unlock(&callchain_mutex); - - return err; -} - -static void put_callchain_buffers(void) -{ - if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { - release_callchain_buffers(); - mutex_unlock(&callchain_mutex); - } -} - -static int get_recursion_context(int *recursion) -{ - int rctx; - - if (in_nmi()) - rctx = 3; - else if (in_irq()) - rctx = 2; - else if (in_softirq()) - rctx = 1; - else - rctx = 0; - - if (recursion[rctx]) - return -1; - - recursion[rctx]++; - barrier(); - - return rctx; -} - -static inline void put_recursion_context(int *recursion, int rctx) -{ - barrier(); - recursion[rctx]--; -} - -static struct perf_callchain_entry *get_callchain_entry(int *rctx) -{ - int cpu; - struct callchain_cpus_entries *entries; - - *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); - if (*rctx == -1) - return NULL; - - entries = rcu_dereference(callchain_cpus_entries); - if (!entries) - return NULL; - - cpu = smp_processor_id(); - - return &entries->cpu_entries[cpu][*rctx]; -} - -static void -put_callchain_entry(int rctx) -{ - put_recursion_context(__get_cpu_var(callchain_recursion), rctx); -} - -static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) -{ - int rctx; - struct perf_callchain_entry *entry; - - - entry = get_callchain_entry(&rctx); - if (rctx == -1) - return NULL; - - if (!entry) - goto exit_put; - - entry->nr = 0; - - if (!user_mode(regs)) { - perf_callchain_store(entry, PERF_CONTEXT_KERNEL); - perf_callchain_kernel(entry, regs); - if (current->mm) - regs = task_pt_regs(current); - else - regs = NULL; - } - - if (regs) { - perf_callchain_store(entry, PERF_CONTEXT_USER); - perf_callchain_user(entry, regs); - } - -exit_put: - put_callchain_entry(rctx); - - return entry; -} - -/* - * Initialize the perf_event context in a task_struct: - */ -static void __perf_event_init_context(struct perf_event_context *ctx) -{ - raw_spin_lock_init(&ctx->lock); - mutex_init(&ctx->mutex); - INIT_LIST_HEAD(&ctx->pinned_groups); - INIT_LIST_HEAD(&ctx->flexible_groups); - INIT_LIST_HEAD(&ctx->event_list); - atomic_set(&ctx->refcount, 1); -} - -static struct perf_event_context * -alloc_perf_context(struct pmu *pmu, struct task_struct *task) -{ - struct perf_event_context *ctx; - - ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); - if (!ctx) - return NULL; - - __perf_event_init_context(ctx); - if (task) { - ctx->task = task; - get_task_struct(task); - } - ctx->pmu = pmu; - - return ctx; -} - -static struct task_struct * -find_lively_task_by_vpid(pid_t vpid) -{ - struct task_struct *task; - int err; - - rcu_read_lock(); - if (!vpid) - task = current; - else - task = find_task_by_vpid(vpid); - if (task) - get_task_struct(task); - rcu_read_unlock(); - - if (!task) - return ERR_PTR(-ESRCH); - - /* Reuse ptrace permission checks for now. */ - err = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - goto errout; - - return task; -errout: - put_task_struct(task); - return ERR_PTR(err); - -} - -/* - * Returns a matching context with refcount and pincount. - */ -static struct perf_event_context * -find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) -{ - struct perf_event_context *ctx; - struct perf_cpu_context *cpuctx; - unsigned long flags; - int ctxn, err; - - if (!task) { - /* Must be root to operate on a CPU event: */ - if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EACCES); - - /* - * We could be clever and allow to attach a event to an - * offline CPU and activate it when the CPU comes up, but - * that's for later. - */ - if (!cpu_online(cpu)) - return ERR_PTR(-ENODEV); - - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - ctx = &cpuctx->ctx; - get_ctx(ctx); - ++ctx->pin_count; - - return ctx; - } - - err = -EINVAL; - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto errout; - -retry: - ctx = perf_lock_task_context(task, ctxn, &flags); - if (ctx) { - unclone_ctx(ctx); - ++ctx->pin_count; - raw_spin_unlock_irqrestore(&ctx->lock, flags); - } - - if (!ctx) { - ctx = alloc_perf_context(pmu, task); - err = -ENOMEM; - if (!ctx) - goto errout; - - get_ctx(ctx); - - err = 0; - mutex_lock(&task->perf_event_mutex); - /* - * If it has already passed perf_event_exit_task(). - * we must see PF_EXITING, it takes this mutex too. - */ - if (task->flags & PF_EXITING) - err = -ESRCH; - else if (task->perf_event_ctxp[ctxn]) - err = -EAGAIN; - else { - ++ctx->pin_count; - rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); - } - mutex_unlock(&task->perf_event_mutex); - - if (unlikely(err)) { - put_task_struct(task); - kfree(ctx); - - if (err == -EAGAIN) - goto retry; - goto errout; - } - } - - return ctx; - -errout: - return ERR_PTR(err); -} - -static void perf_event_free_filter(struct perf_event *event); - -static void free_event_rcu(struct rcu_head *head) -{ - struct perf_event *event; - - event = container_of(head, struct perf_event, rcu_head); - if (event->ns) - put_pid_ns(event->ns); - perf_event_free_filter(event); - kfree(event); -} - -static void perf_buffer_put(struct perf_buffer *buffer); - -static void free_event(struct perf_event *event) -{ - irq_work_sync(&event->pending); - - if (!event->parent) { - if (event->attach_state & PERF_ATTACH_TASK) - jump_label_dec(&perf_sched_events); - if (event->attr.mmap || event->attr.mmap_data) - atomic_dec(&nr_mmap_events); - if (event->attr.comm) - atomic_dec(&nr_comm_events); - if (event->attr.task) - atomic_dec(&nr_task_events); - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - put_callchain_buffers(); - if (is_cgroup_event(event)) { - atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_dec(&perf_sched_events); - } - } - - if (event->buffer) { - perf_buffer_put(event->buffer); - event->buffer = NULL; - } - - if (is_cgroup_event(event)) - perf_detach_cgroup(event); - - if (event->destroy) - event->destroy(event); - - if (event->ctx) - put_ctx(event->ctx); - - call_rcu(&event->rcu_head, free_event_rcu); -} - -int perf_event_release_kernel(struct perf_event *event) -{ - struct perf_event_context *ctx = event->ctx; - - /* - * Remove from the PMU, can't get re-enabled since we got - * here because the last ref went. - */ - perf_event_disable(event); - - WARN_ON_ONCE(ctx->parent_ctx); - /* - * There are two ways this annotation is useful: - * - * 1) there is a lock recursion from perf_event_exit_task - * see the comment there. - * - * 2) there is a lock-inversion with mmap_sem through - * perf_event_read_group(), which takes faults while - * holding ctx->mutex, however this is called after - * the last filedesc died, so there is no possibility - * to trigger the AB-BA case. - */ - mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); - raw_spin_lock_irq(&ctx->lock); - perf_group_detach(event); - list_del_event(event, ctx); - raw_spin_unlock_irq(&ctx->lock); - mutex_unlock(&ctx->mutex); - - free_event(event); - - return 0; -} -EXPORT_SYMBOL_GPL(perf_event_release_kernel); - -/* - * Called when the last reference to the file is gone. - */ -static int perf_release(struct inode *inode, struct file *file) -{ - struct perf_event *event = file->private_data; - struct task_struct *owner; - - file->private_data = NULL; - - rcu_read_lock(); - owner = ACCESS_ONCE(event->owner); - /* - * Matches the smp_wmb() in perf_event_exit_task(). If we observe - * !owner it means the list deletion is complete and we can indeed - * free this event, otherwise we need to serialize on - * owner->perf_event_mutex. - */ - smp_read_barrier_depends(); - if (owner) { - /* - * Since delayed_put_task_struct() also drops the last - * task reference we can safely take a new reference - * while holding the rcu_read_lock(). - */ - get_task_struct(owner); - } - rcu_read_unlock(); - - if (owner) { - mutex_lock(&owner->perf_event_mutex); - /* - * We have to re-check the event->owner field, if it is cleared - * we raced with perf_event_exit_task(), acquiring the mutex - * ensured they're done, and we can proceed with freeing the - * event. - */ - if (event->owner) - list_del_init(&event->owner_entry); - mutex_unlock(&owner->perf_event_mutex); - put_task_struct(owner); - } - - return perf_event_release_kernel(event); -} - -u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) -{ - struct perf_event *child; - u64 total = 0; - - *enabled = 0; - *running = 0; - - mutex_lock(&event->child_mutex); - total += perf_event_read(event); - *enabled += event->total_time_enabled + - atomic64_read(&event->child_total_time_enabled); - *running += event->total_time_running + - atomic64_read(&event->child_total_time_running); - - list_for_each_entry(child, &event->child_list, child_list) { - total += perf_event_read(child); - *enabled += child->total_time_enabled; - *running += child->total_time_running; - } - mutex_unlock(&event->child_mutex); - - return total; -} -EXPORT_SYMBOL_GPL(perf_event_read_value); - -static int perf_event_read_group(struct perf_event *event, - u64 read_format, char __user *buf) -{ - struct perf_event *leader = event->group_leader, *sub; - int n = 0, size = 0, ret = -EFAULT; - struct perf_event_context *ctx = leader->ctx; - u64 values[5]; - u64 count, enabled, running; - - mutex_lock(&ctx->mutex); - count = perf_event_read_value(leader, &enabled, &running); - - values[n++] = 1 + leader->nr_siblings; - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - values[n++] = enabled; - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - values[n++] = running; - values[n++] = count; - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_event_id(leader); - - size = n * sizeof(u64); - - if (copy_to_user(buf, values, size)) - goto unlock; - - ret = size; - - list_for_each_entry(sub, &leader->sibling_list, group_entry) { - n = 0; - - values[n++] = perf_event_read_value(sub, &enabled, &running); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_event_id(sub); - - size = n * sizeof(u64); - - if (copy_to_user(buf + ret, values, size)) { - ret = -EFAULT; - goto unlock; - } - - ret += size; - } -unlock: - mutex_unlock(&ctx->mutex); - - return ret; -} - -static int perf_event_read_one(struct perf_event *event, - u64 read_format, char __user *buf) -{ - u64 enabled, running; - u64 values[4]; - int n = 0; - - values[n++] = perf_event_read_value(event, &enabled, &running); - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - values[n++] = enabled; - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - values[n++] = running; - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_event_id(event); - - if (copy_to_user(buf, values, n * sizeof(u64))) - return -EFAULT; - - return n * sizeof(u64); -} - -/* - * Read the performance event - simple non blocking version for now - */ -static ssize_t -perf_read_hw(struct perf_event *event, char __user *buf, size_t count) -{ - u64 read_format = event->attr.read_format; - int ret; - - /* - * Return end-of-file for a read on a event that is in - * error state (i.e. because it was pinned but it couldn't be - * scheduled on to the CPU at some point). - */ - if (event->state == PERF_EVENT_STATE_ERROR) - return 0; - - if (count < event->read_size) - return -ENOSPC; - - WARN_ON_ONCE(event->ctx->parent_ctx); - if (read_format & PERF_FORMAT_GROUP) - ret = perf_event_read_group(event, read_format, buf); - else - ret = perf_event_read_one(event, read_format, buf); - - return ret; -} - -static ssize_t -perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) -{ - struct perf_event *event = file->private_data; - - return perf_read_hw(event, buf, count); -} - -static unsigned int perf_poll(struct file *file, poll_table *wait) -{ - struct perf_event *event = file->private_data; - struct perf_buffer *buffer; - unsigned int events = POLL_HUP; - - rcu_read_lock(); - buffer = rcu_dereference(event->buffer); - if (buffer) - events = atomic_xchg(&buffer->poll, 0); - rcu_read_unlock(); - - poll_wait(file, &event->waitq, wait); - - return events; -} - -static void perf_event_reset(struct perf_event *event) -{ - (void)perf_event_read(event); - local64_set(&event->count, 0); - perf_event_update_userpage(event); -} - -/* - * Holding the top-level event's child_mutex means that any - * descendant process that has inherited this event will block - * in sync_child_event if it goes to exit, thus satisfying the - * task existence requirements of perf_event_enable/disable. - */ -static void perf_event_for_each_child(struct perf_event *event, - void (*func)(struct perf_event *)) -{ - struct perf_event *child; - - WARN_ON_ONCE(event->ctx->parent_ctx); - mutex_lock(&event->child_mutex); - func(event); - list_for_each_entry(child, &event->child_list, child_list) - func(child); - mutex_unlock(&event->child_mutex); -} - -static void perf_event_for_each(struct perf_event *event, - void (*func)(struct perf_event *)) -{ - struct perf_event_context *ctx = event->ctx; - struct perf_event *sibling; - - WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); - event = event->group_leader; - - perf_event_for_each_child(event, func); - func(event); - list_for_each_entry(sibling, &event->sibling_list, group_entry) - perf_event_for_each_child(event, func); - mutex_unlock(&ctx->mutex); -} - -static int perf_event_period(struct perf_event *event, u64 __user *arg) -{ - struct perf_event_context *ctx = event->ctx; - int ret = 0; - u64 value; - - if (!is_sampling_event(event)) - return -EINVAL; - - if (copy_from_user(&value, arg, sizeof(value))) - return -EFAULT; - - if (!value) - return -EINVAL; - - raw_spin_lock_irq(&ctx->lock); - if (event->attr.freq) { - if (value > sysctl_perf_event_sample_rate) { - ret = -EINVAL; - goto unlock; - } - - event->attr.sample_freq = value; - } else { - event->attr.sample_period = value; - event->hw.sample_period = value; - } -unlock: - raw_spin_unlock_irq(&ctx->lock); - - return ret; -} - -static const struct file_operations perf_fops; - -static struct perf_event *perf_fget_light(int fd, int *fput_needed) -{ - struct file *file; - - file = fget_light(fd, fput_needed); - if (!file) - return ERR_PTR(-EBADF); - - if (file->f_op != &perf_fops) { - fput_light(file, *fput_needed); - *fput_needed = 0; - return ERR_PTR(-EBADF); - } - - return file->private_data; -} - -static int perf_event_set_output(struct perf_event *event, - struct perf_event *output_event); -static int perf_event_set_filter(struct perf_event *event, void __user *arg); - -static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - struct perf_event *event = file->private_data; - void (*func)(struct perf_event *); - u32 flags = arg; - - switch (cmd) { - case PERF_EVENT_IOC_ENABLE: - func = perf_event_enable; - break; - case PERF_EVENT_IOC_DISABLE: - func = perf_event_disable; - break; - case PERF_EVENT_IOC_RESET: - func = perf_event_reset; - break; - - case PERF_EVENT_IOC_REFRESH: - return perf_event_refresh(event, arg); - - case PERF_EVENT_IOC_PERIOD: - return perf_event_period(event, (u64 __user *)arg); - - case PERF_EVENT_IOC_SET_OUTPUT: - { - struct perf_event *output_event = NULL; - int fput_needed = 0; - int ret; - - if (arg != -1) { - output_event = perf_fget_light(arg, &fput_needed); - if (IS_ERR(output_event)) - return PTR_ERR(output_event); - } - - ret = perf_event_set_output(event, output_event); - if (output_event) - fput_light(output_event->filp, fput_needed); - - return ret; - } - - case PERF_EVENT_IOC_SET_FILTER: - return perf_event_set_filter(event, (void __user *)arg); - - default: - return -ENOTTY; - } - - if (flags & PERF_IOC_FLAG_GROUP) - perf_event_for_each(event, func); - else - perf_event_for_each_child(event, func); - - return 0; -} - -int perf_event_task_enable(void) -{ - struct perf_event *event; - - mutex_lock(¤t->perf_event_mutex); - list_for_each_entry(event, ¤t->perf_event_list, owner_entry) - perf_event_for_each_child(event, perf_event_enable); - mutex_unlock(¤t->perf_event_mutex); - - return 0; -} - -int perf_event_task_disable(void) -{ - struct perf_event *event; - - mutex_lock(¤t->perf_event_mutex); - list_for_each_entry(event, ¤t->perf_event_list, owner_entry) - perf_event_for_each_child(event, perf_event_disable); - mutex_unlock(¤t->perf_event_mutex); - - return 0; -} - -#ifndef PERF_EVENT_INDEX_OFFSET -# define PERF_EVENT_INDEX_OFFSET 0 -#endif - -static int perf_event_index(struct perf_event *event) -{ - if (event->hw.state & PERF_HES_STOPPED) - return 0; - - if (event->state != PERF_EVENT_STATE_ACTIVE) - return 0; - - return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; -} - -/* - * Callers need to ensure there can be no nesting of this function, otherwise - * the seqlock logic goes bad. We can not serialize this because the arch - * code calls this from NMI context. - */ -void perf_event_update_userpage(struct perf_event *event) -{ - struct perf_event_mmap_page *userpg; - struct perf_buffer *buffer; - - rcu_read_lock(); - buffer = rcu_dereference(event->buffer); - if (!buffer) - goto unlock; - - userpg = buffer->user_page; - - /* - * Disable preemption so as to not let the corresponding user-space - * spin too long if we get preempted. - */ - preempt_disable(); - ++userpg->lock; - barrier(); - userpg->index = perf_event_index(event); - userpg->offset = perf_event_count(event); - if (event->state == PERF_EVENT_STATE_ACTIVE) - userpg->offset -= local64_read(&event->hw.prev_count); - - userpg->time_enabled = event->total_time_enabled + - atomic64_read(&event->child_total_time_enabled); - - userpg->time_running = event->total_time_running + - atomic64_read(&event->child_total_time_running); - - barrier(); - ++userpg->lock; - preempt_enable(); -unlock: - rcu_read_unlock(); -} - -static unsigned long perf_data_size(struct perf_buffer *buffer); - -static void -perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags) -{ - long max_size = perf_data_size(buffer); - - if (watermark) - buffer->watermark = min(max_size, watermark); - - if (!buffer->watermark) - buffer->watermark = max_size / 2; - - if (flags & PERF_BUFFER_WRITABLE) - buffer->writable = 1; - - atomic_set(&buffer->refcount, 1); -} - -#ifndef CONFIG_PERF_USE_VMALLOC - -/* - * Back perf_mmap() with regular GFP_KERNEL-0 pages. - */ - -static struct page * -perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) -{ - if (pgoff > buffer->nr_pages) - return NULL; - - if (pgoff == 0) - return virt_to_page(buffer->user_page); - - return virt_to_page(buffer->data_pages[pgoff - 1]); -} - -static void *perf_mmap_alloc_page(int cpu) -{ - struct page *page; - int node; - - node = (cpu == -1) ? cpu : cpu_to_node(cpu); - page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); - if (!page) - return NULL; - - return page_address(page); -} - -static struct perf_buffer * -perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) -{ - struct perf_buffer *buffer; - unsigned long size; - int i; - - size = sizeof(struct perf_buffer); - size += nr_pages * sizeof(void *); - - buffer = kzalloc(size, GFP_KERNEL); - if (!buffer) - goto fail; - - buffer->user_page = perf_mmap_alloc_page(cpu); - if (!buffer->user_page) - goto fail_user_page; - - for (i = 0; i < nr_pages; i++) { - buffer->data_pages[i] = perf_mmap_alloc_page(cpu); - if (!buffer->data_pages[i]) - goto fail_data_pages; - } - - buffer->nr_pages = nr_pages; - - perf_buffer_init(buffer, watermark, flags); - - return buffer; - -fail_data_pages: - for (i--; i >= 0; i--) - free_page((unsigned long)buffer->data_pages[i]); - - free_page((unsigned long)buffer->user_page); - -fail_user_page: - kfree(buffer); - -fail: - return NULL; -} - -static void perf_mmap_free_page(unsigned long addr) -{ - struct page *page = virt_to_page((void *)addr); - - page->mapping = NULL; - __free_page(page); -} - -static void perf_buffer_free(struct perf_buffer *buffer) -{ - int i; - - perf_mmap_free_page((unsigned long)buffer->user_page); - for (i = 0; i < buffer->nr_pages; i++) - perf_mmap_free_page((unsigned long)buffer->data_pages[i]); - kfree(buffer); -} - -static inline int page_order(struct perf_buffer *buffer) -{ - return 0; -} - -#else - -/* - * Back perf_mmap() with vmalloc memory. - * - * Required for architectures that have d-cache aliasing issues. - */ - -static inline int page_order(struct perf_buffer *buffer) -{ - return buffer->page_order; -} - -static struct page * -perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) -{ - if (pgoff > (1UL << page_order(buffer))) - return NULL; - - return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE); -} - -static void perf_mmap_unmark_page(void *addr) -{ - struct page *page = vmalloc_to_page(addr); - - page->mapping = NULL; -} - -static void perf_buffer_free_work(struct work_struct *work) -{ - struct perf_buffer *buffer; - void *base; - int i, nr; - - buffer = container_of(work, struct perf_buffer, work); - nr = 1 << page_order(buffer); - - base = buffer->user_page; - for (i = 0; i < nr + 1; i++) - perf_mmap_unmark_page(base + (i * PAGE_SIZE)); - - vfree(base); - kfree(buffer); -} - -static void perf_buffer_free(struct perf_buffer *buffer) -{ - schedule_work(&buffer->work); -} - -static struct perf_buffer * -perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) -{ - struct perf_buffer *buffer; - unsigned long size; - void *all_buf; - - size = sizeof(struct perf_buffer); - size += sizeof(void *); - - buffer = kzalloc(size, GFP_KERNEL); - if (!buffer) - goto fail; - - INIT_WORK(&buffer->work, perf_buffer_free_work); - - all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); - if (!all_buf) - goto fail_all_buf; - - buffer->user_page = all_buf; - buffer->data_pages[0] = all_buf + PAGE_SIZE; - buffer->page_order = ilog2(nr_pages); - buffer->nr_pages = 1; - - perf_buffer_init(buffer, watermark, flags); - - return buffer; - -fail_all_buf: - kfree(buffer); - -fail: - return NULL; -} - -#endif - -static unsigned long perf_data_size(struct perf_buffer *buffer) -{ - return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer)); -} - -static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct perf_event *event = vma->vm_file->private_data; - struct perf_buffer *buffer; - int ret = VM_FAULT_SIGBUS; - - if (vmf->flags & FAULT_FLAG_MKWRITE) { - if (vmf->pgoff == 0) - ret = 0; - return ret; - } - - rcu_read_lock(); - buffer = rcu_dereference(event->buffer); - if (!buffer) - goto unlock; - - if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) - goto unlock; - - vmf->page = perf_mmap_to_page(buffer, vmf->pgoff); - if (!vmf->page) - goto unlock; - - get_page(vmf->page); - vmf->page->mapping = vma->vm_file->f_mapping; - vmf->page->index = vmf->pgoff; - - ret = 0; -unlock: - rcu_read_unlock(); - - return ret; -} - -static void perf_buffer_free_rcu(struct rcu_head *rcu_head) -{ - struct perf_buffer *buffer; - - buffer = container_of(rcu_head, struct perf_buffer, rcu_head); - perf_buffer_free(buffer); -} - -static struct perf_buffer *perf_buffer_get(struct perf_event *event) -{ - struct perf_buffer *buffer; - - rcu_read_lock(); - buffer = rcu_dereference(event->buffer); - if (buffer) { - if (!atomic_inc_not_zero(&buffer->refcount)) - buffer = NULL; - } - rcu_read_unlock(); - - return buffer; -} - -static void perf_buffer_put(struct perf_buffer *buffer) -{ - if (!atomic_dec_and_test(&buffer->refcount)) - return; - - call_rcu(&buffer->rcu_head, perf_buffer_free_rcu); -} - -static void perf_mmap_open(struct vm_area_struct *vma) -{ - struct perf_event *event = vma->vm_file->private_data; - - atomic_inc(&event->mmap_count); -} - -static void perf_mmap_close(struct vm_area_struct *vma) -{ - struct perf_event *event = vma->vm_file->private_data; - - if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { - unsigned long size = perf_data_size(event->buffer); - struct user_struct *user = event->mmap_user; - struct perf_buffer *buffer = event->buffer; - - atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); - vma->vm_mm->locked_vm -= event->mmap_locked; - rcu_assign_pointer(event->buffer, NULL); - mutex_unlock(&event->mmap_mutex); - - perf_buffer_put(buffer); - free_uid(user); - } -} - -static const struct vm_operations_struct perf_mmap_vmops = { - .open = perf_mmap_open, - .close = perf_mmap_close, - .fault = perf_mmap_fault, - .page_mkwrite = perf_mmap_fault, -}; - -static int perf_mmap(struct file *file, struct vm_area_struct *vma) -{ - struct perf_event *event = file->private_data; - unsigned long user_locked, user_lock_limit; - struct user_struct *user = current_user(); - unsigned long locked, lock_limit; - struct perf_buffer *buffer; - unsigned long vma_size; - unsigned long nr_pages; - long user_extra, extra; - int ret = 0, flags = 0; - - /* - * Don't allow mmap() of inherited per-task counters. This would - * create a performance issue due to all children writing to the - * same buffer. - */ - if (event->cpu == -1 && event->attr.inherit) - return -EINVAL; - - if (!(vma->vm_flags & VM_SHARED)) - return -EINVAL; - - vma_size = vma->vm_end - vma->vm_start; - nr_pages = (vma_size / PAGE_SIZE) - 1; - - /* - * If we have buffer pages ensure they're a power-of-two number, so we - * can do bitmasks instead of modulo. - */ - if (nr_pages != 0 && !is_power_of_2(nr_pages)) - return -EINVAL; - - if (vma_size != PAGE_SIZE * (1 + nr_pages)) - return -EINVAL; - - if (vma->vm_pgoff != 0) - return -EINVAL; - - WARN_ON_ONCE(event->ctx->parent_ctx); - mutex_lock(&event->mmap_mutex); - if (event->buffer) { - if (event->buffer->nr_pages == nr_pages) - atomic_inc(&event->buffer->refcount); - else - ret = -EINVAL; - goto unlock; - } - - user_extra = nr_pages + 1; - user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); - - /* - * Increase the limit linearly with more CPUs: - */ - user_lock_limit *= num_online_cpus(); - - user_locked = atomic_long_read(&user->locked_vm) + user_extra; - - extra = 0; - if (user_locked > user_lock_limit) - extra = user_locked - user_lock_limit; - - lock_limit = rlimit(RLIMIT_MEMLOCK); - lock_limit >>= PAGE_SHIFT; - locked = vma->vm_mm->locked_vm + extra; - - if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && - !capable(CAP_IPC_LOCK)) { - ret = -EPERM; - goto unlock; - } - - WARN_ON(event->buffer); - - if (vma->vm_flags & VM_WRITE) - flags |= PERF_BUFFER_WRITABLE; - - buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark, - event->cpu, flags); - if (!buffer) { - ret = -ENOMEM; - goto unlock; - } - rcu_assign_pointer(event->buffer, buffer); - - atomic_long_add(user_extra, &user->locked_vm); - event->mmap_locked = extra; - event->mmap_user = get_current_user(); - vma->vm_mm->locked_vm += event->mmap_locked; - -unlock: - if (!ret) - atomic_inc(&event->mmap_count); - mutex_unlock(&event->mmap_mutex); - - vma->vm_flags |= VM_RESERVED; - vma->vm_ops = &perf_mmap_vmops; - - return ret; -} - -static int perf_fasync(int fd, struct file *filp, int on) -{ - struct inode *inode = filp->f_path.dentry->d_inode; - struct perf_event *event = filp->private_data; - int retval; - - mutex_lock(&inode->i_mutex); - retval = fasync_helper(fd, filp, on, &event->fasync); - mutex_unlock(&inode->i_mutex); - - if (retval < 0) - return retval; - - return 0; -} - -static const struct file_operations perf_fops = { - .llseek = no_llseek, - .release = perf_release, - .read = perf_read, - .poll = perf_poll, - .unlocked_ioctl = perf_ioctl, - .compat_ioctl = perf_ioctl, - .mmap = perf_mmap, - .fasync = perf_fasync, -}; - -/* - * Perf event wakeup - * - * If there's data, ensure we set the poll() state and publish everything - * to user-space before waking everybody up. - */ - -void perf_event_wakeup(struct perf_event *event) -{ - wake_up_all(&event->waitq); - - if (event->pending_kill) { - kill_fasync(&event->fasync, SIGIO, event->pending_kill); - event->pending_kill = 0; - } -} - -static void perf_pending_event(struct irq_work *entry) -{ - struct perf_event *event = container_of(entry, - struct perf_event, pending); - - if (event->pending_disable) { - event->pending_disable = 0; - __perf_event_disable(event); - } - - if (event->pending_wakeup) { - event->pending_wakeup = 0; - perf_event_wakeup(event); - } -} - -/* - * We assume there is only KVM supporting the callbacks. - * Later on, we might change it to a list if there is - * another virtualization implementation supporting the callbacks. - */ -struct perf_guest_info_callbacks *perf_guest_cbs; - -int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) -{ - perf_guest_cbs = cbs; - return 0; -} -EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); - -int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) -{ - perf_guest_cbs = NULL; - return 0; -} -EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); - -/* - * Output - */ -static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail, - unsigned long offset, unsigned long head) -{ - unsigned long mask; - - if (!buffer->writable) - return true; - - mask = perf_data_size(buffer) - 1; - - offset = (offset - tail) & mask; - head = (head - tail) & mask; - - if ((int)(head - offset) < 0) - return false; - - return true; -} - -static void perf_output_wakeup(struct perf_output_handle *handle) -{ - atomic_set(&handle->buffer->poll, POLL_IN); - - if (handle->nmi) { - handle->event->pending_wakeup = 1; - irq_work_queue(&handle->event->pending); - } else - perf_event_wakeup(handle->event); -} - -/* - * We need to ensure a later event_id doesn't publish a head when a former - * event isn't done writing. However since we need to deal with NMIs we - * cannot fully serialize things. - * - * We only publish the head (and generate a wakeup) when the outer-most - * event completes. - */ -static void perf_output_get_handle(struct perf_output_handle *handle) -{ - struct perf_buffer *buffer = handle->buffer; - - preempt_disable(); - local_inc(&buffer->nest); - handle->wakeup = local_read(&buffer->wakeup); -} - -static void perf_output_put_handle(struct perf_output_handle *handle) -{ - struct perf_buffer *buffer = handle->buffer; - unsigned long head; - -again: - head = local_read(&buffer->head); - - /* - * IRQ/NMI can happen here, which means we can miss a head update. - */ - - if (!local_dec_and_test(&buffer->nest)) - goto out; - - /* - * Publish the known good head. Rely on the full barrier implied - * by atomic_dec_and_test() order the buffer->head read and this - * write. - */ - buffer->user_page->data_head = head; - - /* - * Now check if we missed an update, rely on the (compiler) - * barrier in atomic_dec_and_test() to re-read buffer->head. - */ - if (unlikely(head != local_read(&buffer->head))) { - local_inc(&buffer->nest); - goto again; - } - - if (handle->wakeup != local_read(&buffer->wakeup)) - perf_output_wakeup(handle); - -out: - preempt_enable(); -} - -__always_inline void perf_output_copy(struct perf_output_handle *handle, - const void *buf, unsigned int len) -{ - do { - unsigned long size = min_t(unsigned long, handle->size, len); - - memcpy(handle->addr, buf, size); - - len -= size; - handle->addr += size; - buf += size; - handle->size -= size; - if (!handle->size) { - struct perf_buffer *buffer = handle->buffer; - - handle->page++; - handle->page &= buffer->nr_pages - 1; - handle->addr = buffer->data_pages[handle->page]; - handle->size = PAGE_SIZE << page_order(buffer); - } - } while (len); -} - -static void __perf_event_header__init_id(struct perf_event_header *header, - struct perf_sample_data *data, - struct perf_event *event) -{ - u64 sample_type = event->attr.sample_type; - - data->type = sample_type; - header->size += event->id_header_size; - - if (sample_type & PERF_SAMPLE_TID) { - /* namespace issues */ - data->tid_entry.pid = perf_event_pid(event, current); - data->tid_entry.tid = perf_event_tid(event, current); - } - - if (sample_type & PERF_SAMPLE_TIME) - data->time = perf_clock(); - - if (sample_type & PERF_SAMPLE_ID) - data->id = primary_event_id(event); - - if (sample_type & PERF_SAMPLE_STREAM_ID) - data->stream_id = event->id; - - if (sample_type & PERF_SAMPLE_CPU) { - data->cpu_entry.cpu = raw_smp_processor_id(); - data->cpu_entry.reserved = 0; - } -} - -static void perf_event_header__init_id(struct perf_event_header *header, - struct perf_sample_data *data, - struct perf_event *event) -{ - if (event->attr.sample_id_all) - __perf_event_header__init_id(header, data, event); -} - -static void __perf_event__output_id_sample(struct perf_output_handle *handle, - struct perf_sample_data *data) -{ - u64 sample_type = data->type; - - if (sample_type & PERF_SAMPLE_TID) - perf_output_put(handle, data->tid_entry); - - if (sample_type & PERF_SAMPLE_TIME) - perf_output_put(handle, data->time); - - if (sample_type & PERF_SAMPLE_ID) - perf_output_put(handle, data->id); - - if (sample_type & PERF_SAMPLE_STREAM_ID) - perf_output_put(handle, data->stream_id); - - if (sample_type & PERF_SAMPLE_CPU) - perf_output_put(handle, data->cpu_entry); -} - -static void perf_event__output_id_sample(struct perf_event *event, - struct perf_output_handle *handle, - struct perf_sample_data *sample) -{ - if (event->attr.sample_id_all) - __perf_event__output_id_sample(handle, sample); -} - -int perf_output_begin(struct perf_output_handle *handle, - struct perf_event *event, unsigned int size, - int nmi, int sample) -{ - struct perf_buffer *buffer; - unsigned long tail, offset, head; - int have_lost; - struct perf_sample_data sample_data; - struct { - struct perf_event_header header; - u64 id; - u64 lost; - } lost_event; - - rcu_read_lock(); - /* - * For inherited events we send all the output towards the parent. - */ - if (event->parent) - event = event->parent; - - buffer = rcu_dereference(event->buffer); - if (!buffer) - goto out; - - handle->buffer = buffer; - handle->event = event; - handle->nmi = nmi; - handle->sample = sample; - - if (!buffer->nr_pages) - goto out; - - have_lost = local_read(&buffer->lost); - if (have_lost) { - lost_event.header.size = sizeof(lost_event); - perf_event_header__init_id(&lost_event.header, &sample_data, - event); - size += lost_event.header.size; - } - - perf_output_get_handle(handle); - - do { - /* - * Userspace could choose to issue a mb() before updating the - * tail pointer. So that all reads will be completed before the - * write is issued. - */ - tail = ACCESS_ONCE(buffer->user_page->data_tail); - smp_rmb(); - offset = head = local_read(&buffer->head); - head += size; - if (unlikely(!perf_output_space(buffer, tail, offset, head))) - goto fail; - } while (local_cmpxchg(&buffer->head, offset, head) != offset); - - if (head - local_read(&buffer->wakeup) > buffer->watermark) - local_add(buffer->watermark, &buffer->wakeup); - - handle->page = offset >> (PAGE_SHIFT + page_order(buffer)); - handle->page &= buffer->nr_pages - 1; - handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1); - handle->addr = buffer->data_pages[handle->page]; - handle->addr += handle->size; - handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size; - - if (have_lost) { - lost_event.header.type = PERF_RECORD_LOST; - lost_event.header.misc = 0; - lost_event.id = event->id; - lost_event.lost = local_xchg(&buffer->lost, 0); - - perf_output_put(handle, lost_event); - perf_event__output_id_sample(event, handle, &sample_data); - } - - return 0; - -fail: - local_inc(&buffer->lost); - perf_output_put_handle(handle); -out: - rcu_read_unlock(); - - return -ENOSPC; -} - -void perf_output_end(struct perf_output_handle *handle) -{ - struct perf_event *event = handle->event; - struct perf_buffer *buffer = handle->buffer; - - int wakeup_events = event->attr.wakeup_events; - - if (handle->sample && wakeup_events) { - int events = local_inc_return(&buffer->events); - if (events >= wakeup_events) { - local_sub(wakeup_events, &buffer->events); - local_inc(&buffer->wakeup); - } - } - - perf_output_put_handle(handle); - rcu_read_unlock(); -} - -static void perf_output_read_one(struct perf_output_handle *handle, - struct perf_event *event, - u64 enabled, u64 running) -{ - u64 read_format = event->attr.read_format; - u64 values[4]; - int n = 0; - - values[n++] = perf_event_count(event); - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = enabled + - atomic64_read(&event->child_total_time_enabled); - } - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = running + - atomic64_read(&event->child_total_time_running); - } - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_event_id(event); - - perf_output_copy(handle, values, n * sizeof(u64)); -} - -/* - * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. - */ -static void perf_output_read_group(struct perf_output_handle *handle, - struct perf_event *event, - u64 enabled, u64 running) -{ - struct perf_event *leader = event->group_leader, *sub; - u64 read_format = event->attr.read_format; - u64 values[5]; - int n = 0; - - values[n++] = 1 + leader->nr_siblings; - - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - values[n++] = enabled; - - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - values[n++] = running; - - if (leader != event) - leader->pmu->read(leader); - - values[n++] = perf_event_count(leader); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_event_id(leader); - - perf_output_copy(handle, values, n * sizeof(u64)); - - list_for_each_entry(sub, &leader->sibling_list, group_entry) { - n = 0; - - if (sub != event) - sub->pmu->read(sub); - - values[n++] = perf_event_count(sub); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_event_id(sub); - - perf_output_copy(handle, values, n * sizeof(u64)); - } -} - -#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ - PERF_FORMAT_TOTAL_TIME_RUNNING) - -static void perf_output_read(struct perf_output_handle *handle, - struct perf_event *event) -{ - u64 enabled = 0, running = 0, now, ctx_time; - u64 read_format = event->attr.read_format; - - /* - * compute total_time_enabled, total_time_running - * based on snapshot values taken when the event - * was last scheduled in. - * - * we cannot simply called update_context_time() - * because of locking issue as we are called in - * NMI context - */ - if (read_format & PERF_FORMAT_TOTAL_TIMES) { - now = perf_clock(); - ctx_time = event->shadow_ctx_time + now; - enabled = ctx_time - event->tstamp_enabled; - running = ctx_time - event->tstamp_running; - } - - if (event->attr.read_format & PERF_FORMAT_GROUP) - perf_output_read_group(handle, event, enabled, running); - else - perf_output_read_one(handle, event, enabled, running); -} - -void perf_output_sample(struct perf_output_handle *handle, - struct perf_event_header *header, - struct perf_sample_data *data, - struct perf_event *event) -{ - u64 sample_type = data->type; - - perf_output_put(handle, *header); - - if (sample_type & PERF_SAMPLE_IP) - perf_output_put(handle, data->ip); - - if (sample_type & PERF_SAMPLE_TID) - perf_output_put(handle, data->tid_entry); - - if (sample_type & PERF_SAMPLE_TIME) - perf_output_put(handle, data->time); - - if (sample_type & PERF_SAMPLE_ADDR) - perf_output_put(handle, data->addr); - - if (sample_type & PERF_SAMPLE_ID) - perf_output_put(handle, data->id); - - if (sample_type & PERF_SAMPLE_STREAM_ID) - perf_output_put(handle, data->stream_id); - - if (sample_type & PERF_SAMPLE_CPU) - perf_output_put(handle, data->cpu_entry); - - if (sample_type & PERF_SAMPLE_PERIOD) - perf_output_put(handle, data->period); - - if (sample_type & PERF_SAMPLE_READ) - perf_output_read(handle, event); - - if (sample_type & PERF_SAMPLE_CALLCHAIN) { - if (data->callchain) { - int size = 1; - - if (data->callchain) - size += data->callchain->nr; - - size *= sizeof(u64); - - perf_output_copy(handle, data->callchain, size); - } else { - u64 nr = 0; - perf_output_put(handle, nr); - } - } - - if (sample_type & PERF_SAMPLE_RAW) { - if (data->raw) { - perf_output_put(handle, data->raw->size); - perf_output_copy(handle, data->raw->data, - data->raw->size); - } else { - struct { - u32 size; - u32 data; - } raw = { - .size = sizeof(u32), - .data = 0, - }; - perf_output_put(handle, raw); - } - } -} - -void perf_prepare_sample(struct perf_event_header *header, - struct perf_sample_data *data, - struct perf_event *event, - struct pt_regs *regs) -{ - u64 sample_type = event->attr.sample_type; - - header->type = PERF_RECORD_SAMPLE; - header->size = sizeof(*header) + event->header_size; - - header->misc = 0; - header->misc |= perf_misc_flags(regs); - - __perf_event_header__init_id(header, data, event); - - if (sample_type & PERF_SAMPLE_IP) - data->ip = perf_instruction_pointer(regs); - - if (sample_type & PERF_SAMPLE_CALLCHAIN) { - int size = 1; - - data->callchain = perf_callchain(regs); - - if (data->callchain) - size += data->callchain->nr; - - header->size += size * sizeof(u64); - } - - if (sample_type & PERF_SAMPLE_RAW) { - int size = sizeof(u32); - - if (data->raw) - size += data->raw->size; - else - size += sizeof(u32); - - WARN_ON_ONCE(size & (sizeof(u64)-1)); - header->size += size; - } -} - -static void perf_event_output(struct perf_event *event, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct perf_output_handle handle; - struct perf_event_header header; - - /* protect the callchain buffers */ - rcu_read_lock(); - - perf_prepare_sample(&header, data, event, regs); - - if (perf_output_begin(&handle, event, header.size, nmi, 1)) - goto exit; - - perf_output_sample(&handle, &header, data, event); - - perf_output_end(&handle); - -exit: - rcu_read_unlock(); -} - -/* - * read event_id - */ - -struct perf_read_event { - struct perf_event_header header; - - u32 pid; - u32 tid; -}; - -static void -perf_event_read_event(struct perf_event *event, - struct task_struct *task) -{ - struct perf_output_handle handle; - struct perf_sample_data sample; - struct perf_read_event read_event = { - .header = { - .type = PERF_RECORD_READ, - .misc = 0, - .size = sizeof(read_event) + event->read_size, - }, - .pid = perf_event_pid(event, task), - .tid = perf_event_tid(event, task), - }; - int ret; - - perf_event_header__init_id(&read_event.header, &sample, event); - ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); - if (ret) - return; - - perf_output_put(&handle, read_event); - perf_output_read(&handle, event); - perf_event__output_id_sample(event, &handle, &sample); - - perf_output_end(&handle); -} - -/* - * task tracking -- fork/exit - * - * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task - */ - -struct perf_task_event { - struct task_struct *task; - struct perf_event_context *task_ctx; - - struct { - struct perf_event_header header; - - u32 pid; - u32 ppid; - u32 tid; - u32 ptid; - u64 time; - } event_id; -}; - -static void perf_event_task_output(struct perf_event *event, - struct perf_task_event *task_event) -{ - struct perf_output_handle handle; - struct perf_sample_data sample; - struct task_struct *task = task_event->task; - int ret, size = task_event->event_id.header.size; - - perf_event_header__init_id(&task_event->event_id.header, &sample, event); - - ret = perf_output_begin(&handle, event, - task_event->event_id.header.size, 0, 0); - if (ret) - goto out; - - task_event->event_id.pid = perf_event_pid(event, task); - task_event->event_id.ppid = perf_event_pid(event, current); - - task_event->event_id.tid = perf_event_tid(event, task); - task_event->event_id.ptid = perf_event_tid(event, current); - - perf_output_put(&handle, task_event->event_id); - - perf_event__output_id_sample(event, &handle, &sample); - - perf_output_end(&handle); -out: - task_event->event_id.header.size = size; -} - -static int perf_event_task_match(struct perf_event *event) -{ - if (event->state < PERF_EVENT_STATE_INACTIVE) - return 0; - - if (!event_filter_match(event)) - return 0; - - if (event->attr.comm || event->attr.mmap || - event->attr.mmap_data || event->attr.task) - return 1; - - return 0; -} - -static void perf_event_task_ctx(struct perf_event_context *ctx, - struct perf_task_event *task_event) -{ - struct perf_event *event; - - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (perf_event_task_match(event)) - perf_event_task_output(event, task_event); - } -} - -static void perf_event_task_event(struct perf_task_event *task_event) -{ - struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx; - struct pmu *pmu; - int ctxn; - - rcu_read_lock(); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->active_pmu != pmu) - goto next; - perf_event_task_ctx(&cpuctx->ctx, task_event); - - ctx = task_event->task_ctx; - if (!ctx) { - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto next; - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - } - if (ctx) - perf_event_task_ctx(ctx, task_event); -next: - put_cpu_ptr(pmu->pmu_cpu_context); - } - rcu_read_unlock(); -} - -static void perf_event_task(struct task_struct *task, - struct perf_event_context *task_ctx, - int new) -{ - struct perf_task_event task_event; - - if (!atomic_read(&nr_comm_events) && - !atomic_read(&nr_mmap_events) && - !atomic_read(&nr_task_events)) - return; - - task_event = (struct perf_task_event){ - .task = task, - .task_ctx = task_ctx, - .event_id = { - .header = { - .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT, - .misc = 0, - .size = sizeof(task_event.event_id), - }, - /* .pid */ - /* .ppid */ - /* .tid */ - /* .ptid */ - .time = perf_clock(), - }, - }; - - perf_event_task_event(&task_event); -} - -void perf_event_fork(struct task_struct *task) -{ - perf_event_task(task, NULL, 1); -} - -/* - * comm tracking - */ - -struct perf_comm_event { - struct task_struct *task; - char *comm; - int comm_size; - - struct { - struct perf_event_header header; - - u32 pid; - u32 tid; - } event_id; -}; - -static void perf_event_comm_output(struct perf_event *event, - struct perf_comm_event *comm_event) -{ - struct perf_output_handle handle; - struct perf_sample_data sample; - int size = comm_event->event_id.header.size; - int ret; - - perf_event_header__init_id(&comm_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, - comm_event->event_id.header.size, 0, 0); - - if (ret) - goto out; - - comm_event->event_id.pid = perf_event_pid(event, comm_event->task); - comm_event->event_id.tid = perf_event_tid(event, comm_event->task); - - perf_output_put(&handle, comm_event->event_id); - perf_output_copy(&handle, comm_event->comm, - comm_event->comm_size); - - perf_event__output_id_sample(event, &handle, &sample); - - perf_output_end(&handle); -out: - comm_event->event_id.header.size = size; -} - -static int perf_event_comm_match(struct perf_event *event) -{ - if (event->state < PERF_EVENT_STATE_INACTIVE) - return 0; - - if (!event_filter_match(event)) - return 0; - - if (event->attr.comm) - return 1; - - return 0; -} - -static void perf_event_comm_ctx(struct perf_event_context *ctx, - struct perf_comm_event *comm_event) -{ - struct perf_event *event; - - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (perf_event_comm_match(event)) - perf_event_comm_output(event, comm_event); - } -} - -static void perf_event_comm_event(struct perf_comm_event *comm_event) -{ - struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx; - char comm[TASK_COMM_LEN]; - unsigned int size; - struct pmu *pmu; - int ctxn; - - memset(comm, 0, sizeof(comm)); - strlcpy(comm, comm_event->task->comm, sizeof(comm)); - size = ALIGN(strlen(comm)+1, sizeof(u64)); - - comm_event->comm = comm; - comm_event->comm_size = size; - - comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; - rcu_read_lock(); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->active_pmu != pmu) - goto next; - perf_event_comm_ctx(&cpuctx->ctx, comm_event); - - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto next; - - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (ctx) - perf_event_comm_ctx(ctx, comm_event); -next: - put_cpu_ptr(pmu->pmu_cpu_context); - } - rcu_read_unlock(); -} - -void perf_event_comm(struct task_struct *task) -{ - struct perf_comm_event comm_event; - struct perf_event_context *ctx; - int ctxn; - - for_each_task_context_nr(ctxn) { - ctx = task->perf_event_ctxp[ctxn]; - if (!ctx) - continue; - - perf_event_enable_on_exec(ctx); - } - - if (!atomic_read(&nr_comm_events)) - return; - - comm_event = (struct perf_comm_event){ - .task = task, - /* .comm */ - /* .comm_size */ - .event_id = { - .header = { - .type = PERF_RECORD_COMM, - .misc = 0, - /* .size */ - }, - /* .pid */ - /* .tid */ - }, - }; - - perf_event_comm_event(&comm_event); -} - -/* - * mmap tracking - */ - -struct perf_mmap_event { - struct vm_area_struct *vma; - - const char *file_name; - int file_size; - - struct { - struct perf_event_header header; - - u32 pid; - u32 tid; - u64 start; - u64 len; - u64 pgoff; - } event_id; -}; - -static void perf_event_mmap_output(struct perf_event *event, - struct perf_mmap_event *mmap_event) -{ - struct perf_output_handle handle; - struct perf_sample_data sample; - int size = mmap_event->event_id.header.size; - int ret; - - perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, - mmap_event->event_id.header.size, 0, 0); - if (ret) - goto out; - - mmap_event->event_id.pid = perf_event_pid(event, current); - mmap_event->event_id.tid = perf_event_tid(event, current); - - perf_output_put(&handle, mmap_event->event_id); - perf_output_copy(&handle, mmap_event->file_name, - mmap_event->file_size); - - perf_event__output_id_sample(event, &handle, &sample); - - perf_output_end(&handle); -out: - mmap_event->event_id.header.size = size; -} - -static int perf_event_mmap_match(struct perf_event *event, - struct perf_mmap_event *mmap_event, - int executable) -{ - if (event->state < PERF_EVENT_STATE_INACTIVE) - return 0; - - if (!event_filter_match(event)) - return 0; - - if ((!executable && event->attr.mmap_data) || - (executable && event->attr.mmap)) - return 1; - - return 0; -} - -static void perf_event_mmap_ctx(struct perf_event_context *ctx, - struct perf_mmap_event *mmap_event, - int executable) -{ - struct perf_event *event; - - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (perf_event_mmap_match(event, mmap_event, executable)) - perf_event_mmap_output(event, mmap_event); - } -} - -static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) -{ - struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx; - struct vm_area_struct *vma = mmap_event->vma; - struct file *file = vma->vm_file; - unsigned int size; - char tmp[16]; - char *buf = NULL; - const char *name; - struct pmu *pmu; - int ctxn; - - memset(tmp, 0, sizeof(tmp)); - - if (file) { - /* - * d_path works from the end of the buffer backwards, so we - * need to add enough zero bytes after the string to handle - * the 64bit alignment we do later. - */ - buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); - if (!buf) { - name = strncpy(tmp, "//enomem", sizeof(tmp)); - goto got_name; - } - name = d_path(&file->f_path, buf, PATH_MAX); - if (IS_ERR(name)) { - name = strncpy(tmp, "//toolong", sizeof(tmp)); - goto got_name; - } - } else { - if (arch_vma_name(mmap_event->vma)) { - name = strncpy(tmp, arch_vma_name(mmap_event->vma), - sizeof(tmp)); - goto got_name; - } - - if (!vma->vm_mm) { - name = strncpy(tmp, "[vdso]", sizeof(tmp)); - goto got_name; - } else if (vma->vm_start <= vma->vm_mm->start_brk && - vma->vm_end >= vma->vm_mm->brk) { - name = strncpy(tmp, "[heap]", sizeof(tmp)); - goto got_name; - } else if (vma->vm_start <= vma->vm_mm->start_stack && - vma->vm_end >= vma->vm_mm->start_stack) { - name = strncpy(tmp, "[stack]", sizeof(tmp)); - goto got_name; - } - - name = strncpy(tmp, "//anon", sizeof(tmp)); - goto got_name; - } - -got_name: - size = ALIGN(strlen(name)+1, sizeof(u64)); - - mmap_event->file_name = name; - mmap_event->file_size = size; - - mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; - - rcu_read_lock(); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->active_pmu != pmu) - goto next; - perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, - vma->vm_flags & VM_EXEC); - - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto next; - - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (ctx) { - perf_event_mmap_ctx(ctx, mmap_event, - vma->vm_flags & VM_EXEC); - } -next: - put_cpu_ptr(pmu->pmu_cpu_context); - } - rcu_read_unlock(); - - kfree(buf); -} - -void perf_event_mmap(struct vm_area_struct *vma) -{ - struct perf_mmap_event mmap_event; - - if (!atomic_read(&nr_mmap_events)) - return; - - mmap_event = (struct perf_mmap_event){ - .vma = vma, - /* .file_name */ - /* .file_size */ - .event_id = { - .header = { - .type = PERF_RECORD_MMAP, - .misc = PERF_RECORD_MISC_USER, - /* .size */ - }, - /* .pid */ - /* .tid */ - .start = vma->vm_start, - .len = vma->vm_end - vma->vm_start, - .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, - }, - }; - - perf_event_mmap_event(&mmap_event); -} - -/* - * IRQ throttle logging - */ - -static void perf_log_throttle(struct perf_event *event, int enable) -{ - struct perf_output_handle handle; - struct perf_sample_data sample; - int ret; - - struct { - struct perf_event_header header; - u64 time; - u64 id; - u64 stream_id; - } throttle_event = { - .header = { - .type = PERF_RECORD_THROTTLE, - .misc = 0, - .size = sizeof(throttle_event), - }, - .time = perf_clock(), - .id = primary_event_id(event), - .stream_id = event->id, - }; - - if (enable) - throttle_event.header.type = PERF_RECORD_UNTHROTTLE; - - perf_event_header__init_id(&throttle_event.header, &sample, event); - - ret = perf_output_begin(&handle, event, - throttle_event.header.size, 1, 0); - if (ret) - return; - - perf_output_put(&handle, throttle_event); - perf_event__output_id_sample(event, &handle, &sample); - perf_output_end(&handle); -} - -/* - * Generic event overflow handling, sampling. - */ - -static int __perf_event_overflow(struct perf_event *event, int nmi, - int throttle, struct perf_sample_data *data, - struct pt_regs *regs) -{ - int events = atomic_read(&event->event_limit); - struct hw_perf_event *hwc = &event->hw; - int ret = 0; - - /* - * Non-sampling counters might still use the PMI to fold short - * hardware counters, ignore those. - */ - if (unlikely(!is_sampling_event(event))) - return 0; - - if (unlikely(hwc->interrupts >= max_samples_per_tick)) { - if (throttle) { - hwc->interrupts = MAX_INTERRUPTS; - perf_log_throttle(event, 0); - ret = 1; - } - } else - hwc->interrupts++; - - if (event->attr.freq) { - u64 now = perf_clock(); - s64 delta = now - hwc->freq_time_stamp; - - hwc->freq_time_stamp = now; - - if (delta > 0 && delta < 2*TICK_NSEC) - perf_adjust_period(event, delta, hwc->last_period); - } - - /* - * XXX event_limit might not quite work as expected on inherited - * events - */ - - event->pending_kill = POLL_IN; - if (events && atomic_dec_and_test(&event->event_limit)) { - ret = 1; - event->pending_kill = POLL_HUP; - if (nmi) { - event->pending_disable = 1; - irq_work_queue(&event->pending); - } else - perf_event_disable(event); - } - - if (event->overflow_handler) - event->overflow_handler(event, nmi, data, regs); - else - perf_event_output(event, nmi, data, regs); - - return ret; -} - -int perf_event_overflow(struct perf_event *event, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - return __perf_event_overflow(event, nmi, 1, data, regs); -} - -/* - * Generic software event infrastructure - */ - -struct swevent_htable { - struct swevent_hlist *swevent_hlist; - struct mutex hlist_mutex; - int hlist_refcount; - - /* Recursion avoidance in each contexts */ - int recursion[PERF_NR_CONTEXTS]; -}; - -static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); - -/* - * We directly increment event->count and keep a second value in - * event->hw.period_left to count intervals. This period event - * is kept in the range [-sample_period, 0] so that we can use the - * sign as trigger. - */ - -static u64 perf_swevent_set_period(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - u64 period = hwc->last_period; - u64 nr, offset; - s64 old, val; - - hwc->last_period = hwc->sample_period; - -again: - old = val = local64_read(&hwc->period_left); - if (val < 0) - return 0; - - nr = div64_u64(period + val, period); - offset = nr * period; - val -= offset; - if (local64_cmpxchg(&hwc->period_left, old, val) != old) - goto again; - - return nr; -} - -static void perf_swevent_overflow(struct perf_event *event, u64 overflow, - int nmi, struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct hw_perf_event *hwc = &event->hw; - int throttle = 0; - - data->period = event->hw.last_period; - if (!overflow) - overflow = perf_swevent_set_period(event); - - if (hwc->interrupts == MAX_INTERRUPTS) - return; - - for (; overflow; overflow--) { - if (__perf_event_overflow(event, nmi, throttle, - data, regs)) { - /* - * We inhibit the overflow from happening when - * hwc->interrupts == MAX_INTERRUPTS. - */ - break; - } - throttle = 1; - } -} - -static void perf_swevent_event(struct perf_event *event, u64 nr, - int nmi, struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct hw_perf_event *hwc = &event->hw; - - local64_add(nr, &event->count); - - if (!regs) - return; - - if (!is_sampling_event(event)) - return; - - if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) - return perf_swevent_overflow(event, 1, nmi, data, regs); - - if (local64_add_negative(nr, &hwc->period_left)) - return; - - perf_swevent_overflow(event, 0, nmi, data, regs); -} - -static int perf_exclude_event(struct perf_event *event, - struct pt_regs *regs) -{ - if (event->hw.state & PERF_HES_STOPPED) - return 1; - - if (regs) { - if (event->attr.exclude_user && user_mode(regs)) - return 1; - - if (event->attr.exclude_kernel && !user_mode(regs)) - return 1; - } - - return 0; -} - -static int perf_swevent_match(struct perf_event *event, - enum perf_type_id type, - u32 event_id, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - if (event->attr.type != type) - return 0; - - if (event->attr.config != event_id) - return 0; - - if (perf_exclude_event(event, regs)) - return 0; - - return 1; -} - -static inline u64 swevent_hash(u64 type, u32 event_id) -{ - u64 val = event_id | (type << 32); - - return hash_64(val, SWEVENT_HLIST_BITS); -} - -static inline struct hlist_head * -__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) -{ - u64 hash = swevent_hash(type, event_id); - - return &hlist->heads[hash]; -} - -/* For the read side: events when they trigger */ -static inline struct hlist_head * -find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id) -{ - struct swevent_hlist *hlist; - - hlist = rcu_dereference(swhash->swevent_hlist); - if (!hlist) - return NULL; - - return __find_swevent_head(hlist, type, event_id); -} - -/* For the event head insertion and removal in the hlist */ -static inline struct hlist_head * -find_swevent_head(struct swevent_htable *swhash, struct perf_event *event) -{ - struct swevent_hlist *hlist; - u32 event_id = event->attr.config; - u64 type = event->attr.type; - - /* - * Event scheduling is always serialized against hlist allocation - * and release. Which makes the protected version suitable here. - * The context lock guarantees that. - */ - hlist = rcu_dereference_protected(swhash->swevent_hlist, - lockdep_is_held(&event->ctx->lock)); - if (!hlist) - return NULL; - - return __find_swevent_head(hlist, type, event_id); -} - -static void do_perf_sw_event(enum perf_type_id type, u32 event_id, - u64 nr, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); - struct perf_event *event; - struct hlist_node *node; - struct hlist_head *head; - - rcu_read_lock(); - head = find_swevent_head_rcu(swhash, type, event_id); - if (!head) - goto end; - - hlist_for_each_entry_rcu(event, node, head, hlist_entry) { - if (perf_swevent_match(event, type, event_id, data, regs)) - perf_swevent_event(event, nr, nmi, data, regs); - } -end: - rcu_read_unlock(); -} - -int perf_swevent_get_recursion_context(void) -{ - struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); - - return get_recursion_context(swhash->recursion); -} -EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); - -inline void perf_swevent_put_recursion_context(int rctx) -{ - struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); - - put_recursion_context(swhash->recursion, rctx); -} - -void __perf_sw_event(u32 event_id, u64 nr, int nmi, - struct pt_regs *regs, u64 addr) -{ - struct perf_sample_data data; - int rctx; - - preempt_disable_notrace(); - rctx = perf_swevent_get_recursion_context(); - if (rctx < 0) - return; - - perf_sample_data_init(&data, addr); - - do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); - - perf_swevent_put_recursion_context(rctx); - preempt_enable_notrace(); -} - -static void perf_swevent_read(struct perf_event *event) -{ -} - -static int perf_swevent_add(struct perf_event *event, int flags) -{ - struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); - struct hw_perf_event *hwc = &event->hw; - struct hlist_head *head; - - if (is_sampling_event(event)) { - hwc->last_period = hwc->sample_period; - perf_swevent_set_period(event); - } - - hwc->state = !(flags & PERF_EF_START); - - head = find_swevent_head(swhash, event); - if (WARN_ON_ONCE(!head)) - return -EINVAL; - - hlist_add_head_rcu(&event->hlist_entry, head); - - return 0; -} - -static void perf_swevent_del(struct perf_event *event, int flags) -{ - hlist_del_rcu(&event->hlist_entry); -} - -static void perf_swevent_start(struct perf_event *event, int flags) -{ - event->hw.state = 0; -} - -static void perf_swevent_stop(struct perf_event *event, int flags) -{ - event->hw.state = PERF_HES_STOPPED; -} - -/* Deref the hlist from the update side */ -static inline struct swevent_hlist * -swevent_hlist_deref(struct swevent_htable *swhash) -{ - return rcu_dereference_protected(swhash->swevent_hlist, - lockdep_is_held(&swhash->hlist_mutex)); -} - -static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) -{ - struct swevent_hlist *hlist; - - hlist = container_of(rcu_head, struct swevent_hlist, rcu_head); - kfree(hlist); -} - -static void swevent_hlist_release(struct swevent_htable *swhash) -{ - struct swevent_hlist *hlist = swevent_hlist_deref(swhash); - - if (!hlist) - return; - - rcu_assign_pointer(swhash->swevent_hlist, NULL); - call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); -} - -static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) -{ - struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); - - mutex_lock(&swhash->hlist_mutex); - - if (!--swhash->hlist_refcount) - swevent_hlist_release(swhash); - - mutex_unlock(&swhash->hlist_mutex); -} - -static void swevent_hlist_put(struct perf_event *event) -{ - int cpu; - - if (event->cpu != -1) { - swevent_hlist_put_cpu(event, event->cpu); - return; - } - - for_each_possible_cpu(cpu) - swevent_hlist_put_cpu(event, cpu); -} - -static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) -{ - struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); - int err = 0; - - mutex_lock(&swhash->hlist_mutex); - - if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) { - struct swevent_hlist *hlist; - - hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); - if (!hlist) { - err = -ENOMEM; - goto exit; - } - rcu_assign_pointer(swhash->swevent_hlist, hlist); - } - swhash->hlist_refcount++; -exit: - mutex_unlock(&swhash->hlist_mutex); - - return err; -} - -static int swevent_hlist_get(struct perf_event *event) -{ - int err; - int cpu, failed_cpu; - - if (event->cpu != -1) - return swevent_hlist_get_cpu(event, event->cpu); - - get_online_cpus(); - for_each_possible_cpu(cpu) { - err = swevent_hlist_get_cpu(event, cpu); - if (err) { - failed_cpu = cpu; - goto fail; - } - } - put_online_cpus(); - - return 0; -fail: - for_each_possible_cpu(cpu) { - if (cpu == failed_cpu) - break; - swevent_hlist_put_cpu(event, cpu); - } - - put_online_cpus(); - return err; -} - -struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; - -static void sw_perf_event_destroy(struct perf_event *event) -{ - u64 event_id = event->attr.config; - - WARN_ON(event->parent); - - jump_label_dec(&perf_swevent_enabled[event_id]); - swevent_hlist_put(event); -} - -static int perf_swevent_init(struct perf_event *event) -{ - int event_id = event->attr.config; - - if (event->attr.type != PERF_TYPE_SOFTWARE) - return -ENOENT; - - switch (event_id) { - case PERF_COUNT_SW_CPU_CLOCK: - case PERF_COUNT_SW_TASK_CLOCK: - return -ENOENT; - - default: - break; - } - - if (event_id >= PERF_COUNT_SW_MAX) - return -ENOENT; - - if (!event->parent) { - int err; - - err = swevent_hlist_get(event); - if (err) - return err; - - jump_label_inc(&perf_swevent_enabled[event_id]); - event->destroy = sw_perf_event_destroy; - } - - return 0; -} - -static struct pmu perf_swevent = { - .task_ctx_nr = perf_sw_context, - - .event_init = perf_swevent_init, - .add = perf_swevent_add, - .del = perf_swevent_del, - .start = perf_swevent_start, - .stop = perf_swevent_stop, - .read = perf_swevent_read, -}; - -#ifdef CONFIG_EVENT_TRACING - -static int perf_tp_filter_match(struct perf_event *event, - struct perf_sample_data *data) -{ - void *record = data->raw->data; - - if (likely(!event->filter) || filter_match_preds(event->filter, record)) - return 1; - return 0; -} - -static int perf_tp_event_match(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - if (event->hw.state & PERF_HES_STOPPED) - return 0; - /* - * All tracepoints are from kernel-space. - */ - if (event->attr.exclude_kernel) - return 0; - - if (!perf_tp_filter_match(event, data)) - return 0; - - return 1; -} - -void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, - struct pt_regs *regs, struct hlist_head *head, int rctx) -{ - struct perf_sample_data data; - struct perf_event *event; - struct hlist_node *node; - - struct perf_raw_record raw = { - .size = entry_size, - .data = record, - }; - - perf_sample_data_init(&data, addr); - data.raw = &raw; - - hlist_for_each_entry_rcu(event, node, head, hlist_entry) { - if (perf_tp_event_match(event, &data, regs)) - perf_swevent_event(event, count, 1, &data, regs); - } - - perf_swevent_put_recursion_context(rctx); -} -EXPORT_SYMBOL_GPL(perf_tp_event); - -static void tp_perf_event_destroy(struct perf_event *event) -{ - perf_trace_destroy(event); -} - -static int perf_tp_event_init(struct perf_event *event) -{ - int err; - - if (event->attr.type != PERF_TYPE_TRACEPOINT) - return -ENOENT; - - err = perf_trace_init(event); - if (err) - return err; - - event->destroy = tp_perf_event_destroy; - - return 0; -} - -static struct pmu perf_tracepoint = { - .task_ctx_nr = perf_sw_context, - - .event_init = perf_tp_event_init, - .add = perf_trace_add, - .del = perf_trace_del, - .start = perf_swevent_start, - .stop = perf_swevent_stop, - .read = perf_swevent_read, -}; - -static inline void perf_tp_register(void) -{ - perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); -} - -static int perf_event_set_filter(struct perf_event *event, void __user *arg) -{ - char *filter_str; - int ret; - - if (event->attr.type != PERF_TYPE_TRACEPOINT) - return -EINVAL; - - filter_str = strndup_user(arg, PAGE_SIZE); - if (IS_ERR(filter_str)) - return PTR_ERR(filter_str); - - ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); - - kfree(filter_str); - return ret; -} - -static void perf_event_free_filter(struct perf_event *event) -{ - ftrace_profile_free_filter(event); -} - -#else - -static inline void perf_tp_register(void) -{ -} - -static int perf_event_set_filter(struct perf_event *event, void __user *arg) -{ - return -ENOENT; -} - -static void perf_event_free_filter(struct perf_event *event) -{ -} - -#endif /* CONFIG_EVENT_TRACING */ - -#ifdef CONFIG_HAVE_HW_BREAKPOINT -void perf_bp_event(struct perf_event *bp, void *data) -{ - struct perf_sample_data sample; - struct pt_regs *regs = data; - - perf_sample_data_init(&sample, bp->attr.bp_addr); - - if (!bp->hw.state && !perf_exclude_event(bp, regs)) - perf_swevent_event(bp, 1, 1, &sample, regs); -} -#endif - -/* - * hrtimer based swevent callback - */ - -static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) -{ - enum hrtimer_restart ret = HRTIMER_RESTART; - struct perf_sample_data data; - struct pt_regs *regs; - struct perf_event *event; - u64 period; - - event = container_of(hrtimer, struct perf_event, hw.hrtimer); - - if (event->state != PERF_EVENT_STATE_ACTIVE) - return HRTIMER_NORESTART; - - event->pmu->read(event); - - perf_sample_data_init(&data, 0); - data.period = event->hw.last_period; - regs = get_irq_regs(); - - if (regs && !perf_exclude_event(event, regs)) { - if (!(event->attr.exclude_idle && current->pid == 0)) - if (perf_event_overflow(event, 0, &data, regs)) - ret = HRTIMER_NORESTART; - } - - period = max_t(u64, 10000, event->hw.sample_period); - hrtimer_forward_now(hrtimer, ns_to_ktime(period)); - - return ret; -} - -static void perf_swevent_start_hrtimer(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - s64 period; - - if (!is_sampling_event(event)) - return; - - period = local64_read(&hwc->period_left); - if (period) { - if (period < 0) - period = 10000; - - local64_set(&hwc->period_left, 0); - } else { - period = max_t(u64, 10000, hwc->sample_period); - } - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL_PINNED, 0); -} - -static void perf_swevent_cancel_hrtimer(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - - if (is_sampling_event(event)) { - ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); - local64_set(&hwc->period_left, ktime_to_ns(remaining)); - - hrtimer_cancel(&hwc->hrtimer); - } -} - -static void perf_swevent_init_hrtimer(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - - if (!is_sampling_event(event)) - return; - - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swevent_hrtimer; - - /* - * Since hrtimers have a fixed rate, we can do a static freq->period - * mapping and avoid the whole period adjust feedback stuff. - */ - if (event->attr.freq) { - long freq = event->attr.sample_freq; - - event->attr.sample_period = NSEC_PER_SEC / freq; - hwc->sample_period = event->attr.sample_period; - local64_set(&hwc->period_left, hwc->sample_period); - event->attr.freq = 0; - } -} - -/* - * Software event: cpu wall time clock - */ - -static void cpu_clock_event_update(struct perf_event *event) -{ - s64 prev; - u64 now; - - now = local_clock(); - prev = local64_xchg(&event->hw.prev_count, now); - local64_add(now - prev, &event->count); -} - -static void cpu_clock_event_start(struct perf_event *event, int flags) -{ - local64_set(&event->hw.prev_count, local_clock()); - perf_swevent_start_hrtimer(event); -} - -static void cpu_clock_event_stop(struct perf_event *event, int flags) -{ - perf_swevent_cancel_hrtimer(event); - cpu_clock_event_update(event); -} - -static int cpu_clock_event_add(struct perf_event *event, int flags) -{ - if (flags & PERF_EF_START) - cpu_clock_event_start(event, flags); - - return 0; -} - -static void cpu_clock_event_del(struct perf_event *event, int flags) -{ - cpu_clock_event_stop(event, flags); -} - -static void cpu_clock_event_read(struct perf_event *event) -{ - cpu_clock_event_update(event); -} - -static int cpu_clock_event_init(struct perf_event *event) -{ - if (event->attr.type != PERF_TYPE_SOFTWARE) - return -ENOENT; - - if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) - return -ENOENT; - - perf_swevent_init_hrtimer(event); - - return 0; -} - -static struct pmu perf_cpu_clock = { - .task_ctx_nr = perf_sw_context, - - .event_init = cpu_clock_event_init, - .add = cpu_clock_event_add, - .del = cpu_clock_event_del, - .start = cpu_clock_event_start, - .stop = cpu_clock_event_stop, - .read = cpu_clock_event_read, -}; - -/* - * Software event: task time clock - */ - -static void task_clock_event_update(struct perf_event *event, u64 now) -{ - u64 prev; - s64 delta; - - prev = local64_xchg(&event->hw.prev_count, now); - delta = now - prev; - local64_add(delta, &event->count); -} - -static void task_clock_event_start(struct perf_event *event, int flags) -{ - local64_set(&event->hw.prev_count, event->ctx->time); - perf_swevent_start_hrtimer(event); -} - -static void task_clock_event_stop(struct perf_event *event, int flags) -{ - perf_swevent_cancel_hrtimer(event); - task_clock_event_update(event, event->ctx->time); -} - -static int task_clock_event_add(struct perf_event *event, int flags) -{ - if (flags & PERF_EF_START) - task_clock_event_start(event, flags); - - return 0; -} - -static void task_clock_event_del(struct perf_event *event, int flags) -{ - task_clock_event_stop(event, PERF_EF_UPDATE); -} - -static void task_clock_event_read(struct perf_event *event) -{ - u64 now = perf_clock(); - u64 delta = now - event->ctx->timestamp; - u64 time = event->ctx->time + delta; - - task_clock_event_update(event, time); -} - -static int task_clock_event_init(struct perf_event *event) -{ - if (event->attr.type != PERF_TYPE_SOFTWARE) - return -ENOENT; - - if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) - return -ENOENT; - - perf_swevent_init_hrtimer(event); - - return 0; -} - -static struct pmu perf_task_clock = { - .task_ctx_nr = perf_sw_context, - - .event_init = task_clock_event_init, - .add = task_clock_event_add, - .del = task_clock_event_del, - .start = task_clock_event_start, - .stop = task_clock_event_stop, - .read = task_clock_event_read, -}; - -static void perf_pmu_nop_void(struct pmu *pmu) -{ -} - -static int perf_pmu_nop_int(struct pmu *pmu) -{ - return 0; -} - -static void perf_pmu_start_txn(struct pmu *pmu) -{ - perf_pmu_disable(pmu); -} - -static int perf_pmu_commit_txn(struct pmu *pmu) -{ - perf_pmu_enable(pmu); - return 0; -} - -static void perf_pmu_cancel_txn(struct pmu *pmu) -{ - perf_pmu_enable(pmu); -} - -/* - * Ensures all contexts with the same task_ctx_nr have the same - * pmu_cpu_context too. - */ -static void *find_pmu_context(int ctxn) -{ - struct pmu *pmu; - - if (ctxn < 0) - return NULL; - - list_for_each_entry(pmu, &pmus, entry) { - if (pmu->task_ctx_nr == ctxn) - return pmu->pmu_cpu_context; - } - - return NULL; -} - -static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu) -{ - int cpu; - - for_each_possible_cpu(cpu) { - struct perf_cpu_context *cpuctx; - - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - - if (cpuctx->active_pmu == old_pmu) - cpuctx->active_pmu = pmu; - } -} - -static void free_pmu_context(struct pmu *pmu) -{ - struct pmu *i; - - mutex_lock(&pmus_lock); - /* - * Like a real lame refcount. - */ - list_for_each_entry(i, &pmus, entry) { - if (i->pmu_cpu_context == pmu->pmu_cpu_context) { - update_pmu_context(i, pmu); - goto out; - } - } - - free_percpu(pmu->pmu_cpu_context); -out: - mutex_unlock(&pmus_lock); -} -static struct idr pmu_idr; - -static ssize_t -type_show(struct device *dev, struct device_attribute *attr, char *page) -{ - struct pmu *pmu = dev_get_drvdata(dev); - - return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); -} - -static struct device_attribute pmu_dev_attrs[] = { - __ATTR_RO(type), - __ATTR_NULL, -}; - -static int pmu_bus_running; -static struct bus_type pmu_bus = { - .name = "event_source", - .dev_attrs = pmu_dev_attrs, -}; - -static void pmu_dev_release(struct device *dev) -{ - kfree(dev); -} - -static int pmu_dev_alloc(struct pmu *pmu) -{ - int ret = -ENOMEM; - - pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL); - if (!pmu->dev) - goto out; - - device_initialize(pmu->dev); - ret = dev_set_name(pmu->dev, "%s", pmu->name); - if (ret) - goto free_dev; - - dev_set_drvdata(pmu->dev, pmu); - pmu->dev->bus = &pmu_bus; - pmu->dev->release = pmu_dev_release; - ret = device_add(pmu->dev); - if (ret) - goto free_dev; - -out: - return ret; - -free_dev: - put_device(pmu->dev); - goto out; -} - -static struct lock_class_key cpuctx_mutex; - -int perf_pmu_register(struct pmu *pmu, char *name, int type) -{ - int cpu, ret; - - mutex_lock(&pmus_lock); - ret = -ENOMEM; - pmu->pmu_disable_count = alloc_percpu(int); - if (!pmu->pmu_disable_count) - goto unlock; - - pmu->type = -1; - if (!name) - goto skip_type; - pmu->name = name; - - if (type < 0) { - int err = idr_pre_get(&pmu_idr, GFP_KERNEL); - if (!err) - goto free_pdc; - - err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type); - if (err) { - ret = err; - goto free_pdc; - } - } - pmu->type = type; - - if (pmu_bus_running) { - ret = pmu_dev_alloc(pmu); - if (ret) - goto free_idr; - } - -skip_type: - pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); - if (pmu->pmu_cpu_context) - goto got_cpu_context; - - pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); - if (!pmu->pmu_cpu_context) - goto free_dev; - - for_each_possible_cpu(cpu) { - struct perf_cpu_context *cpuctx; - - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - __perf_event_init_context(&cpuctx->ctx); - lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); - cpuctx->ctx.type = cpu_context; - cpuctx->ctx.pmu = pmu; - cpuctx->jiffies_interval = 1; - INIT_LIST_HEAD(&cpuctx->rotation_list); - cpuctx->active_pmu = pmu; - } - -got_cpu_context: - if (!pmu->start_txn) { - if (pmu->pmu_enable) { - /* - * If we have pmu_enable/pmu_disable calls, install - * transaction stubs that use that to try and batch - * hardware accesses. - */ - pmu->start_txn = perf_pmu_start_txn; - pmu->commit_txn = perf_pmu_commit_txn; - pmu->cancel_txn = perf_pmu_cancel_txn; - } else { - pmu->start_txn = perf_pmu_nop_void; - pmu->commit_txn = perf_pmu_nop_int; - pmu->cancel_txn = perf_pmu_nop_void; - } - } - - if (!pmu->pmu_enable) { - pmu->pmu_enable = perf_pmu_nop_void; - pmu->pmu_disable = perf_pmu_nop_void; - } - - list_add_rcu(&pmu->entry, &pmus); - ret = 0; -unlock: - mutex_unlock(&pmus_lock); - - return ret; - -free_dev: - device_del(pmu->dev); - put_device(pmu->dev); - -free_idr: - if (pmu->type >= PERF_TYPE_MAX) - idr_remove(&pmu_idr, pmu->type); - -free_pdc: - free_percpu(pmu->pmu_disable_count); - goto unlock; -} - -void perf_pmu_unregister(struct pmu *pmu) -{ - mutex_lock(&pmus_lock); - list_del_rcu(&pmu->entry); - mutex_unlock(&pmus_lock); - - /* - * We dereference the pmu list under both SRCU and regular RCU, so - * synchronize against both of those. - */ - synchronize_srcu(&pmus_srcu); - synchronize_rcu(); - - free_percpu(pmu->pmu_disable_count); - if (pmu->type >= PERF_TYPE_MAX) - idr_remove(&pmu_idr, pmu->type); - device_del(pmu->dev); - put_device(pmu->dev); - free_pmu_context(pmu); -} - -struct pmu *perf_init_event(struct perf_event *event) -{ - struct pmu *pmu = NULL; - int idx; - int ret; - - idx = srcu_read_lock(&pmus_srcu); - - rcu_read_lock(); - pmu = idr_find(&pmu_idr, event->attr.type); - rcu_read_unlock(); - if (pmu) { - ret = pmu->event_init(event); - if (ret) - pmu = ERR_PTR(ret); - goto unlock; - } - - list_for_each_entry_rcu(pmu, &pmus, entry) { - ret = pmu->event_init(event); - if (!ret) - goto unlock; - - if (ret != -ENOENT) { - pmu = ERR_PTR(ret); - goto unlock; - } - } - pmu = ERR_PTR(-ENOENT); -unlock: - srcu_read_unlock(&pmus_srcu, idx); - - return pmu; -} - -/* - * Allocate and initialize a event structure - */ -static struct perf_event * -perf_event_alloc(struct perf_event_attr *attr, int cpu, - struct task_struct *task, - struct perf_event *group_leader, - struct perf_event *parent_event, - perf_overflow_handler_t overflow_handler) -{ - struct pmu *pmu; - struct perf_event *event; - struct hw_perf_event *hwc; - long err; - - if ((unsigned)cpu >= nr_cpu_ids) { - if (!task || cpu != -1) - return ERR_PTR(-EINVAL); - } - - event = kzalloc(sizeof(*event), GFP_KERNEL); - if (!event) - return ERR_PTR(-ENOMEM); - - /* - * Single events are their own group leaders, with an - * empty sibling list: - */ - if (!group_leader) - group_leader = event; - - mutex_init(&event->child_mutex); - INIT_LIST_HEAD(&event->child_list); - - INIT_LIST_HEAD(&event->group_entry); - INIT_LIST_HEAD(&event->event_entry); - INIT_LIST_HEAD(&event->sibling_list); - init_waitqueue_head(&event->waitq); - init_irq_work(&event->pending, perf_pending_event); - - mutex_init(&event->mmap_mutex); - - event->cpu = cpu; - event->attr = *attr; - event->group_leader = group_leader; - event->pmu = NULL; - event->oncpu = -1; - - event->parent = parent_event; - - event->ns = get_pid_ns(current->nsproxy->pid_ns); - event->id = atomic64_inc_return(&perf_event_id); - - event->state = PERF_EVENT_STATE_INACTIVE; - - if (task) { - event->attach_state = PERF_ATTACH_TASK; -#ifdef CONFIG_HAVE_HW_BREAKPOINT - /* - * hw_breakpoint is a bit difficult here.. - */ - if (attr->type == PERF_TYPE_BREAKPOINT) - event->hw.bp_target = task; -#endif - } - - if (!overflow_handler && parent_event) - overflow_handler = parent_event->overflow_handler; - - event->overflow_handler = overflow_handler; - - if (attr->disabled) - event->state = PERF_EVENT_STATE_OFF; - - pmu = NULL; - - hwc = &event->hw; - hwc->sample_period = attr->sample_period; - if (attr->freq && attr->sample_freq) - hwc->sample_period = 1; - hwc->last_period = hwc->sample_period; - - local64_set(&hwc->period_left, hwc->sample_period); - - /* - * we currently do not support PERF_FORMAT_GROUP on inherited events - */ - if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) - goto done; - - pmu = perf_init_event(event); - -done: - err = 0; - if (!pmu) - err = -EINVAL; - else if (IS_ERR(pmu)) - err = PTR_ERR(pmu); - - if (err) { - if (event->ns) - put_pid_ns(event->ns); - kfree(event); - return ERR_PTR(err); - } - - event->pmu = pmu; - - if (!event->parent) { - if (event->attach_state & PERF_ATTACH_TASK) - jump_label_inc(&perf_sched_events); - if (event->attr.mmap || event->attr.mmap_data) - atomic_inc(&nr_mmap_events); - if (event->attr.comm) - atomic_inc(&nr_comm_events); - if (event->attr.task) - atomic_inc(&nr_task_events); - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { - err = get_callchain_buffers(); - if (err) { - free_event(event); - return ERR_PTR(err); - } - } - } - - return event; -} - -static int perf_copy_attr(struct perf_event_attr __user *uattr, - struct perf_event_attr *attr) -{ - u32 size; - int ret; - - if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) - return -EFAULT; - - /* - * zero the full structure, so that a short copy will be nice. - */ - memset(attr, 0, sizeof(*attr)); - - ret = get_user(size, &uattr->size); - if (ret) - return ret; - - if (size > PAGE_SIZE) /* silly large */ - goto err_size; - - if (!size) /* abi compat */ - size = PERF_ATTR_SIZE_VER0; - - if (size < PERF_ATTR_SIZE_VER0) - goto err_size; - - /* - * If we're handed a bigger struct than we know of, - * ensure all the unknown bits are 0 - i.e. new - * user-space does not rely on any kernel feature - * extensions we dont know about yet. - */ - if (size > sizeof(*attr)) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - - addr = (void __user *)uattr + sizeof(*attr); - end = (void __user *)uattr + size; - - for (; addr < end; addr++) { - ret = get_user(val, addr); - if (ret) - return ret; - if (val) - goto err_size; - } - size = sizeof(*attr); - } - - ret = copy_from_user(attr, uattr, size); - if (ret) - return -EFAULT; - - /* - * If the type exists, the corresponding creation will verify - * the attr->config. - */ - if (attr->type >= PERF_TYPE_MAX) - return -EINVAL; - - if (attr->__reserved_1) - return -EINVAL; - - if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) - return -EINVAL; - - if (attr->read_format & ~(PERF_FORMAT_MAX-1)) - return -EINVAL; - -out: - return ret; - -err_size: - put_user(sizeof(*attr), &uattr->size); - ret = -E2BIG; - goto out; -} - -static int -perf_event_set_output(struct perf_event *event, struct perf_event *output_event) -{ - struct perf_buffer *buffer = NULL, *old_buffer = NULL; - int ret = -EINVAL; - - if (!output_event) - goto set; - - /* don't allow circular references */ - if (event == output_event) - goto out; - - /* - * Don't allow cross-cpu buffers - */ - if (output_event->cpu != event->cpu) - goto out; - - /* - * If its not a per-cpu buffer, it must be the same task. - */ - if (output_event->cpu == -1 && output_event->ctx != event->ctx) - goto out; - -set: - mutex_lock(&event->mmap_mutex); - /* Can't redirect output if we've got an active mmap() */ - if (atomic_read(&event->mmap_count)) - goto unlock; - - if (output_event) { - /* get the buffer we want to redirect to */ - buffer = perf_buffer_get(output_event); - if (!buffer) - goto unlock; - } - - old_buffer = event->buffer; - rcu_assign_pointer(event->buffer, buffer); - ret = 0; -unlock: - mutex_unlock(&event->mmap_mutex); - - if (old_buffer) - perf_buffer_put(old_buffer); -out: - return ret; -} - -/** - * sys_perf_event_open - open a performance event, associate it to a task/cpu - * - * @attr_uptr: event_id type attributes for monitoring/sampling - * @pid: target pid - * @cpu: target cpu - * @group_fd: group leader event fd - */ -SYSCALL_DEFINE5(perf_event_open, - struct perf_event_attr __user *, attr_uptr, - pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) -{ - struct perf_event *group_leader = NULL, *output_event = NULL; - struct perf_event *event, *sibling; - struct perf_event_attr attr; - struct perf_event_context *ctx; - struct file *event_file = NULL; - struct file *group_file = NULL; - struct task_struct *task = NULL; - struct pmu *pmu; - int event_fd; - int move_group = 0; - int fput_needed = 0; - int err; - - /* for future expandability... */ - if (flags & ~PERF_FLAG_ALL) - return -EINVAL; - - err = perf_copy_attr(attr_uptr, &attr); - if (err) - return err; - - if (!attr.exclude_kernel) { - if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) - return -EACCES; - } - - if (attr.freq) { - if (attr.sample_freq > sysctl_perf_event_sample_rate) - return -EINVAL; - } - - /* - * In cgroup mode, the pid argument is used to pass the fd - * opened to the cgroup directory in cgroupfs. The cpu argument - * designates the cpu on which to monitor threads from that - * cgroup. - */ - if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) - return -EINVAL; - - event_fd = get_unused_fd_flags(O_RDWR); - if (event_fd < 0) - return event_fd; - - if (group_fd != -1) { - group_leader = perf_fget_light(group_fd, &fput_needed); - if (IS_ERR(group_leader)) { - err = PTR_ERR(group_leader); - goto err_fd; - } - group_file = group_leader->filp; - if (flags & PERF_FLAG_FD_OUTPUT) - output_event = group_leader; - if (flags & PERF_FLAG_FD_NO_GROUP) - group_leader = NULL; - } - - if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) { - task = find_lively_task_by_vpid(pid); - if (IS_ERR(task)) { - err = PTR_ERR(task); - goto err_group_fd; - } - } - - event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL); - if (IS_ERR(event)) { - err = PTR_ERR(event); - goto err_task; - } - - if (flags & PERF_FLAG_PID_CGROUP) { - err = perf_cgroup_connect(pid, event, &attr, group_leader); - if (err) - goto err_alloc; - /* - * one more event: - * - that has cgroup constraint on event->cpu - * - that may need work on context switch - */ - atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_inc(&perf_sched_events); - } - - /* - * Special case software events and allow them to be part of - * any hardware group. - */ - pmu = event->pmu; - - if (group_leader && - (is_software_event(event) != is_software_event(group_leader))) { - if (is_software_event(event)) { - /* - * If event and group_leader are not both a software - * event, and event is, then group leader is not. - * - * Allow the addition of software events to !software - * groups, this is safe because software events never - * fail to schedule. - */ - pmu = group_leader->pmu; - } else if (is_software_event(group_leader) && - (group_leader->group_flags & PERF_GROUP_SOFTWARE)) { - /* - * In case the group is a pure software group, and we - * try to add a hardware event, move the whole group to - * the hardware context. - */ - move_group = 1; - } - } - - /* - * Get the target context (task or percpu): - */ - ctx = find_get_context(pmu, task, cpu); - if (IS_ERR(ctx)) { - err = PTR_ERR(ctx); - goto err_alloc; - } - - if (task) { - put_task_struct(task); - task = NULL; - } - - /* - * Look up the group leader (we will attach this event to it): - */ - if (group_leader) { - err = -EINVAL; - - /* - * Do not allow a recursive hierarchy (this new sibling - * becoming part of another group-sibling): - */ - if (group_leader->group_leader != group_leader) - goto err_context; - /* - * Do not allow to attach to a group in a different - * task or CPU context: - */ - if (move_group) { - if (group_leader->ctx->type != ctx->type) - goto err_context; - } else { - if (group_leader->ctx != ctx) - goto err_context; - } - - /* - * Only a group leader can be exclusive or pinned - */ - if (attr.exclusive || attr.pinned) - goto err_context; - } - - if (output_event) { - err = perf_event_set_output(event, output_event); - if (err) - goto err_context; - } - - event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); - if (IS_ERR(event_file)) { - err = PTR_ERR(event_file); - goto err_context; - } - - if (move_group) { - struct perf_event_context *gctx = group_leader->ctx; - - mutex_lock(&gctx->mutex); - perf_remove_from_context(group_leader); - list_for_each_entry(sibling, &group_leader->sibling_list, - group_entry) { - perf_remove_from_context(sibling); - put_ctx(gctx); - } - mutex_unlock(&gctx->mutex); - put_ctx(gctx); - } - - event->filp = event_file; - WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); - - if (move_group) { - perf_install_in_context(ctx, group_leader, cpu); - get_ctx(ctx); - list_for_each_entry(sibling, &group_leader->sibling_list, - group_entry) { - perf_install_in_context(ctx, sibling, cpu); - get_ctx(ctx); - } - } - - perf_install_in_context(ctx, event, cpu); - ++ctx->generation; - perf_unpin_context(ctx); - mutex_unlock(&ctx->mutex); - - event->owner = current; - - mutex_lock(¤t->perf_event_mutex); - list_add_tail(&event->owner_entry, ¤t->perf_event_list); - mutex_unlock(¤t->perf_event_mutex); - - /* - * Precalculate sample_data sizes - */ - perf_event__header_size(event); - perf_event__id_header_size(event); - - /* - * Drop the reference on the group_event after placing the - * new event on the sibling_list. This ensures destruction - * of the group leader will find the pointer to itself in - * perf_group_detach(). - */ - fput_light(group_file, fput_needed); - fd_install(event_fd, event_file); - return event_fd; - -err_context: - perf_unpin_context(ctx); - put_ctx(ctx); -err_alloc: - free_event(event); -err_task: - if (task) - put_task_struct(task); -err_group_fd: - fput_light(group_file, fput_needed); -err_fd: - put_unused_fd(event_fd); - return err; -} - -/** - * perf_event_create_kernel_counter - * - * @attr: attributes of the counter to create - * @cpu: cpu in which the counter is bound - * @task: task to profile (NULL for percpu) - */ -struct perf_event * -perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, - struct task_struct *task, - perf_overflow_handler_t overflow_handler) -{ - struct perf_event_context *ctx; - struct perf_event *event; - int err; - - /* - * Get the target context (task or percpu): - */ - - event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler); - if (IS_ERR(event)) { - err = PTR_ERR(event); - goto err; - } - - ctx = find_get_context(event->pmu, task, cpu); - if (IS_ERR(ctx)) { - err = PTR_ERR(ctx); - goto err_free; - } - - event->filp = NULL; - WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); - perf_install_in_context(ctx, event, cpu); - ++ctx->generation; - perf_unpin_context(ctx); - mutex_unlock(&ctx->mutex); - - return event; - -err_free: - free_event(event); -err: - return ERR_PTR(err); -} -EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); - -static void sync_child_event(struct perf_event *child_event, - struct task_struct *child) -{ - struct perf_event *parent_event = child_event->parent; - u64 child_val; - - if (child_event->attr.inherit_stat) - perf_event_read_event(child_event, child); - - child_val = perf_event_count(child_event); - - /* - * Add back the child's count to the parent's count: - */ - atomic64_add(child_val, &parent_event->child_count); - atomic64_add(child_event->total_time_enabled, - &parent_event->child_total_time_enabled); - atomic64_add(child_event->total_time_running, - &parent_event->child_total_time_running); - - /* - * Remove this event from the parent's list - */ - WARN_ON_ONCE(parent_event->ctx->parent_ctx); - mutex_lock(&parent_event->child_mutex); - list_del_init(&child_event->child_list); - mutex_unlock(&parent_event->child_mutex); - - /* - * Release the parent event, if this was the last - * reference to it. - */ - fput(parent_event->filp); -} - -static void -__perf_event_exit_task(struct perf_event *child_event, - struct perf_event_context *child_ctx, - struct task_struct *child) -{ - if (child_event->parent) { - raw_spin_lock_irq(&child_ctx->lock); - perf_group_detach(child_event); - raw_spin_unlock_irq(&child_ctx->lock); - } - - perf_remove_from_context(child_event); - - /* - * It can happen that the parent exits first, and has events - * that are still around due to the child reference. These - * events need to be zapped. - */ - if (child_event->parent) { - sync_child_event(child_event, child); - free_event(child_event); - } -} - -static void perf_event_exit_task_context(struct task_struct *child, int ctxn) -{ - struct perf_event *child_event, *tmp; - struct perf_event_context *child_ctx; - unsigned long flags; - - if (likely(!child->perf_event_ctxp[ctxn])) { - perf_event_task(child, NULL, 0); - return; - } - - local_irq_save(flags); - /* - * We can't reschedule here because interrupts are disabled, - * and either child is current or it is a task that can't be - * scheduled, so we are now safe from rescheduling changing - * our context. - */ - child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); - task_ctx_sched_out(child_ctx, EVENT_ALL); - - /* - * Take the context lock here so that if find_get_context is - * reading child->perf_event_ctxp, we wait until it has - * incremented the context's refcount before we do put_ctx below. - */ - raw_spin_lock(&child_ctx->lock); - child->perf_event_ctxp[ctxn] = NULL; - /* - * If this context is a clone; unclone it so it can't get - * swapped to another process while we're removing all - * the events from it. - */ - unclone_ctx(child_ctx); - update_context_time(child_ctx); - raw_spin_unlock_irqrestore(&child_ctx->lock, flags); - - /* - * Report the task dead after unscheduling the events so that we - * won't get any samples after PERF_RECORD_EXIT. We can however still - * get a few PERF_RECORD_READ events. - */ - perf_event_task(child, child_ctx, 0); - - /* - * We can recurse on the same lock type through: - * - * __perf_event_exit_task() - * sync_child_event() - * fput(parent_event->filp) - * perf_release() - * mutex_lock(&ctx->mutex) - * - * But since its the parent context it won't be the same instance. - */ - mutex_lock(&child_ctx->mutex); - -again: - list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, - group_entry) - __perf_event_exit_task(child_event, child_ctx, child); - - list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups, - group_entry) - __perf_event_exit_task(child_event, child_ctx, child); - - /* - * If the last event was a group event, it will have appended all - * its siblings to the list, but we obtained 'tmp' before that which - * will still point to the list head terminating the iteration. - */ - if (!list_empty(&child_ctx->pinned_groups) || - !list_empty(&child_ctx->flexible_groups)) - goto again; - - mutex_unlock(&child_ctx->mutex); - - put_ctx(child_ctx); -} - -/* - * When a child task exits, feed back event values to parent events. - */ -void perf_event_exit_task(struct task_struct *child) -{ - struct perf_event *event, *tmp; - int ctxn; - - mutex_lock(&child->perf_event_mutex); - list_for_each_entry_safe(event, tmp, &child->perf_event_list, - owner_entry) { - list_del_init(&event->owner_entry); - - /* - * Ensure the list deletion is visible before we clear - * the owner, closes a race against perf_release() where - * we need to serialize on the owner->perf_event_mutex. - */ - smp_wmb(); - event->owner = NULL; - } - mutex_unlock(&child->perf_event_mutex); - - for_each_task_context_nr(ctxn) - perf_event_exit_task_context(child, ctxn); -} - -static void perf_free_event(struct perf_event *event, - struct perf_event_context *ctx) -{ - struct perf_event *parent = event->parent; - - if (WARN_ON_ONCE(!parent)) - return; - - mutex_lock(&parent->child_mutex); - list_del_init(&event->child_list); - mutex_unlock(&parent->child_mutex); - - fput(parent->filp); - - perf_group_detach(event); - list_del_event(event, ctx); - free_event(event); -} - -/* - * free an unexposed, unused context as created by inheritance by - * perf_event_init_task below, used by fork() in case of fail. - */ -void perf_event_free_task(struct task_struct *task) -{ - struct perf_event_context *ctx; - struct perf_event *event, *tmp; - int ctxn; - - for_each_task_context_nr(ctxn) { - ctx = task->perf_event_ctxp[ctxn]; - if (!ctx) - continue; - - mutex_lock(&ctx->mutex); -again: - list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, - group_entry) - perf_free_event(event, ctx); - - list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, - group_entry) - perf_free_event(event, ctx); - - if (!list_empty(&ctx->pinned_groups) || - !list_empty(&ctx->flexible_groups)) - goto again; - - mutex_unlock(&ctx->mutex); - - put_ctx(ctx); - } -} - -void perf_event_delayed_put(struct task_struct *task) -{ - int ctxn; - - for_each_task_context_nr(ctxn) - WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); -} - -/* - * inherit a event from parent task to child task: - */ -static struct perf_event * -inherit_event(struct perf_event *parent_event, - struct task_struct *parent, - struct perf_event_context *parent_ctx, - struct task_struct *child, - struct perf_event *group_leader, - struct perf_event_context *child_ctx) -{ - struct perf_event *child_event; - unsigned long flags; - - /* - * Instead of creating recursive hierarchies of events, - * we link inherited events back to the original parent, - * which has a filp for sure, which we use as the reference - * count: - */ - if (parent_event->parent) - parent_event = parent_event->parent; - - child_event = perf_event_alloc(&parent_event->attr, - parent_event->cpu, - child, - group_leader, parent_event, - NULL); - if (IS_ERR(child_event)) - return child_event; - get_ctx(child_ctx); - - /* - * Make the child state follow the state of the parent event, - * not its attr.disabled bit. We hold the parent's mutex, - * so we won't race with perf_event_{en, dis}able_family. - */ - if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) - child_event->state = PERF_EVENT_STATE_INACTIVE; - else - child_event->state = PERF_EVENT_STATE_OFF; - - if (parent_event->attr.freq) { - u64 sample_period = parent_event->hw.sample_period; - struct hw_perf_event *hwc = &child_event->hw; - - hwc->sample_period = sample_period; - hwc->last_period = sample_period; - - local64_set(&hwc->period_left, sample_period); - } - - child_event->ctx = child_ctx; - child_event->overflow_handler = parent_event->overflow_handler; - - /* - * Precalculate sample_data sizes - */ - perf_event__header_size(child_event); - perf_event__id_header_size(child_event); - - /* - * Link it up in the child's context: - */ - raw_spin_lock_irqsave(&child_ctx->lock, flags); - add_event_to_ctx(child_event, child_ctx); - raw_spin_unlock_irqrestore(&child_ctx->lock, flags); - - /* - * Get a reference to the parent filp - we will fput it - * when the child event exits. This is safe to do because - * we are in the parent and we know that the filp still - * exists and has a nonzero count: - */ - atomic_long_inc(&parent_event->filp->f_count); - - /* - * Link this into the parent event's child list - */ - WARN_ON_ONCE(parent_event->ctx->parent_ctx); - mutex_lock(&parent_event->child_mutex); - list_add_tail(&child_event->child_list, &parent_event->child_list); - mutex_unlock(&parent_event->child_mutex); - - return child_event; -} - -static int inherit_group(struct perf_event *parent_event, - struct task_struct *parent, - struct perf_event_context *parent_ctx, - struct task_struct *child, - struct perf_event_context *child_ctx) -{ - struct perf_event *leader; - struct perf_event *sub; - struct perf_event *child_ctr; - - leader = inherit_event(parent_event, parent, parent_ctx, - child, NULL, child_ctx); - if (IS_ERR(leader)) - return PTR_ERR(leader); - list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { - child_ctr = inherit_event(sub, parent, parent_ctx, - child, leader, child_ctx); - if (IS_ERR(child_ctr)) - return PTR_ERR(child_ctr); - } - return 0; -} - -static int -inherit_task_group(struct perf_event *event, struct task_struct *parent, - struct perf_event_context *parent_ctx, - struct task_struct *child, int ctxn, - int *inherited_all) -{ - int ret; - struct perf_event_context *child_ctx; - - if (!event->attr.inherit) { - *inherited_all = 0; - return 0; - } - - child_ctx = child->perf_event_ctxp[ctxn]; - if (!child_ctx) { - /* - * This is executed from the parent task context, so - * inherit events that have been marked for cloning. - * First allocate and initialize a context for the - * child. - */ - - child_ctx = alloc_perf_context(event->pmu, child); - if (!child_ctx) - return -ENOMEM; - - child->perf_event_ctxp[ctxn] = child_ctx; - } - - ret = inherit_group(event, parent, parent_ctx, - child, child_ctx); - - if (ret) - *inherited_all = 0; - - return ret; -} - -/* - * Initialize the perf_event context in task_struct - */ -int perf_event_init_context(struct task_struct *child, int ctxn) -{ - struct perf_event_context *child_ctx, *parent_ctx; - struct perf_event_context *cloned_ctx; - struct perf_event *event; - struct task_struct *parent = current; - int inherited_all = 1; - unsigned long flags; - int ret = 0; - - if (likely(!parent->perf_event_ctxp[ctxn])) - return 0; - - /* - * If the parent's context is a clone, pin it so it won't get - * swapped under us. - */ - parent_ctx = perf_pin_task_context(parent, ctxn); - - /* - * No need to check if parent_ctx != NULL here; since we saw - * it non-NULL earlier, the only reason for it to become NULL - * is if we exit, and since we're currently in the middle of - * a fork we can't be exiting at the same time. - */ - - /* - * Lock the parent list. No need to lock the child - not PID - * hashed yet and not running, so nobody can access it. - */ - mutex_lock(&parent_ctx->mutex); - - /* - * We dont have to disable NMIs - we are only looking at - * the list, not manipulating it: - */ - list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { - ret = inherit_task_group(event, parent, parent_ctx, - child, ctxn, &inherited_all); - if (ret) - break; - } - - /* - * We can't hold ctx->lock when iterating the ->flexible_group list due - * to allocations, but we need to prevent rotation because - * rotate_ctx() will change the list from interrupt context. - */ - raw_spin_lock_irqsave(&parent_ctx->lock, flags); - parent_ctx->rotate_disable = 1; - raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); - - list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { - ret = inherit_task_group(event, parent, parent_ctx, - child, ctxn, &inherited_all); - if (ret) - break; - } - - raw_spin_lock_irqsave(&parent_ctx->lock, flags); - parent_ctx->rotate_disable = 0; - - child_ctx = child->perf_event_ctxp[ctxn]; - - if (child_ctx && inherited_all) { - /* - * Mark the child context as a clone of the parent - * context, or of whatever the parent is a clone of. - * - * Note that if the parent is a clone, the holding of - * parent_ctx->lock avoids it from being uncloned. - */ - cloned_ctx = parent_ctx->parent_ctx; - if (cloned_ctx) { - child_ctx->parent_ctx = cloned_ctx; - child_ctx->parent_gen = parent_ctx->parent_gen; - } else { - child_ctx->parent_ctx = parent_ctx; - child_ctx->parent_gen = parent_ctx->generation; - } - get_ctx(child_ctx->parent_ctx); - } - - raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); - mutex_unlock(&parent_ctx->mutex); - - perf_unpin_context(parent_ctx); - put_ctx(parent_ctx); - - return ret; -} - -/* - * Initialize the perf_event context in task_struct - */ -int perf_event_init_task(struct task_struct *child) -{ - int ctxn, ret; - - memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp)); - mutex_init(&child->perf_event_mutex); - INIT_LIST_HEAD(&child->perf_event_list); - - for_each_task_context_nr(ctxn) { - ret = perf_event_init_context(child, ctxn); - if (ret) - return ret; - } - - return 0; -} - -static void __init perf_event_init_all_cpus(void) -{ - struct swevent_htable *swhash; - int cpu; - - for_each_possible_cpu(cpu) { - swhash = &per_cpu(swevent_htable, cpu); - mutex_init(&swhash->hlist_mutex); - INIT_LIST_HEAD(&per_cpu(rotation_list, cpu)); - } -} - -static void __cpuinit perf_event_init_cpu(int cpu) -{ - struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); - - mutex_lock(&swhash->hlist_mutex); - if (swhash->hlist_refcount > 0) { - struct swevent_hlist *hlist; - - hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); - WARN_ON(!hlist); - rcu_assign_pointer(swhash->swevent_hlist, hlist); - } - mutex_unlock(&swhash->hlist_mutex); -} - -#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC -static void perf_pmu_rotate_stop(struct pmu *pmu) -{ - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - - WARN_ON(!irqs_disabled()); - - list_del_init(&cpuctx->rotation_list); -} - -static void __perf_event_exit_context(void *__info) -{ - struct perf_event_context *ctx = __info; - struct perf_event *event, *tmp; - - perf_pmu_rotate_stop(ctx->pmu); - - list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) - __perf_remove_from_context(event); - list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) - __perf_remove_from_context(event); -} - -static void perf_event_exit_cpu_context(int cpu) -{ - struct perf_event_context *ctx; - struct pmu *pmu; - int idx; - - idx = srcu_read_lock(&pmus_srcu); - list_for_each_entry_rcu(pmu, &pmus, entry) { - ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx; - - mutex_lock(&ctx->mutex); - smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); - mutex_unlock(&ctx->mutex); - } - srcu_read_unlock(&pmus_srcu, idx); -} - -static void perf_event_exit_cpu(int cpu) -{ - struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); - - mutex_lock(&swhash->hlist_mutex); - swevent_hlist_release(swhash); - mutex_unlock(&swhash->hlist_mutex); - - perf_event_exit_cpu_context(cpu); -} -#else -static inline void perf_event_exit_cpu(int cpu) { } -#endif - -static int -perf_reboot(struct notifier_block *notifier, unsigned long val, void *v) -{ - int cpu; - - for_each_online_cpu(cpu) - perf_event_exit_cpu(cpu); - - return NOTIFY_OK; -} - -/* - * Run the perf reboot notifier at the very last possible moment so that - * the generic watchdog code runs as long as possible. - */ -static struct notifier_block perf_reboot_notifier = { - .notifier_call = perf_reboot, - .priority = INT_MIN, -}; - -static int __cpuinit -perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) -{ - unsigned int cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - - case CPU_UP_PREPARE: - case CPU_DOWN_FAILED: - perf_event_init_cpu(cpu); - break; - - case CPU_UP_CANCELED: - case CPU_DOWN_PREPARE: - perf_event_exit_cpu(cpu); - break; - - default: - break; - } - - return NOTIFY_OK; -} - -void __init perf_event_init(void) -{ - int ret; - - idr_init(&pmu_idr); - - perf_event_init_all_cpus(); - init_srcu_struct(&pmus_srcu); - perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); - perf_pmu_register(&perf_cpu_clock, NULL, -1); - perf_pmu_register(&perf_task_clock, NULL, -1); - perf_tp_register(); - perf_cpu_notifier(perf_cpu_notify); - register_reboot_notifier(&perf_reboot_notifier); - - ret = init_hw_breakpoint(); - WARN(ret, "hw_breakpoint initialization failed with: %d", ret); -} - -static int __init perf_event_sysfs_init(void) -{ - struct pmu *pmu; - int ret; - - mutex_lock(&pmus_lock); - - ret = bus_register(&pmu_bus); - if (ret) - goto unlock; - - list_for_each_entry(pmu, &pmus, entry) { - if (!pmu->name || pmu->type < 0) - continue; - - ret = pmu_dev_alloc(pmu); - WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret); - } - pmu_bus_running = 1; - ret = 0; - -unlock: - mutex_unlock(&pmus_lock); - - return ret; -} -device_initcall(perf_event_sysfs_init); - -#ifdef CONFIG_CGROUP_PERF -static struct cgroup_subsys_state *perf_cgroup_create( - struct cgroup_subsys *ss, struct cgroup *cont) -{ - struct perf_cgroup *jc; - - jc = kzalloc(sizeof(*jc), GFP_KERNEL); - if (!jc) - return ERR_PTR(-ENOMEM); - - jc->info = alloc_percpu(struct perf_cgroup_info); - if (!jc->info) { - kfree(jc); - return ERR_PTR(-ENOMEM); - } - - return &jc->css; -} - -static void perf_cgroup_destroy(struct cgroup_subsys *ss, - struct cgroup *cont) -{ - struct perf_cgroup *jc; - jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), - struct perf_cgroup, css); - free_percpu(jc->info); - kfree(jc); -} - -static int __perf_cgroup_move(void *info) -{ - struct task_struct *task = info; - perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN); - return 0; -} - -static void perf_cgroup_move(struct task_struct *task) -{ - task_function_call(task, __perf_cgroup_move, task); -} - -static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cgrp, struct task_struct *task, - bool threadgroup) -{ - perf_cgroup_move(task); - if (threadgroup) { - struct task_struct *c; - rcu_read_lock(); - list_for_each_entry_rcu(c, &task->thread_group, thread_group) { - perf_cgroup_move(c); - } - rcu_read_unlock(); - } -} - -static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cgrp, struct task_struct *task) -{ - /* - * cgroup_exit() is called in the copy_process() failure path. - * Ignore this case since the task hasn't ran yet, this avoids - * trying to poke a half freed task state from generic code. - */ - if (!(task->flags & PF_EXITING)) - return; - - perf_cgroup_move(task); -} - -struct cgroup_subsys perf_subsys = { - .name = "perf_event", - .subsys_id = perf_subsys_id, - .create = perf_cgroup_create, - .destroy = perf_cgroup_destroy, - .exit = perf_cgroup_exit, - .attach = perf_cgroup_attach, -}; -#endif /* CONFIG_CGROUP_PERF */ -- cgit v1.2.3 From 48dbb6dc86ca5d1b2224937d774c7ba98bc3a485 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 3 May 2011 15:26:43 +0200 Subject: hw breakpoints: Move to kernel/events/ As part of the events sybsystem unification, relocate hw_breakpoint.c into its new destination. Cc: Frederic Weisbecker Signed-off-by: Borislav Petkov --- kernel/Makefile | 1 - kernel/events/Makefile | 3 +- kernel/events/hw_breakpoint.c | 659 ++++++++++++++++++++++++++++++++++++++++++ kernel/hw_breakpoint.c | 659 ------------------------------------------ 4 files changed, 661 insertions(+), 661 deletions(-) create mode 100644 kernel/events/hw_breakpoint.c delete mode 100644 kernel/hw_breakpoint.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 79815306474f..e9cf19155b46 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -105,7 +105,6 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_PERF_EVENTS) += events/ -obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o obj-$(CONFIG_PADATA) += padata.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o diff --git a/kernel/events/Makefile b/kernel/events/Makefile index 26c00e4570e5..1ce23d3d8394 100644 --- a/kernel/events/Makefile +++ b/kernel/events/Makefile @@ -2,4 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_core.o = -pg endif -obj-y += core.o +obj-y := core.o +obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c new file mode 100644 index 000000000000..086adf25a55e --- /dev/null +++ b/kernel/events/hw_breakpoint.c @@ -0,0 +1,659 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) 2007 Alan Stern + * Copyright (C) IBM Corporation, 2009 + * Copyright (C) 2009, Frederic Weisbecker + * + * Thanks to Ingo Molnar for his many suggestions. + * + * Authors: Alan Stern + * K.Prasad + * Frederic Weisbecker + */ + +/* + * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility, + * using the CPU's debug registers. + * This file contains the arch-independent routines. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +/* + * Constraints data + */ + +/* Number of pinned cpu breakpoints in a cpu */ +static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]); + +/* Number of pinned task breakpoints in a cpu */ +static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]); + +/* Number of non-pinned cpu/task breakpoints in a cpu */ +static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]); + +static int nr_slots[TYPE_MAX]; + +/* Keep track of the breakpoints attached to tasks */ +static LIST_HEAD(bp_task_head); + +static int constraints_initialized; + +/* Gather the number of total pinned and un-pinned bp in a cpuset */ +struct bp_busy_slots { + unsigned int pinned; + unsigned int flexible; +}; + +/* Serialize accesses to the above constraints */ +static DEFINE_MUTEX(nr_bp_mutex); + +__weak int hw_breakpoint_weight(struct perf_event *bp) +{ + return 1; +} + +static inline enum bp_type_idx find_slot_idx(struct perf_event *bp) +{ + if (bp->attr.bp_type & HW_BREAKPOINT_RW) + return TYPE_DATA; + + return TYPE_INST; +} + +/* + * Report the maximum number of pinned breakpoints a task + * have in this cpu + */ +static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) +{ + int i; + unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); + + for (i = nr_slots[type] - 1; i >= 0; i--) { + if (tsk_pinned[i] > 0) + return i + 1; + } + + return 0; +} + +/* + * Count the number of breakpoints of the same type and same task. + * The given event must be not on the list. + */ +static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) +{ + struct task_struct *tsk = bp->hw.bp_target; + struct perf_event *iter; + int count = 0; + + list_for_each_entry(iter, &bp_task_head, hw.bp_list) { + if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type) + count += hw_breakpoint_weight(iter); + } + + return count; +} + +/* + * Report the number of pinned/un-pinned breakpoints we have in + * a given cpu (cpu > -1) or in all of them (cpu = -1). + */ +static void +fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, + enum bp_type_idx type) +{ + int cpu = bp->cpu; + struct task_struct *tsk = bp->hw.bp_target; + + if (cpu >= 0) { + slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu); + if (!tsk) + slots->pinned += max_task_bp_pinned(cpu, type); + else + slots->pinned += task_bp_pinned(bp, type); + slots->flexible = per_cpu(nr_bp_flexible[type], cpu); + + return; + } + + for_each_online_cpu(cpu) { + unsigned int nr; + + nr = per_cpu(nr_cpu_bp_pinned[type], cpu); + if (!tsk) + nr += max_task_bp_pinned(cpu, type); + else + nr += task_bp_pinned(bp, type); + + if (nr > slots->pinned) + slots->pinned = nr; + + nr = per_cpu(nr_bp_flexible[type], cpu); + + if (nr > slots->flexible) + slots->flexible = nr; + } +} + +/* + * For now, continue to consider flexible as pinned, until we can + * ensure no flexible event can ever be scheduled before a pinned event + * in a same cpu. + */ +static void +fetch_this_slot(struct bp_busy_slots *slots, int weight) +{ + slots->pinned += weight; +} + +/* + * Add a pinned breakpoint for the given task in our constraint table + */ +static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable, + enum bp_type_idx type, int weight) +{ + unsigned int *tsk_pinned; + int old_count = 0; + int old_idx = 0; + int idx = 0; + + old_count = task_bp_pinned(bp, type); + old_idx = old_count - 1; + idx = old_idx + weight; + + /* tsk_pinned[n] is the number of tasks having n breakpoints */ + tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); + if (enable) { + tsk_pinned[idx]++; + if (old_count > 0) + tsk_pinned[old_idx]--; + } else { + tsk_pinned[idx]--; + if (old_count > 0) + tsk_pinned[old_idx]++; + } +} + +/* + * Add/remove the given breakpoint in our constraint table + */ +static void +toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, + int weight) +{ + int cpu = bp->cpu; + struct task_struct *tsk = bp->hw.bp_target; + + /* Pinned counter cpu profiling */ + if (!tsk) { + + if (enable) + per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; + else + per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; + return; + } + + /* Pinned counter task profiling */ + + if (!enable) + list_del(&bp->hw.bp_list); + + if (cpu >= 0) { + toggle_bp_task_slot(bp, cpu, enable, type, weight); + } else { + for_each_online_cpu(cpu) + toggle_bp_task_slot(bp, cpu, enable, type, weight); + } + + if (enable) + list_add_tail(&bp->hw.bp_list, &bp_task_head); +} + +/* + * Function to perform processor-specific cleanup during unregistration + */ +__weak void arch_unregister_hw_breakpoint(struct perf_event *bp) +{ + /* + * A weak stub function here for those archs that don't define + * it inside arch/.../kernel/hw_breakpoint.c + */ +} + +/* + * Contraints to check before allowing this new breakpoint counter: + * + * == Non-pinned counter == (Considered as pinned for now) + * + * - If attached to a single cpu, check: + * + * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu) + * + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM + * + * -> If there are already non-pinned counters in this cpu, it means + * there is already a free slot for them. + * Otherwise, we check that the maximum number of per task + * breakpoints (for this cpu) plus the number of per cpu breakpoint + * (for this cpu) doesn't cover every registers. + * + * - If attached to every cpus, check: + * + * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *)) + * + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM + * + * -> This is roughly the same, except we check the number of per cpu + * bp for every cpu and we keep the max one. Same for the per tasks + * breakpoints. + * + * + * == Pinned counter == + * + * - If attached to a single cpu, check: + * + * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu) + * + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM + * + * -> Same checks as before. But now the nr_bp_flexible, if any, must keep + * one register at least (or they will never be fed). + * + * - If attached to every cpus, check: + * + * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) + * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM + */ +static int __reserve_bp_slot(struct perf_event *bp) +{ + struct bp_busy_slots slots = {0}; + enum bp_type_idx type; + int weight; + + /* We couldn't initialize breakpoint constraints on boot */ + if (!constraints_initialized) + return -ENOMEM; + + /* Basic checks */ + if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY || + bp->attr.bp_type == HW_BREAKPOINT_INVALID) + return -EINVAL; + + type = find_slot_idx(bp); + weight = hw_breakpoint_weight(bp); + + fetch_bp_busy_slots(&slots, bp, type); + /* + * Simulate the addition of this breakpoint to the constraints + * and see the result. + */ + fetch_this_slot(&slots, weight); + + /* Flexible counters need to keep at least one slot */ + if (slots.pinned + (!!slots.flexible) > nr_slots[type]) + return -ENOSPC; + + toggle_bp_slot(bp, true, type, weight); + + return 0; +} + +int reserve_bp_slot(struct perf_event *bp) +{ + int ret; + + mutex_lock(&nr_bp_mutex); + + ret = __reserve_bp_slot(bp); + + mutex_unlock(&nr_bp_mutex); + + return ret; +} + +static void __release_bp_slot(struct perf_event *bp) +{ + enum bp_type_idx type; + int weight; + + type = find_slot_idx(bp); + weight = hw_breakpoint_weight(bp); + toggle_bp_slot(bp, false, type, weight); +} + +void release_bp_slot(struct perf_event *bp) +{ + mutex_lock(&nr_bp_mutex); + + arch_unregister_hw_breakpoint(bp); + __release_bp_slot(bp); + + mutex_unlock(&nr_bp_mutex); +} + +/* + * Allow the kernel debugger to reserve breakpoint slots without + * taking a lock using the dbg_* variant of for the reserve and + * release breakpoint slots. + */ +int dbg_reserve_bp_slot(struct perf_event *bp) +{ + if (mutex_is_locked(&nr_bp_mutex)) + return -1; + + return __reserve_bp_slot(bp); +} + +int dbg_release_bp_slot(struct perf_event *bp) +{ + if (mutex_is_locked(&nr_bp_mutex)) + return -1; + + __release_bp_slot(bp); + + return 0; +} + +static int validate_hw_breakpoint(struct perf_event *bp) +{ + int ret; + + ret = arch_validate_hwbkpt_settings(bp); + if (ret) + return ret; + + if (arch_check_bp_in_kernelspace(bp)) { + if (bp->attr.exclude_kernel) + return -EINVAL; + /* + * Don't let unprivileged users set a breakpoint in the trap + * path to avoid trap recursion attacks. + */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + } + + return 0; +} + +int register_perf_hw_breakpoint(struct perf_event *bp) +{ + int ret; + + ret = reserve_bp_slot(bp); + if (ret) + return ret; + + ret = validate_hw_breakpoint(bp); + + /* if arch_validate_hwbkpt_settings() fails then release bp slot */ + if (ret) + release_bp_slot(bp); + + return ret; +} + +/** + * register_user_hw_breakpoint - register a hardware breakpoint for user space + * @attr: breakpoint attributes + * @triggered: callback to trigger when we hit the breakpoint + * @tsk: pointer to 'task_struct' of the process to which the address belongs + */ +struct perf_event * +register_user_hw_breakpoint(struct perf_event_attr *attr, + perf_overflow_handler_t triggered, + struct task_struct *tsk) +{ + return perf_event_create_kernel_counter(attr, -1, tsk, triggered); +} +EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); + +/** + * modify_user_hw_breakpoint - modify a user-space hardware breakpoint + * @bp: the breakpoint structure to modify + * @attr: new breakpoint attributes + * @triggered: callback to trigger when we hit the breakpoint + * @tsk: pointer to 'task_struct' of the process to which the address belongs + */ +int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) +{ + u64 old_addr = bp->attr.bp_addr; + u64 old_len = bp->attr.bp_len; + int old_type = bp->attr.bp_type; + int err = 0; + + perf_event_disable(bp); + + bp->attr.bp_addr = attr->bp_addr; + bp->attr.bp_type = attr->bp_type; + bp->attr.bp_len = attr->bp_len; + + if (attr->disabled) + goto end; + + err = validate_hw_breakpoint(bp); + if (!err) + perf_event_enable(bp); + + if (err) { + bp->attr.bp_addr = old_addr; + bp->attr.bp_type = old_type; + bp->attr.bp_len = old_len; + if (!bp->attr.disabled) + perf_event_enable(bp); + + return err; + } + +end: + bp->attr.disabled = attr->disabled; + + return 0; +} +EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); + +/** + * unregister_hw_breakpoint - unregister a user-space hardware breakpoint + * @bp: the breakpoint structure to unregister + */ +void unregister_hw_breakpoint(struct perf_event *bp) +{ + if (!bp) + return; + perf_event_release_kernel(bp); +} +EXPORT_SYMBOL_GPL(unregister_hw_breakpoint); + +/** + * register_wide_hw_breakpoint - register a wide breakpoint in the kernel + * @attr: breakpoint attributes + * @triggered: callback to trigger when we hit the breakpoint + * + * @return a set of per_cpu pointers to perf events + */ +struct perf_event * __percpu * +register_wide_hw_breakpoint(struct perf_event_attr *attr, + perf_overflow_handler_t triggered) +{ + struct perf_event * __percpu *cpu_events, **pevent, *bp; + long err; + int cpu; + + cpu_events = alloc_percpu(typeof(*cpu_events)); + if (!cpu_events) + return (void __percpu __force *)ERR_PTR(-ENOMEM); + + get_online_cpus(); + for_each_online_cpu(cpu) { + pevent = per_cpu_ptr(cpu_events, cpu); + bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered); + + *pevent = bp; + + if (IS_ERR(bp)) { + err = PTR_ERR(bp); + goto fail; + } + } + put_online_cpus(); + + return cpu_events; + +fail: + for_each_online_cpu(cpu) { + pevent = per_cpu_ptr(cpu_events, cpu); + if (IS_ERR(*pevent)) + break; + unregister_hw_breakpoint(*pevent); + } + put_online_cpus(); + + free_percpu(cpu_events); + return (void __percpu __force *)ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); + +/** + * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel + * @cpu_events: the per cpu set of events to unregister + */ +void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) +{ + int cpu; + struct perf_event **pevent; + + for_each_possible_cpu(cpu) { + pevent = per_cpu_ptr(cpu_events, cpu); + unregister_hw_breakpoint(*pevent); + } + free_percpu(cpu_events); +} +EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint); + +static struct notifier_block hw_breakpoint_exceptions_nb = { + .notifier_call = hw_breakpoint_exceptions_notify, + /* we need to be notified first */ + .priority = 0x7fffffff +}; + +static void bp_perf_event_destroy(struct perf_event *event) +{ + release_bp_slot(event); +} + +static int hw_breakpoint_event_init(struct perf_event *bp) +{ + int err; + + if (bp->attr.type != PERF_TYPE_BREAKPOINT) + return -ENOENT; + + err = register_perf_hw_breakpoint(bp); + if (err) + return err; + + bp->destroy = bp_perf_event_destroy; + + return 0; +} + +static int hw_breakpoint_add(struct perf_event *bp, int flags) +{ + if (!(flags & PERF_EF_START)) + bp->hw.state = PERF_HES_STOPPED; + + return arch_install_hw_breakpoint(bp); +} + +static void hw_breakpoint_del(struct perf_event *bp, int flags) +{ + arch_uninstall_hw_breakpoint(bp); +} + +static void hw_breakpoint_start(struct perf_event *bp, int flags) +{ + bp->hw.state = 0; +} + +static void hw_breakpoint_stop(struct perf_event *bp, int flags) +{ + bp->hw.state = PERF_HES_STOPPED; +} + +static struct pmu perf_breakpoint = { + .task_ctx_nr = perf_sw_context, /* could eventually get its own */ + + .event_init = hw_breakpoint_event_init, + .add = hw_breakpoint_add, + .del = hw_breakpoint_del, + .start = hw_breakpoint_start, + .stop = hw_breakpoint_stop, + .read = hw_breakpoint_pmu_read, +}; + +int __init init_hw_breakpoint(void) +{ + unsigned int **task_bp_pinned; + int cpu, err_cpu; + int i; + + for (i = 0; i < TYPE_MAX; i++) + nr_slots[i] = hw_breakpoint_slots(i); + + for_each_possible_cpu(cpu) { + for (i = 0; i < TYPE_MAX; i++) { + task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu); + *task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i], + GFP_KERNEL); + if (!*task_bp_pinned) + goto err_alloc; + } + } + + constraints_initialized = 1; + + perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT); + + return register_die_notifier(&hw_breakpoint_exceptions_nb); + + err_alloc: + for_each_possible_cpu(err_cpu) { + if (err_cpu == cpu) + break; + for (i = 0; i < TYPE_MAX; i++) + kfree(per_cpu(nr_task_bp_pinned[i], cpu)); + } + + return -ENOMEM; +} + + diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c deleted file mode 100644 index 086adf25a55e..000000000000 --- a/kernel/hw_breakpoint.c +++ /dev/null @@ -1,659 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) 2007 Alan Stern - * Copyright (C) IBM Corporation, 2009 - * Copyright (C) 2009, Frederic Weisbecker - * - * Thanks to Ingo Molnar for his many suggestions. - * - * Authors: Alan Stern - * K.Prasad - * Frederic Weisbecker - */ - -/* - * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility, - * using the CPU's debug registers. - * This file contains the arch-independent routines. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - - -/* - * Constraints data - */ - -/* Number of pinned cpu breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]); - -/* Number of pinned task breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]); - -/* Number of non-pinned cpu/task breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]); - -static int nr_slots[TYPE_MAX]; - -/* Keep track of the breakpoints attached to tasks */ -static LIST_HEAD(bp_task_head); - -static int constraints_initialized; - -/* Gather the number of total pinned and un-pinned bp in a cpuset */ -struct bp_busy_slots { - unsigned int pinned; - unsigned int flexible; -}; - -/* Serialize accesses to the above constraints */ -static DEFINE_MUTEX(nr_bp_mutex); - -__weak int hw_breakpoint_weight(struct perf_event *bp) -{ - return 1; -} - -static inline enum bp_type_idx find_slot_idx(struct perf_event *bp) -{ - if (bp->attr.bp_type & HW_BREAKPOINT_RW) - return TYPE_DATA; - - return TYPE_INST; -} - -/* - * Report the maximum number of pinned breakpoints a task - * have in this cpu - */ -static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) -{ - int i; - unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); - - for (i = nr_slots[type] - 1; i >= 0; i--) { - if (tsk_pinned[i] > 0) - return i + 1; - } - - return 0; -} - -/* - * Count the number of breakpoints of the same type and same task. - * The given event must be not on the list. - */ -static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) -{ - struct task_struct *tsk = bp->hw.bp_target; - struct perf_event *iter; - int count = 0; - - list_for_each_entry(iter, &bp_task_head, hw.bp_list) { - if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type) - count += hw_breakpoint_weight(iter); - } - - return count; -} - -/* - * Report the number of pinned/un-pinned breakpoints we have in - * a given cpu (cpu > -1) or in all of them (cpu = -1). - */ -static void -fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, - enum bp_type_idx type) -{ - int cpu = bp->cpu; - struct task_struct *tsk = bp->hw.bp_target; - - if (cpu >= 0) { - slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu); - if (!tsk) - slots->pinned += max_task_bp_pinned(cpu, type); - else - slots->pinned += task_bp_pinned(bp, type); - slots->flexible = per_cpu(nr_bp_flexible[type], cpu); - - return; - } - - for_each_online_cpu(cpu) { - unsigned int nr; - - nr = per_cpu(nr_cpu_bp_pinned[type], cpu); - if (!tsk) - nr += max_task_bp_pinned(cpu, type); - else - nr += task_bp_pinned(bp, type); - - if (nr > slots->pinned) - slots->pinned = nr; - - nr = per_cpu(nr_bp_flexible[type], cpu); - - if (nr > slots->flexible) - slots->flexible = nr; - } -} - -/* - * For now, continue to consider flexible as pinned, until we can - * ensure no flexible event can ever be scheduled before a pinned event - * in a same cpu. - */ -static void -fetch_this_slot(struct bp_busy_slots *slots, int weight) -{ - slots->pinned += weight; -} - -/* - * Add a pinned breakpoint for the given task in our constraint table - */ -static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable, - enum bp_type_idx type, int weight) -{ - unsigned int *tsk_pinned; - int old_count = 0; - int old_idx = 0; - int idx = 0; - - old_count = task_bp_pinned(bp, type); - old_idx = old_count - 1; - idx = old_idx + weight; - - /* tsk_pinned[n] is the number of tasks having n breakpoints */ - tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); - if (enable) { - tsk_pinned[idx]++; - if (old_count > 0) - tsk_pinned[old_idx]--; - } else { - tsk_pinned[idx]--; - if (old_count > 0) - tsk_pinned[old_idx]++; - } -} - -/* - * Add/remove the given breakpoint in our constraint table - */ -static void -toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, - int weight) -{ - int cpu = bp->cpu; - struct task_struct *tsk = bp->hw.bp_target; - - /* Pinned counter cpu profiling */ - if (!tsk) { - - if (enable) - per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; - else - per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; - return; - } - - /* Pinned counter task profiling */ - - if (!enable) - list_del(&bp->hw.bp_list); - - if (cpu >= 0) { - toggle_bp_task_slot(bp, cpu, enable, type, weight); - } else { - for_each_online_cpu(cpu) - toggle_bp_task_slot(bp, cpu, enable, type, weight); - } - - if (enable) - list_add_tail(&bp->hw.bp_list, &bp_task_head); -} - -/* - * Function to perform processor-specific cleanup during unregistration - */ -__weak void arch_unregister_hw_breakpoint(struct perf_event *bp) -{ - /* - * A weak stub function here for those archs that don't define - * it inside arch/.../kernel/hw_breakpoint.c - */ -} - -/* - * Contraints to check before allowing this new breakpoint counter: - * - * == Non-pinned counter == (Considered as pinned for now) - * - * - If attached to a single cpu, check: - * - * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu) - * + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM - * - * -> If there are already non-pinned counters in this cpu, it means - * there is already a free slot for them. - * Otherwise, we check that the maximum number of per task - * breakpoints (for this cpu) plus the number of per cpu breakpoint - * (for this cpu) doesn't cover every registers. - * - * - If attached to every cpus, check: - * - * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *)) - * + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM - * - * -> This is roughly the same, except we check the number of per cpu - * bp for every cpu and we keep the max one. Same for the per tasks - * breakpoints. - * - * - * == Pinned counter == - * - * - If attached to a single cpu, check: - * - * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu) - * + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM - * - * -> Same checks as before. But now the nr_bp_flexible, if any, must keep - * one register at least (or they will never be fed). - * - * - If attached to every cpus, check: - * - * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) - * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM - */ -static int __reserve_bp_slot(struct perf_event *bp) -{ - struct bp_busy_slots slots = {0}; - enum bp_type_idx type; - int weight; - - /* We couldn't initialize breakpoint constraints on boot */ - if (!constraints_initialized) - return -ENOMEM; - - /* Basic checks */ - if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY || - bp->attr.bp_type == HW_BREAKPOINT_INVALID) - return -EINVAL; - - type = find_slot_idx(bp); - weight = hw_breakpoint_weight(bp); - - fetch_bp_busy_slots(&slots, bp, type); - /* - * Simulate the addition of this breakpoint to the constraints - * and see the result. - */ - fetch_this_slot(&slots, weight); - - /* Flexible counters need to keep at least one slot */ - if (slots.pinned + (!!slots.flexible) > nr_slots[type]) - return -ENOSPC; - - toggle_bp_slot(bp, true, type, weight); - - return 0; -} - -int reserve_bp_slot(struct perf_event *bp) -{ - int ret; - - mutex_lock(&nr_bp_mutex); - - ret = __reserve_bp_slot(bp); - - mutex_unlock(&nr_bp_mutex); - - return ret; -} - -static void __release_bp_slot(struct perf_event *bp) -{ - enum bp_type_idx type; - int weight; - - type = find_slot_idx(bp); - weight = hw_breakpoint_weight(bp); - toggle_bp_slot(bp, false, type, weight); -} - -void release_bp_slot(struct perf_event *bp) -{ - mutex_lock(&nr_bp_mutex); - - arch_unregister_hw_breakpoint(bp); - __release_bp_slot(bp); - - mutex_unlock(&nr_bp_mutex); -} - -/* - * Allow the kernel debugger to reserve breakpoint slots without - * taking a lock using the dbg_* variant of for the reserve and - * release breakpoint slots. - */ -int dbg_reserve_bp_slot(struct perf_event *bp) -{ - if (mutex_is_locked(&nr_bp_mutex)) - return -1; - - return __reserve_bp_slot(bp); -} - -int dbg_release_bp_slot(struct perf_event *bp) -{ - if (mutex_is_locked(&nr_bp_mutex)) - return -1; - - __release_bp_slot(bp); - - return 0; -} - -static int validate_hw_breakpoint(struct perf_event *bp) -{ - int ret; - - ret = arch_validate_hwbkpt_settings(bp); - if (ret) - return ret; - - if (arch_check_bp_in_kernelspace(bp)) { - if (bp->attr.exclude_kernel) - return -EINVAL; - /* - * Don't let unprivileged users set a breakpoint in the trap - * path to avoid trap recursion attacks. - */ - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - } - - return 0; -} - -int register_perf_hw_breakpoint(struct perf_event *bp) -{ - int ret; - - ret = reserve_bp_slot(bp); - if (ret) - return ret; - - ret = validate_hw_breakpoint(bp); - - /* if arch_validate_hwbkpt_settings() fails then release bp slot */ - if (ret) - release_bp_slot(bp); - - return ret; -} - -/** - * register_user_hw_breakpoint - register a hardware breakpoint for user space - * @attr: breakpoint attributes - * @triggered: callback to trigger when we hit the breakpoint - * @tsk: pointer to 'task_struct' of the process to which the address belongs - */ -struct perf_event * -register_user_hw_breakpoint(struct perf_event_attr *attr, - perf_overflow_handler_t triggered, - struct task_struct *tsk) -{ - return perf_event_create_kernel_counter(attr, -1, tsk, triggered); -} -EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); - -/** - * modify_user_hw_breakpoint - modify a user-space hardware breakpoint - * @bp: the breakpoint structure to modify - * @attr: new breakpoint attributes - * @triggered: callback to trigger when we hit the breakpoint - * @tsk: pointer to 'task_struct' of the process to which the address belongs - */ -int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) -{ - u64 old_addr = bp->attr.bp_addr; - u64 old_len = bp->attr.bp_len; - int old_type = bp->attr.bp_type; - int err = 0; - - perf_event_disable(bp); - - bp->attr.bp_addr = attr->bp_addr; - bp->attr.bp_type = attr->bp_type; - bp->attr.bp_len = attr->bp_len; - - if (attr->disabled) - goto end; - - err = validate_hw_breakpoint(bp); - if (!err) - perf_event_enable(bp); - - if (err) { - bp->attr.bp_addr = old_addr; - bp->attr.bp_type = old_type; - bp->attr.bp_len = old_len; - if (!bp->attr.disabled) - perf_event_enable(bp); - - return err; - } - -end: - bp->attr.disabled = attr->disabled; - - return 0; -} -EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); - -/** - * unregister_hw_breakpoint - unregister a user-space hardware breakpoint - * @bp: the breakpoint structure to unregister - */ -void unregister_hw_breakpoint(struct perf_event *bp) -{ - if (!bp) - return; - perf_event_release_kernel(bp); -} -EXPORT_SYMBOL_GPL(unregister_hw_breakpoint); - -/** - * register_wide_hw_breakpoint - register a wide breakpoint in the kernel - * @attr: breakpoint attributes - * @triggered: callback to trigger when we hit the breakpoint - * - * @return a set of per_cpu pointers to perf events - */ -struct perf_event * __percpu * -register_wide_hw_breakpoint(struct perf_event_attr *attr, - perf_overflow_handler_t triggered) -{ - struct perf_event * __percpu *cpu_events, **pevent, *bp; - long err; - int cpu; - - cpu_events = alloc_percpu(typeof(*cpu_events)); - if (!cpu_events) - return (void __percpu __force *)ERR_PTR(-ENOMEM); - - get_online_cpus(); - for_each_online_cpu(cpu) { - pevent = per_cpu_ptr(cpu_events, cpu); - bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered); - - *pevent = bp; - - if (IS_ERR(bp)) { - err = PTR_ERR(bp); - goto fail; - } - } - put_online_cpus(); - - return cpu_events; - -fail: - for_each_online_cpu(cpu) { - pevent = per_cpu_ptr(cpu_events, cpu); - if (IS_ERR(*pevent)) - break; - unregister_hw_breakpoint(*pevent); - } - put_online_cpus(); - - free_percpu(cpu_events); - return (void __percpu __force *)ERR_PTR(err); -} -EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); - -/** - * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel - * @cpu_events: the per cpu set of events to unregister - */ -void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) -{ - int cpu; - struct perf_event **pevent; - - for_each_possible_cpu(cpu) { - pevent = per_cpu_ptr(cpu_events, cpu); - unregister_hw_breakpoint(*pevent); - } - free_percpu(cpu_events); -} -EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint); - -static struct notifier_block hw_breakpoint_exceptions_nb = { - .notifier_call = hw_breakpoint_exceptions_notify, - /* we need to be notified first */ - .priority = 0x7fffffff -}; - -static void bp_perf_event_destroy(struct perf_event *event) -{ - release_bp_slot(event); -} - -static int hw_breakpoint_event_init(struct perf_event *bp) -{ - int err; - - if (bp->attr.type != PERF_TYPE_BREAKPOINT) - return -ENOENT; - - err = register_perf_hw_breakpoint(bp); - if (err) - return err; - - bp->destroy = bp_perf_event_destroy; - - return 0; -} - -static int hw_breakpoint_add(struct perf_event *bp, int flags) -{ - if (!(flags & PERF_EF_START)) - bp->hw.state = PERF_HES_STOPPED; - - return arch_install_hw_breakpoint(bp); -} - -static void hw_breakpoint_del(struct perf_event *bp, int flags) -{ - arch_uninstall_hw_breakpoint(bp); -} - -static void hw_breakpoint_start(struct perf_event *bp, int flags) -{ - bp->hw.state = 0; -} - -static void hw_breakpoint_stop(struct perf_event *bp, int flags) -{ - bp->hw.state = PERF_HES_STOPPED; -} - -static struct pmu perf_breakpoint = { - .task_ctx_nr = perf_sw_context, /* could eventually get its own */ - - .event_init = hw_breakpoint_event_init, - .add = hw_breakpoint_add, - .del = hw_breakpoint_del, - .start = hw_breakpoint_start, - .stop = hw_breakpoint_stop, - .read = hw_breakpoint_pmu_read, -}; - -int __init init_hw_breakpoint(void) -{ - unsigned int **task_bp_pinned; - int cpu, err_cpu; - int i; - - for (i = 0; i < TYPE_MAX; i++) - nr_slots[i] = hw_breakpoint_slots(i); - - for_each_possible_cpu(cpu) { - for (i = 0; i < TYPE_MAX; i++) { - task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu); - *task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i], - GFP_KERNEL); - if (!*task_bp_pinned) - goto err_alloc; - } - } - - constraints_initialized = 1; - - perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT); - - return register_die_notifier(&hw_breakpoint_exceptions_nb); - - err_alloc: - for_each_possible_cpu(err_cpu) { - if (err_cpu == cpu) - break; - for (i = 0; i < TYPE_MAX; i++) - kfree(per_cpu(nr_task_bp_pinned[i], cpu)); - } - - return -ENOMEM; -} - - -- cgit v1.2.3 From e7e7ee2eab2080248084d71fe0a115ab745eb2aa Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 4 May 2011 08:42:29 +0200 Subject: perf events: Clean up definitions and initializers, update copyrights Fix a few inconsistent style bits that were added over the past few months. Cc: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-yv4hwf9yhnzoada8pcpb3a97@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 96 +++++++++++++++++++++------------------------- kernel/events/core.c | 40 +++++++++---------- 2 files changed, 64 insertions(+), 72 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 9eec53d97370..207c16976a17 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -2,8 +2,8 @@ * Performance events: * * Copyright (C) 2008-2009, Thomas Gleixner - * Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra * * Data type definitions, declarations, prototypes. * @@ -468,9 +468,9 @@ enum perf_callchain_context { PERF_CONTEXT_MAX = (__u64)-4095, }; -#define PERF_FLAG_FD_NO_GROUP (1U << 0) -#define PERF_FLAG_FD_OUTPUT (1U << 1) -#define PERF_FLAG_PID_CGROUP (1U << 2) /* pid=cgroup id, per-cpu mode only */ +#define PERF_FLAG_FD_NO_GROUP (1U << 0) +#define PERF_FLAG_FD_OUTPUT (1U << 1) +#define PERF_FLAG_PID_CGROUP (1U << 2) /* pid=cgroup id, per-cpu mode only */ #ifdef __KERNEL__ /* @@ -484,9 +484,9 @@ enum perf_callchain_context { #endif struct perf_guest_info_callbacks { - int (*is_in_guest) (void); - int (*is_user_mode) (void); - unsigned long (*get_guest_ip) (void); + int (*is_in_guest)(void); + int (*is_user_mode)(void); + unsigned long (*get_guest_ip)(void); }; #ifdef CONFIG_HAVE_HW_BREAKPOINT @@ -652,19 +652,19 @@ struct pmu { * Start the transaction, after this ->add() doesn't need to * do schedulability tests. */ - void (*start_txn) (struct pmu *pmu); /* optional */ + void (*start_txn) (struct pmu *pmu); /* optional */ /* * If ->start_txn() disabled the ->add() schedulability test * then ->commit_txn() is required to perform one. On success * the transaction is closed. On error the transaction is kept * open until ->cancel_txn() is called. */ - int (*commit_txn) (struct pmu *pmu); /* optional */ + int (*commit_txn) (struct pmu *pmu); /* optional */ /* * Will cancel the transaction, assumes ->del() is called * for each successful ->add() during the transaction. */ - void (*cancel_txn) (struct pmu *pmu); /* optional */ + void (*cancel_txn) (struct pmu *pmu); /* optional */ }; /** @@ -712,15 +712,15 @@ typedef void (*perf_overflow_handler_t)(struct perf_event *, int, struct pt_regs *regs); enum perf_group_flag { - PERF_GROUP_SOFTWARE = 0x1, + PERF_GROUP_SOFTWARE = 0x1, }; -#define SWEVENT_HLIST_BITS 8 -#define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS) +#define SWEVENT_HLIST_BITS 8 +#define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS) struct swevent_hlist { - struct hlist_head heads[SWEVENT_HLIST_SIZE]; - struct rcu_head rcu_head; + struct hlist_head heads[SWEVENT_HLIST_SIZE]; + struct rcu_head rcu_head; }; #define PERF_ATTACH_CONTEXT 0x01 @@ -733,13 +733,13 @@ struct swevent_hlist { * This is a per-cpu dynamically allocated data structure. */ struct perf_cgroup_info { - u64 time; - u64 timestamp; + u64 time; + u64 timestamp; }; struct perf_cgroup { - struct cgroup_subsys_state css; - struct perf_cgroup_info *info; /* timing info, one per cpu */ + struct cgroup_subsys_state css; + struct perf_cgroup_info *info; /* timing info, one per cpu */ }; #endif @@ -923,7 +923,7 @@ struct perf_event_context { /* * Number of contexts where an event can trigger: - * task, softirq, hardirq, nmi. + * task, softirq, hardirq, nmi. */ #define PERF_NR_CONTEXTS 4 @@ -1001,8 +1001,7 @@ struct perf_sample_data { struct perf_raw_record *raw; }; -static inline -void perf_sample_data_init(struct perf_sample_data *data, u64 addr) +static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr) { data->addr = addr; data->raw = NULL; @@ -1039,8 +1038,7 @@ extern struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; extern void __perf_sw_event(u32, u64, int, struct pt_regs *, u64); #ifndef perf_arch_fetch_caller_regs -static inline void -perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip) { } +static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip) { } #endif /* @@ -1080,8 +1078,7 @@ static inline void perf_event_task_sched_in(struct task_struct *task) __perf_event_task_sched_in(task); } -static inline -void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next) +static inline void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next) { perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); @@ -1099,14 +1096,10 @@ extern void perf_event_fork(struct task_struct *tsk); /* Callchains */ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); -extern void perf_callchain_user(struct perf_callchain_entry *entry, - struct pt_regs *regs); -extern void perf_callchain_kernel(struct perf_callchain_entry *entry, - struct pt_regs *regs); - +extern void perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs); +extern void perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs); -static inline void -perf_callchain_store(struct perf_callchain_entry *entry, u64 ip) +static inline void perf_callchain_store(struct perf_callchain_entry *entry, u64 ip) { if (entry->nr < PERF_MAX_STACK_DEPTH) entry->ip[entry->nr++] = ip; @@ -1142,9 +1135,9 @@ extern void perf_tp_event(u64 addr, u64 count, void *record, extern void perf_bp_event(struct perf_event *event, void *data); #ifndef perf_misc_flags -#define perf_misc_flags(regs) (user_mode(regs) ? PERF_RECORD_MISC_USER : \ - PERF_RECORD_MISC_KERNEL) -#define perf_instruction_pointer(regs) instruction_pointer(regs) +# define perf_misc_flags(regs) \ + (user_mode(regs) ? PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL) +# define perf_instruction_pointer(regs) instruction_pointer(regs) #endif extern int perf_output_begin(struct perf_output_handle *handle, @@ -1179,9 +1172,9 @@ static inline void perf_bp_event(struct perf_event *event, void *data) { } static inline int perf_register_guest_info_callbacks -(struct perf_guest_info_callbacks *callbacks) { return 0; } +(struct perf_guest_info_callbacks *callbacks) { return 0; } static inline int perf_unregister_guest_info_callbacks -(struct perf_guest_info_callbacks *callbacks) { return 0; } +(struct perf_guest_info_callbacks *callbacks) { return 0; } static inline void perf_event_mmap(struct vm_area_struct *vma) { } static inline void perf_event_comm(struct task_struct *tsk) { } @@ -1194,23 +1187,22 @@ static inline void perf_event_disable(struct perf_event *event) { } static inline void perf_event_task_tick(void) { } #endif -#define perf_output_put(handle, x) \ - perf_output_copy((handle), &(x), sizeof(x)) +#define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x)) /* * This has to have a higher priority than migration_notifier in sched.c. */ -#define perf_cpu_notifier(fn) \ -do { \ - static struct notifier_block fn##_nb __cpuinitdata = \ - { .notifier_call = fn, .priority = CPU_PRI_PERF }; \ - fn(&fn##_nb, (unsigned long)CPU_UP_PREPARE, \ - (void *)(unsigned long)smp_processor_id()); \ - fn(&fn##_nb, (unsigned long)CPU_STARTING, \ - (void *)(unsigned long)smp_processor_id()); \ - fn(&fn##_nb, (unsigned long)CPU_ONLINE, \ - (void *)(unsigned long)smp_processor_id()); \ - register_cpu_notifier(&fn##_nb); \ +#define perf_cpu_notifier(fn) \ +do { \ + static struct notifier_block fn##_nb __cpuinitdata = \ + { .notifier_call = fn, .priority = CPU_PRI_PERF }; \ + fn(&fn##_nb, (unsigned long)CPU_UP_PREPARE, \ + (void *)(unsigned long)smp_processor_id()); \ + fn(&fn##_nb, (unsigned long)CPU_STARTING, \ + (void *)(unsigned long)smp_processor_id()); \ + fn(&fn##_nb, (unsigned long)CPU_ONLINE, \ + (void *)(unsigned long)smp_processor_id()); \ + register_cpu_notifier(&fn##_nb); \ } while (0) #endif /* __KERNEL__ */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 440bc485bbff..0fc34a370ba4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2,8 +2,8 @@ * Performance events core code: * * Copyright (C) 2008 Thomas Gleixner - * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. * * For licensing details see kernel-base/COPYING @@ -39,10 +39,10 @@ #include struct remote_function_call { - struct task_struct *p; - int (*func)(void *info); - void *info; - int ret; + struct task_struct *p; + int (*func)(void *info); + void *info; + int ret; }; static void remote_function(void *data) @@ -76,10 +76,10 @@ static int task_function_call(struct task_struct *p, int (*func) (void *info), void *info) { struct remote_function_call data = { - .p = p, - .func = func, - .info = info, - .ret = -ESRCH, /* No such (running) process */ + .p = p, + .func = func, + .info = info, + .ret = -ESRCH, /* No such (running) process */ }; if (task_curr(p)) @@ -100,10 +100,10 @@ task_function_call(struct task_struct *p, int (*func) (void *info), void *info) static int cpu_function_call(int cpu, int (*func) (void *info), void *info) { struct remote_function_call data = { - .p = NULL, - .func = func, - .info = info, - .ret = -ENXIO, /* No such CPU */ + .p = NULL, + .func = func, + .info = info, + .ret = -ENXIO, /* No such CPU */ }; smp_call_function_single(cpu, remote_function, &data, 1); @@ -7445,11 +7445,11 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, } struct cgroup_subsys perf_subsys = { - .name = "perf_event", - .subsys_id = perf_subsys_id, - .create = perf_cgroup_create, - .destroy = perf_cgroup_destroy, - .exit = perf_cgroup_exit, - .attach = perf_cgroup_attach, + .name = "perf_event", + .subsys_id = perf_subsys_id, + .create = perf_cgroup_create, + .destroy = perf_cgroup_destroy, + .exit = perf_cgroup_exit, + .attach = perf_cgroup_attach, }; #endif /* CONFIG_CGROUP_PERF */ -- cgit v1.2.3 From b448c4e3ae6d20108dba1d7833f2c0d3dbad87ce Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 29 Apr 2011 15:12:32 -0400 Subject: ftrace: Replace FTRACE_FL_NOTRACE flag with a hash of ignored functions To prepare for the accounting system that will allow multiple users of the function tracer, having the FTRACE_FL_NOTRACE as a flag in the dyn_trace record does not make sense. All ftrace_ops will soon have a hash of functions they should trace and not trace. By making a global hash of functions not to trace makes this easier for the transition. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 1 - kernel/trace/ftrace.c | 176 +++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 150 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 32047449b309..fe0a90a09243 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -149,7 +149,6 @@ enum { FTRACE_FL_FREE = (1 << 0), FTRACE_FL_FILTER = (1 << 1), FTRACE_FL_ENABLED = (1 << 2), - FTRACE_FL_NOTRACE = (1 << 3), }; struct dyn_ftrace { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d3406346ced6..04c002a491fb 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -57,6 +57,7 @@ /* hash bits for specific function selection */ #define FTRACE_HASH_BITS 7 #define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS) +#define FTRACE_HASH_MAX_BITS 10 /* ftrace_enabled is a method to turn ftrace on or off */ int ftrace_enabled __read_mostly; @@ -865,6 +866,22 @@ enum { FTRACE_START_FUNC_RET = (1 << 3), FTRACE_STOP_FUNC_RET = (1 << 4), }; +struct ftrace_func_entry { + struct hlist_node hlist; + unsigned long ip; +}; + +struct ftrace_hash { + unsigned long size_bits; + struct hlist_head *buckets; + unsigned long count; +}; + +static struct hlist_head notrace_buckets[1 << FTRACE_HASH_MAX_BITS]; +static struct ftrace_hash notrace_hash = { + .size_bits = FTRACE_HASH_MAX_BITS, + .buckets = notrace_buckets, +}; static int ftrace_filtered; @@ -889,6 +906,79 @@ static struct ftrace_page *ftrace_pages; static struct dyn_ftrace *ftrace_free_records; +static struct ftrace_func_entry * +ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) +{ + unsigned long key; + struct ftrace_func_entry *entry; + struct hlist_head *hhd; + struct hlist_node *n; + + if (!hash->count) + return NULL; + + if (hash->size_bits > 0) + key = hash_long(ip, hash->size_bits); + else + key = 0; + + hhd = &hash->buckets[key]; + + hlist_for_each_entry_rcu(entry, n, hhd, hlist) { + if (entry->ip == ip) + return entry; + } + return NULL; +} + +static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip) +{ + struct ftrace_func_entry *entry; + struct hlist_head *hhd; + unsigned long key; + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + + if (hash->size_bits) + key = hash_long(ip, hash->size_bits); + else + key = 0; + + entry->ip = ip; + hhd = &hash->buckets[key]; + hlist_add_head(&entry->hlist, hhd); + hash->count++; + + return 0; +} + +static void +remove_hash_entry(struct ftrace_hash *hash, + struct ftrace_func_entry *entry) +{ + hlist_del(&entry->hlist); + kfree(entry); + hash->count--; +} + +static void ftrace_hash_clear(struct ftrace_hash *hash) +{ + struct hlist_head *hhd; + struct hlist_node *tp, *tn; + struct ftrace_func_entry *entry; + int size = 1 << hash->size_bits; + int i; + + for (i = 0; i < size; i++) { + hhd = &hash->buckets[i]; + hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) + remove_hash_entry(hash, entry); + } + FTRACE_WARN_ON(hash->count); +} + /* * This is a double for. Do not use 'break' to break out of the loop, * you must use a goto. @@ -1032,7 +1122,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) * If we want to enable it and filtering is on, enable it only if * it's filtered */ - if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) { + if (enable && !ftrace_lookup_ip(¬race_hash, rec->ip)) { if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER)) flag = FTRACE_FL_ENABLED; } @@ -1465,7 +1555,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) !(rec->flags & FTRACE_FL_FILTER)) || ((iter->flags & FTRACE_ITER_NOTRACE) && - !(rec->flags & FTRACE_FL_NOTRACE))) { + !ftrace_lookup_ip(¬race_hash, rec->ip))) { rec = NULL; goto retry; } @@ -1609,14 +1699,15 @@ static void ftrace_filter_reset(int enable) { struct ftrace_page *pg; struct dyn_ftrace *rec; - unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; mutex_lock(&ftrace_lock); - if (enable) + if (enable) { ftrace_filtered = 0; - do_for_each_ftrace_rec(pg, rec) { - rec->flags &= ~type; - } while_for_each_ftrace_rec(); + do_for_each_ftrace_rec(pg, rec) { + rec->flags &= ~FTRACE_FL_FILTER; + } while_for_each_ftrace_rec(); + } else + ftrace_hash_clear(¬race_hash); mutex_unlock(&ftrace_lock); } @@ -1716,13 +1807,36 @@ static int ftrace_match(char *str, char *regex, int len, int type) return matched; } -static void -update_record(struct dyn_ftrace *rec, unsigned long flag, int not) +static int +update_record(struct dyn_ftrace *rec, int enable, int not) { - if (not) - rec->flags &= ~flag; - else - rec->flags |= flag; + struct ftrace_func_entry *entry; + struct ftrace_hash *hash = ¬race_hash; + int ret = 0; + + if (enable) { + if (not) + rec->flags &= ~FTRACE_FL_FILTER; + else + rec->flags |= FTRACE_FL_FILTER; + } else { + if (not) { + /* Do nothing if it doesn't exist */ + entry = ftrace_lookup_ip(hash, rec->ip); + if (!entry) + return 0; + + remove_hash_entry(hash, entry); + } else { + /* Do nothing if it exists */ + entry = ftrace_lookup_ip(hash, rec->ip); + if (entry) + return 0; + + ret = add_hash_entry(hash, rec->ip); + } + } + return ret; } static int @@ -1754,16 +1868,14 @@ static int match_records(char *buff, int len, char *mod, int enable, int not) struct dyn_ftrace *rec; int type = MATCH_FULL; char *search = buff; - unsigned long flag; int found = 0; + int ret; if (len) { type = filter_parse_regex(buff, len, &search, ¬); search_len = strlen(search); } - flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; - mutex_lock(&ftrace_lock); if (unlikely(ftrace_disabled)) @@ -1772,7 +1884,11 @@ static int match_records(char *buff, int len, char *mod, int enable, int not) do_for_each_ftrace_rec(pg, rec) { if (ftrace_match_record(rec, mod, search, search_len, type)) { - update_record(rec, flag, not); + ret = update_record(rec, enable, not); + if (ret < 0) { + found = ret; + goto out_unlock; + } found = 1; } /* @@ -1821,6 +1937,7 @@ static int ftrace_mod_callback(char *func, char *cmd, char *param, int enable) { char *mod; + int ret = -EINVAL; /* * cmd == 'mod' because we only registered this func @@ -1832,15 +1949,19 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable) /* we must have a module name */ if (!param) - return -EINVAL; + return ret; mod = strsep(¶m, ":"); if (!strlen(mod)) - return -EINVAL; + return ret; - if (ftrace_match_module_records(func, mod, enable)) - return 0; - return -EINVAL; + ret = ftrace_match_module_records(func, mod, enable); + if (!ret) + ret = -EINVAL; + if (ret < 0) + return ret; + + return 0; } static struct ftrace_func_command ftrace_mod_cmd = { @@ -2132,14 +2253,17 @@ static int ftrace_process_regex(char *buff, int len, int enable) { char *func, *command, *next = buff; struct ftrace_func_command *p; - int ret = -EINVAL; + int ret; func = strsep(&next, ":"); if (!next) { - if (ftrace_match_records(func, len, enable)) - return 0; - return ret; + ret = ftrace_match_records(func, len, enable); + if (!ret) + ret = -EINVAL; + if (ret < 0) + return ret; + return 0; } /* command found */ -- cgit v1.2.3 From 1cf41dd79993389b012e4542ab502ce36ae7343f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 29 Apr 2011 20:59:51 -0400 Subject: ftrace: Use hash instead for FTRACE_FL_FILTER When multiple users are allowed to have their own set of functions to trace, having the FTRACE_FL_FILTER flag will not be enough to handle the accounting of those users. Each user will need their own set of functions. Replace the FTRACE_FL_FILTER with a filter_hash instead. This is temporary until the rest of the function filtering accounting gets in. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 3 +- kernel/trace/ftrace.c | 151 ++++++++++++++++++++++--------------------------- 2 files changed, 70 insertions(+), 84 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index fe0a90a09243..52fc5d4e6edd 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -147,8 +147,7 @@ extern int ftrace_text_reserved(void *start, void *end); enum { FTRACE_FL_FREE = (1 << 0), - FTRACE_FL_FILTER = (1 << 1), - FTRACE_FL_ENABLED = (1 << 2), + FTRACE_FL_ENABLED = (1 << 1), }; struct dyn_ftrace { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 04c002a491fb..222eca4c3022 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -883,7 +883,11 @@ static struct ftrace_hash notrace_hash = { .buckets = notrace_buckets, }; -static int ftrace_filtered; +static struct hlist_head filter_buckets[1 << FTRACE_HASH_MAX_BITS]; +static struct ftrace_hash filter_hash = { + .size_bits = FTRACE_HASH_MAX_BITS, + .buckets = filter_buckets, +}; static struct dyn_ftrace *ftrace_new_addrs; @@ -1123,7 +1127,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) * it's filtered */ if (enable && !ftrace_lookup_ip(¬race_hash, rec->ip)) { - if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER)) + if (!filter_hash.count || ftrace_lookup_ip(&filter_hash, rec->ip)) flag = FTRACE_FL_ENABLED; } @@ -1430,6 +1434,7 @@ struct ftrace_iterator { struct dyn_ftrace *func; struct ftrace_func_probe *probe; struct trace_parser parser; + struct ftrace_hash *hash; int hidx; int idx; unsigned flags; @@ -1552,7 +1557,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) if ((rec->flags & FTRACE_FL_FREE) || ((iter->flags & FTRACE_ITER_FILTER) && - !(rec->flags & FTRACE_FL_FILTER)) || + !(ftrace_lookup_ip(&filter_hash, rec->ip))) || ((iter->flags & FTRACE_ITER_NOTRACE) && !ftrace_lookup_ip(¬race_hash, rec->ip))) { @@ -1598,7 +1603,7 @@ static void *t_start(struct seq_file *m, loff_t *pos) * off, we can short cut and just print out that all * functions are enabled. */ - if (iter->flags & FTRACE_ITER_FILTER && !ftrace_filtered) { + if (iter->flags & FTRACE_ITER_FILTER && !filter_hash.count) { if (*pos > 0) return t_hash_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; @@ -1695,24 +1700,16 @@ ftrace_avail_open(struct inode *inode, struct file *file) return ret; } -static void ftrace_filter_reset(int enable) +static void ftrace_filter_reset(struct ftrace_hash *hash) { - struct ftrace_page *pg; - struct dyn_ftrace *rec; - mutex_lock(&ftrace_lock); - if (enable) { - ftrace_filtered = 0; - do_for_each_ftrace_rec(pg, rec) { - rec->flags &= ~FTRACE_FL_FILTER; - } while_for_each_ftrace_rec(); - } else - ftrace_hash_clear(¬race_hash); + ftrace_hash_clear(hash); mutex_unlock(&ftrace_lock); } static int -ftrace_regex_open(struct inode *inode, struct file *file, int enable) +ftrace_regex_open(struct ftrace_hash *hash, int flag, + struct inode *inode, struct file *file) { struct ftrace_iterator *iter; int ret = 0; @@ -1729,15 +1726,16 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) return -ENOMEM; } + iter->hash = hash; + mutex_lock(&ftrace_regex_lock); if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) - ftrace_filter_reset(enable); + ftrace_filter_reset(hash); if (file->f_mode & FMODE_READ) { iter->pg = ftrace_pages_start; - iter->flags = enable ? FTRACE_ITER_FILTER : - FTRACE_ITER_NOTRACE; + iter->flags = flag; ret = seq_open(file, &show_ftrace_seq_ops); if (!ret) { @@ -1757,13 +1755,15 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) static int ftrace_filter_open(struct inode *inode, struct file *file) { - return ftrace_regex_open(inode, file, 1); + return ftrace_regex_open(&filter_hash, FTRACE_ITER_FILTER, + inode, file); } static int ftrace_notrace_open(struct inode *inode, struct file *file) { - return ftrace_regex_open(inode, file, 0); + return ftrace_regex_open(¬race_hash, FTRACE_ITER_NOTRACE, + inode, file); } static loff_t @@ -1808,33 +1808,24 @@ static int ftrace_match(char *str, char *regex, int len, int type) } static int -update_record(struct dyn_ftrace *rec, int enable, int not) +enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not) { struct ftrace_func_entry *entry; - struct ftrace_hash *hash = ¬race_hash; int ret = 0; - if (enable) { - if (not) - rec->flags &= ~FTRACE_FL_FILTER; - else - rec->flags |= FTRACE_FL_FILTER; - } else { - if (not) { - /* Do nothing if it doesn't exist */ - entry = ftrace_lookup_ip(hash, rec->ip); - if (!entry) - return 0; + entry = ftrace_lookup_ip(hash, rec->ip); + if (not) { + /* Do nothing if it doesn't exist */ + if (!entry) + return 0; - remove_hash_entry(hash, entry); - } else { - /* Do nothing if it exists */ - entry = ftrace_lookup_ip(hash, rec->ip); - if (entry) - return 0; + remove_hash_entry(hash, entry); + } else { + /* Do nothing if it exists */ + if (entry) + return 0; - ret = add_hash_entry(hash, rec->ip); - } + ret = add_hash_entry(hash, rec->ip); } return ret; } @@ -1861,7 +1852,9 @@ ftrace_match_record(struct dyn_ftrace *rec, char *mod, return ftrace_match(str, regex, len, type); } -static int match_records(char *buff, int len, char *mod, int enable, int not) +static int +match_records(struct ftrace_hash *hash, char *buff, + int len, char *mod, int not) { unsigned search_len = 0; struct ftrace_page *pg; @@ -1884,20 +1877,13 @@ static int match_records(char *buff, int len, char *mod, int enable, int not) do_for_each_ftrace_rec(pg, rec) { if (ftrace_match_record(rec, mod, search, search_len, type)) { - ret = update_record(rec, enable, not); + ret = enter_record(hash, rec, not); if (ret < 0) { found = ret; goto out_unlock; } found = 1; } - /* - * Only enable filtering if we have a function that - * is filtered on. - */ - if (enable && (rec->flags & FTRACE_FL_FILTER)) - ftrace_filtered = 1; - } while_for_each_ftrace_rec(); out_unlock: mutex_unlock(&ftrace_lock); @@ -1906,12 +1892,13 @@ static int match_records(char *buff, int len, char *mod, int enable, int not) } static int -ftrace_match_records(char *buff, int len, int enable) +ftrace_match_records(struct ftrace_hash *hash, char *buff, int len) { - return match_records(buff, len, NULL, enable, 0); + return match_records(hash, buff, len, NULL, 0); } -static int ftrace_match_module_records(char *buff, char *mod, int enable) +static int +ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod) { int not = 0; @@ -1925,7 +1912,7 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable) not = 1; } - return match_records(buff, strlen(buff), mod, enable, not); + return match_records(hash, buff, strlen(buff), mod, not); } /* @@ -1936,6 +1923,7 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable) static int ftrace_mod_callback(char *func, char *cmd, char *param, int enable) { + struct ftrace_hash *hash; char *mod; int ret = -EINVAL; @@ -1955,7 +1943,12 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable) if (!strlen(mod)) return ret; - ret = ftrace_match_module_records(func, mod, enable); + if (enable) + hash = &filter_hash; + else + hash = ¬race_hash; + + ret = ftrace_match_module_records(hash, func, mod); if (!ret) ret = -EINVAL; if (ret < 0) @@ -2253,12 +2246,18 @@ static int ftrace_process_regex(char *buff, int len, int enable) { char *func, *command, *next = buff; struct ftrace_func_command *p; + struct ftrace_hash *hash; int ret; + if (enable) + hash = &filter_hash; + else + hash = ¬race_hash; + func = strsep(&next, ":"); if (!next) { - ret = ftrace_match_records(func, len, enable); + ret = ftrace_match_records(hash, func, len); if (!ret) ret = -EINVAL; if (ret < 0) @@ -2340,16 +2339,16 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf, } static void -ftrace_set_regex(unsigned char *buf, int len, int reset, int enable) +ftrace_set_regex(struct ftrace_hash *hash, unsigned char *buf, int len, int reset) { if (unlikely(ftrace_disabled)) return; mutex_lock(&ftrace_regex_lock); if (reset) - ftrace_filter_reset(enable); + ftrace_filter_reset(hash); if (buf) - ftrace_match_records(buf, len, enable); + ftrace_match_records(hash, buf, len); mutex_unlock(&ftrace_regex_lock); } @@ -2364,7 +2363,7 @@ ftrace_set_regex(unsigned char *buf, int len, int reset, int enable) */ void ftrace_set_filter(unsigned char *buf, int len, int reset) { - ftrace_set_regex(buf, len, reset, 1); + ftrace_set_regex(&filter_hash, buf, len, reset); } /** @@ -2379,7 +2378,7 @@ void ftrace_set_filter(unsigned char *buf, int len, int reset) */ void ftrace_set_notrace(unsigned char *buf, int len, int reset) { - ftrace_set_regex(buf, len, reset, 0); + ftrace_set_regex(¬race_hash, buf, len, reset); } /* @@ -2431,22 +2430,22 @@ static void __init set_ftrace_early_graph(char *buf) } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -static void __init set_ftrace_early_filter(char *buf, int enable) +static void __init set_ftrace_early_filter(struct ftrace_hash *hash, char *buf) { char *func; while (buf) { func = strsep(&buf, ","); - ftrace_set_regex(func, strlen(func), 0, enable); + ftrace_set_regex(hash, func, strlen(func), 0); } } static void __init set_ftrace_early_filters(void) { if (ftrace_filter_buf[0]) - set_ftrace_early_filter(ftrace_filter_buf, 1); + set_ftrace_early_filter(&filter_hash, ftrace_filter_buf); if (ftrace_notrace_buf[0]) - set_ftrace_early_filter(ftrace_notrace_buf, 0); + set_ftrace_early_filter(¬race_hash, ftrace_notrace_buf); #ifdef CONFIG_FUNCTION_GRAPH_TRACER if (ftrace_graph_buf[0]) set_ftrace_early_graph(ftrace_graph_buf); @@ -2454,7 +2453,7 @@ static void __init set_ftrace_early_filters(void) } static int -ftrace_regex_release(struct inode *inode, struct file *file, int enable) +ftrace_regex_release(struct inode *inode, struct file *file) { struct seq_file *m = (struct seq_file *)file->private_data; struct ftrace_iterator *iter; @@ -2471,7 +2470,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) parser = &iter->parser; if (trace_parser_loaded(parser)) { parser->buffer[parser->idx] = 0; - ftrace_match_records(parser->buffer, parser->idx, enable); + ftrace_match_records(iter->hash, parser->buffer, parser->idx); } trace_parser_put(parser); @@ -2488,18 +2487,6 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) return 0; } -static int -ftrace_filter_release(struct inode *inode, struct file *file) -{ - return ftrace_regex_release(inode, file, 1); -} - -static int -ftrace_notrace_release(struct inode *inode, struct file *file) -{ - return ftrace_regex_release(inode, file, 0); -} - static const struct file_operations ftrace_avail_fops = { .open = ftrace_avail_open, .read = seq_read, @@ -2512,7 +2499,7 @@ static const struct file_operations ftrace_filter_fops = { .read = seq_read, .write = ftrace_filter_write, .llseek = ftrace_regex_lseek, - .release = ftrace_filter_release, + .release = ftrace_regex_release, }; static const struct file_operations ftrace_notrace_fops = { @@ -2520,7 +2507,7 @@ static const struct file_operations ftrace_notrace_fops = { .read = seq_read, .write = ftrace_notrace_write, .llseek = ftrace_regex_lseek, - .release = ftrace_notrace_release, + .release = ftrace_regex_release, }; #ifdef CONFIG_FUNCTION_GRAPH_TRACER -- cgit v1.2.3 From f45948e898e7bc76a73a468796d2ce80dd040058 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 2 May 2011 12:29:25 -0400 Subject: ftrace: Create a global_ops to hold the filter and notrace hashes Combine the filter and notrace hashes to be accessed by a single entity, the global_ops. The global_ops is a ftrace_ops structure that is passed to different functions that can read or modify the filtering of the function tracer. The ftrace_ops structure was modified to hold a filter and notrace hashes so that later patches may allow each ftrace_ops to have its own set of rules to what functions may be filtered. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 10 ++++++-- kernel/trace/ftrace.c | 65 +++++++++++++++++++++++++++++++++++--------------- 2 files changed, 54 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 52fc5d4e6edd..6658a51390fe 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -29,9 +29,15 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip); +struct ftrace_hash; + struct ftrace_ops { - ftrace_func_t func; - struct ftrace_ops *next; + ftrace_func_t func; + struct ftrace_ops *next; +#ifdef CONFIG_DYNAMIC_FTRACE + struct ftrace_hash *notrace_hash; + struct ftrace_hash *filter_hash; +#endif }; extern int function_trace_stop; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 222eca4c3022..a517a6c40645 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -889,6 +889,12 @@ static struct ftrace_hash filter_hash = { .buckets = filter_buckets, }; +struct ftrace_ops global_ops = { + .func = ftrace_stub, + .notrace_hash = ¬race_hash, + .filter_hash = &filter_hash, +}; + static struct dyn_ftrace *ftrace_new_addrs; static DEFINE_MUTEX(ftrace_regex_lock); @@ -1112,6 +1118,7 @@ int ftrace_text_reserved(void *start, void *end) static int __ftrace_replace_code(struct dyn_ftrace *rec, int enable) { + struct ftrace_ops *ops = &global_ops; unsigned long ftrace_addr; unsigned long flag = 0UL; @@ -1126,8 +1133,9 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) * If we want to enable it and filtering is on, enable it only if * it's filtered */ - if (enable && !ftrace_lookup_ip(¬race_hash, rec->ip)) { - if (!filter_hash.count || ftrace_lookup_ip(&filter_hash, rec->ip)) + if (enable && !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) { + if (!ops->filter_hash->count || + ftrace_lookup_ip(ops->filter_hash, rec->ip)) flag = FTRACE_FL_ENABLED; } @@ -1531,6 +1539,7 @@ static void * t_next(struct seq_file *m, void *v, loff_t *pos) { struct ftrace_iterator *iter = m->private; + struct ftrace_ops *ops = &global_ops; struct dyn_ftrace *rec = NULL; if (unlikely(ftrace_disabled)) @@ -1557,10 +1566,10 @@ t_next(struct seq_file *m, void *v, loff_t *pos) if ((rec->flags & FTRACE_FL_FREE) || ((iter->flags & FTRACE_ITER_FILTER) && - !(ftrace_lookup_ip(&filter_hash, rec->ip))) || + !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) || ((iter->flags & FTRACE_ITER_NOTRACE) && - !ftrace_lookup_ip(¬race_hash, rec->ip))) { + !ftrace_lookup_ip(ops->notrace_hash, rec->ip))) { rec = NULL; goto retry; } @@ -1584,6 +1593,7 @@ static void reset_iter_read(struct ftrace_iterator *iter) static void *t_start(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; + struct ftrace_ops *ops = &global_ops; void *p = NULL; loff_t l; @@ -1603,7 +1613,7 @@ static void *t_start(struct seq_file *m, loff_t *pos) * off, we can short cut and just print out that all * functions are enabled. */ - if (iter->flags & FTRACE_ITER_FILTER && !filter_hash.count) { + if (iter->flags & FTRACE_ITER_FILTER && !ops->filter_hash->count) { if (*pos > 0) return t_hash_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; @@ -1708,10 +1718,11 @@ static void ftrace_filter_reset(struct ftrace_hash *hash) } static int -ftrace_regex_open(struct ftrace_hash *hash, int flag, +ftrace_regex_open(struct ftrace_ops *ops, int flag, struct inode *inode, struct file *file) { struct ftrace_iterator *iter; + struct ftrace_hash *hash; int ret = 0; if (unlikely(ftrace_disabled)) @@ -1726,6 +1737,11 @@ ftrace_regex_open(struct ftrace_hash *hash, int flag, return -ENOMEM; } + if (flag & FTRACE_ITER_NOTRACE) + hash = ops->notrace_hash; + else + hash = ops->filter_hash; + iter->hash = hash; mutex_lock(&ftrace_regex_lock); @@ -1755,14 +1771,14 @@ ftrace_regex_open(struct ftrace_hash *hash, int flag, static int ftrace_filter_open(struct inode *inode, struct file *file) { - return ftrace_regex_open(&filter_hash, FTRACE_ITER_FILTER, + return ftrace_regex_open(&global_ops, FTRACE_ITER_FILTER, inode, file); } static int ftrace_notrace_open(struct inode *inode, struct file *file) { - return ftrace_regex_open(¬race_hash, FTRACE_ITER_NOTRACE, + return ftrace_regex_open(&global_ops, FTRACE_ITER_NOTRACE, inode, file); } @@ -1923,6 +1939,7 @@ ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod) static int ftrace_mod_callback(char *func, char *cmd, char *param, int enable) { + struct ftrace_ops *ops = &global_ops; struct ftrace_hash *hash; char *mod; int ret = -EINVAL; @@ -1944,9 +1961,9 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable) return ret; if (enable) - hash = &filter_hash; + hash = ops->filter_hash; else - hash = ¬race_hash; + hash = ops->notrace_hash; ret = ftrace_match_module_records(hash, func, mod); if (!ret) @@ -2245,14 +2262,15 @@ int unregister_ftrace_command(struct ftrace_func_command *cmd) static int ftrace_process_regex(char *buff, int len, int enable) { char *func, *command, *next = buff; + struct ftrace_ops *ops = &global_ops; struct ftrace_func_command *p; struct ftrace_hash *hash; int ret; if (enable) - hash = &filter_hash; + hash = ops->filter_hash; else - hash = ¬race_hash; + hash = ops->notrace_hash; func = strsep(&next, ":"); @@ -2339,11 +2357,19 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf, } static void -ftrace_set_regex(struct ftrace_hash *hash, unsigned char *buf, int len, int reset) +ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, + int reset, int enable) { + struct ftrace_hash *hash; + if (unlikely(ftrace_disabled)) return; + if (enable) + hash = ops->filter_hash; + else + hash = ops->notrace_hash; + mutex_lock(&ftrace_regex_lock); if (reset) ftrace_filter_reset(hash); @@ -2363,7 +2389,7 @@ ftrace_set_regex(struct ftrace_hash *hash, unsigned char *buf, int len, int rese */ void ftrace_set_filter(unsigned char *buf, int len, int reset) { - ftrace_set_regex(&filter_hash, buf, len, reset); + ftrace_set_regex(&global_ops, buf, len, reset, 1); } /** @@ -2378,7 +2404,7 @@ void ftrace_set_filter(unsigned char *buf, int len, int reset) */ void ftrace_set_notrace(unsigned char *buf, int len, int reset) { - ftrace_set_regex(¬race_hash, buf, len, reset); + ftrace_set_regex(&global_ops, buf, len, reset, 0); } /* @@ -2430,22 +2456,23 @@ static void __init set_ftrace_early_graph(char *buf) } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -static void __init set_ftrace_early_filter(struct ftrace_hash *hash, char *buf) +static void __init +set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable) { char *func; while (buf) { func = strsep(&buf, ","); - ftrace_set_regex(hash, func, strlen(func), 0); + ftrace_set_regex(ops, func, strlen(func), 0, enable); } } static void __init set_ftrace_early_filters(void) { if (ftrace_filter_buf[0]) - set_ftrace_early_filter(&filter_hash, ftrace_filter_buf); + set_ftrace_early_filter(&global_ops, ftrace_filter_buf, 1); if (ftrace_notrace_buf[0]) - set_ftrace_early_filter(¬race_hash, ftrace_notrace_buf); + set_ftrace_early_filter(&global_ops, ftrace_notrace_buf, 0); #ifdef CONFIG_FUNCTION_GRAPH_TRACER if (ftrace_graph_buf[0]) set_ftrace_early_graph(ftrace_graph_buf); -- cgit v1.2.3 From 33dc9b1267d59cef46ff0bd6bc043190845dc919 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 2 May 2011 17:34:47 -0400 Subject: ftrace: Separate hash allocation and assignment When filtering, allocate a hash to insert the function records. After the filtering is complete, assign it to the ftrace_ops structure. This allows the ftrace_ops structure to have a much smaller array of hash buckets instead of wasting a lot of memory. A read only empty_hash is created to be the minimum size that any ftrace_ops can point to. When a new hash is created, it has the following steps: o Allocate a default hash. o Walk the function records assigning the filtered records to the hash o Allocate a new hash with the appropriate size buckets o Move the entries from the default hash to the new hash. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 275 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 233 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a517a6c40645..46f08264980b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -57,7 +57,8 @@ /* hash bits for specific function selection */ #define FTRACE_HASH_BITS 7 #define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS) -#define FTRACE_HASH_MAX_BITS 10 +#define FTRACE_HASH_DEFAULT_BITS 10 +#define FTRACE_HASH_MAX_BITS 12 /* ftrace_enabled is a method to turn ftrace on or off */ int ftrace_enabled __read_mostly; @@ -877,22 +878,22 @@ struct ftrace_hash { unsigned long count; }; -static struct hlist_head notrace_buckets[1 << FTRACE_HASH_MAX_BITS]; -static struct ftrace_hash notrace_hash = { - .size_bits = FTRACE_HASH_MAX_BITS, - .buckets = notrace_buckets, -}; - -static struct hlist_head filter_buckets[1 << FTRACE_HASH_MAX_BITS]; -static struct ftrace_hash filter_hash = { - .size_bits = FTRACE_HASH_MAX_BITS, - .buckets = filter_buckets, +/* + * We make these constant because no one should touch them, + * but they are used as the default "empty hash", to avoid allocating + * it all the time. These are in a read only section such that if + * anyone does try to modify it, it will cause an exception. + */ +static const struct hlist_head empty_buckets[1]; +static const struct ftrace_hash empty_hash = { + .buckets = (struct hlist_head *)empty_buckets, }; +#define EMPTY_HASH ((struct ftrace_hash *)&empty_hash) struct ftrace_ops global_ops = { .func = ftrace_stub, - .notrace_hash = ¬race_hash, - .filter_hash = &filter_hash, + .notrace_hash = EMPTY_HASH, + .filter_hash = EMPTY_HASH, }; static struct dyn_ftrace *ftrace_new_addrs; @@ -941,31 +942,38 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) return NULL; } -static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip) +static void __add_hash_entry(struct ftrace_hash *hash, + struct ftrace_func_entry *entry) { - struct ftrace_func_entry *entry; struct hlist_head *hhd; unsigned long key; - entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) - return -ENOMEM; - if (hash->size_bits) - key = hash_long(ip, hash->size_bits); + key = hash_long(entry->ip, hash->size_bits); else key = 0; - entry->ip = ip; hhd = &hash->buckets[key]; hlist_add_head(&entry->hlist, hhd); hash->count++; +} + +static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip) +{ + struct ftrace_func_entry *entry; + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + + entry->ip = ip; + __add_hash_entry(hash, entry); return 0; } static void -remove_hash_entry(struct ftrace_hash *hash, +free_hash_entry(struct ftrace_hash *hash, struct ftrace_func_entry *entry) { hlist_del(&entry->hlist); @@ -973,6 +981,14 @@ remove_hash_entry(struct ftrace_hash *hash, hash->count--; } +static void +remove_hash_entry(struct ftrace_hash *hash, + struct ftrace_func_entry *entry) +{ + hlist_del(&entry->hlist); + hash->count--; +} + static void ftrace_hash_clear(struct ftrace_hash *hash) { struct hlist_head *hhd; @@ -981,14 +997,156 @@ static void ftrace_hash_clear(struct ftrace_hash *hash) int size = 1 << hash->size_bits; int i; + if (!hash->count) + return; + for (i = 0; i < size; i++) { hhd = &hash->buckets[i]; hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) - remove_hash_entry(hash, entry); + free_hash_entry(hash, entry); } FTRACE_WARN_ON(hash->count); } +static void free_ftrace_hash(struct ftrace_hash *hash) +{ + if (!hash || hash == EMPTY_HASH) + return; + ftrace_hash_clear(hash); + kfree(hash->buckets); + kfree(hash); +} + +static struct ftrace_hash *alloc_ftrace_hash(int size_bits) +{ + struct ftrace_hash *hash; + int size; + + hash = kzalloc(sizeof(*hash), GFP_KERNEL); + if (!hash) + return NULL; + + size = 1 << size_bits; + hash->buckets = kzalloc(sizeof(*hash->buckets) * size, GFP_KERNEL); + + if (!hash->buckets) { + kfree(hash); + return NULL; + } + + hash->size_bits = size_bits; + + return hash; +} + +static struct ftrace_hash * +alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) +{ + struct ftrace_func_entry *entry; + struct ftrace_hash *new_hash; + struct hlist_node *tp; + int size; + int ret; + int i; + + new_hash = alloc_ftrace_hash(size_bits); + if (!new_hash) + return NULL; + + /* Empty hash? */ + if (!hash || !hash->count) + return new_hash; + + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) { + ret = add_hash_entry(new_hash, entry->ip); + if (ret < 0) + goto free_hash; + } + } + + FTRACE_WARN_ON(new_hash->count != hash->count); + + return new_hash; + + free_hash: + free_ftrace_hash(new_hash); + return NULL; +} + +static int +ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) +{ + struct ftrace_func_entry *entry; + struct hlist_node *tp, *tn; + struct hlist_head *hhd; + struct ftrace_hash *hash = *dst; + unsigned long key; + int size = src->count; + int bits = 0; + int i; + + /* + * If the new source is empty, just free dst and assign it + * the empty_hash. + */ + if (!src->count) { + free_ftrace_hash(*dst); + *dst = EMPTY_HASH; + return 0; + } + + ftrace_hash_clear(hash); + + /* + * Make the hash size about 1/2 the # found + */ + for (size /= 2; size; size >>= 1) + bits++; + + /* Don't allocate too much */ + if (bits > FTRACE_HASH_MAX_BITS) + bits = FTRACE_HASH_MAX_BITS; + + /* We can't modify the empty_hash */ + if (hash == EMPTY_HASH) { + /* Create a new hash */ + *dst = alloc_ftrace_hash(bits); + if (!*dst) { + *dst = EMPTY_HASH; + return -ENOMEM; + } + hash = *dst; + } else { + size = 1 << bits; + + /* Use the old hash, but create new buckets */ + hhd = kzalloc(sizeof(*hhd) * size, GFP_KERNEL); + if (!hhd) + return -ENOMEM; + + kfree(hash->buckets); + hash->buckets = hhd; + hash->size_bits = bits; + } + + size = 1 << src->size_bits; + for (i = 0; i < size; i++) { + hhd = &src->buckets[i]; + hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) { + if (bits > 0) + key = hash_long(entry->ip, bits); + else + key = 0; + remove_hash_entry(src, entry); + __add_hash_entry(hash, entry); + } + } + + return 0; +} + /* * This is a double for. Do not use 'break' to break out of the loop, * you must use a goto. @@ -1443,6 +1601,7 @@ struct ftrace_iterator { struct ftrace_func_probe *probe; struct trace_parser parser; struct ftrace_hash *hash; + struct ftrace_ops *ops; int hidx; int idx; unsigned flags; @@ -1742,22 +1901,37 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, else hash = ops->filter_hash; - iter->hash = hash; + iter->ops = ops; + iter->flags = flag; + + if (file->f_mode & FMODE_WRITE) { + mutex_lock(&ftrace_lock); + iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash); + mutex_unlock(&ftrace_lock); + + if (!iter->hash) { + trace_parser_put(&iter->parser); + kfree(iter); + return -ENOMEM; + } + } mutex_lock(&ftrace_regex_lock); + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) - ftrace_filter_reset(hash); + ftrace_filter_reset(iter->hash); if (file->f_mode & FMODE_READ) { iter->pg = ftrace_pages_start; - iter->flags = flag; ret = seq_open(file, &show_ftrace_seq_ops); if (!ret) { struct seq_file *m = file->private_data; m->private = iter; } else { + /* Failed */ + free_ftrace_hash(iter->hash); trace_parser_put(&iter->parser); kfree(iter); } @@ -1835,7 +2009,7 @@ enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not) if (!entry) return 0; - remove_hash_entry(hash, entry); + free_hash_entry(hash, entry); } else { /* Do nothing if it exists */ if (entry) @@ -2259,19 +2433,13 @@ int unregister_ftrace_command(struct ftrace_func_command *cmd) return ret; } -static int ftrace_process_regex(char *buff, int len, int enable) +static int ftrace_process_regex(struct ftrace_hash *hash, + char *buff, int len, int enable) { char *func, *command, *next = buff; - struct ftrace_ops *ops = &global_ops; struct ftrace_func_command *p; - struct ftrace_hash *hash; int ret; - if (enable) - hash = ops->filter_hash; - else - hash = ops->notrace_hash; - func = strsep(&next, ":"); if (!next) { @@ -2328,7 +2496,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, if (read >= 0 && trace_parser_loaded(parser) && !trace_parser_cont(parser)) { - ret = ftrace_process_regex(parser->buffer, + ret = ftrace_process_regex(iter->hash, parser->buffer, parser->idx, enable); trace_parser_clear(parser); if (ret) @@ -2356,26 +2524,40 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf, return ftrace_regex_write(file, ubuf, cnt, ppos, 0); } -static void +static int ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, int reset, int enable) { + struct ftrace_hash **orig_hash; struct ftrace_hash *hash; + int ret; if (unlikely(ftrace_disabled)) - return; + return -ENODEV; if (enable) - hash = ops->filter_hash; + orig_hash = &ops->filter_hash; else - hash = ops->notrace_hash; + orig_hash = &ops->notrace_hash; + + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); + if (!hash) + return -ENOMEM; mutex_lock(&ftrace_regex_lock); if (reset) ftrace_filter_reset(hash); if (buf) ftrace_match_records(hash, buf, len); + + mutex_lock(&ftrace_lock); + ret = ftrace_hash_move(orig_hash, hash); + mutex_unlock(&ftrace_lock); + mutex_unlock(&ftrace_regex_lock); + + free_ftrace_hash(hash); + return ret; } /** @@ -2484,7 +2666,9 @@ ftrace_regex_release(struct inode *inode, struct file *file) { struct seq_file *m = (struct seq_file *)file->private_data; struct ftrace_iterator *iter; + struct ftrace_hash **orig_hash; struct trace_parser *parser; + int ret; mutex_lock(&ftrace_regex_lock); if (file->f_mode & FMODE_READ) { @@ -2501,14 +2685,21 @@ ftrace_regex_release(struct inode *inode, struct file *file) } trace_parser_put(parser); - kfree(iter); if (file->f_mode & FMODE_WRITE) { + if (iter->flags & FTRACE_ITER_NOTRACE) + orig_hash = &iter->ops->notrace_hash; + else + orig_hash = &iter->ops->filter_hash; + mutex_lock(&ftrace_lock); - if (ftrace_start_up && ftrace_enabled) + ret = ftrace_hash_move(orig_hash, iter->hash); + if (!ret && ftrace_start_up && ftrace_enabled) ftrace_run_update_code(FTRACE_ENABLE_CALLS); mutex_unlock(&ftrace_lock); } + free_ftrace_hash(iter->hash); + kfree(iter); mutex_unlock(&ftrace_regex_lock); return 0; -- cgit v1.2.3 From ed926f9b35cda0988234c356e16a7cb30f4e5338 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 3 May 2011 13:25:24 -0400 Subject: ftrace: Use counters to enable functions to trace Every function has its own record that stores the instruction pointer and flags for the function to be traced. There are only two flags: enabled and free. The enabled flag states that tracing for the function has been enabled (actively traced), and the free flag states that the record no longer points to a function and can be used by new functions (loaded modules). These flags are now moved to the MSB of the flags (actually just the top 32bits). The rest of the bits (30 bits) are now used as a ref counter. Everytime a tracer register functions to trace, those functions will have its counter incremented. When tracing is enabled, to determine if a function should be traced, the counter is examined, and if it is non-zero it is set to trace. When a ftrace_ops is registered to trace functions, its hashes are examined. If the ftrace_ops filter_hash count is zero, then all functions are set to be traced, otherwise only the functions in the hash are to be traced. The exception to this is if a function is also in the ftrace_ops notrace_hash. Then that function's counter is not incremented for this ftrace_ops. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 8 ++- kernel/trace/ftrace.c | 158 ++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 148 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 6658a51390fe..ab1c46e70bb6 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -37,6 +37,7 @@ struct ftrace_ops { #ifdef CONFIG_DYNAMIC_FTRACE struct ftrace_hash *notrace_hash; struct ftrace_hash *filter_hash; + unsigned long flags; #endif }; @@ -152,10 +153,13 @@ extern void unregister_ftrace_function_probe_all(char *glob); extern int ftrace_text_reserved(void *start, void *end); enum { - FTRACE_FL_FREE = (1 << 0), - FTRACE_FL_ENABLED = (1 << 1), + FTRACE_FL_ENABLED = (1 << 30), + FTRACE_FL_FREE = (1 << 31), }; +#define FTRACE_FL_MASK (0x3UL << 30) +#define FTRACE_REF_MAX ((1 << 30) - 1) + struct dyn_ftrace { union { unsigned long ip; /* address of mcount call-site */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 46f08264980b..5dd332cc5aa8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -890,6 +890,10 @@ static const struct ftrace_hash empty_hash = { }; #define EMPTY_HASH ((struct ftrace_hash *)&empty_hash) +enum { + FTRACE_OPS_FL_ENABLED = 1, +}; + struct ftrace_ops global_ops = { .func = ftrace_stub, .notrace_hash = EMPTY_HASH, @@ -1161,6 +1165,105 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) } \ } +static void __ftrace_hash_rec_update(struct ftrace_ops *ops, + int filter_hash, + bool inc) +{ + struct ftrace_hash *hash; + struct ftrace_hash *other_hash; + struct ftrace_page *pg; + struct dyn_ftrace *rec; + int count = 0; + int all = 0; + + /* Only update if the ops has been registered */ + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return; + + /* + * In the filter_hash case: + * If the count is zero, we update all records. + * Otherwise we just update the items in the hash. + * + * In the notrace_hash case: + * We enable the update in the hash. + * As disabling notrace means enabling the tracing, + * and enabling notrace means disabling, the inc variable + * gets inversed. + */ + if (filter_hash) { + hash = ops->filter_hash; + other_hash = ops->notrace_hash; + if (!hash->count) + all = 1; + } else { + inc = !inc; + hash = ops->notrace_hash; + other_hash = ops->filter_hash; + /* + * If the notrace hash has no items, + * then there's nothing to do. + */ + if (!hash->count) + return; + } + + do_for_each_ftrace_rec(pg, rec) { + int in_other_hash = 0; + int in_hash = 0; + int match = 0; + + if (all) { + /* + * Only the filter_hash affects all records. + * Update if the record is not in the notrace hash. + */ + if (!ftrace_lookup_ip(other_hash, rec->ip)) + match = 1; + } else { + in_hash = !!ftrace_lookup_ip(hash, rec->ip); + in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip); + + /* + * + */ + if (filter_hash && in_hash && !in_other_hash) + match = 1; + else if (!filter_hash && in_hash && + (in_other_hash || !other_hash->count)) + match = 1; + } + if (!match) + continue; + + if (inc) { + rec->flags++; + if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX)) + return; + } else { + if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0)) + return; + rec->flags--; + } + count++; + /* Shortcut, if we handled all records, we are done. */ + if (!all && count == hash->count) + return; + } while_for_each_ftrace_rec(); +} + +static void ftrace_hash_rec_disable(struct ftrace_ops *ops, + int filter_hash) +{ + __ftrace_hash_rec_update(ops, filter_hash, 0); +} + +static void ftrace_hash_rec_enable(struct ftrace_ops *ops, + int filter_hash) +{ + __ftrace_hash_rec_update(ops, filter_hash, 1); +} + static void ftrace_free_rec(struct dyn_ftrace *rec) { rec->freelist = ftrace_free_records; @@ -1276,26 +1379,24 @@ int ftrace_text_reserved(void *start, void *end) static int __ftrace_replace_code(struct dyn_ftrace *rec, int enable) { - struct ftrace_ops *ops = &global_ops; unsigned long ftrace_addr; unsigned long flag = 0UL; ftrace_addr = (unsigned long)FTRACE_ADDR; /* - * If this record is not to be traced or we want to disable it, - * then disable it. + * If we are enabling tracing: * - * If we want to enable it and filtering is off, then enable it. + * If the record has a ref count, then we need to enable it + * because someone is using it. * - * If we want to enable it and filtering is on, enable it only if - * it's filtered + * Otherwise we make sure its disabled. + * + * If we are disabling tracing, then disable all records that + * are enabled. */ - if (enable && !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) { - if (!ops->filter_hash->count || - ftrace_lookup_ip(ops->filter_hash, rec->ip)) - flag = FTRACE_FL_ENABLED; - } + if (enable && (rec->flags & ~FTRACE_FL_MASK)) + flag = FTRACE_FL_ENABLED; /* If the state of this record hasn't changed, then do nothing */ if ((rec->flags & FTRACE_FL_ENABLED) == flag) @@ -1423,17 +1524,25 @@ static void ftrace_startup_enable(int command) static void ftrace_startup(int command) { + struct ftrace_ops *ops = &global_ops; + if (unlikely(ftrace_disabled)) return; ftrace_start_up++; command |= FTRACE_ENABLE_CALLS; + ops->flags |= FTRACE_OPS_FL_ENABLED; + if (ftrace_start_up == 1) + ftrace_hash_rec_enable(ops, 1); + ftrace_startup_enable(command); } static void ftrace_shutdown(int command) { + struct ftrace_ops *ops = &global_ops; + if (unlikely(ftrace_disabled)) return; @@ -1446,7 +1555,12 @@ static void ftrace_shutdown(int command) WARN_ON_ONCE(ftrace_start_up < 0); if (!ftrace_start_up) + ftrace_hash_rec_disable(ops, 1); + + if (!ftrace_start_up) { command |= FTRACE_DISABLE_CALLS; + ops->flags &= ~FTRACE_OPS_FL_ENABLED; + } if (saved_ftrace_func != ftrace_trace_function) { saved_ftrace_func = ftrace_trace_function; @@ -2668,6 +2782,7 @@ ftrace_regex_release(struct inode *inode, struct file *file) struct ftrace_iterator *iter; struct ftrace_hash **orig_hash; struct trace_parser *parser; + int filter_hash; int ret; mutex_lock(&ftrace_regex_lock); @@ -2687,15 +2802,26 @@ ftrace_regex_release(struct inode *inode, struct file *file) trace_parser_put(parser); if (file->f_mode & FMODE_WRITE) { - if (iter->flags & FTRACE_ITER_NOTRACE) - orig_hash = &iter->ops->notrace_hash; - else + filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); + + if (filter_hash) orig_hash = &iter->ops->filter_hash; + else + orig_hash = &iter->ops->notrace_hash; mutex_lock(&ftrace_lock); + /* + * Remove the current set, update the hash and add + * them back. + */ + ftrace_hash_rec_disable(iter->ops, filter_hash); ret = ftrace_hash_move(orig_hash, iter->hash); - if (!ret && ftrace_start_up && ftrace_enabled) - ftrace_run_update_code(FTRACE_ENABLE_CALLS); + if (!ret) { + ftrace_hash_rec_enable(iter->ops, filter_hash); + if (iter->ops->flags & FTRACE_OPS_FL_ENABLED + && ftrace_enabled) + ftrace_run_update_code(FTRACE_ENABLE_CALLS); + } mutex_unlock(&ftrace_lock); } free_ftrace_hash(iter->hash); -- cgit v1.2.3 From 647bcd03d5b2fb44fd9c9ef1a4f50c2eee8f779a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 3 May 2011 14:39:21 -0400 Subject: ftrace: Add enabled_functions file Add the enabled_functions file that is used to show all the functions that have been enabled for tracing as well as their ref counts. This helps seeing if any function has been registered and what functions are being traced. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5dd332cc5aa8..065f1e61e103 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1703,6 +1703,7 @@ enum { FTRACE_ITER_NOTRACE = (1 << 1), FTRACE_ITER_PRINTALL = (1 << 2), FTRACE_ITER_HASH = (1 << 3), + FTRACE_ITER_ENABLED = (1 << 4), }; #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ @@ -1842,7 +1843,11 @@ t_next(struct seq_file *m, void *v, loff_t *pos) !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) || ((iter->flags & FTRACE_ITER_NOTRACE) && - !ftrace_lookup_ip(ops->notrace_hash, rec->ip))) { + !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) || + + ((iter->flags & FTRACE_ITER_ENABLED) && + !(rec->flags & ~FTRACE_FL_MASK))) { + rec = NULL; goto retry; } @@ -1944,7 +1949,11 @@ static int t_show(struct seq_file *m, void *v) if (!rec) return 0; - seq_printf(m, "%ps\n", (void *)rec->ip); + seq_printf(m, "%ps", (void *)rec->ip); + if (iter->flags & FTRACE_ITER_ENABLED) + seq_printf(m, " (%ld)", + rec->flags & ~FTRACE_FL_MASK); + seq_printf(m, "\n"); return 0; } @@ -1983,6 +1992,34 @@ ftrace_avail_open(struct inode *inode, struct file *file) return ret; } +static int +ftrace_enabled_open(struct inode *inode, struct file *file) +{ + struct ftrace_iterator *iter; + int ret; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + iter->pg = ftrace_pages_start; + iter->flags = FTRACE_ITER_ENABLED; + + ret = seq_open(file, &show_ftrace_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + + m->private = iter; + } else { + kfree(iter); + } + + return ret; +} + static void ftrace_filter_reset(struct ftrace_hash *hash) { mutex_lock(&ftrace_lock); @@ -2838,6 +2875,13 @@ static const struct file_operations ftrace_avail_fops = { .release = seq_release_private, }; +static const struct file_operations ftrace_enabled_fops = { + .open = ftrace_enabled_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + static const struct file_operations ftrace_filter_fops = { .open = ftrace_filter_open, .read = seq_read, @@ -3069,6 +3113,9 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) trace_create_file("available_filter_functions", 0444, d_tracer, NULL, &ftrace_avail_fops); + trace_create_file("enabled_functions", 0444, + d_tracer, NULL, &ftrace_enabled_fops); + trace_create_file("set_ftrace_filter", 0644, d_tracer, NULL, &ftrace_filter_fops); -- cgit v1.2.3 From bd69c30b1d08032d97ab0dabd7a1eb7fb73ca2b2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 3 May 2011 21:55:54 -0400 Subject: ftrace: Add ops parameter to ftrace_startup/shutdown functions In order to allow different ops to enable different functions, the ftrace_startup() and ftrace_shutdown() functions need the ops parameter passed to them. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 065f1e61e103..8fef1d99bbbf 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1522,10 +1522,8 @@ static void ftrace_startup_enable(int command) ftrace_run_update_code(command); } -static void ftrace_startup(int command) +static void ftrace_startup(struct ftrace_ops *ops, int command) { - struct ftrace_ops *ops = &global_ops; - if (unlikely(ftrace_disabled)) return; @@ -1539,10 +1537,8 @@ static void ftrace_startup(int command) ftrace_startup_enable(command); } -static void ftrace_shutdown(int command) +static void ftrace_shutdown(struct ftrace_ops *ops, int command) { - struct ftrace_ops *ops = &global_ops; - if (unlikely(ftrace_disabled)) return; @@ -2362,7 +2358,7 @@ static void __enable_ftrace_function_probe(void) return; __register_ftrace_function(&trace_probe_ops); - ftrace_startup(0); + ftrace_startup(&global_ops, 0); ftrace_probe_registered = 1; } @@ -2381,7 +2377,7 @@ static void __disable_ftrace_function_probe(void) /* no more funcs left */ __unregister_ftrace_function(&trace_probe_ops); - ftrace_shutdown(0); + ftrace_shutdown(&global_ops, 0); ftrace_probe_registered = 0; } @@ -3267,6 +3263,10 @@ void __init ftrace_init(void) #else +struct ftrace_ops global_ops = { + .func = ftrace_stub, +}; + static int __init ftrace_nodyn_init(void) { ftrace_enabled = 1; @@ -3277,8 +3277,8 @@ device_initcall(ftrace_nodyn_init); static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } static inline void ftrace_startup_enable(int command) { } /* Keep as macros so we do not need to define the commands */ -# define ftrace_startup(command) do { } while (0) -# define ftrace_shutdown(command) do { } while (0) +# define ftrace_startup(ops, command) do { } while (0) +# define ftrace_shutdown(ops, command) do { } while (0) # define ftrace_startup_sysctl() do { } while (0) # define ftrace_shutdown_sysctl() do { } while (0) #endif /* CONFIG_DYNAMIC_FTRACE */ @@ -3583,7 +3583,7 @@ int register_ftrace_function(struct ftrace_ops *ops) goto out_unlock; ret = __register_ftrace_function(ops); - ftrace_startup(0); + ftrace_startup(&global_ops, 0); out_unlock: mutex_unlock(&ftrace_lock); @@ -3602,7 +3602,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops) mutex_lock(&ftrace_lock); ret = __unregister_ftrace_function(ops); - ftrace_shutdown(0); + ftrace_shutdown(&global_ops, 0); mutex_unlock(&ftrace_lock); return ret; @@ -3825,7 +3825,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, ftrace_graph_return = retfunc; ftrace_graph_entry = entryfunc; - ftrace_startup(FTRACE_START_FUNC_RET); + ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); out: mutex_unlock(&ftrace_lock); @@ -3842,7 +3842,7 @@ void unregister_ftrace_graph(void) ftrace_graph_active--; ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; - ftrace_shutdown(FTRACE_STOP_FUNC_RET); + ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); -- cgit v1.2.3 From 2b499381bc50ede01b3d8eab164ca2fad00655f0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 3 May 2011 22:49:52 -0400 Subject: ftrace: Have global_ops store the functions that are to be traced This is a step towards each ops structure defining its own set of functions to trace. As the current code with pid's and such are specific to the global_ops, it is restructured to be used with the global ops. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 69 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 53 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8fef1d99bbbf..dcce0bf9c84d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -91,6 +91,7 @@ static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; +static struct ftrace_ops global_ops; /* * Traverse the ftrace_list, invoking all entries. The reason that we @@ -153,7 +154,7 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip) } #endif -static void update_ftrace_function(void) +static void update_global_ops(void) { ftrace_func_t func; @@ -173,6 +174,18 @@ static void update_ftrace_function(void) set_ftrace_pid_function(func); func = ftrace_pid_func; } + + global_ops.func = func; +} + +static void update_ftrace_function(void) +{ + ftrace_func_t func; + + update_global_ops(); + + func = global_ops.func; + #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST ftrace_trace_function = func; #else @@ -181,24 +194,19 @@ static void update_ftrace_function(void) #endif } -static int __register_ftrace_function(struct ftrace_ops *ops) +static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) { - ops->next = ftrace_list; + ops->next = *list; /* * We are entering ops into the ftrace_list but another * CPU might be walking that list. We need to make sure * the ops->next pointer is valid before another CPU sees * the ops pointer included into the ftrace_list. */ - rcu_assign_pointer(ftrace_list, ops); - - if (ftrace_enabled) - update_ftrace_function(); - - return 0; + rcu_assign_pointer(*list, ops); } -static int __unregister_ftrace_function(struct ftrace_ops *ops) +static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) { struct ftrace_ops **p; @@ -206,13 +214,12 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) * If we are removing the last function, then simply point * to the ftrace_stub. */ - if (ftrace_list == ops && ops->next == &ftrace_list_end) { - ftrace_trace_function = ftrace_stub; - ftrace_list = &ftrace_list_end; + if (*list == ops && ops->next == &ftrace_list_end) { + *list = &ftrace_list_end; return 0; } - for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next) + for (p = list; *p != &ftrace_list_end; p = &(*p)->next) if (*p == ops) break; @@ -220,7 +227,37 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) return -1; *p = (*p)->next; + return 0; +} + +static int __register_ftrace_function(struct ftrace_ops *ops) +{ + if (ftrace_disabled) + return -ENODEV; + + if (FTRACE_WARN_ON(ops == &global_ops)) + return -EINVAL; + + add_ftrace_ops(&ftrace_list, ops); + if (ftrace_enabled) + update_ftrace_function(); + + return 0; +} +static int __unregister_ftrace_function(struct ftrace_ops *ops) +{ + int ret; + + if (ftrace_disabled) + return -ENODEV; + + if (FTRACE_WARN_ON(ops == &global_ops)) + return -EINVAL; + + ret = remove_ftrace_ops(&ftrace_list, ops); + if (ret < 0) + return ret; if (ftrace_enabled) update_ftrace_function(); @@ -894,7 +931,7 @@ enum { FTRACE_OPS_FL_ENABLED = 1, }; -struct ftrace_ops global_ops = { +static struct ftrace_ops global_ops = { .func = ftrace_stub, .notrace_hash = EMPTY_HASH, .filter_hash = EMPTY_HASH, @@ -3263,7 +3300,7 @@ void __init ftrace_init(void) #else -struct ftrace_ops global_ops = { +static struct ftrace_ops global_ops = { .func = ftrace_stub, }; -- cgit v1.2.3 From 07fd5515f3b5c20704707f63e7f4485b534508a8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 5 May 2011 18:03:47 -0400 Subject: ftrace: Free hash with call_rcu_sched() When a hash is modified and might be in use, we need to perform a schedule RCU operation on it, as the hashes will soon be used directly in the function tracer callback. Cc: Paul E. McKenney Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 55 ++++++++++++++++++++++++++------------------------- 1 file changed, 28 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index dcce0bf9c84d..92b6fdf49ae5 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -913,6 +913,7 @@ struct ftrace_hash { unsigned long size_bits; struct hlist_head *buckets; unsigned long count; + struct rcu_head rcu; }; /* @@ -1058,6 +1059,21 @@ static void free_ftrace_hash(struct ftrace_hash *hash) kfree(hash); } +static void __free_ftrace_hash_rcu(struct rcu_head *rcu) +{ + struct ftrace_hash *hash; + + hash = container_of(rcu, struct ftrace_hash, rcu); + free_ftrace_hash(hash); +} + +static void free_ftrace_hash_rcu(struct ftrace_hash *hash) +{ + if (!hash || hash == EMPTY_HASH) + return; + call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu); +} + static struct ftrace_hash *alloc_ftrace_hash(int size_bits) { struct ftrace_hash *hash; @@ -1122,7 +1138,8 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) struct ftrace_func_entry *entry; struct hlist_node *tp, *tn; struct hlist_head *hhd; - struct ftrace_hash *hash = *dst; + struct ftrace_hash *old_hash; + struct ftrace_hash *new_hash; unsigned long key; int size = src->count; int bits = 0; @@ -1133,13 +1150,11 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) * the empty_hash. */ if (!src->count) { - free_ftrace_hash(*dst); - *dst = EMPTY_HASH; + free_ftrace_hash_rcu(*dst); + rcu_assign_pointer(*dst, EMPTY_HASH); return 0; } - ftrace_hash_clear(hash); - /* * Make the hash size about 1/2 the # found */ @@ -1150,27 +1165,9 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) if (bits > FTRACE_HASH_MAX_BITS) bits = FTRACE_HASH_MAX_BITS; - /* We can't modify the empty_hash */ - if (hash == EMPTY_HASH) { - /* Create a new hash */ - *dst = alloc_ftrace_hash(bits); - if (!*dst) { - *dst = EMPTY_HASH; - return -ENOMEM; - } - hash = *dst; - } else { - size = 1 << bits; - - /* Use the old hash, but create new buckets */ - hhd = kzalloc(sizeof(*hhd) * size, GFP_KERNEL); - if (!hhd) - return -ENOMEM; - - kfree(hash->buckets); - hash->buckets = hhd; - hash->size_bits = bits; - } + new_hash = alloc_ftrace_hash(bits); + if (!new_hash) + return -ENOMEM; size = 1 << src->size_bits; for (i = 0; i < size; i++) { @@ -1181,10 +1178,14 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) else key = 0; remove_hash_entry(src, entry); - __add_hash_entry(hash, entry); + __add_hash_entry(new_hash, entry); } } + old_hash = *dst; + rcu_assign_pointer(*dst, new_hash); + free_ftrace_hash_rcu(old_hash); + return 0; } -- cgit v1.2.3 From b848914ce39589d89ee0078a6d1ef452b464729e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 4 May 2011 09:27:52 -0400 Subject: ftrace: Implement separate user function filtering ftrace_ops that are registered to trace functions can now be agnostic to each other in respect to what functions they trace. Each ops has their own hash of the functions they want to trace and a hash to what they do not want to trace. A empty hash for the functions they want to trace denotes all functions should be traced that are not in the notrace hash. Cc: Paul E. McKenney Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 7 +- kernel/trace/ftrace.c | 193 ++++++++++++++++++++++++++++++-------- kernel/trace/trace_functions.c | 2 + kernel/trace/trace_irqsoff.c | 1 + kernel/trace/trace_sched_wakeup.c | 1 + kernel/trace/trace_stack.c | 1 + 6 files changed, 166 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index ab1c46e70bb6..4609c0ece79a 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -31,13 +31,18 @@ typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip); struct ftrace_hash; +enum { + FTRACE_OPS_FL_ENABLED = 1 << 0, + FTRACE_OPS_FL_GLOBAL = 1 << 1, +}; + struct ftrace_ops { ftrace_func_t func; struct ftrace_ops *next; + unsigned long flags; #ifdef CONFIG_DYNAMIC_FTRACE struct ftrace_hash *notrace_hash; struct ftrace_hash *filter_hash; - unsigned long flags; #endif }; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 92b6fdf49ae5..6c7e1df39b57 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -87,24 +87,29 @@ static struct ftrace_ops ftrace_list_end __read_mostly = .func = ftrace_stub, }; -static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; +static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; +static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; +static void +ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); + /* - * Traverse the ftrace_list, invoking all entries. The reason that we + * Traverse the ftrace_global_list, invoking all entries. The reason that we * can use rcu_dereference_raw() is that elements removed from this list * are simply leaked, so there is no need to interact with a grace-period * mechanism. The rcu_dereference_raw() calls are needed to handle - * concurrent insertions into the ftrace_list. + * concurrent insertions into the ftrace_global_list. * * Silly Alpha and silly pointer-speculation compiler optimizations! */ -static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) +static void ftrace_global_list_func(unsigned long ip, + unsigned long parent_ip) { - struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/ + struct ftrace_ops *op = rcu_dereference_raw(ftrace_global_list); /*see above*/ while (op != &ftrace_list_end) { op->func(ip, parent_ip); @@ -163,11 +168,11 @@ static void update_global_ops(void) * function directly. Otherwise, we need to iterate over the * registered callers. */ - if (ftrace_list == &ftrace_list_end || - ftrace_list->next == &ftrace_list_end) - func = ftrace_list->func; + if (ftrace_global_list == &ftrace_list_end || + ftrace_global_list->next == &ftrace_list_end) + func = ftrace_global_list->func; else - func = ftrace_list_func; + func = ftrace_global_list_func; /* If we filter on pids, update to use the pid function */ if (!list_empty(&ftrace_pids)) { @@ -184,7 +189,11 @@ static void update_ftrace_function(void) update_global_ops(); - func = global_ops.func; + if (ftrace_ops_list == &ftrace_list_end || + ftrace_ops_list->next == &ftrace_list_end) + func = ftrace_ops_list->func; + else + func = ftrace_ops_list_func; #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST ftrace_trace_function = func; @@ -198,10 +207,10 @@ static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) { ops->next = *list; /* - * We are entering ops into the ftrace_list but another + * We are entering ops into the list but another * CPU might be walking that list. We need to make sure * the ops->next pointer is valid before another CPU sees - * the ops pointer included into the ftrace_list. + * the ops pointer included into the list. */ rcu_assign_pointer(*list, ops); } @@ -238,7 +247,18 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if (FTRACE_WARN_ON(ops == &global_ops)) return -EINVAL; - add_ftrace_ops(&ftrace_list, ops); + if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) + return -EBUSY; + + if (ops->flags & FTRACE_OPS_FL_GLOBAL) { + int first = ftrace_global_list == &ftrace_list_end; + add_ftrace_ops(&ftrace_global_list, ops); + ops->flags |= FTRACE_OPS_FL_ENABLED; + if (first) + add_ftrace_ops(&ftrace_ops_list, &global_ops); + } else + add_ftrace_ops(&ftrace_ops_list, ops); + if (ftrace_enabled) update_ftrace_function(); @@ -252,12 +272,24 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) if (ftrace_disabled) return -ENODEV; + if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) + return -EBUSY; + if (FTRACE_WARN_ON(ops == &global_ops)) return -EINVAL; - ret = remove_ftrace_ops(&ftrace_list, ops); + if (ops->flags & FTRACE_OPS_FL_GLOBAL) { + ret = remove_ftrace_ops(&ftrace_global_list, ops); + if (!ret && ftrace_global_list == &ftrace_list_end) + ret = remove_ftrace_ops(&ftrace_ops_list, &global_ops); + if (!ret) + ops->flags &= ~FTRACE_OPS_FL_ENABLED; + } else + ret = remove_ftrace_ops(&ftrace_ops_list, ops); + if (ret < 0) return ret; + if (ftrace_enabled) update_ftrace_function(); @@ -928,10 +960,6 @@ static const struct ftrace_hash empty_hash = { }; #define EMPTY_HASH ((struct ftrace_hash *)&empty_hash) -enum { - FTRACE_OPS_FL_ENABLED = 1, -}; - static struct ftrace_ops global_ops = { .func = ftrace_stub, .notrace_hash = EMPTY_HASH, @@ -1189,6 +1217,40 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) return 0; } +/* + * Test the hashes for this ops to see if we want to call + * the ops->func or not. + * + * It's a match if the ip is in the ops->filter_hash or + * the filter_hash does not exist or is empty, + * AND + * the ip is not in the ops->notrace_hash. + */ +static int +ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) +{ + struct ftrace_hash *filter_hash; + struct ftrace_hash *notrace_hash; + int ret; + + /* The hashes are freed with call_rcu_sched() */ + preempt_disable_notrace(); + + filter_hash = rcu_dereference_raw(ops->filter_hash); + notrace_hash = rcu_dereference_raw(ops->notrace_hash); + + if ((!filter_hash || !filter_hash->count || + ftrace_lookup_ip(filter_hash, ip)) && + (!notrace_hash || !notrace_hash->count || + !ftrace_lookup_ip(notrace_hash, ip))) + ret = 1; + else + ret = 0; + preempt_enable_notrace(); + + return ret; +} + /* * This is a double for. Do not use 'break' to break out of the loop, * you must use a goto. @@ -1232,7 +1294,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, if (filter_hash) { hash = ops->filter_hash; other_hash = ops->notrace_hash; - if (!hash->count) + if (!hash || !hash->count) all = 1; } else { inc = !inc; @@ -1242,7 +1304,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, * If the notrace hash has no items, * then there's nothing to do. */ - if (!hash->count) + if (hash && !hash->count) return; } @@ -1256,11 +1318,11 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, * Only the filter_hash affects all records. * Update if the record is not in the notrace hash. */ - if (!ftrace_lookup_ip(other_hash, rec->ip)) + if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip)) match = 1; } else { - in_hash = !!ftrace_lookup_ip(hash, rec->ip); - in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip); + in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip); + in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip); /* * @@ -1546,6 +1608,7 @@ static void ftrace_run_update_code(int command) static ftrace_func_t saved_ftrace_func; static int ftrace_start_up; +static int global_start_up; static void ftrace_startup_enable(int command) { @@ -1562,14 +1625,25 @@ static void ftrace_startup_enable(int command) static void ftrace_startup(struct ftrace_ops *ops, int command) { + bool hash_enable = true; + if (unlikely(ftrace_disabled)) return; ftrace_start_up++; command |= FTRACE_ENABLE_CALLS; + /* ops marked global share the filter hashes */ + if (ops->flags & FTRACE_OPS_FL_GLOBAL) { + ops = &global_ops; + /* Don't update hash if global is already set */ + if (global_start_up) + hash_enable = false; + global_start_up++; + } + ops->flags |= FTRACE_OPS_FL_ENABLED; - if (ftrace_start_up == 1) + if (hash_enable) ftrace_hash_rec_enable(ops, 1); ftrace_startup_enable(command); @@ -1577,6 +1651,8 @@ static void ftrace_startup(struct ftrace_ops *ops, int command) static void ftrace_shutdown(struct ftrace_ops *ops, int command) { + bool hash_disable = true; + if (unlikely(ftrace_disabled)) return; @@ -1588,13 +1664,25 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command) */ WARN_ON_ONCE(ftrace_start_up < 0); - if (!ftrace_start_up) + if (ops->flags & FTRACE_OPS_FL_GLOBAL) { + ops = &global_ops; + global_start_up--; + WARN_ON_ONCE(global_start_up < 0); + /* Don't update hash if global still has users */ + if (global_start_up) { + WARN_ON_ONCE(!ftrace_start_up); + hash_disable = false; + } + } + + if (hash_disable) ftrace_hash_rec_disable(ops, 1); - if (!ftrace_start_up) { - command |= FTRACE_DISABLE_CALLS; + if (ops != &global_ops || !global_start_up) ops->flags &= ~FTRACE_OPS_FL_ENABLED; - } + + if (!ftrace_start_up) + command |= FTRACE_DISABLE_CALLS; if (saved_ftrace_func != ftrace_trace_function) { saved_ftrace_func = ftrace_trace_function; @@ -2381,6 +2469,7 @@ static int ftrace_probe_registered; static void __enable_ftrace_function_probe(void) { + int ret; int i; if (ftrace_probe_registered) @@ -2395,13 +2484,16 @@ static void __enable_ftrace_function_probe(void) if (i == FTRACE_FUNC_HASHSIZE) return; - __register_ftrace_function(&trace_probe_ops); - ftrace_startup(&global_ops, 0); + ret = __register_ftrace_function(&trace_probe_ops); + if (!ret) + ftrace_startup(&trace_probe_ops, 0); + ftrace_probe_registered = 1; } static void __disable_ftrace_function_probe(void) { + int ret; int i; if (!ftrace_probe_registered) @@ -2414,8 +2506,10 @@ static void __disable_ftrace_function_probe(void) } /* no more funcs left */ - __unregister_ftrace_function(&trace_probe_ops); - ftrace_shutdown(&global_ops, 0); + ret = __unregister_ftrace_function(&trace_probe_ops); + if (!ret) + ftrace_shutdown(&trace_probe_ops, 0); + ftrace_probe_registered = 0; } @@ -3319,8 +3413,28 @@ static inline void ftrace_startup_enable(int command) { } # define ftrace_shutdown(ops, command) do { } while (0) # define ftrace_startup_sysctl() do { } while (0) # define ftrace_shutdown_sysctl() do { } while (0) + +static inline int +ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) +{ + return 1; +} + #endif /* CONFIG_DYNAMIC_FTRACE */ +static void +ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) +{ + /* see comment above ftrace_global_list_func */ + struct ftrace_ops *op = rcu_dereference_raw(ftrace_ops_list); + + while (op != &ftrace_list_end) { + if (ftrace_ops_test(op, ip)) + op->func(ip, parent_ip); + op = rcu_dereference_raw(op->next); + }; +} + static void clear_ftrace_swapper(void) { struct task_struct *p; @@ -3621,7 +3735,9 @@ int register_ftrace_function(struct ftrace_ops *ops) goto out_unlock; ret = __register_ftrace_function(ops); - ftrace_startup(&global_ops, 0); + if (!ret) + ftrace_startup(ops, 0); + out_unlock: mutex_unlock(&ftrace_lock); @@ -3640,7 +3756,8 @@ int unregister_ftrace_function(struct ftrace_ops *ops) mutex_lock(&ftrace_lock); ret = __unregister_ftrace_function(ops); - ftrace_shutdown(&global_ops, 0); + if (!ret) + ftrace_shutdown(ops, 0); mutex_unlock(&ftrace_lock); return ret; @@ -3670,11 +3787,11 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, ftrace_startup_sysctl(); /* we are starting ftrace again */ - if (ftrace_list != &ftrace_list_end) { - if (ftrace_list->next == &ftrace_list_end) - ftrace_trace_function = ftrace_list->func; + if (ftrace_ops_list != &ftrace_list_end) { + if (ftrace_ops_list->next == &ftrace_list_end) + ftrace_trace_function = ftrace_ops_list->func; else - ftrace_trace_function = ftrace_list_func; + ftrace_trace_function = ftrace_ops_list_func; } } else { diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 16aee4d44e8f..8d0e1cc4e974 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -149,11 +149,13 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_ops __read_mostly = { .func = function_trace_call, + .flags = FTRACE_OPS_FL_GLOBAL, }; static struct ftrace_ops trace_stack_ops __read_mostly = { .func = function_stack_trace_call, + .flags = FTRACE_OPS_FL_GLOBAL, }; /* Our two options */ diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index a4969b47afc1..c77424be284d 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -153,6 +153,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_ops __read_mostly = { .func = irqsoff_tracer_call, + .flags = FTRACE_OPS_FL_GLOBAL, }; #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 7319559ed59f..f029dd4fd2ca 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -129,6 +129,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_ops __read_mostly = { .func = wakeup_tracer_call, + .flags = FTRACE_OPS_FL_GLOBAL, }; #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 4c5dead0c239..b0b53b8e4c25 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -133,6 +133,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_ops __read_mostly = { .func = stack_trace_call, + .flags = FTRACE_OPS_FL_GLOBAL, }; static ssize_t -- cgit v1.2.3 From cdbe61bfe70440939e457fb4a8d0995eaaed17de Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 5 May 2011 21:14:55 -0400 Subject: ftrace: Allow dynamically allocated function tracers Now that functions may be selected individually, it only makes sense that we should allow dynamically allocated trace structures to be traced. This will allow perf to allocate a ftrace_ops structure at runtime and use it to pick and choose which functions that structure will trace. Note, a dynamically allocated ftrace_ops will always be called indirectly instead of being called directly from the mcount in entry.S. This is because there's no safe way to prevent mcount from being preempted before calling the function, unless we modify every entry.S to do so (not likely). Thus, dynamically allocated functions will now be called by the ftrace_ops_list_func() that loops through the ops that are allocated if there are more than one op allocated at a time. This loop is protected with a preempt_disable. To determine if an ftrace_ops structure is allocated or not, a new util function was added to the kernel/extable.c called core_kernel_data(), which returns 1 if the address is between _sdata and _edata. Cc: Paul E. McKenney Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 1 + include/linux/kernel.h | 1 + kernel/extable.c | 8 ++++++++ kernel/trace/ftrace.c | 37 ++++++++++++++++++++++++++++++------- 4 files changed, 40 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 4609c0ece79a..caba694a62b6 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -34,6 +34,7 @@ struct ftrace_hash; enum { FTRACE_OPS_FL_ENABLED = 1 << 0, FTRACE_OPS_FL_GLOBAL = 1 << 1, + FTRACE_OPS_FL_DYNAMIC = 1 << 2, }; struct ftrace_ops { diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 00cec4dc0ae2..f37ba716ef8b 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -283,6 +283,7 @@ extern char *get_options(const char *str, int nints, int *ints); extern unsigned long long memparse(const char *ptr, char **retptr); extern int core_kernel_text(unsigned long addr); +extern int core_kernel_data(unsigned long addr); extern int __kernel_text_address(unsigned long addr); extern int kernel_text_address(unsigned long addr); extern int func_ptr_is_kernel_text(void *ptr); diff --git a/kernel/extable.c b/kernel/extable.c index 7f8f263f8524..c2d625fcda77 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -72,6 +72,14 @@ int core_kernel_text(unsigned long addr) return 0; } +int core_kernel_data(unsigned long addr) +{ + if (addr >= (unsigned long)_sdata && + addr < (unsigned long)_edata) + return 1; + return 0; +} + int __kernel_text_address(unsigned long addr) { if (core_kernel_text(addr)) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6c7e1df39b57..5b3ee04e39d9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -189,8 +189,14 @@ static void update_ftrace_function(void) update_global_ops(); + /* + * If we are at the end of the list and this ops is + * not dynamic, then have the mcount trampoline call + * the function directly + */ if (ftrace_ops_list == &ftrace_list_end || - ftrace_ops_list->next == &ftrace_list_end) + (ftrace_ops_list->next == &ftrace_list_end && + !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC))) func = ftrace_ops_list->func; else func = ftrace_ops_list_func; @@ -250,6 +256,9 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) return -EBUSY; + if (!core_kernel_data((unsigned long)ops)) + ops->flags |= FTRACE_OPS_FL_DYNAMIC; + if (ops->flags & FTRACE_OPS_FL_GLOBAL) { int first = ftrace_global_list == &ftrace_list_end; add_ftrace_ops(&ftrace_global_list, ops); @@ -293,6 +302,13 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) if (ftrace_enabled) update_ftrace_function(); + /* + * Dynamic ops may be freed, we must make sure that all + * callers are done before leaving this function. + */ + if (ops->flags & FTRACE_OPS_FL_DYNAMIC) + synchronize_sched(); + return 0; } @@ -1225,6 +1241,9 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) * the filter_hash does not exist or is empty, * AND * the ip is not in the ops->notrace_hash. + * + * This needs to be called with preemption disabled as + * the hashes are freed with call_rcu_sched(). */ static int ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) @@ -1233,9 +1252,6 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) struct ftrace_hash *notrace_hash; int ret; - /* The hashes are freed with call_rcu_sched() */ - preempt_disable_notrace(); - filter_hash = rcu_dereference_raw(ops->filter_hash); notrace_hash = rcu_dereference_raw(ops->notrace_hash); @@ -1246,7 +1262,6 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) ret = 1; else ret = 0; - preempt_enable_notrace(); return ret; } @@ -3425,14 +3440,20 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) { - /* see comment above ftrace_global_list_func */ - struct ftrace_ops *op = rcu_dereference_raw(ftrace_ops_list); + struct ftrace_ops *op; + /* + * Some of the ops may be dynamically allocated, + * they must be freed after a synchronize_sched(). + */ + preempt_disable_notrace(); + op = rcu_dereference_raw(ftrace_ops_list); while (op != &ftrace_list_end) { if (ftrace_ops_test(op, ip)) op->func(ip, parent_ip); op = rcu_dereference_raw(op->next); }; + preempt_enable_notrace(); } static void clear_ftrace_swapper(void) @@ -3743,6 +3764,7 @@ int register_ftrace_function(struct ftrace_ops *ops) mutex_unlock(&ftrace_lock); return ret; } +EXPORT_SYMBOL_GPL(register_ftrace_function); /** * unregister_ftrace_function - unregister a function for profiling. @@ -3762,6 +3784,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops) return ret; } +EXPORT_SYMBOL_GPL(unregister_ftrace_function); int ftrace_enable_sysctl(struct ctl_table *table, int write, -- cgit v1.2.3 From 936e074b286ae779f134312178dbab139ee7ea52 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 5 May 2011 22:54:01 -0400 Subject: ftrace: Modify ftrace_set_filter/notrace to take ops Since users of the function tracer can now pick and choose which functions they want to trace agnostically from other users of the function tracer, we need to pass the ops struct to the ftrace_set_filter() functions. The functions ftrace_set_global_filter() and ftrace_set_global_notrace() is added to keep the old filter functions which are used to modify the generic function tracers. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 7 ++++++- kernel/trace/ftrace.c | 46 +++++++++++++++++++++++++++++++++++++++++-- kernel/trace/trace_selftest.c | 4 ++-- 3 files changed, 52 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index caba694a62b6..9d88e1cb5dbb 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -179,7 +179,12 @@ struct dyn_ftrace { }; int ftrace_force_update(void); -void ftrace_set_filter(unsigned char *buf, int len, int reset); +void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, + int len, int reset); +void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, + int len, int reset); +void ftrace_set_global_filter(unsigned char *buf, int len, int reset); +void ftrace_set_global_notrace(unsigned char *buf, int len, int reset); int register_ftrace_command(struct ftrace_func_command *cmd); int unregister_ftrace_command(struct ftrace_func_command *cmd); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5b3ee04e39d9..d017c2c82c44 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2826,6 +2826,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, struct ftrace_hash *hash; int ret; + /* All global ops uses the global ops filters */ + if (ops->flags & FTRACE_OPS_FL_GLOBAL) + ops = &global_ops; + if (unlikely(ftrace_disabled)) return -ENODEV; @@ -2856,6 +2860,41 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, /** * ftrace_set_filter - set a function to filter on in ftrace + * @ops - the ops to set the filter with + * @buf - the string that holds the function filter text. + * @len - the length of the string. + * @reset - non zero to reset all filters before applying this filter. + * + * Filters denote which functions should be enabled when tracing is enabled. + * If @buf is NULL and reset is set, all functions will be enabled for tracing. + */ +void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, + int len, int reset) +{ + ftrace_set_regex(ops, buf, len, reset, 1); +} +EXPORT_SYMBOL_GPL(ftrace_set_filter); + +/** + * ftrace_set_notrace - set a function to not trace in ftrace + * @ops - the ops to set the notrace filter with + * @buf - the string that holds the function notrace text. + * @len - the length of the string. + * @reset - non zero to reset all filters before applying this filter. + * + * Notrace Filters denote which functions should not be enabled when tracing + * is enabled. If @buf is NULL and reset is set, all functions will be enabled + * for tracing. + */ +void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, + int len, int reset) +{ + ftrace_set_regex(ops, buf, len, reset, 0); +} +EXPORT_SYMBOL_GPL(ftrace_set_notrace); +/** + * ftrace_set_filter - set a function to filter on in ftrace + * @ops - the ops to set the filter with * @buf - the string that holds the function filter text. * @len - the length of the string. * @reset - non zero to reset all filters before applying this filter. @@ -2863,13 +2902,15 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, * Filters denote which functions should be enabled when tracing is enabled. * If @buf is NULL and reset is set, all functions will be enabled for tracing. */ -void ftrace_set_filter(unsigned char *buf, int len, int reset) +void ftrace_set_global_filter(unsigned char *buf, int len, int reset) { ftrace_set_regex(&global_ops, buf, len, reset, 1); } +EXPORT_SYMBOL_GPL(ftrace_set_global_filter); /** * ftrace_set_notrace - set a function to not trace in ftrace + * @ops - the ops to set the notrace filter with * @buf - the string that holds the function notrace text. * @len - the length of the string. * @reset - non zero to reset all filters before applying this filter. @@ -2878,10 +2919,11 @@ void ftrace_set_filter(unsigned char *buf, int len, int reset) * is enabled. If @buf is NULL and reset is set, all functions will be enabled * for tracing. */ -void ftrace_set_notrace(unsigned char *buf, int len, int reset) +void ftrace_set_global_notrace(unsigned char *buf, int len, int reset) { ftrace_set_regex(&global_ops, buf, len, reset, 0); } +EXPORT_SYMBOL_GPL(ftrace_set_global_notrace); /* * command line interface to allow users to set filters on boot up. diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 659732eba07c..0fa2db305b7c 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -131,7 +131,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); /* filter only on our function */ - ftrace_set_filter(func_name, strlen(func_name), 1); + ftrace_set_global_filter(func_name, strlen(func_name), 1); /* enable tracing */ ret = tracer_init(trace, tr); @@ -181,7 +181,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, tracer_enabled = save_tracer_enabled; /* Enable tracing on all functions again */ - ftrace_set_filter(NULL, 0, 1); + ftrace_set_global_filter(NULL, 0, 1); return ret; } -- cgit v1.2.3 From 95950c2ecb31314ef827428e43ff771cf3b037e5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 6 May 2011 00:08:51 -0400 Subject: ftrace: Add self-tests for multiple function trace users Add some basic sanity tests for multiple users of the function tracer at startup. Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 2 + kernel/trace/trace_selftest.c | 210 +++++++++++++++++++++++++++++++++- kernel/trace/trace_selftest_dynamic.c | 6 + 3 files changed, 217 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5e9dfc6286dd..6b69c4bd306f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -419,6 +419,8 @@ extern void trace_find_cmdline(int pid, char comm[]); extern unsigned long ftrace_update_tot_cnt; #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func extern int DYN_FTRACE_TEST_NAME(void); +#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 +extern int DYN_FTRACE_TEST_NAME2(void); #endif extern int ring_buffer_expanded; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 0fa2db305b7c..288541f977fb 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -101,6 +101,206 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret) #ifdef CONFIG_DYNAMIC_FTRACE +static int trace_selftest_test_probe1_cnt; +static void trace_selftest_test_probe1_func(unsigned long ip, + unsigned long pip) +{ + trace_selftest_test_probe1_cnt++; +} + +static int trace_selftest_test_probe2_cnt; +static void trace_selftest_test_probe2_func(unsigned long ip, + unsigned long pip) +{ + trace_selftest_test_probe2_cnt++; +} + +static int trace_selftest_test_probe3_cnt; +static void trace_selftest_test_probe3_func(unsigned long ip, + unsigned long pip) +{ + trace_selftest_test_probe3_cnt++; +} + +static int trace_selftest_test_global_cnt; +static void trace_selftest_test_global_func(unsigned long ip, + unsigned long pip) +{ + trace_selftest_test_global_cnt++; +} + +static int trace_selftest_test_dyn_cnt; +static void trace_selftest_test_dyn_func(unsigned long ip, + unsigned long pip) +{ + trace_selftest_test_dyn_cnt++; +} + +static struct ftrace_ops test_probe1 = { + .func = trace_selftest_test_probe1_func, +}; + +static struct ftrace_ops test_probe2 = { + .func = trace_selftest_test_probe2_func, +}; + +static struct ftrace_ops test_probe3 = { + .func = trace_selftest_test_probe3_func, +}; + +static struct ftrace_ops test_global = { + .func = trace_selftest_test_global_func, + .flags = FTRACE_OPS_FL_GLOBAL, +}; + +static void print_counts(void) +{ + printk("(%d %d %d %d %d) ", + trace_selftest_test_probe1_cnt, + trace_selftest_test_probe2_cnt, + trace_selftest_test_probe3_cnt, + trace_selftest_test_global_cnt, + trace_selftest_test_dyn_cnt); +} + +static void reset_counts(void) +{ + trace_selftest_test_probe1_cnt = 0; + trace_selftest_test_probe2_cnt = 0; + trace_selftest_test_probe3_cnt = 0; + trace_selftest_test_global_cnt = 0; + trace_selftest_test_dyn_cnt = 0; +} + +static int trace_selftest_ops(int cnt) +{ + int save_ftrace_enabled = ftrace_enabled; + struct ftrace_ops *dyn_ops; + char *func1_name; + char *func2_name; + int len1; + int len2; + int ret = -1; + + printk(KERN_CONT "PASSED\n"); + pr_info("Testing dynamic ftrace ops #%d: ", cnt); + + ftrace_enabled = 1; + reset_counts(); + + /* Handle PPC64 '.' name */ + func1_name = "*" __stringify(DYN_FTRACE_TEST_NAME); + func2_name = "*" __stringify(DYN_FTRACE_TEST_NAME2); + len1 = strlen(func1_name); + len2 = strlen(func2_name); + + /* + * Probe 1 will trace function 1. + * Probe 2 will trace function 2. + * Probe 3 will trace functions 1 and 2. + */ + ftrace_set_filter(&test_probe1, func1_name, len1, 1); + ftrace_set_filter(&test_probe2, func2_name, len2, 1); + ftrace_set_filter(&test_probe3, func1_name, len1, 1); + ftrace_set_filter(&test_probe3, func2_name, len2, 0); + + register_ftrace_function(&test_probe1); + register_ftrace_function(&test_probe2); + register_ftrace_function(&test_probe3); + register_ftrace_function(&test_global); + + DYN_FTRACE_TEST_NAME(); + + print_counts(); + + if (trace_selftest_test_probe1_cnt != 1) + goto out; + if (trace_selftest_test_probe2_cnt != 0) + goto out; + if (trace_selftest_test_probe3_cnt != 1) + goto out; + if (trace_selftest_test_global_cnt == 0) + goto out; + + DYN_FTRACE_TEST_NAME2(); + + print_counts(); + + if (trace_selftest_test_probe1_cnt != 1) + goto out; + if (trace_selftest_test_probe2_cnt != 1) + goto out; + if (trace_selftest_test_probe3_cnt != 2) + goto out; + + /* Add a dynamic probe */ + dyn_ops = kzalloc(sizeof(*dyn_ops), GFP_KERNEL); + if (!dyn_ops) { + printk("MEMORY ERROR "); + goto out; + } + + dyn_ops->func = trace_selftest_test_dyn_func; + + register_ftrace_function(dyn_ops); + + trace_selftest_test_global_cnt = 0; + + DYN_FTRACE_TEST_NAME(); + + print_counts(); + + if (trace_selftest_test_probe1_cnt != 2) + goto out_free; + if (trace_selftest_test_probe2_cnt != 1) + goto out_free; + if (trace_selftest_test_probe3_cnt != 3) + goto out_free; + if (trace_selftest_test_global_cnt == 0) + goto out; + if (trace_selftest_test_dyn_cnt == 0) + goto out_free; + + DYN_FTRACE_TEST_NAME2(); + + print_counts(); + + if (trace_selftest_test_probe1_cnt != 2) + goto out_free; + if (trace_selftest_test_probe2_cnt != 2) + goto out_free; + if (trace_selftest_test_probe3_cnt != 4) + goto out_free; + + ret = 0; + out_free: + unregister_ftrace_function(dyn_ops); + kfree(dyn_ops); + + out: + /* Purposely unregister in the same order */ + unregister_ftrace_function(&test_probe1); + unregister_ftrace_function(&test_probe2); + unregister_ftrace_function(&test_probe3); + unregister_ftrace_function(&test_global); + + /* Make sure everything is off */ + reset_counts(); + DYN_FTRACE_TEST_NAME(); + DYN_FTRACE_TEST_NAME(); + + if (trace_selftest_test_probe1_cnt || + trace_selftest_test_probe2_cnt || + trace_selftest_test_probe3_cnt || + trace_selftest_test_global_cnt || + trace_selftest_test_dyn_cnt) + ret = -1; + + ftrace_enabled = save_ftrace_enabled; + + return ret; +} + /* Test dynamic code modification and ftrace filters */ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, struct trace_array *tr, @@ -166,16 +366,20 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, /* check the trace buffer */ ret = trace_test_buffer(tr, &count); - trace->reset(tr); tracing_start(); /* we should only have one item */ if (!ret && count != 1) { + trace->reset(tr); printk(KERN_CONT ".. filter failed count=%ld ..", count); ret = -1; goto out; } + /* Test the ops with global tracing running */ + ret = trace_selftest_ops(1); + trace->reset(tr); + out: ftrace_enabled = save_ftrace_enabled; tracer_enabled = save_tracer_enabled; @@ -183,6 +387,10 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, /* Enable tracing on all functions again */ ftrace_set_global_filter(NULL, 0, 1); + /* Test the ops with global tracing off */ + if (!ret) + ret = trace_selftest_ops(2); + return ret; } #else diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c index 54dd77cce5bf..b4c475a0a48b 100644 --- a/kernel/trace/trace_selftest_dynamic.c +++ b/kernel/trace/trace_selftest_dynamic.c @@ -5,3 +5,9 @@ int DYN_FTRACE_TEST_NAME(void) /* used to call mcount */ return 0; } + +int DYN_FTRACE_TEST_NAME2(void) +{ + /* used to call mcount */ + return 0; +} -- cgit v1.2.3