From e870e9a1240bcef1157ffaaf71dac63362e71904 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 2 Jul 2010 11:07:32 +0800 Subject: tracing: Allow to disable cmdline recording We found that even enabling a single trace event that will rarely be triggered can add big overhead to context switch. (lmbench context switch test) ------------------------------------------------- 2p/0K 2p/16K 2p/64K 8p/16K 8p/64K 16p/16K 16p/64K ctxsw ctxsw ctxsw ctxsw ctxsw ctxsw ctxsw ------ ------ ------ ------ ------ ------- ------- 2.19 2.3 2.21 2.56 2.13 2.54 2.07 2.39 2.51 2.35 2.75 2.27 2.81 2.24 The overhead is 6% ~ 11%. It's because when a trace event is enabled 3 tracepoints (sched_switch, sched_wakeup, sched_wakeup_new) will be activated to map pid to cmdname. We'd like to avoid this overhead, so add a trace option '(no)record-cmd' to allow to disable cmdline recording. Signed-off-by: Li Zefan LKML-Reference: <4C2D57F4.2050204@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 01df7ca4ead7..2b7b1395b4d5 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -152,11 +152,13 @@ extern int ftrace_event_reg(struct ftrace_event_call *event, enum { TRACE_EVENT_FL_ENABLED_BIT, TRACE_EVENT_FL_FILTERED_BIT, + TRACE_EVENT_FL_RECORDED_CMD_BIT, }; enum { - TRACE_EVENT_FL_ENABLED = (1 << TRACE_EVENT_FL_ENABLED_BIT), - TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT), + TRACE_EVENT_FL_ENABLED = (1 << TRACE_EVENT_FL_ENABLED_BIT), + TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT), + TRACE_EVENT_FL_RECORDED_CMD = (1 << TRACE_EVENT_FL_RECORDED_CMD_BIT), }; struct ftrace_event_call { @@ -174,6 +176,7 @@ struct ftrace_event_call { * 32 bit flags: * bit 1: enabled * bit 2: filter_active + * bit 3: enabled cmd record * * Changes to flags must hold the event_mutex. * -- cgit v1.2.3 From bc289ae98b75d93228d24f521ef02a076e506e94 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 3 Jun 2010 18:26:24 +0800 Subject: tracing: Reduce latency and remove percpu trace_seq __print_flags() and __print_symbolic() use percpu trace_seq: 1) Its memory is allocated at compile time, it wastes memory if we don't use tracing. 2) It is percpu data and it wastes more memory for multi-cpus system. 3) It disables preemption when it executes its core routine "trace_seq_printf(s, "%s: ", #call);" and introduces latency. So we move this trace_seq to struct trace_iterator. Signed-off-by: Lai Jiangshan LKML-Reference: <4C078350.7090106@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 5 +++-- include/trace/ftrace.h | 12 +++--------- kernel/trace/trace_output.c | 3 --- 3 files changed, 6 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 2b7b1395b4d5..02b8b24f8f51 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -11,8 +11,6 @@ struct trace_array; struct tracer; struct dentry; -DECLARE_PER_CPU(struct trace_seq, ftrace_event_seq); - struct trace_print_flags { unsigned long mask; const char *name; @@ -58,6 +56,9 @@ struct trace_iterator { struct ring_buffer_iter *buffer_iter[NR_CPUS]; unsigned long iter_flags; + /* trace_seq for __print_flags() and __print_symbolic() etc. */ + struct trace_seq tmp_seq; + /* The below is zeroed out in pipe_read */ struct trace_seq seq; struct trace_entry *ent; diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 55c1fd1bbc3d..fb783d94fc54 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -145,7 +145,7 @@ * struct trace_seq *s = &iter->seq; * struct ftrace_raw_ *field; <-- defined in stage 1 * struct trace_entry *entry; - * struct trace_seq *p; + * struct trace_seq *p = &iter->tmp_seq; * int ret; * * entry = iter->ent; @@ -157,12 +157,10 @@ * * field = (typeof(field))entry; * - * p = &get_cpu_var(ftrace_event_seq); * trace_seq_init(p); * ret = trace_seq_printf(s, "%s: ", ); * if (ret) * ret = trace_seq_printf(s, "\n"); - * put_cpu(); * if (!ret) * return TRACE_TYPE_PARTIAL_LINE; * @@ -216,7 +214,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags, \ struct trace_seq *s = &iter->seq; \ struct ftrace_raw_##call *field; \ struct trace_entry *entry; \ - struct trace_seq *p; \ + struct trace_seq *p = &iter->tmp_seq; \ int ret; \ \ event = container_of(trace_event, struct ftrace_event_call, \ @@ -231,12 +229,10 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags, \ \ field = (typeof(field))entry; \ \ - p = &get_cpu_var(ftrace_event_seq); \ trace_seq_init(p); \ ret = trace_seq_printf(s, "%s: ", event->name); \ if (ret) \ ret = trace_seq_printf(s, print); \ - put_cpu(); \ if (!ret) \ return TRACE_TYPE_PARTIAL_LINE; \ \ @@ -255,7 +251,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags, \ struct trace_seq *s = &iter->seq; \ struct ftrace_raw_##template *field; \ struct trace_entry *entry; \ - struct trace_seq *p; \ + struct trace_seq *p = &iter->tmp_seq; \ int ret; \ \ entry = iter->ent; \ @@ -267,12 +263,10 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags, \ \ field = (typeof(field))entry; \ \ - p = &get_cpu_var(ftrace_event_seq); \ trace_seq_init(p); \ ret = trace_seq_printf(s, "%s: ", #call); \ if (ret) \ ret = trace_seq_printf(s, print); \ - put_cpu(); \ if (!ret) \ return TRACE_TYPE_PARTIAL_LINE; \ \ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 57c1b4596470..1ba64d3cc567 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -16,9 +16,6 @@ DECLARE_RWSEM(trace_event_mutex); -DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq); -EXPORT_PER_CPU_SYMBOL(ftrace_event_seq); - static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; static int next_event_type = __TRACE_LAST_TYPE + 1; -- cgit v1.2.3 From 9849ed4d72251d273524efb8b70be0be9aecb1df Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Tue, 20 Jul 2010 03:13:35 -0400 Subject: tracing/documentation: Document dynamic ftracer internals Add more details to the dynamic function tracing design implementation. Signed-off-by: Mike Frysinger LKML-Reference: <1279610015-10250-1-git-send-email-vapier@gentoo.org> Signed-off-by: Steven Rostedt --- Documentation/trace/ftrace-design.txt | 153 ++++++++++++++++++++++++++++++++-- include/linux/ftrace.h | 5 ++ 2 files changed, 153 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/Documentation/trace/ftrace-design.txt b/Documentation/trace/ftrace-design.txt index f1f81afee8a0..dc52bd442c92 100644 --- a/Documentation/trace/ftrace-design.txt +++ b/Documentation/trace/ftrace-design.txt @@ -13,6 +13,9 @@ Note that this focuses on architecture implementation details only. If you want more explanation of a feature in terms of common code, review the common ftrace.txt file. +Ideally, everyone who wishes to retain performance while supporting tracing in +their kernel should make it all the way to dynamic ftrace support. + Prerequisites ------------- @@ -215,7 +218,7 @@ An arch may pass in a unique value (frame pointer) to both the entering and exiting of a function. On exit, the value is compared and if it does not match, then it will panic the kernel. This is largely a sanity check for bad code generation with gcc. If gcc for your port sanely updates the frame -pointer under different opitmization levels, then ignore this option. +pointer under different optimization levels, then ignore this option. However, adding support for it isn't terribly difficult. In your assembly code that calls prepare_ftrace_return(), pass the frame pointer as the 3rd argument. @@ -234,7 +237,7 @@ If you can't trace NMI functions, then skip this option. HAVE_SYSCALL_TRACEPOINTS ---------------------- +------------------------ You need very few things to get the syscalls tracing in an arch. @@ -250,12 +253,152 @@ You need very few things to get the syscalls tracing in an arch. HAVE_FTRACE_MCOUNT_RECORD ------------------------- -See scripts/recordmcount.pl for more info. +See scripts/recordmcount.pl for more info. Just fill in the arch-specific +details for how to locate the addresses of mcount call sites via objdump. +This option doesn't make much sense without also implementing dynamic ftrace. + +HAVE_DYNAMIC_FTRACE +------------------- + +You will first need HAVE_FTRACE_MCOUNT_RECORD and HAVE_FUNCTION_TRACER, so +scroll your reader back up if you got over eager. + +Once those are out of the way, you will need to implement: + - asm/ftrace.h: + - MCOUNT_ADDR + - ftrace_call_adjust() + - struct dyn_arch_ftrace{} + - asm code: + - mcount() (new stub) + - ftrace_caller() + - ftrace_call() + - ftrace_stub() + - C code: + - ftrace_dyn_arch_init() + - ftrace_make_nop() + - ftrace_make_call() + - ftrace_update_ftrace_func() + +First you will need to fill out some arch details in your asm/ftrace.h. + +Define MCOUNT_ADDR as the address of your mcount symbol similar to: + #define MCOUNT_ADDR ((unsigned long)mcount) +Since no one else will have a decl for that function, you will need to: + extern void mcount(void); + +You will also need the helper function ftrace_call_adjust(). Most people +will be able to stub it out like so: + static inline unsigned long ftrace_call_adjust(unsigned long addr) + { + return addr; + }
+Lastly you will need the custom dyn_arch_ftrace structure. If you need +some extra state when runtime patching arbitrary call sites, this is the +place. For now though, create an empty struct: + struct dyn_arch_ftrace { + /* No extra data needed */ + }; + +With the header out of the way, we can fill out the assembly code. While we +did already create a mcount() function earlier, dynamic ftrace only wants a +stub function. This is because the mcount() will only be used during boot +and then all references to it will be patched out never to return. Instead, +the guts of the old mcount() will be used to create a new ftrace_caller() +function. Because the two are hard to merge, it will most likely be a lot +easier to have two separate definitions split up by #ifdefs. Same goes for +the ftrace_stub() as that will now be inlined in ftrace_caller(). + +Before we get confused anymore, let's check out some pseudo code so you can +implement your own stuff in assembly: -HAVE_DYNAMIC_FTRACE ---------------------- +void mcount(void) +{ + return; +} + +void ftrace_caller(void) +{ + /* implement HAVE_FUNCTION_TRACE_MCOUNT_TEST if you desire */ + + /* save all state needed by the ABI (see paragraph above) */ + + unsigned long frompc = ...; + unsigned long selfpc = - MCOUNT_INSN_SIZE; + +ftrace_call: + ftrace_stub(frompc, selfpc); + + /* restore all state needed by the ABI */ + +ftrace_stub: + return; +} + +This might look a little odd at first, but keep in mind that we will be runtime +patching multiple things. First, only functions that we actually want to trace +will be patched to call ftrace_caller(). Second, since we only have one tracer +active at a time, we will patch the ftrace_caller() function itself to call the +specific tracer in question. That is the point of the ftrace_call label. + +With that in mind, let's move on to the C code that will actually be doing the +runtime patching. You'll need a little knowledge of your arch's opcodes in +order to make it through the next section. + +Every arch has an init callback function. If you need to do something early on +to initialize some state, this is the time to do that. Otherwise, this simple +function below should be sufficient for most people: + +int __init ftrace_dyn_arch_init(void *data) +{ + /* return value is done indirectly via data */ + *(unsigned long *)data = 0; + + return 0; +} + +There are two functions that are used to do runtime patching of arbitrary +functions. The first is used to turn the mcount call site into a nop (which +is what helps us retain runtime performance when not tracing). The second is +used to turn the mcount call site into a call to an arbitrary location (but +typically that is ftracer_caller()). See the general function definition in +linux/ftrace.h for the functions: + ftrace_make_nop() + ftrace_make_call() +The rec->ip value is the address of the mcount call site that was collected +by the scripts/recordmcount.pl during build time. + +The last function is used to do runtime patching of the active tracer. This +will be modifying the assembly code at the location of the ftrace_call symbol +inside of the ftrace_caller() function. So you should have sufficient padding +at that location to support the new function calls you'll be inserting. Some +people will be using a "call" type instruction while others will be using a +"branch" type instruction. Specifically, the function is: + ftrace_update_ftrace_func() + + +HAVE_DYNAMIC_FTRACE + HAVE_FUNCTION_GRAPH_TRACER +------------------------------------------------ + +The function grapher needs a few tweaks in order to work with dynamic ftrace. +Basically, you will need to: + - update: + - ftrace_caller() + - ftrace_graph_call() + - ftrace_graph_caller() + - implement: + - ftrace_enable_ftrace_graph_caller() + - ftrace_disable_ftrace_graph_caller()
+Quick notes: + - add a nop stub after the ftrace_call location named ftrace_graph_call; + stub needs to be large enough to support a call to ftrace_graph_caller() + - update ftrace_graph_caller() to work with being called by the new + ftrace_caller() since some semantics may have changed + - ftrace_enable_ftrace_graph_caller() will runtime patch the + ftrace_graph_call location with a call to ftrace_graph_caller() + - ftrace_disable_ftrace_graph_caller() will runtime patch the + ftrace_graph_call location with nops diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 41e46330d9be..dcd6a7c3a435 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1,3 +1,8 @@ +/* + * Ftrace header. For implementation details beyond the random comments + * scattered below, see: Documentation/trace/ftrace-design.txt + */ + #ifndef _LINUX_FTRACE_H #define _LINUX_FTRACE_H -- cgit v1.2.3