From e48c178814b4a33f84f62d01f5a601ebd57fbba8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Jul 2016 09:18:30 +0200 Subject: perf/core: Optimize perf_pmu_sched_task() For perf record -b, which requires the pmu::sched_task callback the current code is rather expensive: 7.68% sched-pipe [kernel.vmlinux] [k] perf_pmu_sched_task 5.95% sched-pipe [kernel.vmlinux] [k] __switch_to 5.20% sched-pipe [kernel.vmlinux] [k] __intel_pmu_disable_all 3.95% sched-pipe perf [.] worker_thread The problem is that it will iterate all registered PMUs, most of which will not have anything to do. Avoid this by keeping an explicit list of PMUs that have requested the callback. The perf_sched_cb_{inc,dec}() functions already takes the required pmu argument, and now that these functions are no longer called from NMI context we can use them to manage a list. With this patch applied the function doesn't show up in the top 4 anymore (it dropped to 18th place). 6.67% sched-pipe [kernel.vmlinux] [k] __switch_to 6.18% sched-pipe [kernel.vmlinux] [k] __intel_pmu_disable_all 3.92% sched-pipe [kernel.vmlinux] [k] switch_mm_irqs_off 3.71% sched-pipe perf [.] worker_thread Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2b6b43cc0dd5..529c41fa73c8 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -774,6 +774,9 @@ struct perf_cpu_context { #ifdef CONFIG_CGROUP_PERF struct perf_cgroup *cgrp; #endif + + struct list_head sched_cb_entry; + int sched_cb_usage; }; struct perf_output_handle { -- cgit v1.2.3 From 29dd3288705f26cc27663e79061209dabce2d5b9 Mon Sep 17 00:00:00 2001 From: Madhavan Srinivasan Date: Wed, 17 Aug 2016 15:06:08 +0530 Subject: bitmap.h, perf/core: Fix the mask in perf_output_sample_regs() When decoding the perf_regs mask in perf_output_sample_regs(), we loop through the mask using find_first_bit and find_next_bit functions. While the exisiting code works fine in most of the case, the logic is broken for big-endian 32-bit kernels. When reading a u64 mask using (u32 *)(&val)[0], find_*_bit() assumes that it gets the lower 32 bits of u64, but instead it gets the upper 32 bits - which is wrong. The fix is to swap the words of the u64 to handle this case. This is _not_ a regular endianness swap. Suggested-by: Yury Norov Signed-off-by: Madhavan Srinivasan Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Yury Norov Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Jiri Olsa Cc: Linus Torvalds Cc: Michael Ellerman Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/1471426568-31051-2-git-send-email-maddy@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- include/linux/bitmap.h | 18 ++++++++++++++++++ kernel/events/core.c | 5 +++-- 2 files changed, 21 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 598bc999f4c2..3b77588a9360 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -339,6 +339,24 @@ static inline int bitmap_parse(const char *buf, unsigned int buflen, return __bitmap_parse(buf, buflen, 0, maskp, nmaskbits); } +/* + * bitmap_from_u64 - Check and swap words within u64. + * @mask: source bitmap + * @dst: destination bitmap + * + * In 32-bit Big Endian kernel, when using (u32 *)(&val)[*] + * to read u64 mask, we will get the wrong word. + * That is "(u32 *)(&val)[0]" gets the upper 32 bits, + * but we expect the lower 32-bits of u64. + */ +static inline void bitmap_from_u64(unsigned long *dst, u64 mask) +{ + dst[0] = mask & ULONG_MAX; + + if (sizeof(mask) > sizeof(unsigned long)) + dst[1] = mask >> 32; +} + #endif /* __ASSEMBLY__ */ #endif /* __LINUX_BITMAP_H */ diff --git a/kernel/events/core.c b/kernel/events/core.c index ca4fde5ed268..849919c2f3d7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5340,9 +5340,10 @@ perf_output_sample_regs(struct perf_output_handle *handle, struct pt_regs *regs, u64 mask) { int bit; + DECLARE_BITMAP(_mask, 64); - for_each_set_bit(bit, (const unsigned long *) &mask, - sizeof(mask) * BITS_PER_BYTE) { + bitmap_from_u64(_mask, mask); + for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) { u64 val; val = perf_reg_value(regs, bit); -- cgit v1.2.3 From 4ff6a8debf48a7bf48e93c01da720785070d3a25 Mon Sep 17 00:00:00 2001 From: David Carrillo-Cisneros Date: Wed, 17 Aug 2016 13:55:05 -0700 Subject: perf/core: Generalize event->group_flags Currently, PERF_GROUP_SOFTWARE is used in the group_flags field of a group's leader to indicate that is_software_event(event) is true for all events in a group. This is the only usage of event->group_flags. This pattern of setting a group level flags when all events in the group share a property is useful for the flag introduced in the next patch and for future CQM/CMT flags. So this patches generalizes group_flags to work as an aggregate of event level flags. PERF_GROUP_SOFTWARE denotes an inmutable event's property. All other flags that I intend to add are also determinable at event initialization. To better convey the above, this patch renames event's group_flags to group_caps and PERF_GROUP_SOFTWARE to PERF_EV_CAP_SOFTWARE. Individual event flags are stored in the new event->event_caps. Since the cap flags do not change after event initialization, there is no need to serialize event_caps. This new field is used when events are added to a context, similarly to how PERF_GROUP_SOFTWARE and is_software_event() worked. Lastly, for consistency, updates is_software_event() to rely in event_cap instead of the context index. Signed-off-by: David Carrillo-Cisneros Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Paul Turner Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vegard Nossum Cc: Vince Weaver Link: http://lkml.kernel.org/r/1471467307-61171-3-git-send-email-davidcc@google.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 18 +++++++++++++----- kernel/events/core.c | 16 ++++++++-------- 2 files changed, 21 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 529c41fa73c8..6f7459f72dfd 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -510,9 +510,12 @@ typedef void (*perf_overflow_handler_t)(struct perf_event *, struct perf_sample_data *, struct pt_regs *regs); -enum perf_group_flag { - PERF_GROUP_SOFTWARE = 0x1, -}; +/* + * Event capabilities. For event_caps and groups caps. + * + * PERF_EV_CAP_SOFTWARE: Is a software event. + */ +#define PERF_EV_CAP_SOFTWARE BIT(0) #define SWEVENT_HLIST_BITS 8 #define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS) @@ -568,7 +571,12 @@ struct perf_event { struct hlist_node hlist_entry; struct list_head active_entry; int nr_siblings; - int group_flags; + + /* Not serialized. Only written during event initialization. */ + int event_caps; + /* The cumulative AND of all event_caps for events in this group. */ + int group_caps; + struct perf_event *group_leader; struct pmu *pmu; void *pmu_private; @@ -988,7 +996,7 @@ static inline bool is_sampling_event(struct perf_event *event) */ static inline int is_software_event(struct perf_event *event) { - return event->pmu->task_ctx_nr == perf_sw_context; + return event->event_caps & PERF_EV_CAP_SOFTWARE; } extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; diff --git a/kernel/events/core.c b/kernel/events/core.c index 849919c2f3d7..8c42a5ae9030 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1475,8 +1475,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) if (event->group_leader == event) { struct list_head *list; - if (is_software_event(event)) - event->group_flags |= PERF_GROUP_SOFTWARE; + event->group_caps = event->event_caps; list = ctx_group_list(event, ctx); list_add_tail(&event->group_entry, list); @@ -1630,9 +1629,7 @@ static void perf_group_attach(struct perf_event *event) WARN_ON_ONCE(group_leader->ctx != event->ctx); - if (group_leader->group_flags & PERF_GROUP_SOFTWARE && - !is_software_event(event)) - group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; + group_leader->group_caps &= event->event_caps; list_add_tail(&event->group_entry, &group_leader->sibling_list); group_leader->nr_siblings++; @@ -1723,7 +1720,7 @@ static void perf_group_detach(struct perf_event *event) sibling->group_leader = sibling; /* Inherit group flags from the previous leader */ - sibling->group_flags = event->group_flags; + sibling->group_caps = event->group_caps; WARN_ON_ONCE(sibling->ctx != event->ctx); } @@ -2149,7 +2146,7 @@ static int group_can_go_on(struct perf_event *event, /* * Groups consisting entirely of software events can always go on. */ - if (event->group_flags & PERF_GROUP_SOFTWARE) + if (event->group_caps & PERF_EV_CAP_SOFTWARE) return 1; /* * If an exclusive group is already on, no other hardware @@ -9490,6 +9487,9 @@ SYSCALL_DEFINE5(perf_event_open, goto err_alloc; } + if (pmu->task_ctx_nr == perf_sw_context) + event->event_caps |= PERF_EV_CAP_SOFTWARE; + if (group_leader && (is_software_event(event) != is_software_event(group_leader))) { if (is_software_event(event)) { @@ -9503,7 +9503,7 @@ SYSCALL_DEFINE5(perf_event_open, */ pmu = group_leader->pmu; } else if (is_software_event(group_leader) && - (group_leader->group_flags & PERF_GROUP_SOFTWARE)) { + (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { /* * In case the group is a pure software group, and we * try to add a hardware event, move the whole group to -- cgit v1.2.3 From d6a2f9035bfc27d0e9d78b13635dda9fb017ac01 Mon Sep 17 00:00:00 2001 From: David Carrillo-Cisneros Date: Wed, 17 Aug 2016 13:55:06 -0700 Subject: perf/core: Introduce PMU_EV_CAP_READ_ACTIVE_PKG Introduce the flag PMU_EV_CAP_READ_ACTIVE_PKG, useful for uncore events, that allows a PMU to signal the generic perf code that an event is readable in the current CPU if the event is active in a CPU in the same package as the current CPU. This is an optimization that avoids a unnecessary IPI for the common case where uncore events are run and read in the same package but in different CPUs. As an example, the IPI removal speeds up perf_read() in my Haswell system as follows: - For event UNC_C_LLC_LOOKUP: From 260 us to 31 us. - For event RAPL's power/energy-cores/: From to 255 us to 27 us. For the optimization to work, all events in the group must have it (similarly to PERF_EV_CAP_SOFTWARE). Signed-off-by: David Carrillo-Cisneros Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: David Carrillo-Cisneros Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Paul Turner Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vegard Nossum Cc: Vince Weaver Link: http://lkml.kernel.org/r/1471467307-61171-4-git-send-email-davidcc@google.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 3 +++ kernel/events/core.c | 25 +++++++++++++++++++++++-- 2 files changed, 26 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 6f7459f72dfd..5c5362584aba 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -514,8 +514,11 @@ typedef void (*perf_overflow_handler_t)(struct perf_event *, * Event capabilities. For event_caps and groups caps. * * PERF_EV_CAP_SOFTWARE: Is a software event. + * PERF_EV_CAP_READ_ACTIVE_PKG: A CPU event (or cgroup event) that can be read + * from any CPU in the package where it is active. */ #define PERF_EV_CAP_SOFTWARE BIT(0) +#define PERF_EV_CAP_READ_ACTIVE_PKG BIT(1) #define SWEVENT_HLIST_BITS 8 #define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS) diff --git a/kernel/events/core.c b/kernel/events/core.c index 8c42a5ae9030..3f07e6cfc1b6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3424,6 +3424,22 @@ struct perf_read_data { int ret; }; +static int find_cpu_to_read(struct perf_event *event, int local_cpu) +{ + int event_cpu = event->oncpu; + u16 local_pkg, event_pkg; + + if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { + event_pkg = topology_physical_package_id(event_cpu); + local_pkg = topology_physical_package_id(local_cpu); + + if (event_pkg == local_pkg) + return local_cpu; + } + + return event_cpu; +} + /* * Cross CPU call to read the hardware event */ @@ -3545,7 +3561,7 @@ u64 perf_event_read_local(struct perf_event *event) static int perf_event_read(struct perf_event *event, bool group) { - int ret = 0; + int ret = 0, cpu_to_read, local_cpu; /* * If event is enabled and currently active on a CPU, update the @@ -3557,7 +3573,12 @@ static int perf_event_read(struct perf_event *event, bool group) .group = group, .ret = 0, }; - ret = smp_call_function_single(event->oncpu, __perf_event_read, &data, 1); + + local_cpu = get_cpu(); + cpu_to_read = find_cpu_to_read(event, local_cpu); + put_cpu(); + + ret = smp_call_function_single(cpu_to_read, __perf_event_read, &data, 1); /* The event must have been read from an online CPU: */ WARN_ON_ONCE(ret); ret = ret ? : data.ret; -- cgit v1.2.3