From 39bed6cbb842d8edf5a26b01122b391d36775b5e Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Fri, 23 Jan 2015 18:45:40 +0000 Subject: perf: Make perf_cgroup_from_task() global Move perf_cgroup_from_task() from kernel/events/ to include/linux/ along with the necessary struct definitions, so that it can be used by the PMU code. When the upcoming Intel Cache Monitoring PMU driver assigns monitoring IDs to perf events, it needs to be able to check whether any two monitoring events overlap (say, a cgroup and task event), which means we need to be able to lookup the cgroup associated with a task (if any). Signed-off-by: Matt Fleming Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Kanaka Juvva Cc: Linus Torvalds Cc: Paul Mackerras Cc: Vikas Shivappa Link: http://lkml.kernel.org/r/1422038748-21397-2-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 724d3720c9b1..cae4a9481777 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -53,6 +53,7 @@ struct perf_guest_info_callbacks { #include #include #include +#include #include struct perf_callchain_entry { @@ -547,6 +548,35 @@ struct perf_output_handle { int page; }; +#ifdef CONFIG_CGROUP_PERF + +/* + * perf_cgroup_info keeps track of time_enabled for a cgroup. + * This is a per-cpu dynamically allocated data structure. + */ +struct perf_cgroup_info { + u64 time; + u64 timestamp; +}; + +struct perf_cgroup { + struct cgroup_subsys_state css; + struct perf_cgroup_info __percpu *info; +}; + +/* + * Must ensure cgroup is pinned (css_get) before calling + * this function. In other words, we cannot call this function + * if there is no cgroup event for the current CPU context. + */ +static inline struct perf_cgroup * +perf_cgroup_from_task(struct task_struct *task) +{ + return container_of(task_css(task, perf_event_cgrp_id), + struct perf_cgroup, css); +} +#endif /* CONFIG_CGROUP_PERF */ + #ifdef CONFIG_PERF_EVENTS extern int perf_pmu_register(struct pmu *pmu, const char *name, int type); -- cgit v1.2.3 From eacd3ecc34472ce3751eedfc94e44c7cc6eb6305 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Fri, 23 Jan 2015 18:45:41 +0000 Subject: perf: Add ->count() function to read per-package counters For PMU drivers that record per-package counters, the ->count variable cannot be used to record an accurate aggregated value, since it's not possible to perform SMP cross-calls to cpus on other packages from the context in which we update ->count. Introduce a new optional ->count() accessor function that can be used to customize how values are collected. If a PMU driver doesn't provide a ->count() function, we fallback to the existing code. There is necessarily a window of staleness with this approach because the task that generated the counter value may not have been scheduled by the cpu recently. An alternative and more complex approach would be to use a hrtimer to periodically refresh the values from a more permissive scheduling context. So, we're trading off complexity for accuracy. Signed-off-by: Matt Fleming Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Kanaka Juvva Cc: Linus Torvalds Cc: Vikas Shivappa Link: http://lkml.kernel.org/r/1422038748-21397-3-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 10 ++++++++++ kernel/events/core.c | 5 ++++- 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index cae4a9481777..9fc9b0d31442 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -272,6 +272,11 @@ struct pmu { */ size_t task_ctx_size; + + /* + * Return the count value for a counter. + */ + u64 (*count) (struct perf_event *event); /*optional*/ }; /** @@ -770,6 +775,11 @@ static inline void perf_event_task_sched_out(struct task_struct *prev, __perf_event_task_sched_out(prev, next); } +static inline u64 __perf_event_count(struct perf_event *event) +{ + return local64_read(&event->count) + atomic64_read(&event->child_count); +} + extern void perf_event_mmap(struct vm_area_struct *vma); extern struct perf_guest_info_callbacks *perf_guest_cbs; extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); diff --git a/kernel/events/core.c b/kernel/events/core.c index 072de3143244..4e8dc596f101 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3194,7 +3194,10 @@ static void __perf_event_read(void *info) static inline u64 perf_event_count(struct perf_event *event) { - return local64_read(&event->count) + atomic64_read(&event->child_count); + if (event->pmu->count) + return event->pmu->count(event); + + return __perf_event_count(event); } static u64 perf_event_read(struct perf_event *event) -- cgit v1.2.3 From 4afbb24ce5e723c8a093a6674a3c33062175078a Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Fri, 23 Jan 2015 18:45:44 +0000 Subject: perf/x86/intel: Add Intel Cache QoS Monitoring support Future Intel Xeon processors support a Cache QoS Monitoring feature that allows tracking of the LLC occupancy for a task or task group, i.e. the amount of data in pulled into the LLC for the task (group). Currently the PMU only supports per-cpu events. We create an event for each cpu and read out all the LLC occupancy values. Because this results in duplicate values being written out to userspace, we also export a .per-pkg event file so that the perf tools only accumulate values for one cpu per package. Signed-off-by: Matt Fleming Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Kanaka Juvva Cc: Linus Torvalds Cc: Vikas Shivappa Link: http://lkml.kernel.org/r/1422038748-21397-6-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_cqm.c | 530 +++++++++++++++++++++++++++++ include/linux/perf_event.h | 7 + 2 files changed, 537 insertions(+) create mode 100644 arch/x86/kernel/cpu/perf_event_intel_cqm.c (limited to 'include') diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c new file mode 100644 index 000000000000..05b4cd26f426 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -0,0 +1,530 @@ +/* + * Intel Cache Quality-of-Service Monitoring (CQM) support. + * + * Based very, very heavily on work by Peter Zijlstra. + */ + +#include +#include +#include +#include "perf_event.h" + +#define MSR_IA32_PQR_ASSOC 0x0c8f +#define MSR_IA32_QM_CTR 0x0c8e +#define MSR_IA32_QM_EVTSEL 0x0c8d + +static unsigned int cqm_max_rmid = -1; +static unsigned int cqm_l3_scale; /* supposedly cacheline size */ + +struct intel_cqm_state { + raw_spinlock_t lock; + int rmid; + int cnt; +}; + +static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state); + +/* + * Protects cache_cgroups. + */ +static DEFINE_MUTEX(cache_mutex); + +/* + * Groups of events that have the same target(s), one RMID per group. + */ +static LIST_HEAD(cache_groups); + +/* + * Mask of CPUs for reading CQM values. We only need one per-socket. + */ +static cpumask_t cqm_cpumask; + +#define RMID_VAL_ERROR (1ULL << 63) +#define RMID_VAL_UNAVAIL (1ULL << 62) + +#define QOS_L3_OCCUP_EVENT_ID (1 << 0) + +#define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID + +static u64 __rmid_read(unsigned long rmid) +{ + u64 val; + + /* + * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt, + * it just says that to increase confusion. + */ + wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid); + rdmsrl(MSR_IA32_QM_CTR, val); + + /* + * Aside from the ERROR and UNAVAIL bits, assume this thing returns + * the number of cachelines tagged with @rmid. + */ + return val; +} + +static unsigned long *cqm_rmid_bitmap; + +/* + * Returns < 0 on fail. + */ +static int __get_rmid(void) +{ + return bitmap_find_free_region(cqm_rmid_bitmap, cqm_max_rmid, 0); +} + +static void __put_rmid(int rmid) +{ + bitmap_release_region(cqm_rmid_bitmap, rmid, 0); +} + +static int intel_cqm_setup_rmid_cache(void) +{ + cqm_rmid_bitmap = kmalloc(sizeof(long) * BITS_TO_LONGS(cqm_max_rmid), GFP_KERNEL); + if (!cqm_rmid_bitmap) + return -ENOMEM; + + bitmap_zero(cqm_rmid_bitmap, cqm_max_rmid); + + /* + * RMID 0 is special and is always allocated. It's used for all + * tasks that are not monitored. + */ + bitmap_allocate_region(cqm_rmid_bitmap, 0, 0); + + return 0; +} + +/* + * Determine if @a and @b measure the same set of tasks. + */ +static bool __match_event(struct perf_event *a, struct perf_event *b) +{ + if ((a->attach_state & PERF_ATTACH_TASK) != + (b->attach_state & PERF_ATTACH_TASK)) + return false; + + /* not task */ + + return true; /* if not task, we're machine wide */ +} + +/* + * Determine if @a's tasks intersect with @b's tasks + */ +static bool __conflict_event(struct perf_event *a, struct perf_event *b) +{ + /* + * If one of them is not a task, same story as above with cgroups. + */ + if (!(a->attach_state & PERF_ATTACH_TASK) || + !(b->attach_state & PERF_ATTACH_TASK)) + return true; + + /* + * Must be non-overlapping. + */ + return false; +} + +/* + * Find a group and setup RMID. + * + * If we're part of a group, we use the group's RMID. + */ +static int intel_cqm_setup_event(struct perf_event *event, + struct perf_event **group) +{ + struct perf_event *iter; + int rmid; + + list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { + if (__match_event(iter, event)) { + /* All tasks in a group share an RMID */ + event->hw.cqm_rmid = iter->hw.cqm_rmid; + *group = iter; + return 0; + } + + if (__conflict_event(iter, event)) + return -EBUSY; + } + + rmid = __get_rmid(); + if (rmid < 0) + return rmid; + + event->hw.cqm_rmid = rmid; + return 0; +} + +static void intel_cqm_event_read(struct perf_event *event) +{ + unsigned long rmid = event->hw.cqm_rmid; + u64 val; + + val = __rmid_read(rmid); + + /* + * Ignore this reading on error states and do not update the value. + */ + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) + return; + + local64_set(&event->count, val); +} + +static void intel_cqm_event_start(struct perf_event *event, int mode) +{ + struct intel_cqm_state *state = this_cpu_ptr(&cqm_state); + unsigned long rmid = event->hw.cqm_rmid; + unsigned long flags; + + if (!(event->hw.cqm_state & PERF_HES_STOPPED)) + return; + + event->hw.cqm_state &= ~PERF_HES_STOPPED; + + raw_spin_lock_irqsave(&state->lock, flags); + + if (state->cnt++) + WARN_ON_ONCE(state->rmid != rmid); + else + WARN_ON_ONCE(state->rmid); + + state->rmid = rmid; + wrmsrl(MSR_IA32_PQR_ASSOC, state->rmid); + + raw_spin_unlock_irqrestore(&state->lock, flags); +} + +static void intel_cqm_event_stop(struct perf_event *event, int mode) +{ + struct intel_cqm_state *state = this_cpu_ptr(&cqm_state); + unsigned long flags; + + if (event->hw.cqm_state & PERF_HES_STOPPED) + return; + + event->hw.cqm_state |= PERF_HES_STOPPED; + + raw_spin_lock_irqsave(&state->lock, flags); + intel_cqm_event_read(event); + + if (!--state->cnt) { + state->rmid = 0; + wrmsrl(MSR_IA32_PQR_ASSOC, 0); + } else { + WARN_ON_ONCE(!state->rmid); + } + + raw_spin_unlock_irqrestore(&state->lock, flags); +} + +static int intel_cqm_event_add(struct perf_event *event, int mode) +{ + int rmid; + + event->hw.cqm_state = PERF_HES_STOPPED; + rmid = event->hw.cqm_rmid; + WARN_ON_ONCE(!rmid); + + if (mode & PERF_EF_START) + intel_cqm_event_start(event, mode); + + return 0; +} + +static void intel_cqm_event_del(struct perf_event *event, int mode) +{ + intel_cqm_event_stop(event, mode); +} + +static void intel_cqm_event_destroy(struct perf_event *event) +{ + struct perf_event *group_other = NULL; + + mutex_lock(&cache_mutex); + + /* + * If there's another event in this group... + */ + if (!list_empty(&event->hw.cqm_group_entry)) { + group_other = list_first_entry(&event->hw.cqm_group_entry, + struct perf_event, + hw.cqm_group_entry); + list_del(&event->hw.cqm_group_entry); + } + + /* + * And we're the group leader.. + */ + if (!list_empty(&event->hw.cqm_groups_entry)) { + /* + * If there was a group_other, make that leader, otherwise + * destroy the group and return the RMID. + */ + if (group_other) { + list_replace(&event->hw.cqm_groups_entry, + &group_other->hw.cqm_groups_entry); + } else { + int rmid = event->hw.cqm_rmid; + + __put_rmid(rmid); + list_del(&event->hw.cqm_groups_entry); + } + } + + mutex_unlock(&cache_mutex); +} + +static struct pmu intel_cqm_pmu; + +/* + * XXX there's a bit of a problem in that we cannot simply do the one + * event per node as one would want, since that one event would one get + * scheduled on the one cpu. But we want to 'schedule' the RMID on all + * CPUs. + * + * This means we want events for each CPU, however, that generates a lot + * of duplicate values out to userspace -- this is not to be helped + * unless we want to change the core code in some way. Fore more info, + * see intel_cqm_event_read(). + */ +static int intel_cqm_event_init(struct perf_event *event) +{ + struct perf_event *group = NULL; + int err; + + if (event->attr.type != intel_cqm_pmu.type) + return -ENOENT; + + if (event->attr.config & ~QOS_EVENT_MASK) + return -EINVAL; + + if (event->cpu == -1) + return -EINVAL; + + /* unsupported modes and filters */ + if (event->attr.exclude_user || + event->attr.exclude_kernel || + event->attr.exclude_hv || + event->attr.exclude_idle || + event->attr.exclude_host || + event->attr.exclude_guest || + event->attr.sample_period) /* no sampling */ + return -EINVAL; + + INIT_LIST_HEAD(&event->hw.cqm_group_entry); + INIT_LIST_HEAD(&event->hw.cqm_groups_entry); + + event->destroy = intel_cqm_event_destroy; + + mutex_lock(&cache_mutex); + + err = intel_cqm_setup_event(event, &group); /* will also set rmid */ + if (err) + goto out; + + if (group) { + list_add_tail(&event->hw.cqm_group_entry, + &group->hw.cqm_group_entry); + } else { + list_add_tail(&event->hw.cqm_groups_entry, + &cache_groups); + } + +out: + mutex_unlock(&cache_mutex); + return err; +} + +EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01"); +EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1"); +EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes"); +EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL); +EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1"); + +static struct attribute *intel_cqm_events_attr[] = { + EVENT_PTR(intel_cqm_llc), + EVENT_PTR(intel_cqm_llc_pkg), + EVENT_PTR(intel_cqm_llc_unit), + EVENT_PTR(intel_cqm_llc_scale), + EVENT_PTR(intel_cqm_llc_snapshot), + NULL, +}; + +static struct attribute_group intel_cqm_events_group = { + .name = "events", + .attrs = intel_cqm_events_attr, +}; + +PMU_FORMAT_ATTR(event, "config:0-7"); +static struct attribute *intel_cqm_formats_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group intel_cqm_format_group = { + .name = "format", + .attrs = intel_cqm_formats_attr, +}; + +static const struct attribute_group *intel_cqm_attr_groups[] = { + &intel_cqm_events_group, + &intel_cqm_format_group, + NULL, +}; + +static struct pmu intel_cqm_pmu = { + .attr_groups = intel_cqm_attr_groups, + .task_ctx_nr = perf_sw_context, + .event_init = intel_cqm_event_init, + .add = intel_cqm_event_add, + .del = intel_cqm_event_del, + .start = intel_cqm_event_start, + .stop = intel_cqm_event_stop, + .read = intel_cqm_event_read, +}; + +static inline void cqm_pick_event_reader(int cpu) +{ + int phys_id = topology_physical_package_id(cpu); + int i; + + for_each_cpu(i, &cqm_cpumask) { + if (phys_id == topology_physical_package_id(i)) + return; /* already got reader for this socket */ + } + + cpumask_set_cpu(cpu, &cqm_cpumask); +} + +static void intel_cqm_cpu_prepare(unsigned int cpu) +{ + struct intel_cqm_state *state = &per_cpu(cqm_state, cpu); + struct cpuinfo_x86 *c = &cpu_data(cpu); + + raw_spin_lock_init(&state->lock); + state->rmid = 0; + state->cnt = 0; + + WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid); + WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale); +} + +static void intel_cqm_cpu_exit(unsigned int cpu) +{ + int phys_id = topology_physical_package_id(cpu); + int i; + + /* + * Is @cpu a designated cqm reader? + */ + if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask)) + return; + + for_each_online_cpu(i) { + if (i == cpu) + continue; + + if (phys_id == topology_physical_package_id(i)) { + cpumask_set_cpu(i, &cqm_cpumask); + break; + } + } +} + +static int intel_cqm_cpu_notifier(struct notifier_block *nb, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + intel_cqm_cpu_prepare(cpu); + break; + case CPU_DOWN_PREPARE: + intel_cqm_cpu_exit(cpu); + break; + case CPU_STARTING: + cqm_pick_event_reader(cpu); + break; + } + + return NOTIFY_OK; +} + +static const struct x86_cpu_id intel_cqm_match[] = { + { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC }, + {} +}; + +static int __init intel_cqm_init(void) +{ + char *str, scale[20]; + int i, cpu, ret; + + if (!x86_match_cpu(intel_cqm_match)) + return -ENODEV; + + cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale; + + /* + * It's possible that not all resources support the same number + * of RMIDs. Instead of making scheduling much more complicated + * (where we have to match a task's RMID to a cpu that supports + * that many RMIDs) just find the minimum RMIDs supported across + * all cpus. + * + * Also, check that the scales match on all cpus. + */ + cpu_notifier_register_begin(); + + for_each_online_cpu(cpu) { + struct cpuinfo_x86 *c = &cpu_data(cpu); + + if (c->x86_cache_max_rmid < cqm_max_rmid) + cqm_max_rmid = c->x86_cache_max_rmid; + + if (c->x86_cache_occ_scale != cqm_l3_scale) { + pr_err("Multiple LLC scale values, disabling\n"); + ret = -EINVAL; + goto out; + } + } + + snprintf(scale, sizeof(scale), "%u", cqm_l3_scale); + str = kstrdup(scale, GFP_KERNEL); + if (!str) { + ret = -ENOMEM; + goto out; + } + + event_attr_intel_cqm_llc_scale.event_str = str; + + ret = intel_cqm_setup_rmid_cache(); + if (ret) + goto out; + + for_each_online_cpu(i) { + intel_cqm_cpu_prepare(i); + cqm_pick_event_reader(i); + } + + __perf_cpu_notifier(intel_cqm_cpu_notifier); + + ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1); + + if (ret) + pr_err("Intel CQM perf registration failed: %d\n", ret); + else + pr_info("Intel CQM monitoring enabled\n"); + +out: + cpu_notifier_register_done(); + + return ret; +} +device_initcall(intel_cqm_init); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 9fc9b0d31442..ca5504c48f4f 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -123,6 +123,13 @@ struct hw_perf_event { /* for tp_event->class */ struct list_head tp_list; }; + struct { /* intel_cqm */ + int cqm_state; + int cqm_rmid; + struct list_head cqm_events_entry; + struct list_head cqm_groups_entry; + struct list_head cqm_group_entry; + }; #ifdef CONFIG_HAVE_HW_BREAKPOINT struct { /* breakpoint */ /* -- cgit v1.2.3 From bfe1fcd2688f557a6b6a88f59ea7619228728bd7 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Fri, 23 Jan 2015 18:45:46 +0000 Subject: perf/x86/intel: Support task events with Intel CQM Add support for task events as well as system-wide events. This change has a big impact on the way that we gather LLC occupancy values in intel_cqm_event_read(). Currently, for system-wide (per-cpu) events we defer processing to userspace which knows how to discard all but one cpu result per package. Things aren't so simple for task events because we need to do the value aggregation ourselves. To do this, we defer updating the LLC occupancy value in event->count from intel_cqm_event_read() and do an SMP cross-call to read values for all packages in intel_cqm_event_count(). We need to ensure that we only do this for one task event per cache group, otherwise we'll report duplicate values. If we're a system-wide event we want to fallback to the default perf_event_count() implementation. Refactor this into a common function so that we don't duplicate the code. Also, introduce PERF_TYPE_INTEL_CQM, since we need a way to track an event's task (if the event isn't per-cpu) inside of the Intel CQM PMU driver. This task information is only availble in the upper layers of the perf infrastructure. Other perf backends stash the target task in event->hw.*target so we need to do something similar. The task is used to determine whether events should share a cache group and an RMID. Signed-off-by: Matt Fleming Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Kanaka Juvva Cc: Linus Torvalds Cc: Vikas Shivappa Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/1422038748-21397-8-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_cqm.c | 195 +++++++++++++++++++++++++---- include/linux/perf_event.h | 1 + include/uapi/linux/perf_event.h | 1 + kernel/events/core.c | 2 + 4 files changed, 178 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index b5d9d746dbc0..8003d87afd89 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -182,23 +182,124 @@ fail: /* * Determine if @a and @b measure the same set of tasks. + * + * If @a and @b measure the same set of tasks then we want to share a + * single RMID. */ static bool __match_event(struct perf_event *a, struct perf_event *b) { + /* Per-cpu and task events don't mix */ if ((a->attach_state & PERF_ATTACH_TASK) != (b->attach_state & PERF_ATTACH_TASK)) return false; - /* not task */ +#ifdef CONFIG_CGROUP_PERF + if (a->cgrp != b->cgrp) + return false; +#endif + + /* If not task event, we're machine wide */ + if (!(b->attach_state & PERF_ATTACH_TASK)) + return true; + + /* + * Events that target same task are placed into the same cache group. + */ + if (a->hw.cqm_target == b->hw.cqm_target) + return true; + + /* + * Are we an inherited event? + */ + if (b->parent == a) + return true; + + return false; +} + +#ifdef CONFIG_CGROUP_PERF +static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event) +{ + if (event->attach_state & PERF_ATTACH_TASK) + return perf_cgroup_from_task(event->hw.cqm_target); - return true; /* if not task, we're machine wide */ + return event->cgrp; } +#endif /* * Determine if @a's tasks intersect with @b's tasks + * + * There are combinations of events that we explicitly prohibit, + * + * PROHIBITS + * system-wide -> cgroup and task + * cgroup -> system-wide + * -> task in cgroup + * task -> system-wide + * -> task in cgroup + * + * Call this function before allocating an RMID. */ static bool __conflict_event(struct perf_event *a, struct perf_event *b) { +#ifdef CONFIG_CGROUP_PERF + /* + * We can have any number of cgroups but only one system-wide + * event at a time. + */ + if (a->cgrp && b->cgrp) { + struct perf_cgroup *ac = a->cgrp; + struct perf_cgroup *bc = b->cgrp; + + /* + * This condition should have been caught in + * __match_event() and we should be sharing an RMID. + */ + WARN_ON_ONCE(ac == bc); + + if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) || + cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup)) + return true; + + return false; + } + + if (a->cgrp || b->cgrp) { + struct perf_cgroup *ac, *bc; + + /* + * cgroup and system-wide events are mutually exclusive + */ + if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) || + (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK))) + return true; + + /* + * Ensure neither event is part of the other's cgroup + */ + ac = event_to_cgroup(a); + bc = event_to_cgroup(b); + if (ac == bc) + return true; + + /* + * Must have cgroup and non-intersecting task events. + */ + if (!ac || !bc) + return false; + + /* + * We have cgroup and task events, and the task belongs + * to a cgroup. Check for for overlap. + */ + if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) || + cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup)) + return true; + + return false; + } +#endif /* * If one of them is not a task, same story as above with cgroups. */ @@ -245,9 +346,16 @@ static int intel_cqm_setup_event(struct perf_event *event, static void intel_cqm_event_read(struct perf_event *event) { - unsigned long rmid = event->hw.cqm_rmid; + unsigned long rmid; u64 val; + /* + * Task events are handled by intel_cqm_event_count(). + */ + if (event->cpu == -1) + return; + + rmid = event->hw.cqm_rmid; val = __rmid_read(rmid); /* @@ -259,6 +367,63 @@ static void intel_cqm_event_read(struct perf_event *event) local64_set(&event->count, val); } +struct rmid_read { + unsigned int rmid; + atomic64_t value; +}; + +static void __intel_cqm_event_count(void *info) +{ + struct rmid_read *rr = info; + u64 val; + + val = __rmid_read(rr->rmid); + + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) + return; + + atomic64_add(val, &rr->value); +} + +static inline bool cqm_group_leader(struct perf_event *event) +{ + return !list_empty(&event->hw.cqm_groups_entry); +} + +static u64 intel_cqm_event_count(struct perf_event *event) +{ + struct rmid_read rr = { + .rmid = event->hw.cqm_rmid, + .value = ATOMIC64_INIT(0), + }; + + /* + * We only need to worry about task events. System-wide events + * are handled like usual, i.e. entirely with + * intel_cqm_event_read(). + */ + if (event->cpu != -1) + return __perf_event_count(event); + + /* + * Only the group leader gets to report values. This stops us + * reporting duplicate values to userspace, and gives us a clear + * rule for which task gets to report the values. + * + * Note that it is impossible to attribute these values to + * specific packages - we forfeit that ability when we create + * task events. + */ + if (!cqm_group_leader(event)) + return 0; + + on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1); + + local64_set(&event->count, atomic64_read(&rr.value)); + + return __perf_event_count(event); +} + static void intel_cqm_event_start(struct perf_event *event, int mode) { struct intel_cqm_state *state = this_cpu_ptr(&cqm_state); @@ -344,7 +509,7 @@ static void intel_cqm_event_destroy(struct perf_event *event) /* * And we're the group leader.. */ - if (!list_empty(&event->hw.cqm_groups_entry)) { + if (cqm_group_leader(event)) { /* * If there was a group_other, make that leader, otherwise * destroy the group and return the RMID. @@ -365,17 +530,6 @@ static void intel_cqm_event_destroy(struct perf_event *event) static struct pmu intel_cqm_pmu; -/* - * XXX there's a bit of a problem in that we cannot simply do the one - * event per node as one would want, since that one event would one get - * scheduled on the one cpu. But we want to 'schedule' the RMID on all - * CPUs. - * - * This means we want events for each CPU, however, that generates a lot - * of duplicate values out to userspace -- this is not to be helped - * unless we want to change the core code in some way. Fore more info, - * see intel_cqm_event_read(). - */ static int intel_cqm_event_init(struct perf_event *event) { struct perf_event *group = NULL; @@ -387,9 +541,6 @@ static int intel_cqm_event_init(struct perf_event *event) if (event->attr.config & ~QOS_EVENT_MASK) return -EINVAL; - if (event->cpu == -1) - return -EINVAL; - /* unsupported modes and filters */ if (event->attr.exclude_user || event->attr.exclude_kernel || @@ -407,7 +558,8 @@ static int intel_cqm_event_init(struct perf_event *event) mutex_lock(&cache_mutex); - err = intel_cqm_setup_event(event, &group); /* will also set rmid */ + /* Will also set rmid */ + err = intel_cqm_setup_event(event, &group); if (err) goto out; @@ -470,6 +622,7 @@ static struct pmu intel_cqm_pmu = { .start = intel_cqm_event_start, .stop = intel_cqm_event_stop, .read = intel_cqm_event_read, + .count = intel_cqm_event_count, }; static inline void cqm_pick_event_reader(int cpu) @@ -599,8 +752,8 @@ static int __init intel_cqm_init(void) __perf_cpu_notifier(intel_cqm_cpu_notifier); - ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1); - + ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", + PERF_TYPE_INTEL_CQM); if (ret) pr_err("Intel CQM perf registration failed: %d\n", ret); else diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ca5504c48f4f..dac4c2831d82 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -129,6 +129,7 @@ struct hw_perf_event { struct list_head cqm_events_entry; struct list_head cqm_groups_entry; struct list_head cqm_group_entry; + struct task_struct *cqm_target; }; #ifdef CONFIG_HAVE_HW_BREAKPOINT struct { /* breakpoint */ diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 1e3cd07cf76e..3c8b45de57ec 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -32,6 +32,7 @@ enum perf_type_id { PERF_TYPE_HW_CACHE = 3, PERF_TYPE_RAW = 4, PERF_TYPE_BREAKPOINT = 5, + PERF_TYPE_INTEL_CQM = 6, PERF_TYPE_MAX, /* non-ABI */ }; diff --git a/kernel/events/core.c b/kernel/events/core.c index 1fc3bae5904a..71109a045450 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7181,6 +7181,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, else if (attr->type == PERF_TYPE_BREAKPOINT) event->hw.bp_target = task; #endif + else if (attr->type == PERF_TYPE_INTEL_CQM) + event->hw.cqm_target = task; } if (!overflow_handler && parent_event) { -- cgit v1.2.3 From 50f16a8bf9d7a92c437ed1867d0f7e1dc6a9aca9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Mar 2015 22:10:19 +0100 Subject: perf: Remove type specific target pointers The only reason CQM had to use a hard-coded pmu type was so it could use cqm_target in hw_perf_event. Do away with the {tp,bp,cqm}_target pointers and provide a non type specific one. This allows us to do away with that silly pmu type as well. Signed-off-by: Peter Zijlstra (Intel) Cc: Vince Weaver Cc: acme@kernel.org Cc: acme@redhat.com Cc: hpa@zytor.com Cc: jolsa@redhat.com Cc: kanaka.d.juvva@intel.com Cc: matt.fleming@intel.com Cc: tglx@linutronix.de Cc: torvalds@linux-foundation.org Cc: vikas.shivappa@linux.intel.com Link: http://lkml.kernel.org/r/20150305211019.GU21418@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/arm/kernel/hw_breakpoint.c | 2 +- arch/arm64/kernel/hw_breakpoint.c | 2 +- arch/x86/kernel/cpu/perf_event_intel_cqm.c | 7 +++---- include/linux/perf_event.h | 4 +--- include/uapi/linux/perf_event.h | 1 - kernel/events/core.c | 14 ++++---------- kernel/events/hw_breakpoint.c | 8 ++++---- kernel/trace/trace_uprobe.c | 10 +++++----- 8 files changed, 19 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c index 7fc70ae21185..dc7d0a95bd36 100644 --- a/arch/arm/kernel/hw_breakpoint.c +++ b/arch/arm/kernel/hw_breakpoint.c @@ -648,7 +648,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) * Per-cpu breakpoints are not supported by our stepping * mechanism. */ - if (!bp->hw.bp_target) + if (!bp->hw.target) return -EINVAL; /* diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index df1cf15377b4..d062f35911c2 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -527,7 +527,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) * Disallow per-task kernel breakpoints since these would * complicate the stepping code. */ - if (info->ctrl.privilege == AARCH64_BREAKPOINT_EL1 && bp->hw.bp_target) + if (info->ctrl.privilege == AARCH64_BREAKPOINT_EL1 && bp->hw.target) return -EINVAL; return 0; diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index 9a8ef8376fcd..e4d1b8b738fa 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -263,7 +263,7 @@ static bool __match_event(struct perf_event *a, struct perf_event *b) /* * Events that target same task are placed into the same cache group. */ - if (a->hw.cqm_target == b->hw.cqm_target) + if (a->hw.target == b->hw.target) return true; /* @@ -279,7 +279,7 @@ static bool __match_event(struct perf_event *a, struct perf_event *b) static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event) { if (event->attach_state & PERF_ATTACH_TASK) - return perf_cgroup_from_task(event->hw.cqm_target); + return perf_cgroup_from_task(event->hw.target); return event->cgrp; } @@ -1365,8 +1365,7 @@ static int __init intel_cqm_init(void) __perf_cpu_notifier(intel_cqm_cpu_notifier); - ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", - PERF_TYPE_INTEL_CQM); + ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1); if (ret) pr_err("Intel CQM perf registration failed: %d\n", ret); else diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index dac4c2831d82..5aa49d7bfd07 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -119,7 +119,6 @@ struct hw_perf_event { struct hrtimer hrtimer; }; struct { /* tracepoint */ - struct task_struct *tp_target; /* for tp_event->class */ struct list_head tp_list; }; @@ -129,7 +128,6 @@ struct hw_perf_event { struct list_head cqm_events_entry; struct list_head cqm_groups_entry; struct list_head cqm_group_entry; - struct task_struct *cqm_target; }; #ifdef CONFIG_HAVE_HW_BREAKPOINT struct { /* breakpoint */ @@ -138,12 +136,12 @@ struct hw_perf_event { * problem hw_breakpoint has with context * creation and event initalization. */ - struct task_struct *bp_target; struct arch_hw_breakpoint info; struct list_head bp_list; }; #endif }; + struct task_struct *target; int state; local64_t prev_count; u64 sample_period; diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 3c8b45de57ec..1e3cd07cf76e 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -32,7 +32,6 @@ enum perf_type_id { PERF_TYPE_HW_CACHE = 3, PERF_TYPE_RAW = 4, PERF_TYPE_BREAKPOINT = 5, - PERF_TYPE_INTEL_CQM = 6, PERF_TYPE_MAX, /* non-ABI */ }; diff --git a/kernel/events/core.c b/kernel/events/core.c index 71109a045450..525062b6fba1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7171,18 +7171,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (task) { event->attach_state = PERF_ATTACH_TASK; - - if (attr->type == PERF_TYPE_TRACEPOINT) - event->hw.tp_target = task; -#ifdef CONFIG_HAVE_HW_BREAKPOINT /* - * hw_breakpoint is a bit difficult here.. + * XXX pmu::event_init needs to know what task to account to + * and we cannot use the ctx information because we need the + * pmu before we get a ctx. */ - else if (attr->type == PERF_TYPE_BREAKPOINT) - event->hw.bp_target = task; -#endif - else if (attr->type == PERF_TYPE_INTEL_CQM) - event->hw.cqm_target = task; + event->hw.target = task; } if (!overflow_handler && parent_event) { diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 9803a6600d49..92ce5f4ccc26 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -116,12 +116,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) */ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) { - struct task_struct *tsk = bp->hw.bp_target; + struct task_struct *tsk = bp->hw.target; struct perf_event *iter; int count = 0; list_for_each_entry(iter, &bp_task_head, hw.bp_list) { - if (iter->hw.bp_target == tsk && + if (iter->hw.target == tsk && find_slot_idx(iter) == type && (iter->cpu < 0 || cpu == iter->cpu)) count += hw_breakpoint_weight(iter); @@ -153,7 +153,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, int nr; nr = info->cpu_pinned; - if (!bp->hw.bp_target) + if (!bp->hw.target) nr += max_task_bp_pinned(cpu, type); else nr += task_bp_pinned(cpu, bp, type); @@ -210,7 +210,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, weight = -weight; /* Pinned counter cpu profiling */ - if (!bp->hw.bp_target) { + if (!bp->hw.target) { get_bp_info(bp->cpu, type)->cpu_pinned += weight; return; } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index b11441321e7a..93fdc7791eaa 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1005,7 +1005,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) return true; list_for_each_entry(event, &filter->perf_events, hw.tp_list) { - if (event->hw.tp_target->mm == mm) + if (event->hw.target->mm == mm) return true; } @@ -1015,7 +1015,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) static inline bool uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event) { - return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm); + return __uprobe_perf_filter(&tu->filter, event->hw.target->mm); } static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) @@ -1023,10 +1023,10 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) bool done; write_lock(&tu->filter.rwlock); - if (event->hw.tp_target) { + if (event->hw.target) { list_del(&event->hw.tp_list); done = tu->filter.nr_systemwide || - (event->hw.tp_target->flags & PF_EXITING) || + (event->hw.target->flags & PF_EXITING) || uprobe_filter_event(tu, event); } else { tu->filter.nr_systemwide--; @@ -1046,7 +1046,7 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) int err; write_lock(&tu->filter.rwlock); - if (event->hw.tp_target) { + if (event->hw.target) { /* * event->parent != NULL means copy_process(), we can avoid * uprobe_apply(). current->mm must be probed and we can rely -- cgit v1.2.3