From 3a4ac121c2cacbf97d493fa3bc42ead88657abe4 Mon Sep 17 00:00:00 2001 From: CodyYao-oc Date: Mon, 13 Apr 2020 11:14:29 +0800 Subject: x86/perf: Add hardware performance events support for Zhaoxin CPU. Zhaoxin CPU has provided facilities for monitoring performance via PMU (Performance Monitor Unit), but the functionality is unused so far. Therefore, add support for zhaoxin pmu to make performance related hardware events available. The PMU is mostly an Intel Architectural PerfMon-v2 with a novel errata for the ZXC line. It supports the following events: ----------------------------------------------------------------------------------------------------------------------------------- Event | Event | Umask | Description | Select | | ----------------------------------------------------------------------------------------------------------------------------------- cpu-cycles | 82h | 00h | unhalt core clock instructions | 00h | 00h | number of instructions at retirement. cache-references | 15h | 05h | number of fillq pushs at the current cycle. cache-misses | 1ah | 05h | number of l2 miss pushed by fillq. branch-instructions | 28h | 00h | counts the number of branch instructions retired. branch-misses | 29h | 00h | mispredicted branch instructions at retirement. bus-cycles | 83h | 00h | unhalt bus clock stalled-cycles-frontend | 01h | 01h | Increments each cycle the # of Uops issued by the RAT to RS. stalled-cycles-backend | 0fh | 04h | RS0/1/2/3/45 empty L1-dcache-loads | 68h | 05h | number of retire/commit load. L1-dcache-load-misses | 4bh | 05h | retired load uops whose data source followed an L1 miss. L1-dcache-stores | 69h | 06h | number of retire/commit Store,no LEA L1-dcache-store-misses | 62h | 05h | cache lines in M state evicted out of L1D due to Snoop HitM or dirty line replacement. L1-icache-loads | 00h | 03h | number of l1i cache access for valid normal fetch,including un-cacheable access. L1-icache-load-misses | 01h | 03h | number of l1i cache miss for valid normal fetch,including un-cacheable miss. L1-icache-prefetches | 0ah | 03h | number of prefetch. L1-icache-prefetch-misses | 0bh | 03h | number of prefetch miss. dTLB-loads | 68h | 05h | number of retire/commit load dTLB-load-misses | 2ch | 05h | number of load operations miss all level tlbs and cause a tablewalk. dTLB-stores | 69h | 06h | number of retire/commit Store,no LEA dTLB-store-misses | 30h | 05h | number of store operations miss all level tlbs and cause a tablewalk. dTLB-prefetches | 64h | 05h | number of hardware pte prefetch requests dispatched out of the prefetch FIFO. dTLB-prefetch-misses | 65h | 05h | number of hardware pte prefetch requests miss the l1d data cache. iTLB-load | 00h | 00h | actually counter instructions. iTLB-load-misses | 34h | 05h | number of code operations miss all level tlbs and cause a tablewalk. ----------------------------------------------------------------------------------------------------------------------------------- Reported-by: kbuild test robot Signed-off-by: CodyYao-oc Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1586747669-4827-1-git-send-email-CodyYao-oc@zhaoxin.com --- arch/x86/events/Makefile | 2 + arch/x86/events/core.c | 4 + arch/x86/events/perf_event.h | 10 + arch/x86/events/zhaoxin/Makefile | 2 + arch/x86/events/zhaoxin/core.c | 613 +++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/perfctr-watchdog.c | 8 + 6 files changed, 639 insertions(+) create mode 100644 arch/x86/events/zhaoxin/Makefile create mode 100644 arch/x86/events/zhaoxin/core.c (limited to 'arch/x86') diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile index 9e07f554333f..6f1d1fde8b2d 100644 --- a/arch/x86/events/Makefile +++ b/arch/x86/events/Makefile @@ -3,3 +3,5 @@ obj-y += core.o probe.o obj-y += amd/ obj-$(CONFIG_X86_LOCAL_APIC) += msr.o obj-$(CONFIG_CPU_SUP_INTEL) += intel/ +obj-$(CONFIG_CPU_SUP_CENTAUR) += zhaoxin/ +obj-$(CONFIG_CPU_SUP_ZHAOXIN) += zhaoxin/ diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index a619763e96e1..9e63ee50b19a 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1839,6 +1839,10 @@ static int __init init_hw_perf_events(void) err = amd_pmu_init(); x86_pmu.name = "HYGON"; break; + case X86_VENDOR_ZHAOXIN: + case X86_VENDOR_CENTAUR: + err = zhaoxin_pmu_init(); + break; default: err = -ENOTSUPP; } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index f1cd1ca1a77b..e17a3d8a47ed 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -618,6 +618,7 @@ struct x86_pmu { /* PMI handler bits */ unsigned int late_ack :1, + enabled_ack :1, counter_freezing :1; /* * sysfs attrs @@ -1133,3 +1134,12 @@ static inline int is_ht_workaround_enabled(void) return 0; } #endif /* CONFIG_CPU_SUP_INTEL */ + +#if ((defined CONFIG_CPU_SUP_CENTAUR) || (defined CONFIG_CPU_SUP_ZHAOXIN)) +int zhaoxin_pmu_init(void); +#else +static inline int zhaoxin_pmu_init(void) +{ + return 0; +} +#endif /*CONFIG_CPU_SUP_CENTAUR or CONFIG_CPU_SUP_ZHAOXIN*/ diff --git a/arch/x86/events/zhaoxin/Makefile b/arch/x86/events/zhaoxin/Makefile new file mode 100644 index 000000000000..642c1174d662 --- /dev/null +++ b/arch/x86/events/zhaoxin/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-y += core.o diff --git a/arch/x86/events/zhaoxin/core.c b/arch/x86/events/zhaoxin/core.c new file mode 100644 index 000000000000..898fa1ae9ceb --- /dev/null +++ b/arch/x86/events/zhaoxin/core.c @@ -0,0 +1,613 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Zhoaxin PMU; like Intel Architectural PerfMon-v2 + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "../perf_event.h" + +/* + * Zhaoxin PerfMon, used on zxc and later. + */ +static u64 zx_pmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = { + + [PERF_COUNT_HW_CPU_CYCLES] = 0x0082, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0515, + [PERF_COUNT_HW_CACHE_MISSES] = 0x051a, + [PERF_COUNT_HW_BUS_CYCLES] = 0x0083, +}; + +static struct event_constraint zxc_event_constraints[] __read_mostly = { + + FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint zxd_event_constraints[] __read_mostly = { + + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* retired instructions */ + FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */ + FIXED_EVENT_CONSTRAINT(0x0083, 2), /* unhalted bus clock cycles */ + EVENT_CONSTRAINT_END +}; + +static __initconst const u64 zxd_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { +[C(L1D)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0042, + [C(RESULT_MISS)] = 0x0538, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x0043, + [C(RESULT_MISS)] = 0x0562, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(L1I)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0300, + [C(RESULT_MISS)] = 0x0301, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x030a, + [C(RESULT_MISS)] = 0x030b, + }, +}, +[C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(DTLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0042, + [C(RESULT_MISS)] = 0x052c, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x0043, + [C(RESULT_MISS)] = 0x0530, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x0564, + [C(RESULT_MISS)] = 0x0565, + }, +}, +[C(ITLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x00c0, + [C(RESULT_MISS)] = 0x0534, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(BPU)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0700, + [C(RESULT_MISS)] = 0x0709, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(NODE)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +}; + +static __initconst const u64 zxe_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { +[C(L1D)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0568, + [C(RESULT_MISS)] = 0x054b, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x0669, + [C(RESULT_MISS)] = 0x0562, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(L1I)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0300, + [C(RESULT_MISS)] = 0x0301, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x030a, + [C(RESULT_MISS)] = 0x030b, + }, +}, +[C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0, + [C(RESULT_MISS)] = 0x0, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x0, + [C(RESULT_MISS)] = 0x0, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x0, + [C(RESULT_MISS)] = 0x0, + }, +}, +[C(DTLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0568, + [C(RESULT_MISS)] = 0x052c, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x0669, + [C(RESULT_MISS)] = 0x0530, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x0564, + [C(RESULT_MISS)] = 0x0565, + }, +}, +[C(ITLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x00c0, + [C(RESULT_MISS)] = 0x0534, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(BPU)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0028, + [C(RESULT_MISS)] = 0x0029, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(NODE)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +}; + +static void zhaoxin_pmu_disable_all(void) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); +} + +static void zhaoxin_pmu_enable_all(int added) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); +} + +static inline u64 zhaoxin_pmu_get_status(void) +{ + u64 status; + + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + + return status; +} + +static inline void zhaoxin_pmu_ack_status(u64 ack) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); +} + +static inline void zxc_pmu_ack_status(u64 ack) +{ + /* + * ZXC needs global control enabled in order to clear status bits. + */ + zhaoxin_pmu_enable_all(0); + zhaoxin_pmu_ack_status(ack); + zhaoxin_pmu_disable_all(); +} + +static void zhaoxin_pmu_disable_fixed(struct hw_perf_event *hwc) +{ + int idx = hwc->idx - INTEL_PMC_IDX_FIXED; + u64 ctrl_val, mask; + + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + wrmsrl(hwc->config_base, ctrl_val); +} + +static void zhaoxin_pmu_disable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + zhaoxin_pmu_disable_fixed(hwc); + return; + } + + x86_pmu_disable_event(event); +} + +static void zhaoxin_pmu_enable_fixed(struct hw_perf_event *hwc) +{ + int idx = hwc->idx - INTEL_PMC_IDX_FIXED; + u64 ctrl_val, bits, mask; + + /* + * Enable IRQ generation (0x8), + * and enable ring-3 counting (0x2) and ring-0 counting (0x1) + * if requested: + */ + bits = 0x8ULL; + if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) + bits |= 0x2; + if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) + bits |= 0x1; + + bits <<= (idx * 4); + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + ctrl_val |= bits; + wrmsrl(hwc->config_base, ctrl_val); +} + +static void zhaoxin_pmu_enable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + zhaoxin_pmu_enable_fixed(hwc); + return; + } + + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); +} + +/* + * This handler is triggered by the local APIC, so the APIC IRQ handling + * rules apply: + */ +static int zhaoxin_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + int handled = 0; + u64 status; + int bit; + + cpuc = this_cpu_ptr(&cpu_hw_events); + apic_write(APIC_LVTPC, APIC_DM_NMI); + zhaoxin_pmu_disable_all(); + status = zhaoxin_pmu_get_status(); + if (!status) + goto done; + +again: + if (x86_pmu.enabled_ack) + zxc_pmu_ack_status(status); + else + zhaoxin_pmu_ack_status(status); + + inc_irq_stat(apic_perf_irqs); + + /* + * CondChgd bit 63 doesn't mean any overflow status. Ignore + * and clear the bit. + */ + if (__test_and_clear_bit(63, (unsigned long *)&status)) { + if (!status) + goto done; + } + + for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { + struct perf_event *event = cpuc->events[bit]; + + handled++; + + if (!test_bit(bit, cpuc->active_mask)) + continue; + + x86_perf_event_update(event); + perf_sample_data_init(&data, 0, event->hw.last_period); + + if (!x86_perf_event_set_period(event)) + continue; + + if (perf_event_overflow(event, &data, regs)) + x86_pmu_stop(event, 0); + } + + /* + * Repeat if there is more work to be done: + */ + status = zhaoxin_pmu_get_status(); + if (status) + goto again; + +done: + zhaoxin_pmu_enable_all(0); + return handled; +} + +static u64 zhaoxin_pmu_event_map(int hw_event) +{ + return zx_pmon_event_map[hw_event]; +} + +static struct event_constraint * +zhaoxin_get_event_constraints(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + struct event_constraint *c; + + if (x86_pmu.event_constraints) { + for_each_event_constraint(c, x86_pmu.event_constraints) { + if ((event->hw.config & c->cmask) == c->code) + return c; + } + } + + return &unconstrained; +} + +PMU_FORMAT_ATTR(event, "config:0-7"); +PMU_FORMAT_ATTR(umask, "config:8-15"); +PMU_FORMAT_ATTR(edge, "config:18"); +PMU_FORMAT_ATTR(inv, "config:23"); +PMU_FORMAT_ATTR(cmask, "config:24-31"); + +static struct attribute *zx_arch_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask.attr, + NULL, +}; + +static ssize_t zhaoxin_event_sysfs_show(char *page, u64 config) +{ + u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT); + + return x86_event_sysfs_show(page, config, event); +} + +static const struct x86_pmu zhaoxin_pmu __initconst = { + .name = "zhaoxin", + .handle_irq = zhaoxin_pmu_handle_irq, + .disable_all = zhaoxin_pmu_disable_all, + .enable_all = zhaoxin_pmu_enable_all, + .enable = zhaoxin_pmu_enable_event, + .disable = zhaoxin_pmu_disable_event, + .hw_config = x86_pmu_hw_config, + .schedule_events = x86_schedule_events, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .event_map = zhaoxin_pmu_event_map, + .max_events = ARRAY_SIZE(zx_pmon_event_map), + .apic = 1, + /* + * For zxd/zxe, read/write operation for PMCx MSR is 48 bits. + */ + .max_period = (1ULL << 47) - 1, + .get_event_constraints = zhaoxin_get_event_constraints, + + .format_attrs = zx_arch_formats_attr, + .events_sysfs_show = zhaoxin_event_sysfs_show, +}; + +static const struct { int id; char *name; } zx_arch_events_map[] __initconst = { + { PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" }, + { PERF_COUNT_HW_INSTRUCTIONS, "instructions" }, + { PERF_COUNT_HW_BUS_CYCLES, "bus cycles" }, + { PERF_COUNT_HW_CACHE_REFERENCES, "cache references" }, + { PERF_COUNT_HW_CACHE_MISSES, "cache misses" }, + { PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" }, + { PERF_COUNT_HW_BRANCH_MISSES, "branch misses" }, +}; + +static __init void zhaoxin_arch_events_quirk(void) +{ + int bit; + + /* disable event that reported as not presend by cpuid */ + for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(zx_arch_events_map)) { + zx_pmon_event_map[zx_arch_events_map[bit].id] = 0; + pr_warn("CPUID marked event: \'%s\' unavailable\n", + zx_arch_events_map[bit].name); + } +} + +__init int zhaoxin_pmu_init(void) +{ + union cpuid10_edx edx; + union cpuid10_eax eax; + union cpuid10_ebx ebx; + struct event_constraint *c; + unsigned int unused; + int version; + + pr_info("Welcome to zhaoxin pmu!\n"); + + /* + * Check whether the Architectural PerfMon supports + * hw_event or not. + */ + cpuid(10, &eax.full, &ebx.full, &unused, &edx.full); + + if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT - 1) + return -ENODEV; + + version = eax.split.version_id; + if (version != 2) + return -ENODEV; + + x86_pmu = zhaoxin_pmu; + pr_info("Version check pass!\n"); + + x86_pmu.version = version; + x86_pmu.num_counters = eax.split.num_counters; + x86_pmu.cntval_bits = eax.split.bit_width; + x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1; + x86_pmu.events_maskl = ebx.full; + x86_pmu.events_mask_len = eax.split.mask_length; + + x86_pmu.num_counters_fixed = edx.split.num_counters_fixed; + x86_add_quirk(zhaoxin_arch_events_quirk); + + switch (boot_cpu_data.x86) { + case 0x06: + if (boot_cpu_data.x86_model == 0x0f || boot_cpu_data.x86_model == 0x19) { + + x86_pmu.max_period = x86_pmu.cntval_mask >> 1; + + /* Clearing status works only if the global control is enable on zxc. */ + x86_pmu.enabled_ack = 1; + + x86_pmu.event_constraints = zxc_event_constraints; + zx_pmon_event_map[PERF_COUNT_HW_INSTRUCTIONS] = 0; + zx_pmon_event_map[PERF_COUNT_HW_CACHE_REFERENCES] = 0; + zx_pmon_event_map[PERF_COUNT_HW_CACHE_MISSES] = 0; + zx_pmon_event_map[PERF_COUNT_HW_BUS_CYCLES] = 0; + + pr_cont("ZXC events, "); + break; + } + return -ENODEV; + + case 0x07: + zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = + X86_CONFIG(.event = 0x01, .umask = 0x01, .inv = 0x01, .cmask = 0x01); + + zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = + X86_CONFIG(.event = 0x0f, .umask = 0x04, .inv = 0, .cmask = 0); + + switch (boot_cpu_data.x86_model) { + case 0x1b: + memcpy(hw_cache_event_ids, zxd_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + x86_pmu.event_constraints = zxd_event_constraints; + + zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0700; + zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0709; + + pr_cont("ZXD events, "); + break; + case 0x3b: + memcpy(hw_cache_event_ids, zxe_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + x86_pmu.event_constraints = zxd_event_constraints; + + zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0028; + zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0029; + + pr_cont("ZXE events, "); + break; + default: + return -ENODEV; + } + break; + + default: + return -ENODEV; + } + + x86_pmu.intel_ctrl = (1 << (x86_pmu.num_counters)) - 1; + x86_pmu.intel_ctrl |= ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED; + + if (x86_pmu.event_constraints) { + for_each_event_constraint(c, x86_pmu.event_constraints) { + c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; + c->weight += x86_pmu.num_counters; + } + } + + return 0; +} + diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 9556930cd8c1..a5ee607a3b89 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -63,6 +63,10 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) case 15: return msr - MSR_P4_BPU_PERFCTR0; } + fallthrough; + case X86_VENDOR_ZHAOXIN: + case X86_VENDOR_CENTAUR: + return msr - MSR_ARCH_PERFMON_PERFCTR0; } return 0; } @@ -92,6 +96,10 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) case 15: return msr - MSR_P4_BSU_ESCR0; } + fallthrough; + case X86_VENDOR_ZHAOXIN: + case X86_VENDOR_CENTAUR: + return msr - MSR_ARCH_PERFMON_EVENTSEL0; } return 0; -- cgit v1.2.3 From 4bd30106ddb26d2304adc5bb7bd269825300440d Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Wed, 8 Apr 2020 19:52:16 -0400 Subject: perf/x86/intel/pt: Drop pointless NULL assignment. Only a few lines below this removed line is this: attrs = kzalloc(size, GFP_KERNEL); and since there is no code path where this could be avoided, the NULL assignment is a pointless relic of history and can be removed. Signed-off-by: Paul Gortmaker Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200408235216.108980-1-paul.gortmaker@windriver.com --- arch/x86/events/intel/pt.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 1db7a51d9792..e94af4a54d0d 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -226,8 +226,6 @@ static int __init pt_pmu_hw_init(void) pt_pmu.vmx = true; } - attrs = NULL; - for (i = 0; i < PT_CPUID_LEAVES; i++) { cpuid_count(20, i, &pt_pmu.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM], -- cgit v1.2.3 From f649fc2eefdef7a67698a3c584222c5c8c5a6785 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 7 May 2020 06:14:18 -0700 Subject: perf/x86/rapl: Add Ice Lake RAPL support Enable RAPL support for Intel Ice Lake X and Ice Lake D. For RAPL support, it is identical to Sky Lake X. Reported-by: Stephane Eranian Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1588857258-38213-1-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/rapl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index a5dbd25852cb..9e1e141d22d3 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -738,6 +738,8 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS, &model_hsw), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &model_hsx), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &model_hsx), X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &model_skl), {}, -- cgit v1.2.3 From 0813c40556fce1eeefb996e020cc5339e0b84137 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 1 May 2020 05:54:42 -0700 Subject: perf/x86/intel: Add more available bits for OFFCORE_RESPONSE of Intel Tremont The mask in the extra_regs for Intel Tremont need to be extended to allow more defined bits. "Outstanding Requests" (bit 63) is only available on MSR_OFFCORE_RSP0; Fixes: 6daeb8737f8a ("perf/x86/intel: Add Tremont core PMU support") Reported-by: Stephane Eranian Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20200501125442.7030-1-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 332954cccece..ca35c8b5ee10 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -1892,8 +1892,8 @@ static __initconst const u64 tnt_hw_cache_extra_regs static struct extra_reg intel_tnt_extra_regs[] __read_mostly = { /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ - INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffffff9fffull, RSP_0), - INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0xffffff9fffull, RSP_1), + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x800ff0ffffff9fffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0xff0ffffff9fffull, RSP_1), EVENT_EXTRA_END }; -- cgit v1.2.3 From 8ac7571a8cd3c11da24c3c3555f6e40e33049609 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 11 May 2020 15:09:11 -0500 Subject: perf/x86: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200511200911.GA13149@embeddedor --- arch/x86/events/intel/bts.c | 2 +- arch/x86/events/intel/uncore.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 6a3b599ee0fe..731dd8d0dbb1 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -58,7 +58,7 @@ struct bts_buffer { local_t head; unsigned long end; void **data_pages; - struct bts_phys buf[0]; + struct bts_phys buf[]; }; static struct pmu bts_pmu; diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 0da4a4605536..b469ddd45515 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -130,7 +130,7 @@ struct intel_uncore_box { struct list_head list; struct list_head active_list; void __iomem *io_addr; - struct intel_uncore_extra_reg shared_regs[0]; + struct intel_uncore_extra_reg shared_regs[]; }; /* CFL uncore 8th cbox MSRs */ -- cgit v1.2.3 From fd3ae1e1587d64ef8cc8e361903d33625458073e Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 27 May 2020 15:46:55 -0700 Subject: perf/x86/rapl: Move RAPL support to common x86 code To prepare for support of both Intel and AMD RAPL. As per the AMD PPR, Fam17h support Package RAPL counters to monitor power usage. The RAPL counter operates as with Intel RAPL, and as such it is beneficial to share the code. No change in functionality. Signed-off-by: Stephane Eranian Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200527224659.206129-2-eranian@google.com --- arch/x86/events/Kconfig | 6 +- arch/x86/events/Makefile | 1 + arch/x86/events/intel/Makefile | 2 - arch/x86/events/intel/rapl.c | 802 ---------------------------------------- arch/x86/events/rapl.c | 805 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 809 insertions(+), 807 deletions(-) delete mode 100644 arch/x86/events/intel/rapl.c create mode 100644 arch/x86/events/rapl.c (limited to 'arch/x86') diff --git a/arch/x86/events/Kconfig b/arch/x86/events/Kconfig index 9a7a1446cb3a..4a809c6cbd2f 100644 --- a/arch/x86/events/Kconfig +++ b/arch/x86/events/Kconfig @@ -10,11 +10,11 @@ config PERF_EVENTS_INTEL_UNCORE available on NehalemEX and more modern processors. config PERF_EVENTS_INTEL_RAPL - tristate "Intel rapl performance events" - depends on PERF_EVENTS && CPU_SUP_INTEL && PCI + tristate "Intel/AMD rapl performance events" + depends on PERF_EVENTS && (CPU_SUP_INTEL || CPU_SUP_AMD) && PCI default y ---help--- - Include support for Intel rapl performance events for power + Include support for Intel and AMD rapl performance events for power monitoring on modern processors. config PERF_EVENTS_INTEL_CSTATE diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile index 6f1d1fde8b2d..12c42eba77ec 100644 --- a/arch/x86/events/Makefile +++ b/arch/x86/events/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y += core.o probe.o +obj-$(PERF_EVENTS_INTEL_RAPL) += rapl.o obj-y += amd/ obj-$(CONFIG_X86_LOCAL_APIC) += msr.o obj-$(CONFIG_CPU_SUP_INTEL) += intel/ diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile index 3468b0c1dc7c..e67a5886336c 100644 --- a/arch/x86/events/intel/Makefile +++ b/arch/x86/events/intel/Makefile @@ -2,8 +2,6 @@ obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o obj-$(CONFIG_CPU_SUP_INTEL) += ds.o knc.o obj-$(CONFIG_CPU_SUP_INTEL) += lbr.o p4.o p6.o pt.o -obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += intel-rapl-perf.o -intel-rapl-perf-objs := rapl.o obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel-uncore.o intel-uncore-objs := uncore.o uncore_nhmex.o uncore_snb.o uncore_snbep.o obj-$(CONFIG_PERF_EVENTS_INTEL_CSTATE) += intel-cstate.o diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c deleted file mode 100644 index 9e1e141d22d3..000000000000 --- a/arch/x86/events/intel/rapl.c +++ /dev/null @@ -1,802 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Support Intel RAPL energy consumption counters - * Copyright (C) 2013 Google, Inc., Stephane Eranian - * - * Intel RAPL interface is specified in the IA-32 Manual Vol3b - * section 14.7.1 (September 2013) - * - * RAPL provides more controls than just reporting energy consumption - * however here we only expose the 3 energy consumption free running - * counters (pp0, pkg, dram). - * - * Each of those counters increments in a power unit defined by the - * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules - * but it can vary. - * - * Counter to rapl events mappings: - * - * pp0 counter: consumption of all physical cores (power plane 0) - * event: rapl_energy_cores - * perf code: 0x1 - * - * pkg counter: consumption of the whole processor package - * event: rapl_energy_pkg - * perf code: 0x2 - * - * dram counter: consumption of the dram domain (servers only) - * event: rapl_energy_dram - * perf code: 0x3 - * - * gpu counter: consumption of the builtin-gpu domain (client only) - * event: rapl_energy_gpu - * perf code: 0x4 - * - * psys counter: consumption of the builtin-psys domain (client only) - * event: rapl_energy_psys - * perf code: 0x5 - * - * We manage those counters as free running (read-only). They may be - * use simultaneously by other tools, such as turbostat. - * - * The events only support system-wide mode counting. There is no - * sampling support because it does not make sense and is not - * supported by the RAPL hardware. - * - * Because we want to avoid floating-point operations in the kernel, - * the events are all reported in fixed point arithmetic (32.32). - * Tools must adjust the counts to convert them to Watts using - * the duration of the measurement. Tools may use a function such as - * ldexp(raw_count, -32); - */ - -#define pr_fmt(fmt) "RAPL PMU: " fmt - -#include -#include -#include -#include -#include -#include -#include "../perf_event.h" -#include "../probe.h" - -MODULE_LICENSE("GPL"); - -/* - * RAPL energy status counters - */ -enum perf_rapl_events { - PERF_RAPL_PP0 = 0, /* all cores */ - PERF_RAPL_PKG, /* entire package */ - PERF_RAPL_RAM, /* DRAM */ - PERF_RAPL_PP1, /* gpu */ - PERF_RAPL_PSYS, /* psys */ - - PERF_RAPL_MAX, - NR_RAPL_DOMAINS = PERF_RAPL_MAX, -}; - -static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { - "pp0-core", - "package", - "dram", - "pp1-gpu", - "psys", -}; - -/* - * event code: LSB 8 bits, passed in attr->config - * any other bit is reserved - */ -#define RAPL_EVENT_MASK 0xFFULL - -#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format) \ -static ssize_t __rapl_##_var##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, \ - char *page) \ -{ \ - BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ - return sprintf(page, _format "\n"); \ -} \ -static struct kobj_attribute format_attr_##_var = \ - __ATTR(_name, 0444, __rapl_##_var##_show, NULL) - -#define RAPL_CNTR_WIDTH 32 - -#define RAPL_EVENT_ATTR_STR(_name, v, str) \ -static struct perf_pmu_events_attr event_attr_##v = { \ - .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \ - .id = 0, \ - .event_str = str, \ -}; - -struct rapl_pmu { - raw_spinlock_t lock; - int n_active; - int cpu; - struct list_head active_list; - struct pmu *pmu; - ktime_t timer_interval; - struct hrtimer hrtimer; -}; - -struct rapl_pmus { - struct pmu pmu; - unsigned int maxdie; - struct rapl_pmu *pmus[]; -}; - -struct rapl_model { - unsigned long events; - bool apply_quirk; -}; - - /* 1/2^hw_unit Joule */ -static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly; -static struct rapl_pmus *rapl_pmus; -static cpumask_t rapl_cpu_mask; -static unsigned int rapl_cntr_mask; -static u64 rapl_timer_ms; -static struct perf_msr rapl_msrs[]; - -static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu) -{ - unsigned int dieid = topology_logical_die_id(cpu); - - /* - * The unsigned check also catches the '-1' return value for non - * existent mappings in the topology map. - */ - return dieid < rapl_pmus->maxdie ? rapl_pmus->pmus[dieid] : NULL; -} - -static inline u64 rapl_read_counter(struct perf_event *event) -{ - u64 raw; - rdmsrl(event->hw.event_base, raw); - return raw; -} - -static inline u64 rapl_scale(u64 v, int cfg) -{ - if (cfg > NR_RAPL_DOMAINS) { - pr_warn("Invalid domain %d, failed to scale data\n", cfg); - return v; - } - /* - * scale delta to smallest unit (1/2^32) - * users must then scale back: count * 1/(1e9*2^32) to get Joules - * or use ldexp(count, -32). - * Watts = Joules/Time delta - */ - return v << (32 - rapl_hw_unit[cfg - 1]); -} - -static u64 rapl_event_update(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - u64 prev_raw_count, new_raw_count; - s64 delta, sdelta; - int shift = RAPL_CNTR_WIDTH; - -again: - prev_raw_count = local64_read(&hwc->prev_count); - rdmsrl(event->hw.event_base, new_raw_count); - - if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, - new_raw_count) != prev_raw_count) { - cpu_relax(); - goto again; - } - - /* - * Now we have the new raw value and have updated the prev - * timestamp already. We can now calculate the elapsed delta - * (event-)time and add that to the generic event. - * - * Careful, not all hw sign-extends above the physical width - * of the count. - */ - delta = (new_raw_count << shift) - (prev_raw_count << shift); - delta >>= shift; - - sdelta = rapl_scale(delta, event->hw.config); - - local64_add(sdelta, &event->count); - - return new_raw_count; -} - -static void rapl_start_hrtimer(struct rapl_pmu *pmu) -{ - hrtimer_start(&pmu->hrtimer, pmu->timer_interval, - HRTIMER_MODE_REL_PINNED); -} - -static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer) -{ - struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer); - struct perf_event *event; - unsigned long flags; - - if (!pmu->n_active) - return HRTIMER_NORESTART; - - raw_spin_lock_irqsave(&pmu->lock, flags); - - list_for_each_entry(event, &pmu->active_list, active_entry) - rapl_event_update(event); - - raw_spin_unlock_irqrestore(&pmu->lock, flags); - - hrtimer_forward_now(hrtimer, pmu->timer_interval); - - return HRTIMER_RESTART; -} - -static void rapl_hrtimer_init(struct rapl_pmu *pmu) -{ - struct hrtimer *hr = &pmu->hrtimer; - - hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hr->function = rapl_hrtimer_handle; -} - -static void __rapl_pmu_event_start(struct rapl_pmu *pmu, - struct perf_event *event) -{ - if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) - return; - - event->hw.state = 0; - - list_add_tail(&event->active_entry, &pmu->active_list); - - local64_set(&event->hw.prev_count, rapl_read_counter(event)); - - pmu->n_active++; - if (pmu->n_active == 1) - rapl_start_hrtimer(pmu); -} - -static void rapl_pmu_event_start(struct perf_event *event, int mode) -{ - struct rapl_pmu *pmu = event->pmu_private; - unsigned long flags; - - raw_spin_lock_irqsave(&pmu->lock, flags); - __rapl_pmu_event_start(pmu, event); - raw_spin_unlock_irqrestore(&pmu->lock, flags); -} - -static void rapl_pmu_event_stop(struct perf_event *event, int mode) -{ - struct rapl_pmu *pmu = event->pmu_private; - struct hw_perf_event *hwc = &event->hw; - unsigned long flags; - - raw_spin_lock_irqsave(&pmu->lock, flags); - - /* mark event as deactivated and stopped */ - if (!(hwc->state & PERF_HES_STOPPED)) { - WARN_ON_ONCE(pmu->n_active <= 0); - pmu->n_active--; - if (pmu->n_active == 0) - hrtimer_cancel(&pmu->hrtimer); - - list_del(&event->active_entry); - - WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); - hwc->state |= PERF_HES_STOPPED; - } - - /* check if update of sw counter is necessary */ - if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { - /* - * Drain the remaining delta count out of a event - * that we are disabling: - */ - rapl_event_update(event); - hwc->state |= PERF_HES_UPTODATE; - } - - raw_spin_unlock_irqrestore(&pmu->lock, flags); -} - -static int rapl_pmu_event_add(struct perf_event *event, int mode) -{ - struct rapl_pmu *pmu = event->pmu_private; - struct hw_perf_event *hwc = &event->hw; - unsigned long flags; - - raw_spin_lock_irqsave(&pmu->lock, flags); - - hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; - - if (mode & PERF_EF_START) - __rapl_pmu_event_start(pmu, event); - - raw_spin_unlock_irqrestore(&pmu->lock, flags); - - return 0; -} - -static void rapl_pmu_event_del(struct perf_event *event, int flags) -{ - rapl_pmu_event_stop(event, PERF_EF_UPDATE); -} - -static int rapl_pmu_event_init(struct perf_event *event) -{ - u64 cfg = event->attr.config & RAPL_EVENT_MASK; - int bit, ret = 0; - struct rapl_pmu *pmu; - - /* only look at RAPL events */ - if (event->attr.type != rapl_pmus->pmu.type) - return -ENOENT; - - /* check only supported bits are set */ - if (event->attr.config & ~RAPL_EVENT_MASK) - return -EINVAL; - - if (event->cpu < 0) - return -EINVAL; - - event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG; - - if (!cfg || cfg >= NR_RAPL_DOMAINS + 1) - return -EINVAL; - - cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1); - bit = cfg - 1; - - /* check event supported */ - if (!(rapl_cntr_mask & (1 << bit))) - return -EINVAL; - - /* unsupported modes and filters */ - if (event->attr.sample_period) /* no sampling */ - return -EINVAL; - - /* must be done before validate_group */ - pmu = cpu_to_rapl_pmu(event->cpu); - if (!pmu) - return -EINVAL; - event->cpu = pmu->cpu; - event->pmu_private = pmu; - event->hw.event_base = rapl_msrs[bit].msr; - event->hw.config = cfg; - event->hw.idx = bit; - - return ret; -} - -static void rapl_pmu_event_read(struct perf_event *event) -{ - rapl_event_update(event); -} - -static ssize_t rapl_get_attr_cpumask(struct device *dev, - struct device_attribute *attr, char *buf) -{ - return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask); -} - -static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL); - -static struct attribute *rapl_pmu_attrs[] = { - &dev_attr_cpumask.attr, - NULL, -}; - -static struct attribute_group rapl_pmu_attr_group = { - .attrs = rapl_pmu_attrs, -}; - -RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01"); -RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02"); -RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03"); -RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04"); -RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05"); - -RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules"); -RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules"); -RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules"); -RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules"); -RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_psys_unit, "Joules"); - -/* - * we compute in 0.23 nJ increments regardless of MSR - */ -RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10"); -RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10"); -RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10"); -RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10"); -RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_psys_scale, "2.3283064365386962890625e-10"); - -/* - * There are no default events, but we need to create - * "events" group (with empty attrs) before updating - * it with detected events. - */ -static struct attribute *attrs_empty[] = { - NULL, -}; - -static struct attribute_group rapl_pmu_events_group = { - .name = "events", - .attrs = attrs_empty, -}; - -DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7"); -static struct attribute *rapl_formats_attr[] = { - &format_attr_event.attr, - NULL, -}; - -static struct attribute_group rapl_pmu_format_group = { - .name = "format", - .attrs = rapl_formats_attr, -}; - -static const struct attribute_group *rapl_attr_groups[] = { - &rapl_pmu_attr_group, - &rapl_pmu_format_group, - &rapl_pmu_events_group, - NULL, -}; - -static struct attribute *rapl_events_cores[] = { - EVENT_PTR(rapl_cores), - EVENT_PTR(rapl_cores_unit), - EVENT_PTR(rapl_cores_scale), - NULL, -}; - -static struct attribute_group rapl_events_cores_group = { - .name = "events", - .attrs = rapl_events_cores, -}; - -static struct attribute *rapl_events_pkg[] = { - EVENT_PTR(rapl_pkg), - EVENT_PTR(rapl_pkg_unit), - EVENT_PTR(rapl_pkg_scale), - NULL, -}; - -static struct attribute_group rapl_events_pkg_group = { - .name = "events", - .attrs = rapl_events_pkg, -}; - -static struct attribute *rapl_events_ram[] = { - EVENT_PTR(rapl_ram), - EVENT_PTR(rapl_ram_unit), - EVENT_PTR(rapl_ram_scale), - NULL, -}; - -static struct attribute_group rapl_events_ram_group = { - .name = "events", - .attrs = rapl_events_ram, -}; - -static struct attribute *rapl_events_gpu[] = { - EVENT_PTR(rapl_gpu), - EVENT_PTR(rapl_gpu_unit), - EVENT_PTR(rapl_gpu_scale), - NULL, -}; - -static struct attribute_group rapl_events_gpu_group = { - .name = "events", - .attrs = rapl_events_gpu, -}; - -static struct attribute *rapl_events_psys[] = { - EVENT_PTR(rapl_psys), - EVENT_PTR(rapl_psys_unit), - EVENT_PTR(rapl_psys_scale), - NULL, -}; - -static struct attribute_group rapl_events_psys_group = { - .name = "events", - .attrs = rapl_events_psys, -}; - -static bool test_msr(int idx, void *data) -{ - return test_bit(idx, (unsigned long *) data); -} - -static struct perf_msr rapl_msrs[] = { - [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr }, - [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr }, - [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr }, - [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr }, - [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr }, -}; - -static int rapl_cpu_offline(unsigned int cpu) -{ - struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); - int target; - - /* Check if exiting cpu is used for collecting rapl events */ - if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask)) - return 0; - - pmu->cpu = -1; - /* Find a new cpu to collect rapl events */ - target = cpumask_any_but(topology_die_cpumask(cpu), cpu); - - /* Migrate rapl events to the new target */ - if (target < nr_cpu_ids) { - cpumask_set_cpu(target, &rapl_cpu_mask); - pmu->cpu = target; - perf_pmu_migrate_context(pmu->pmu, cpu, target); - } - return 0; -} - -static int rapl_cpu_online(unsigned int cpu) -{ - struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); - int target; - - if (!pmu) { - pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); - if (!pmu) - return -ENOMEM; - - raw_spin_lock_init(&pmu->lock); - INIT_LIST_HEAD(&pmu->active_list); - pmu->pmu = &rapl_pmus->pmu; - pmu->timer_interval = ms_to_ktime(rapl_timer_ms); - rapl_hrtimer_init(pmu); - - rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu; - } - - /* - * Check if there is an online cpu in the package which collects rapl - * events already. - */ - target = cpumask_any_and(&rapl_cpu_mask, topology_die_cpumask(cpu)); - if (target < nr_cpu_ids) - return 0; - - cpumask_set_cpu(cpu, &rapl_cpu_mask); - pmu->cpu = cpu; - return 0; -} - -static int rapl_check_hw_unit(bool apply_quirk) -{ - u64 msr_rapl_power_unit_bits; - int i; - - /* protect rdmsrl() to handle virtualization */ - if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits)) - return -1; - for (i = 0; i < NR_RAPL_DOMAINS; i++) - rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; - - /* - * DRAM domain on HSW server and KNL has fixed energy unit which can be - * different than the unit from power unit MSR. See - * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2 - * of 2. Datasheet, September 2014, Reference Number: 330784-001 " - */ - if (apply_quirk) - rapl_hw_unit[PERF_RAPL_RAM] = 16; - - /* - * Calculate the timer rate: - * Use reference of 200W for scaling the timeout to avoid counter - * overflows. 200W = 200 Joules/sec - * Divide interval by 2 to avoid lockstep (2 * 100) - * if hw unit is 32, then we use 2 ms 1/200/2 - */ - rapl_timer_ms = 2; - if (rapl_hw_unit[0] < 32) { - rapl_timer_ms = (1000 / (2 * 100)); - rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1)); - } - return 0; -} - -static void __init rapl_advertise(void) -{ - int i; - - pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n", - hweight32(rapl_cntr_mask), rapl_timer_ms); - - for (i = 0; i < NR_RAPL_DOMAINS; i++) { - if (rapl_cntr_mask & (1 << i)) { - pr_info("hw unit of domain %s 2^-%d Joules\n", - rapl_domain_names[i], rapl_hw_unit[i]); - } - } -} - -static void cleanup_rapl_pmus(void) -{ - int i; - - for (i = 0; i < rapl_pmus->maxdie; i++) - kfree(rapl_pmus->pmus[i]); - kfree(rapl_pmus); -} - -static const struct attribute_group *rapl_attr_update[] = { - &rapl_events_cores_group, - &rapl_events_pkg_group, - &rapl_events_ram_group, - &rapl_events_gpu_group, - &rapl_events_gpu_group, - NULL, -}; - -static int __init init_rapl_pmus(void) -{ - int maxdie = topology_max_packages() * topology_max_die_per_package(); - size_t size; - - size = sizeof(*rapl_pmus) + maxdie * sizeof(struct rapl_pmu *); - rapl_pmus = kzalloc(size, GFP_KERNEL); - if (!rapl_pmus) - return -ENOMEM; - - rapl_pmus->maxdie = maxdie; - rapl_pmus->pmu.attr_groups = rapl_attr_groups; - rapl_pmus->pmu.attr_update = rapl_attr_update; - rapl_pmus->pmu.task_ctx_nr = perf_invalid_context; - rapl_pmus->pmu.event_init = rapl_pmu_event_init; - rapl_pmus->pmu.add = rapl_pmu_event_add; - rapl_pmus->pmu.del = rapl_pmu_event_del; - rapl_pmus->pmu.start = rapl_pmu_event_start; - rapl_pmus->pmu.stop = rapl_pmu_event_stop; - rapl_pmus->pmu.read = rapl_pmu_event_read; - rapl_pmus->pmu.module = THIS_MODULE; - rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE; - return 0; -} - -static struct rapl_model model_snb = { - .events = BIT(PERF_RAPL_PP0) | - BIT(PERF_RAPL_PKG) | - BIT(PERF_RAPL_PP1), - .apply_quirk = false, -}; - -static struct rapl_model model_snbep = { - .events = BIT(PERF_RAPL_PP0) | - BIT(PERF_RAPL_PKG) | - BIT(PERF_RAPL_RAM), - .apply_quirk = false, -}; - -static struct rapl_model model_hsw = { - .events = BIT(PERF_RAPL_PP0) | - BIT(PERF_RAPL_PKG) | - BIT(PERF_RAPL_RAM) | - BIT(PERF_RAPL_PP1), - .apply_quirk = false, -}; - -static struct rapl_model model_hsx = { - .events = BIT(PERF_RAPL_PP0) | - BIT(PERF_RAPL_PKG) | - BIT(PERF_RAPL_RAM), - .apply_quirk = true, -}; - -static struct rapl_model model_knl = { - .events = BIT(PERF_RAPL_PKG) | - BIT(PERF_RAPL_RAM), - .apply_quirk = true, -}; - -static struct rapl_model model_skl = { - .events = BIT(PERF_RAPL_PP0) | - BIT(PERF_RAPL_PKG) | - BIT(PERF_RAPL_RAM) | - BIT(PERF_RAPL_PP1) | - BIT(PERF_RAPL_PSYS), - .apply_quirk = false, -}; - -static const struct x86_cpu_id rapl_model_match[] __initconst = { - X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE, &model_snb), - X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X, &model_snbep), - X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE, &model_snb), - X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &model_snbep), - X86_MATCH_INTEL_FAM6_MODEL(HASWELL, &model_hsw), - X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &model_hsx), - X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L, &model_hsw), - X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G, &model_hsw), - X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, &model_hsw), - X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G, &model_hsw), - X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &model_hsx), - X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &model_hsx), - X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &model_knl), - X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &model_knl), - X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L, &model_skl), - X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE, &model_skl), - X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &model_hsx), - X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L, &model_skl), - X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &model_skl), - X86_MATCH_INTEL_FAM6_MODEL(CANNONLAKE_L, &model_skl), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT, &model_hsw), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D, &model_hsw), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS, &model_hsw), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &model_skl), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &model_skl), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &model_hsx), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &model_hsx), - X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &model_skl), - X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &model_skl), - {}, -}; -MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); - -static int __init rapl_pmu_init(void) -{ - const struct x86_cpu_id *id; - struct rapl_model *rm; - int ret; - - id = x86_match_cpu(rapl_model_match); - if (!id) - return -ENODEV; - - rm = (struct rapl_model *) id->driver_data; - rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX, - false, (void *) &rm->events); - - ret = rapl_check_hw_unit(rm->apply_quirk); - if (ret) - return ret; - - ret = init_rapl_pmus(); - if (ret) - return ret; - - /* - * Install callbacks. Core will call them for each online cpu. - */ - ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE, - "perf/x86/rapl:online", - rapl_cpu_online, rapl_cpu_offline); - if (ret) - goto out; - - ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1); - if (ret) - goto out1; - - rapl_advertise(); - return 0; - -out1: - cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE); -out: - pr_warn("Initialization failed (%d), disabled\n", ret); - cleanup_rapl_pmus(); - return ret; -} -module_init(rapl_pmu_init); - -static void __exit intel_rapl_exit(void) -{ - cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE); - perf_pmu_unregister(&rapl_pmus->pmu); - cleanup_rapl_pmus(); -} -module_exit(intel_rapl_exit); diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c new file mode 100644 index 000000000000..3e6c01b8e996 --- /dev/null +++ b/arch/x86/events/rapl.c @@ -0,0 +1,805 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Support Intel/AMD RAPL energy consumption counters + * Copyright (C) 2013 Google, Inc., Stephane Eranian + * + * Intel RAPL interface is specified in the IA-32 Manual Vol3b + * section 14.7.1 (September 2013) + * + * AMD RAPL interface for Fam17h is described in the public PPR: + * https://bugzilla.kernel.org/show_bug.cgi?id=206537 + * + * RAPL provides more controls than just reporting energy consumption + * however here we only expose the 3 energy consumption free running + * counters (pp0, pkg, dram). + * + * Each of those counters increments in a power unit defined by the + * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules + * but it can vary. + * + * Counter to rapl events mappings: + * + * pp0 counter: consumption of all physical cores (power plane 0) + * event: rapl_energy_cores + * perf code: 0x1 + * + * pkg counter: consumption of the whole processor package + * event: rapl_energy_pkg + * perf code: 0x2 + * + * dram counter: consumption of the dram domain (servers only) + * event: rapl_energy_dram + * perf code: 0x3 + * + * gpu counter: consumption of the builtin-gpu domain (client only) + * event: rapl_energy_gpu + * perf code: 0x4 + * + * psys counter: consumption of the builtin-psys domain (client only) + * event: rapl_energy_psys + * perf code: 0x5 + * + * We manage those counters as free running (read-only). They may be + * use simultaneously by other tools, such as turbostat. + * + * The events only support system-wide mode counting. There is no + * sampling support because it does not make sense and is not + * supported by the RAPL hardware. + * + * Because we want to avoid floating-point operations in the kernel, + * the events are all reported in fixed point arithmetic (32.32). + * Tools must adjust the counts to convert them to Watts using + * the duration of the measurement. Tools may use a function such as + * ldexp(raw_count, -32); + */ + +#define pr_fmt(fmt) "RAPL PMU: " fmt + +#include +#include +#include +#include +#include +#include +#include "perf_event.h" +#include "probe.h" + +MODULE_LICENSE("GPL"); + +/* + * RAPL energy status counters + */ +enum perf_rapl_events { + PERF_RAPL_PP0 = 0, /* all cores */ + PERF_RAPL_PKG, /* entire package */ + PERF_RAPL_RAM, /* DRAM */ + PERF_RAPL_PP1, /* gpu */ + PERF_RAPL_PSYS, /* psys */ + + PERF_RAPL_MAX, + NR_RAPL_DOMAINS = PERF_RAPL_MAX, +}; + +static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { + "pp0-core", + "package", + "dram", + "pp1-gpu", + "psys", +}; + +/* + * event code: LSB 8 bits, passed in attr->config + * any other bit is reserved + */ +#define RAPL_EVENT_MASK 0xFFULL + +#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format) \ +static ssize_t __rapl_##_var##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + char *page) \ +{ \ + BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ + return sprintf(page, _format "\n"); \ +} \ +static struct kobj_attribute format_attr_##_var = \ + __ATTR(_name, 0444, __rapl_##_var##_show, NULL) + +#define RAPL_CNTR_WIDTH 32 + +#define RAPL_EVENT_ATTR_STR(_name, v, str) \ +static struct perf_pmu_events_attr event_attr_##v = { \ + .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \ + .id = 0, \ + .event_str = str, \ +}; + +struct rapl_pmu { + raw_spinlock_t lock; + int n_active; + int cpu; + struct list_head active_list; + struct pmu *pmu; + ktime_t timer_interval; + struct hrtimer hrtimer; +}; + +struct rapl_pmus { + struct pmu pmu; + unsigned int maxdie; + struct rapl_pmu *pmus[]; +}; + +struct rapl_model { + unsigned long events; + bool apply_quirk; +}; + + /* 1/2^hw_unit Joule */ +static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly; +static struct rapl_pmus *rapl_pmus; +static cpumask_t rapl_cpu_mask; +static unsigned int rapl_cntr_mask; +static u64 rapl_timer_ms; +static struct perf_msr rapl_msrs[]; + +static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu) +{ + unsigned int dieid = topology_logical_die_id(cpu); + + /* + * The unsigned check also catches the '-1' return value for non + * existent mappings in the topology map. + */ + return dieid < rapl_pmus->maxdie ? rapl_pmus->pmus[dieid] : NULL; +} + +static inline u64 rapl_read_counter(struct perf_event *event) +{ + u64 raw; + rdmsrl(event->hw.event_base, raw); + return raw; +} + +static inline u64 rapl_scale(u64 v, int cfg) +{ + if (cfg > NR_RAPL_DOMAINS) { + pr_warn("Invalid domain %d, failed to scale data\n", cfg); + return v; + } + /* + * scale delta to smallest unit (1/2^32) + * users must then scale back: count * 1/(1e9*2^32) to get Joules + * or use ldexp(count, -32). + * Watts = Joules/Time delta + */ + return v << (32 - rapl_hw_unit[cfg - 1]); +} + +static u64 rapl_event_update(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 prev_raw_count, new_raw_count; + s64 delta, sdelta; + int shift = RAPL_CNTR_WIDTH; + +again: + prev_raw_count = local64_read(&hwc->prev_count); + rdmsrl(event->hw.event_base, new_raw_count); + + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) { + cpu_relax(); + goto again; + } + + /* + * Now we have the new raw value and have updated the prev + * timestamp already. We can now calculate the elapsed delta + * (event-)time and add that to the generic event. + * + * Careful, not all hw sign-extends above the physical width + * of the count. + */ + delta = (new_raw_count << shift) - (prev_raw_count << shift); + delta >>= shift; + + sdelta = rapl_scale(delta, event->hw.config); + + local64_add(sdelta, &event->count); + + return new_raw_count; +} + +static void rapl_start_hrtimer(struct rapl_pmu *pmu) +{ + hrtimer_start(&pmu->hrtimer, pmu->timer_interval, + HRTIMER_MODE_REL_PINNED); +} + +static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer) +{ + struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer); + struct perf_event *event; + unsigned long flags; + + if (!pmu->n_active) + return HRTIMER_NORESTART; + + raw_spin_lock_irqsave(&pmu->lock, flags); + + list_for_each_entry(event, &pmu->active_list, active_entry) + rapl_event_update(event); + + raw_spin_unlock_irqrestore(&pmu->lock, flags); + + hrtimer_forward_now(hrtimer, pmu->timer_interval); + + return HRTIMER_RESTART; +} + +static void rapl_hrtimer_init(struct rapl_pmu *pmu) +{ + struct hrtimer *hr = &pmu->hrtimer; + + hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hr->function = rapl_hrtimer_handle; +} + +static void __rapl_pmu_event_start(struct rapl_pmu *pmu, + struct perf_event *event) +{ + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + event->hw.state = 0; + + list_add_tail(&event->active_entry, &pmu->active_list); + + local64_set(&event->hw.prev_count, rapl_read_counter(event)); + + pmu->n_active++; + if (pmu->n_active == 1) + rapl_start_hrtimer(pmu); +} + +static void rapl_pmu_event_start(struct perf_event *event, int mode) +{ + struct rapl_pmu *pmu = event->pmu_private; + unsigned long flags; + + raw_spin_lock_irqsave(&pmu->lock, flags); + __rapl_pmu_event_start(pmu, event); + raw_spin_unlock_irqrestore(&pmu->lock, flags); +} + +static void rapl_pmu_event_stop(struct perf_event *event, int mode) +{ + struct rapl_pmu *pmu = event->pmu_private; + struct hw_perf_event *hwc = &event->hw; + unsigned long flags; + + raw_spin_lock_irqsave(&pmu->lock, flags); + + /* mark event as deactivated and stopped */ + if (!(hwc->state & PERF_HES_STOPPED)) { + WARN_ON_ONCE(pmu->n_active <= 0); + pmu->n_active--; + if (pmu->n_active == 0) + hrtimer_cancel(&pmu->hrtimer); + + list_del(&event->active_entry); + + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + } + + /* check if update of sw counter is necessary */ + if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + /* + * Drain the remaining delta count out of a event + * that we are disabling: + */ + rapl_event_update(event); + hwc->state |= PERF_HES_UPTODATE; + } + + raw_spin_unlock_irqrestore(&pmu->lock, flags); +} + +static int rapl_pmu_event_add(struct perf_event *event, int mode) +{ + struct rapl_pmu *pmu = event->pmu_private; + struct hw_perf_event *hwc = &event->hw; + unsigned long flags; + + raw_spin_lock_irqsave(&pmu->lock, flags); + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + if (mode & PERF_EF_START) + __rapl_pmu_event_start(pmu, event); + + raw_spin_unlock_irqrestore(&pmu->lock, flags); + + return 0; +} + +static void rapl_pmu_event_del(struct perf_event *event, int flags) +{ + rapl_pmu_event_stop(event, PERF_EF_UPDATE); +} + +static int rapl_pmu_event_init(struct perf_event *event) +{ + u64 cfg = event->attr.config & RAPL_EVENT_MASK; + int bit, ret = 0; + struct rapl_pmu *pmu; + + /* only look at RAPL events */ + if (event->attr.type != rapl_pmus->pmu.type) + return -ENOENT; + + /* check only supported bits are set */ + if (event->attr.config & ~RAPL_EVENT_MASK) + return -EINVAL; + + if (event->cpu < 0) + return -EINVAL; + + event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG; + + if (!cfg || cfg >= NR_RAPL_DOMAINS + 1) + return -EINVAL; + + cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1); + bit = cfg - 1; + + /* check event supported */ + if (!(rapl_cntr_mask & (1 << bit))) + return -EINVAL; + + /* unsupported modes and filters */ + if (event->attr.sample_period) /* no sampling */ + return -EINVAL; + + /* must be done before validate_group */ + pmu = cpu_to_rapl_pmu(event->cpu); + if (!pmu) + return -EINVAL; + event->cpu = pmu->cpu; + event->pmu_private = pmu; + event->hw.event_base = rapl_msrs[bit].msr; + event->hw.config = cfg; + event->hw.idx = bit; + + return ret; +} + +static void rapl_pmu_event_read(struct perf_event *event) +{ + rapl_event_update(event); +} + +static ssize_t rapl_get_attr_cpumask(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask); +} + +static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL); + +static struct attribute *rapl_pmu_attrs[] = { + &dev_attr_cpumask.attr, + NULL, +}; + +static struct attribute_group rapl_pmu_attr_group = { + .attrs = rapl_pmu_attrs, +}; + +RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01"); +RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02"); +RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03"); +RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04"); +RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05"); + +RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules"); +RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules"); +RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules"); +RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules"); +RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_psys_unit, "Joules"); + +/* + * we compute in 0.23 nJ increments regardless of MSR + */ +RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10"); +RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10"); +RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10"); +RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10"); +RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_psys_scale, "2.3283064365386962890625e-10"); + +/* + * There are no default events, but we need to create + * "events" group (with empty attrs) before updating + * it with detected events. + */ +static struct attribute *attrs_empty[] = { + NULL, +}; + +static struct attribute_group rapl_pmu_events_group = { + .name = "events", + .attrs = attrs_empty, +}; + +DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7"); +static struct attribute *rapl_formats_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group rapl_pmu_format_group = { + .name = "format", + .attrs = rapl_formats_attr, +}; + +static const struct attribute_group *rapl_attr_groups[] = { + &rapl_pmu_attr_group, + &rapl_pmu_format_group, + &rapl_pmu_events_group, + NULL, +}; + +static struct attribute *rapl_events_cores[] = { + EVENT_PTR(rapl_cores), + EVENT_PTR(rapl_cores_unit), + EVENT_PTR(rapl_cores_scale), + NULL, +}; + +static struct attribute_group rapl_events_cores_group = { + .name = "events", + .attrs = rapl_events_cores, +}; + +static struct attribute *rapl_events_pkg[] = { + EVENT_PTR(rapl_pkg), + EVENT_PTR(rapl_pkg_unit), + EVENT_PTR(rapl_pkg_scale), + NULL, +}; + +static struct attribute_group rapl_events_pkg_group = { + .name = "events", + .attrs = rapl_events_pkg, +}; + +static struct attribute *rapl_events_ram[] = { + EVENT_PTR(rapl_ram), + EVENT_PTR(rapl_ram_unit), + EVENT_PTR(rapl_ram_scale), + NULL, +}; + +static struct attribute_group rapl_events_ram_group = { + .name = "events", + .attrs = rapl_events_ram, +}; + +static struct attribute *rapl_events_gpu[] = { + EVENT_PTR(rapl_gpu), + EVENT_PTR(rapl_gpu_unit), + EVENT_PTR(rapl_gpu_scale), + NULL, +}; + +static struct attribute_group rapl_events_gpu_group = { + .name = "events", + .attrs = rapl_events_gpu, +}; + +static struct attribute *rapl_events_psys[] = { + EVENT_PTR(rapl_psys), + EVENT_PTR(rapl_psys_unit), + EVENT_PTR(rapl_psys_scale), + NULL, +}; + +static struct attribute_group rapl_events_psys_group = { + .name = "events", + .attrs = rapl_events_psys, +}; + +static bool test_msr(int idx, void *data) +{ + return test_bit(idx, (unsigned long *) data); +} + +static struct perf_msr rapl_msrs[] = { + [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr }, + [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr }, + [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr }, + [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr }, + [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr }, +}; + +static int rapl_cpu_offline(unsigned int cpu) +{ + struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); + int target; + + /* Check if exiting cpu is used for collecting rapl events */ + if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask)) + return 0; + + pmu->cpu = -1; + /* Find a new cpu to collect rapl events */ + target = cpumask_any_but(topology_die_cpumask(cpu), cpu); + + /* Migrate rapl events to the new target */ + if (target < nr_cpu_ids) { + cpumask_set_cpu(target, &rapl_cpu_mask); + pmu->cpu = target; + perf_pmu_migrate_context(pmu->pmu, cpu, target); + } + return 0; +} + +static int rapl_cpu_online(unsigned int cpu) +{ + struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); + int target; + + if (!pmu) { + pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); + if (!pmu) + return -ENOMEM; + + raw_spin_lock_init(&pmu->lock); + INIT_LIST_HEAD(&pmu->active_list); + pmu->pmu = &rapl_pmus->pmu; + pmu->timer_interval = ms_to_ktime(rapl_timer_ms); + rapl_hrtimer_init(pmu); + + rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu; + } + + /* + * Check if there is an online cpu in the package which collects rapl + * events already. + */ + target = cpumask_any_and(&rapl_cpu_mask, topology_die_cpumask(cpu)); + if (target < nr_cpu_ids) + return 0; + + cpumask_set_cpu(cpu, &rapl_cpu_mask); + pmu->cpu = cpu; + return 0; +} + +static int rapl_check_hw_unit(bool apply_quirk) +{ + u64 msr_rapl_power_unit_bits; + int i; + + /* protect rdmsrl() to handle virtualization */ + if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits)) + return -1; + for (i = 0; i < NR_RAPL_DOMAINS; i++) + rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; + + /* + * DRAM domain on HSW server and KNL has fixed energy unit which can be + * different than the unit from power unit MSR. See + * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2 + * of 2. Datasheet, September 2014, Reference Number: 330784-001 " + */ + if (apply_quirk) + rapl_hw_unit[PERF_RAPL_RAM] = 16; + + /* + * Calculate the timer rate: + * Use reference of 200W for scaling the timeout to avoid counter + * overflows. 200W = 200 Joules/sec + * Divide interval by 2 to avoid lockstep (2 * 100) + * if hw unit is 32, then we use 2 ms 1/200/2 + */ + rapl_timer_ms = 2; + if (rapl_hw_unit[0] < 32) { + rapl_timer_ms = (1000 / (2 * 100)); + rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1)); + } + return 0; +} + +static void __init rapl_advertise(void) +{ + int i; + + pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n", + hweight32(rapl_cntr_mask), rapl_timer_ms); + + for (i = 0; i < NR_RAPL_DOMAINS; i++) { + if (rapl_cntr_mask & (1 << i)) { + pr_info("hw unit of domain %s 2^-%d Joules\n", + rapl_domain_names[i], rapl_hw_unit[i]); + } + } +} + +static void cleanup_rapl_pmus(void) +{ + int i; + + for (i = 0; i < rapl_pmus->maxdie; i++) + kfree(rapl_pmus->pmus[i]); + kfree(rapl_pmus); +} + +static const struct attribute_group *rapl_attr_update[] = { + &rapl_events_cores_group, + &rapl_events_pkg_group, + &rapl_events_ram_group, + &rapl_events_gpu_group, + &rapl_events_gpu_group, + NULL, +}; + +static int __init init_rapl_pmus(void) +{ + int maxdie = topology_max_packages() * topology_max_die_per_package(); + size_t size; + + size = sizeof(*rapl_pmus) + maxdie * sizeof(struct rapl_pmu *); + rapl_pmus = kzalloc(size, GFP_KERNEL); + if (!rapl_pmus) + return -ENOMEM; + + rapl_pmus->maxdie = maxdie; + rapl_pmus->pmu.attr_groups = rapl_attr_groups; + rapl_pmus->pmu.attr_update = rapl_attr_update; + rapl_pmus->pmu.task_ctx_nr = perf_invalid_context; + rapl_pmus->pmu.event_init = rapl_pmu_event_init; + rapl_pmus->pmu.add = rapl_pmu_event_add; + rapl_pmus->pmu.del = rapl_pmu_event_del; + rapl_pmus->pmu.start = rapl_pmu_event_start; + rapl_pmus->pmu.stop = rapl_pmu_event_stop; + rapl_pmus->pmu.read = rapl_pmu_event_read; + rapl_pmus->pmu.module = THIS_MODULE; + rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE; + return 0; +} + +static struct rapl_model model_snb = { + .events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_PP1), + .apply_quirk = false, +}; + +static struct rapl_model model_snbep = { + .events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM), + .apply_quirk = false, +}; + +static struct rapl_model model_hsw = { + .events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM) | + BIT(PERF_RAPL_PP1), + .apply_quirk = false, +}; + +static struct rapl_model model_hsx = { + .events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM), + .apply_quirk = true, +}; + +static struct rapl_model model_knl = { + .events = BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM), + .apply_quirk = true, +}; + +static struct rapl_model model_skl = { + .events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM) | + BIT(PERF_RAPL_PP1) | + BIT(PERF_RAPL_PSYS), + .apply_quirk = false, +}; + +static const struct x86_cpu_id rapl_model_match[] __initconst = { + X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE, &model_snb), + X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X, &model_snbep), + X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE, &model_snb), + X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &model_snbep), + X86_MATCH_INTEL_FAM6_MODEL(HASWELL, &model_hsw), + X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &model_hsx), + X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L, &model_hsw), + X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G, &model_hsw), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, &model_hsw), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G, &model_hsw), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &model_hsx), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &model_hsx), + X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &model_knl), + X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &model_knl), + X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &model_hsx), + X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(CANNONLAKE_L, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT, &model_hsw), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D, &model_hsw), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS, &model_hsw), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &model_hsx), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &model_hsx), + X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &model_skl), + {}, +}; +MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); + +static int __init rapl_pmu_init(void) +{ + const struct x86_cpu_id *id; + struct rapl_model *rm; + int ret; + + id = x86_match_cpu(rapl_model_match); + if (!id) + return -ENODEV; + + rm = (struct rapl_model *) id->driver_data; + rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX, + false, (void *) &rm->events); + + ret = rapl_check_hw_unit(rm->apply_quirk); + if (ret) + return ret; + + ret = init_rapl_pmus(); + if (ret) + return ret; + + /* + * Install callbacks. Core will call them for each online cpu. + */ + ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE, + "perf/x86/rapl:online", + rapl_cpu_online, rapl_cpu_offline); + if (ret) + goto out; + + ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1); + if (ret) + goto out1; + + rapl_advertise(); + return 0; + +out1: + cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE); +out: + pr_warn("Initialization failed (%d), disabled\n", ret); + cleanup_rapl_pmus(); + return ret; +} +module_init(rapl_pmu_init); + +static void __exit intel_rapl_exit(void) +{ + cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE); + perf_pmu_unregister(&rapl_pmus->pmu); + cleanup_rapl_pmus(); +} +module_exit(intel_rapl_exit); -- cgit v1.2.3 From 5c95c68949880035b68e5c48fdf4899ec0989631 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 27 May 2020 15:46:56 -0700 Subject: perf/x86/rapl: Refactor to share the RAPL code between Intel and AMD CPUs This patch modifies the rapl_model struct to include architecture specific knowledge in this previously Intel specific structure, and in particular it adds the MSR for POWER_UNIT and the rapl_msrs array. No functional changes. Signed-off-by: Stephane Eranian Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200527224659.206129-3-eranian@google.com --- arch/x86/events/rapl.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 3e6c01b8e996..f29935e6a9df 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -131,7 +131,9 @@ struct rapl_pmus { }; struct rapl_model { + struct perf_msr *rapl_msrs; unsigned long events; + unsigned int msr_power_unit; bool apply_quirk; }; @@ -141,7 +143,7 @@ static struct rapl_pmus *rapl_pmus; static cpumask_t rapl_cpu_mask; static unsigned int rapl_cntr_mask; static u64 rapl_timer_ms; -static struct perf_msr rapl_msrs[]; +static struct perf_msr *rapl_msrs; static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu) { @@ -516,7 +518,7 @@ static bool test_msr(int idx, void *data) return test_bit(idx, (unsigned long *) data); } -static struct perf_msr rapl_msrs[] = { +static struct perf_msr intel_rapl_msrs[] = { [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr }, [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr }, [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr }, @@ -578,13 +580,13 @@ static int rapl_cpu_online(unsigned int cpu) return 0; } -static int rapl_check_hw_unit(bool apply_quirk) +static int rapl_check_hw_unit(struct rapl_model *rm) { u64 msr_rapl_power_unit_bits; int i; /* protect rdmsrl() to handle virtualization */ - if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits)) + if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits)) return -1; for (i = 0; i < NR_RAPL_DOMAINS; i++) rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; @@ -595,7 +597,7 @@ static int rapl_check_hw_unit(bool apply_quirk) * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2 * of 2. Datasheet, September 2014, Reference Number: 330784-001 " */ - if (apply_quirk) + if (rm->apply_quirk) rapl_hw_unit[PERF_RAPL_RAM] = 16; /* @@ -676,6 +678,8 @@ static struct rapl_model model_snb = { BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_PP1), .apply_quirk = false, + .msr_power_unit = MSR_RAPL_POWER_UNIT, + .rapl_msrs = intel_rapl_msrs, }; static struct rapl_model model_snbep = { @@ -683,6 +687,8 @@ static struct rapl_model model_snbep = { BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM), .apply_quirk = false, + .msr_power_unit = MSR_RAPL_POWER_UNIT, + .rapl_msrs = intel_rapl_msrs, }; static struct rapl_model model_hsw = { @@ -691,6 +697,8 @@ static struct rapl_model model_hsw = { BIT(PERF_RAPL_RAM) | BIT(PERF_RAPL_PP1), .apply_quirk = false, + .msr_power_unit = MSR_RAPL_POWER_UNIT, + .rapl_msrs = intel_rapl_msrs, }; static struct rapl_model model_hsx = { @@ -698,12 +706,16 @@ static struct rapl_model model_hsx = { BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM), .apply_quirk = true, + .msr_power_unit = MSR_RAPL_POWER_UNIT, + .rapl_msrs = intel_rapl_msrs, }; static struct rapl_model model_knl = { .events = BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM), .apply_quirk = true, + .msr_power_unit = MSR_RAPL_POWER_UNIT, + .rapl_msrs = intel_rapl_msrs, }; static struct rapl_model model_skl = { @@ -713,6 +725,8 @@ static struct rapl_model model_skl = { BIT(PERF_RAPL_PP1) | BIT(PERF_RAPL_PSYS), .apply_quirk = false, + .msr_power_unit = MSR_RAPL_POWER_UNIT, + .rapl_msrs = intel_rapl_msrs, }; static const struct x86_cpu_id rapl_model_match[] __initconst = { @@ -760,10 +774,13 @@ static int __init rapl_pmu_init(void) return -ENODEV; rm = (struct rapl_model *) id->driver_data; + + rapl_msrs = rm->rapl_msrs; + rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX, false, (void *) &rm->events); - ret = rapl_check_hw_unit(rm->apply_quirk); + ret = rapl_check_hw_unit(rm); if (ret) return ret; -- cgit v1.2.3 From 2a3e3f73a23b4ff2c0065d3a42edc18ad94b7851 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 27 May 2020 15:46:57 -0700 Subject: perf/x86/rapl: Flip logic on default events visibility This patch modifies the default visibility of the attribute_group for each RAPL event. By default if the grp.is_visible field is NULL, sysfs considers that it must display the attribute group. If the field is not NULL (callback function), then the return value of the callback determines the visibility (0 = not visible). The RAPL attribute groups had the field set to NULL, meaning that unless they failed the probing from perf_msr_probe(), they would be visible. We want to avoid having to specify attribute groups that are not supported by the HW in the rapl_msrs[] array, they don't have an MSR address to begin with. Therefore, we intialize the visible field of all RAPL attribute groups to a callback that returns 0. If the RAPL msr goes through probing and succeeds the is_visible field will be set back to NULL (visible). If the probing fails the field is set to a callback that return 0 (not visible). Signed-off-by: Stephane Eranian Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200527224659.206129-4-eranian@google.com --- arch/x86/events/rapl.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index f29935e6a9df..8d17af4b1ca9 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -460,9 +460,16 @@ static struct attribute *rapl_events_cores[] = { NULL, }; +static umode_t +rapl_not_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return 0; +} + static struct attribute_group rapl_events_cores_group = { .name = "events", .attrs = rapl_events_cores, + .is_visible = rapl_not_visible, }; static struct attribute *rapl_events_pkg[] = { @@ -475,6 +482,7 @@ static struct attribute *rapl_events_pkg[] = { static struct attribute_group rapl_events_pkg_group = { .name = "events", .attrs = rapl_events_pkg, + .is_visible = rapl_not_visible, }; static struct attribute *rapl_events_ram[] = { @@ -487,6 +495,7 @@ static struct attribute *rapl_events_ram[] = { static struct attribute_group rapl_events_ram_group = { .name = "events", .attrs = rapl_events_ram, + .is_visible = rapl_not_visible, }; static struct attribute *rapl_events_gpu[] = { @@ -499,6 +508,7 @@ static struct attribute *rapl_events_gpu[] = { static struct attribute_group rapl_events_gpu_group = { .name = "events", .attrs = rapl_events_gpu, + .is_visible = rapl_not_visible, }; static struct attribute *rapl_events_psys[] = { @@ -511,6 +521,7 @@ static struct attribute *rapl_events_psys[] = { static struct attribute_group rapl_events_psys_group = { .name = "events", .attrs = rapl_events_psys, + .is_visible = rapl_not_visible, }; static bool test_msr(int idx, void *data) -- cgit v1.2.3 From 4c953f879460bf65ea3c119354026b126fe8ee57 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 27 May 2020 15:46:58 -0700 Subject: perf/x86/rapl: Make perf_probe_msr() more robust and flexible This patch modifies perf_probe_msr() by allowing passing of struct perf_msr array where some entries are not populated, i.e., they have either an msr address of 0 or no attribute_group pointer. This helps with certain call paths, e.g., RAPL. In case the grp is NULL, the default sysfs visibility rule applies which is to make the group visible. Without the patch, you would get a kernel crash with a NULL group. Signed-off-by: Stephane Eranian Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200527224659.206129-5-eranian@google.com --- arch/x86/events/probe.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/events/probe.c b/arch/x86/events/probe.c index c2ede2f3b277..136a1e847254 100644 --- a/arch/x86/events/probe.c +++ b/arch/x86/events/probe.c @@ -10,6 +10,11 @@ not_visible(struct kobject *kobj, struct attribute *attr, int i) return 0; } +/* + * Accepts msr[] array with non populated entries as long as either + * msr[i].msr is 0 or msr[i].grp is NULL. Note that the default sysfs + * visibility is visible when group->is_visible callback is set. + */ unsigned long perf_msr_probe(struct perf_msr *msr, int cnt, bool zero, void *data) { @@ -24,8 +29,16 @@ perf_msr_probe(struct perf_msr *msr, int cnt, bool zero, void *data) if (!msr[bit].no_check) { struct attribute_group *grp = msr[bit].grp; + /* skip entry with no group */ + if (!grp) + continue; + grp->is_visible = not_visible; + /* skip unpopulated entry */ + if (!msr[bit].msr) + continue; + if (msr[bit].test && !msr[bit].test(bit, data)) continue; /* Virt sucks; you cannot tell if a R/O MSR is present :/ */ -- cgit v1.2.3 From 5cde265384cad739b162cf08afba6da8857778bd Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 27 May 2020 15:46:59 -0700 Subject: perf/x86/rapl: Add AMD Fam17h RAPL support This patch enables AMD Fam17h RAPL support for the Package level metric. The support is as per AMD Fam17h Model31h (Zen2) and model 00-ffh (Zen1) PPR. The same output is available via the energy-pkg pseudo event: $ perf stat -a -I 1000 --per-socket -e power/energy-pkg/ Signed-off-by: Stephane Eranian Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200527224659.206129-6-eranian@google.com --- arch/x86/events/rapl.c | 18 ++++++++++++++++++ arch/x86/include/asm/msr-index.h | 3 +++ 2 files changed, 21 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 8d17af4b1ca9..0f2bf59f4354 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -537,6 +537,16 @@ static struct perf_msr intel_rapl_msrs[] = { [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr }, }; +/* + * Force to PERF_RAPL_MAX size due to: + * - perf_msr_probe(PERF_RAPL_MAX) + * - want to use same event codes across both architectures + */ +static struct perf_msr amd_rapl_msrs[PERF_RAPL_MAX] = { + [PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr }, +}; + + static int rapl_cpu_offline(unsigned int cpu) { struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); @@ -740,6 +750,13 @@ static struct rapl_model model_skl = { .rapl_msrs = intel_rapl_msrs, }; +static struct rapl_model model_amd_fam17h = { + .events = BIT(PERF_RAPL_PKG), + .apply_quirk = false, + .msr_power_unit = MSR_AMD_RAPL_POWER_UNIT, + .rapl_msrs = amd_rapl_msrs, +}; + static const struct x86_cpu_id rapl_model_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE, &model_snb), X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X, &model_snbep), @@ -770,6 +787,7 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &model_hsx), X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &model_skl), + X86_MATCH_VENDOR_FAM(AMD, 0x17, &model_amd_fam17h), {}, }; MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 12c9684d59ba..ef452b817f44 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -301,6 +301,9 @@ #define MSR_PP1_ENERGY_STATUS 0x00000641 #define MSR_PP1_POLICY 0x00000642 +#define MSR_AMD_PKG_ENERGY_STATUS 0xc001029b +#define MSR_AMD_RAPL_POWER_UNIT 0xc0010299 + /* Config TDP MSRs */ #define MSR_CONFIG_TDP_NOMINAL 0x00000648 #define MSR_CONFIG_TDP_LEVEL_1 0x00000649 -- cgit v1.2.3