From da916e96e2dedcb2d40de77a7def833d315b81a6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 25 Oct 2024 10:21:41 +0200 Subject: perf: Make perf_pmu_unregister() useable Previously it was only safe to call perf_pmu_unregister() if there were no active events of that pmu around -- which was impossible to guarantee since it races all sorts against perf_init_event(). Rework the whole thing by: - keeping track of all events for a given pmu - 'hiding' the pmu from perf_init_event() - waiting for the appropriate (s)rcu grace periods such that all prior references to the PMU will be completed - detaching all still existing events of that pmu (see first point) and moving them to a new REVOKED state. - actually freeing the pmu data. Where notably the new REVOKED state must inhibit all event actions from reaching code that wants to use event->pmu. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ravi Bangoria Link: https://lkml.kernel.org/r/20250307193723.525402029@infradead.org --- include/linux/perf_event.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 0069ba6866a4..7f49a58b271d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -325,6 +325,9 @@ struct perf_output_handle; struct pmu { struct list_head entry; + spinlock_t events_lock; + struct list_head events; + struct module *module; struct device *dev; struct device *parent; @@ -622,9 +625,10 @@ struct perf_addr_filter_range { * enum perf_event_state - the states of an event: */ enum perf_event_state { - PERF_EVENT_STATE_DEAD = -4, - PERF_EVENT_STATE_EXIT = -3, - PERF_EVENT_STATE_ERROR = -2, + PERF_EVENT_STATE_DEAD = -5, + PERF_EVENT_STATE_REVOKED = -4, /* pmu gone, must not touch */ + PERF_EVENT_STATE_EXIT = -3, /* task died, still inherit */ + PERF_EVENT_STATE_ERROR = -2, /* scheduling error, can enable */ PERF_EVENT_STATE_OFF = -1, PERF_EVENT_STATE_INACTIVE = 0, PERF_EVENT_STATE_ACTIVE = 1, @@ -865,6 +869,7 @@ struct perf_event { void *security; #endif struct list_head sb_list; + struct list_head pmu_list; /* * Certain events gets forwarded to another pmu internally by over- @@ -1155,7 +1160,7 @@ extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags); extern void perf_event_itrace_started(struct perf_event *event); extern int perf_pmu_register(struct pmu *pmu, const char *name, int type); -extern void perf_pmu_unregister(struct pmu *pmu); +extern int perf_pmu_unregister(struct pmu *pmu); extern void __perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task); @@ -1760,7 +1765,7 @@ static inline bool needs_branch_stack(struct perf_event *event) static inline bool has_aux(struct perf_event *event) { - return event->pmu->setup_aux; + return event->pmu && event->pmu->setup_aux; } static inline bool has_aux_action(struct perf_event *event) -- cgit v1.2.3 From 4dfe3232cc04325a09e96f6c7f9546ba6c0b132b Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 27 Mar 2025 12:52:13 -0700 Subject: perf/x86: Add dynamic constraint More and more features require a dynamic event constraint, e.g., branch counter logging, auto counter reload, Arch PEBS, etc. Add a generic flag, PMU_FL_DYN_CONSTRAINT, to indicate the case. It avoids keeping adding the individual flag in intel_cpuc_prepare(). Add a variable dyn_constraint in the struct hw_perf_event to track the dynamic constraint of the event. Apply it if it's updated. Apply the generic dynamic constraint for branch counter logging. Many features on and after V6 require dynamic constraint. So unconditionally set the flag for V6+. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Thomas Falcon Link: https://lkml.kernel.org/r/20250327195217.2683619-2-kan.liang@linux.intel.com --- arch/x86/events/core.c | 1 + arch/x86/events/intel/core.c | 21 +++++++++++++++------ arch/x86/events/intel/lbr.c | 2 +- arch/x86/events/perf_event.h | 1 + include/linux/perf_event.h | 1 + 5 files changed, 19 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 6866cc5acb0b..a0fe51e0c00f 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -674,6 +674,7 @@ static int __x86_pmu_event_init(struct perf_event *event) event->hw.idx = -1; event->hw.last_cpu = -1; event->hw.last_tag = ~0ULL; + event->hw.dyn_constraint = ~0ULL; /* mark unused */ event->hw.extra_reg.idx = EXTRA_REG_NONE; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 09d2d66c9f21..972492832d7f 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3730,10 +3730,9 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, if (cpuc->excl_cntrs) return intel_get_excl_constraints(cpuc, event, idx, c2); - /* Not all counters support the branch counter feature. */ - if (branch_sample_counters(event)) { + if (event->hw.dyn_constraint != ~0ULL) { c2 = dyn_constraint(cpuc, c2, idx); - c2->idxmsk64 &= x86_pmu.lbr_counters; + c2->idxmsk64 &= event->hw.dyn_constraint; c2->weight = hweight64(c2->idxmsk64); } @@ -4135,15 +4134,19 @@ static int intel_pmu_hw_config(struct perf_event *event) leader = event->group_leader; if (branch_sample_call_stack(leader)) return -EINVAL; - if (branch_sample_counters(leader)) + if (branch_sample_counters(leader)) { num++; + leader->hw.dyn_constraint &= x86_pmu.lbr_counters; + } leader->hw.flags |= PERF_X86_EVENT_BRANCH_COUNTERS; for_each_sibling_event(sibling, leader) { if (branch_sample_call_stack(sibling)) return -EINVAL; - if (branch_sample_counters(sibling)) + if (branch_sample_counters(sibling)) { num++; + sibling->hw.dyn_constraint &= x86_pmu.lbr_counters; + } } if (num > fls(x86_pmu.lbr_counters)) @@ -4943,7 +4946,7 @@ int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu) goto err; } - if (x86_pmu.flags & (PMU_FL_EXCL_CNTRS | PMU_FL_TFA | PMU_FL_BR_CNTR)) { + if (x86_pmu.flags & (PMU_FL_EXCL_CNTRS | PMU_FL_TFA | PMU_FL_DYN_CONSTRAINT)) { size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint); cpuc->constraint_list = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu)); @@ -6664,6 +6667,12 @@ __init int intel_pmu_init(void) pr_cont(" AnyThread deprecated, "); } + /* + * Many features on and after V6 require dynamic constraint, + * e.g., Arch PEBS, ACR. + */ + if (version >= 6) + x86_pmu.flags |= PMU_FL_DYN_CONSTRAINT; /* * Install the hw-cache-events table: */ diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index f44c3d866f24..05acd6449ceb 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1618,7 +1618,7 @@ void __init intel_pmu_arch_lbr_init(void) x86_pmu.lbr_nr = lbr_nr; if (!!x86_pmu.lbr_counters) - x86_pmu.flags |= PMU_FL_BR_CNTR; + x86_pmu.flags |= PMU_FL_BR_CNTR | PMU_FL_DYN_CONSTRAINT; if (x86_pmu.lbr_mispred) static_branch_enable(&x86_lbr_mispred); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 2c0ce0e9545e..f5ba1658a889 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1042,6 +1042,7 @@ do { \ #define PMU_FL_MEM_LOADS_AUX 0x100 /* Require an auxiliary event for the complete memory info */ #define PMU_FL_RETIRE_LATENCY 0x200 /* Support Retire Latency in PEBS */ #define PMU_FL_BR_CNTR 0x400 /* Support branch counter logging */ +#define PMU_FL_DYN_CONSTRAINT 0x800 /* Needs dynamic constraint */ #define EVENT_VAR(_id) event_attr_##_id #define EVENT_PTR(_id) &event_attr_##_id.attr.attr diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 7f49a58b271d..54dad174ed7a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -158,6 +158,7 @@ struct hw_perf_event { struct { /* hardware */ u64 config; u64 last_tag; + u64 dyn_constraint; unsigned long config_base; unsigned long event_base; int event_base_rdpmc; -- cgit v1.2.3 From c9449c8506a5df5052ef4d17867699517b10b55a Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 27 Mar 2025 12:52:15 -0700 Subject: perf: Extend the bit width of the arch-specific flag The auto counter reload feature requires an event flag to indicate an auto counter reload group, which can only be scheduled on specific counters that enumerated in CPUID. However, the hw_perf_event.flags has run out on X86. Two solutions were considered to address the issue. - Currently, 20 bits are reserved for the architecture-specific flags. Only the bit 31 is used for the generic flag. There is still plenty of space left. Reserve 8 more bits for the arch-specific flags. - Add a new X86 specific hw_perf_event.flags1 to support more flags. The former is implemented. Enough room is still left in the global generic flag. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Thomas Falcon Link: https://lkml.kernel.org/r/20250327195217.2683619-4-kan.liang@linux.intel.com --- arch/x86/events/perf_event_flags.h | 41 +++++++++++++++++++------------------- include/linux/perf_event.h | 2 +- 2 files changed, 22 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/events/perf_event_flags.h b/arch/x86/events/perf_event_flags.h index 1d9e385649b5..70078334e4a3 100644 --- a/arch/x86/events/perf_event_flags.h +++ b/arch/x86/events/perf_event_flags.h @@ -2,23 +2,24 @@ /* * struct hw_perf_event.flags flags */ -PERF_ARCH(PEBS_LDLAT, 0x00001) /* ld+ldlat data address sampling */ -PERF_ARCH(PEBS_ST, 0x00002) /* st data address sampling */ -PERF_ARCH(PEBS_ST_HSW, 0x00004) /* haswell style datala, store */ -PERF_ARCH(PEBS_LD_HSW, 0x00008) /* haswell style datala, load */ -PERF_ARCH(PEBS_NA_HSW, 0x00010) /* haswell style datala, unknown */ -PERF_ARCH(EXCL, 0x00020) /* HT exclusivity on counter */ -PERF_ARCH(DYNAMIC, 0x00040) /* dynamic alloc'd constraint */ -PERF_ARCH(PEBS_CNTR, 0x00080) /* PEBS counters snapshot */ -PERF_ARCH(EXCL_ACCT, 0x00100) /* accounted EXCL event */ -PERF_ARCH(AUTO_RELOAD, 0x00200) /* use PEBS auto-reload */ -PERF_ARCH(LARGE_PEBS, 0x00400) /* use large PEBS */ -PERF_ARCH(PEBS_VIA_PT, 0x00800) /* use PT buffer for PEBS */ -PERF_ARCH(PAIR, 0x01000) /* Large Increment per Cycle */ -PERF_ARCH(LBR_SELECT, 0x02000) /* Save/Restore MSR_LBR_SELECT */ -PERF_ARCH(TOPDOWN, 0x04000) /* Count Topdown slots/metrics events */ -PERF_ARCH(PEBS_STLAT, 0x08000) /* st+stlat data address sampling */ -PERF_ARCH(AMD_BRS, 0x10000) /* AMD Branch Sampling */ -PERF_ARCH(PEBS_LAT_HYBRID, 0x20000) /* ld and st lat for hybrid */ -PERF_ARCH(NEEDS_BRANCH_STACK, 0x40000) /* require branch stack setup */ -PERF_ARCH(BRANCH_COUNTERS, 0x80000) /* logs the counters in the extra space of each branch */ +PERF_ARCH(PEBS_LDLAT, 0x0000001) /* ld+ldlat data address sampling */ +PERF_ARCH(PEBS_ST, 0x0000002) /* st data address sampling */ +PERF_ARCH(PEBS_ST_HSW, 0x0000004) /* haswell style datala, store */ +PERF_ARCH(PEBS_LD_HSW, 0x0000008) /* haswell style datala, load */ +PERF_ARCH(PEBS_NA_HSW, 0x0000010) /* haswell style datala, unknown */ +PERF_ARCH(EXCL, 0x0000020) /* HT exclusivity on counter */ +PERF_ARCH(DYNAMIC, 0x0000040) /* dynamic alloc'd constraint */ +PERF_ARCH(PEBS_CNTR, 0x0000080) /* PEBS counters snapshot */ +PERF_ARCH(EXCL_ACCT, 0x0000100) /* accounted EXCL event */ +PERF_ARCH(AUTO_RELOAD, 0x0000200) /* use PEBS auto-reload */ +PERF_ARCH(LARGE_PEBS, 0x0000400) /* use large PEBS */ +PERF_ARCH(PEBS_VIA_PT, 0x0000800) /* use PT buffer for PEBS */ +PERF_ARCH(PAIR, 0x0001000) /* Large Increment per Cycle */ +PERF_ARCH(LBR_SELECT, 0x0002000) /* Save/Restore MSR_LBR_SELECT */ +PERF_ARCH(TOPDOWN, 0x0004000) /* Count Topdown slots/metrics events */ +PERF_ARCH(PEBS_STLAT, 0x0008000) /* st+stlat data address sampling */ +PERF_ARCH(AMD_BRS, 0x0010000) /* AMD Branch Sampling */ +PERF_ARCH(PEBS_LAT_HYBRID, 0x0020000) /* ld and st lat for hybrid */ +PERF_ARCH(NEEDS_BRANCH_STACK, 0x0040000) /* require branch stack setup */ +PERF_ARCH(BRANCH_COUNTERS, 0x0080000) /* logs the counters in the extra space of each branch */ +PERF_ARCH(ACR, 0x0100000) /* Auto counter reload */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 54dad174ed7a..5c547329cf02 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -144,7 +144,7 @@ struct hw_perf_event_extra { * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific * usage. */ -#define PERF_EVENT_FLAG_ARCH 0x000fffff +#define PERF_EVENT_FLAG_ARCH 0x0fffffff #define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0); -- cgit v1.2.3 From ec980e4facef8110f6fce27e5b6344660117f01f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 27 Mar 2025 12:52:17 -0700 Subject: perf/x86/intel: Support auto counter reload MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The relative rates among two or more events are useful for performance analysis, e.g., a high branch miss rate may indicate a performance issue. Usually, the samples with a relative rate that exceeds some threshold are more useful. However, the traditional sampling takes samples of events separately. To get the relative rates among two or more events, a high sample rate is required, which can bring high overhead. Many samples taken in the non-hotspot area are also dropped (useless) in the post-process. The auto counter reload (ACR) feature takes samples when the relative rate of two or more events exceeds some threshold, which provides the fine-grained information at a low cost. To support the feature, two sets of MSRs are introduced. For a given counter IA32_PMC_GPn_CTR/IA32_PMC_FXm_CTR, bit fields in the IA32_PMC_GPn_CFG_B/IA32_PMC_FXm_CFG_B MSR indicate which counter(s) can cause a reload of that counter. The reload value is stored in the IA32_PMC_GPn_CFG_C/IA32_PMC_FXm_CFG_C. The details can be found at Intel SDM (085), Volume 3, 21.9.11 Auto Counter Reload. In the hw_config(), an ACR event is specially configured, because the cause/reloadable counter mask has to be applied to the dyn_constraint. Besides the HW limit, e.g., not support perf metrics, PDist and etc, a SW limit is applied as well. ACR events in a group must be contiguous. It facilitates the later conversion from the event idx to the counter idx. Otherwise, the intel_pmu_acr_late_setup() has to traverse the whole event list again to find the "cause" event. Also, add a new flag PERF_X86_EVENT_ACR to indicate an ACR group, which is set to the group leader. The late setup() is also required for an ACR group. It's to convert the event idx to the counter idx, and saved it in hw.config1. The ACR configuration MSRs are only updated in the enable_event(). The disable_event() doesn't clear the ACR CFG register. Add acr_cfg_b/acr_cfg_c in the struct cpu_hw_events to cache the MSR values. It can avoid a MSR write if the value is not changed. Expose an acr_mask to the sysfs. The perf tool can utilize the new format to configure the relation of events in the group. The bit sequence of the acr_mask follows the events enabled order of the group. Example: Here is the snippet of the mispredict.c. Since the array has a random numbers, jumps are random and often mispredicted. The mispredicted rate depends on the compared value. For the Loop1, ~11% of all branches are mispredicted. For the Loop2, ~21% of all branches are mispredicted. main() { ... for (i = 0; i < N; i++) data[i] = rand() % 256; ... /* Loop 1 */ for (k = 0; k < 50; k++) for (i = 0; i < N; i++) if (data[i] >= 64) sum += data[i]; ... ... /* Loop 2 */ for (k = 0; k < 50; k++) for (i = 0; i < N; i++) if (data[i] >= 128) sum += data[i]; ... } Usually, a code with a high branch miss rate means a bad performance. To understand the branch miss rate of the codes, the traditional method usually samples both branches and branch-misses events. E.g., perf record -e "{cpu_atom/branch-misses/ppu, cpu_atom/branch-instructions/u}" -c 1000000 -- ./mispredict [ perf record: Woken up 4 times to write data ] [ perf record: Captured and wrote 0.925 MB perf.data (5106 samples) ] The 5106 samples are from both events and spread in both Loops. In the post-process stage, a user can know that the Loop 2 has a 21% branch miss rate. Then they can focus on the samples of branch-misses events for the Loop 2. With this patch, the user can generate the samples only when the branch miss rate > 20%. For example, perf record -e "{cpu_atom/branch-misses,period=200000,acr_mask=0x2/ppu, cpu_atom/branch-instructions,period=1000000,acr_mask=0x3/u}" -- ./mispredict (Two different periods are applied to branch-misses and branch-instructions. The ratio is set to 20%. If the branch-instructions is overflowed first, the branch-miss rate < 20%. No samples should be generated. All counters should be automatically reloaded. If the branch-misses is overflowed first, the branch-miss rate > 20%. A sample triggered by the branch-misses event should be generated. Just the counter of the branch-instructions should be automatically reloaded. The branch-misses event should only be automatically reloaded when the branch-instructions is overflowed. So the "cause" event is the branch-instructions event. The acr_mask is set to 0x2, since the event index in the group of branch-instructions is 1. The branch-instructions event is automatically reloaded no matter which events are overflowed. So the "cause" events are the branch-misses and the branch-instructions event. The acr_mask should be set to 0x3.) [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.098 MB perf.data (2498 samples) ] $perf report Percent │154: movl $0x0,-0x14(%rbp) │ ↓ jmp 1af │ for (i = j; i < N; i++) │15d: mov -0x10(%rbp),%eax │ mov %eax,-0x18(%rbp) │ ↓ jmp 1a2 │ if (data[i] >= 128) │165: mov -0x18(%rbp),%eax │ cltq │ lea 0x0(,%rax,4),%rdx │ mov -0x8(%rbp),%rax │ add %rdx,%rax │ mov (%rax),%eax │ ┌──cmp $0x7f,%eax 100.00 0.00 │ ├──jle 19e │ │sum += data[i]; The 2498 samples are all from the branch-misses events for the Loop 2. The number of samples and overhead is significantly reduced without losing any information. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Thomas Falcon Link: https://lkml.kernel.org/r/20250327195217.2683619-6-kan.liang@linux.intel.com --- arch/x86/events/core.c | 2 +- arch/x86/events/intel/core.c | 226 ++++++++++++++++++++++++++++++++++++++- arch/x86/events/perf_event.h | 10 ++ arch/x86/include/asm/msr-index.h | 4 + include/linux/perf_event.h | 1 + 5 files changed, 240 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index a0fe51e0c00f..f53ae1fd986f 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -755,7 +755,7 @@ void x86_pmu_enable_all(int added) } } -static inline int is_x86_event(struct perf_event *event) +int is_x86_event(struct perf_event *event) { int i; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 876678aa3895..3152a018c502 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2603,7 +2603,8 @@ static void intel_pmu_del_event(struct perf_event *event) intel_pmu_lbr_del(event); if (event->attr.precise_ip) intel_pmu_pebs_del(event); - if (is_pebs_counter_event_group(event)) + if (is_pebs_counter_event_group(event) || + is_acr_event_group(event)) this_cpu_ptr(&cpu_hw_events)->n_late_setup--; } @@ -2882,6 +2883,52 @@ static void intel_pmu_enable_fixed(struct perf_event *event) cpuc->fixed_ctrl_val |= bits; } +static void intel_pmu_config_acr(int idx, u64 mask, u32 reload) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int msr_b, msr_c; + + if (!mask && !cpuc->acr_cfg_b[idx]) + return; + + if (idx < INTEL_PMC_IDX_FIXED) { + msr_b = MSR_IA32_PMC_V6_GP0_CFG_B; + msr_c = MSR_IA32_PMC_V6_GP0_CFG_C; + } else { + msr_b = MSR_IA32_PMC_V6_FX0_CFG_B; + msr_c = MSR_IA32_PMC_V6_FX0_CFG_C; + idx -= INTEL_PMC_IDX_FIXED; + } + + if (cpuc->acr_cfg_b[idx] != mask) { + wrmsrl(msr_b + x86_pmu.addr_offset(idx, false), mask); + cpuc->acr_cfg_b[idx] = mask; + } + /* Only need to update the reload value when there is a valid config value. */ + if (mask && cpuc->acr_cfg_c[idx] != reload) { + wrmsrl(msr_c + x86_pmu.addr_offset(idx, false), reload); + cpuc->acr_cfg_c[idx] = reload; + } +} + +static void intel_pmu_enable_acr(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (!is_acr_event_group(event) || !event->attr.config2) { + /* + * The disable doesn't clear the ACR CFG register. + * Check and clear the ACR CFG register. + */ + intel_pmu_config_acr(hwc->idx, 0, 0); + return; + } + + intel_pmu_config_acr(hwc->idx, hwc->config1, -hwc->sample_period); +} + +DEFINE_STATIC_CALL_NULL(intel_pmu_enable_acr_event, intel_pmu_enable_acr); + static void intel_pmu_enable_event(struct perf_event *event) { u64 enable_mask = ARCH_PERFMON_EVENTSEL_ENABLE; @@ -2896,9 +2943,12 @@ static void intel_pmu_enable_event(struct perf_event *event) if (branch_sample_counters(event)) enable_mask |= ARCH_PERFMON_EVENTSEL_BR_CNTR; intel_set_masks(event, idx); + static_call_cond(intel_pmu_enable_acr_event)(event); __x86_pmu_enable_event(hwc, enable_mask); break; case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1: + static_call_cond(intel_pmu_enable_acr_event)(event); + fallthrough; case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: intel_pmu_enable_fixed(event); break; @@ -2916,6 +2966,31 @@ static void intel_pmu_enable_event(struct perf_event *event) } } +static void intel_pmu_acr_late_setup(struct cpu_hw_events *cpuc) +{ + struct perf_event *event, *leader; + int i, j, idx; + + for (i = 0; i < cpuc->n_events; i++) { + leader = cpuc->event_list[i]; + if (!is_acr_event_group(leader)) + continue; + + /* The ACR events must be contiguous. */ + for (j = i; j < cpuc->n_events; j++) { + event = cpuc->event_list[j]; + if (event->group_leader != leader->group_leader) + break; + for_each_set_bit(idx, (unsigned long *)&event->attr.config2, X86_PMC_IDX_MAX) { + if (WARN_ON_ONCE(i + idx > cpuc->n_events)) + return; + __set_bit(cpuc->assign[i + idx], (unsigned long *)&event->hw.config1); + } + } + i = j - 1; + } +} + void intel_pmu_late_setup(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -2924,6 +2999,7 @@ void intel_pmu_late_setup(void) return; intel_pmu_pebs_late_setup(cpuc); + intel_pmu_acr_late_setup(cpuc); } static void intel_pmu_add_event(struct perf_event *event) @@ -2932,7 +3008,8 @@ static void intel_pmu_add_event(struct perf_event *event) intel_pmu_pebs_add(event); if (intel_pmu_needs_branch_stack(event)) intel_pmu_lbr_add(event); - if (is_pebs_counter_event_group(event)) + if (is_pebs_counter_event_group(event) || + is_acr_event_group(event)) this_cpu_ptr(&cpu_hw_events)->n_late_setup++; } @@ -4087,6 +4164,39 @@ end: return start; } +static inline bool intel_pmu_has_acr(struct pmu *pmu) +{ + return !!hybrid(pmu, acr_cause_mask64); +} + +static bool intel_pmu_is_acr_group(struct perf_event *event) +{ + /* The group leader has the ACR flag set */ + if (is_acr_event_group(event)) + return true; + + /* The acr_mask is set */ + if (event->attr.config2) + return true; + + return false; +} + +static inline void intel_pmu_set_acr_cntr_constr(struct perf_event *event, + u64 *cause_mask, int *num) +{ + event->hw.dyn_constraint &= hybrid(event->pmu, acr_cntr_mask64); + *cause_mask |= event->attr.config2; + *num += 1; +} + +static inline void intel_pmu_set_acr_caused_constr(struct perf_event *event, + int idx, u64 cause_mask) +{ + if (test_bit(idx, (unsigned long *)&cause_mask)) + event->hw.dyn_constraint &= hybrid(event->pmu, acr_cause_mask64); +} + static int intel_pmu_hw_config(struct perf_event *event) { int ret = x86_pmu_hw_config(event); @@ -4215,6 +4325,94 @@ static int intel_pmu_hw_config(struct perf_event *event) event->attr.precise_ip) event->group_leader->hw.flags |= PERF_X86_EVENT_PEBS_CNTR; + if (intel_pmu_has_acr(event->pmu) && intel_pmu_is_acr_group(event)) { + struct perf_event *sibling, *leader = event->group_leader; + struct pmu *pmu = event->pmu; + bool has_sw_event = false; + int num = 0, idx = 0; + u64 cause_mask = 0; + + /* Not support perf metrics */ + if (is_metric_event(event)) + return -EINVAL; + + /* Not support freq mode */ + if (event->attr.freq) + return -EINVAL; + + /* PDist is not supported */ + if (event->attr.config2 && event->attr.precise_ip > 2) + return -EINVAL; + + /* The reload value cannot exceeds the max period */ + if (event->attr.sample_period > x86_pmu.max_period) + return -EINVAL; + /* + * The counter-constraints of each event cannot be finalized + * unless the whole group is scanned. However, it's hard + * to know whether the event is the last one of the group. + * Recalculate the counter-constraints for each event when + * adding a new event. + * + * The group is traversed twice, which may be optimized later. + * In the first round, + * - Find all events which do reload when other events + * overflow and set the corresponding counter-constraints + * - Add all events, which can cause other events reload, + * in the cause_mask + * - Error out if the number of events exceeds the HW limit + * - The ACR events must be contiguous. + * Error out if there are non-X86 events between ACR events. + * This is not a HW limit, but a SW limit. + * With the assumption, the intel_pmu_acr_late_setup() can + * easily convert the event idx to counter idx without + * traversing the whole event list. + */ + if (!is_x86_event(leader)) + return -EINVAL; + + if (leader->attr.config2) + intel_pmu_set_acr_cntr_constr(leader, &cause_mask, &num); + + if (leader->nr_siblings) { + for_each_sibling_event(sibling, leader) { + if (!is_x86_event(sibling)) { + has_sw_event = true; + continue; + } + if (!sibling->attr.config2) + continue; + if (has_sw_event) + return -EINVAL; + intel_pmu_set_acr_cntr_constr(sibling, &cause_mask, &num); + } + } + if (leader != event && event->attr.config2) { + if (has_sw_event) + return -EINVAL; + intel_pmu_set_acr_cntr_constr(event, &cause_mask, &num); + } + + if (hweight64(cause_mask) > hweight64(hybrid(pmu, acr_cause_mask64)) || + num > hweight64(hybrid(event->pmu, acr_cntr_mask64))) + return -EINVAL; + /* + * In the second round, apply the counter-constraints for + * the events which can cause other events reload. + */ + intel_pmu_set_acr_caused_constr(leader, idx++, cause_mask); + + if (leader->nr_siblings) { + for_each_sibling_event(sibling, leader) + intel_pmu_set_acr_caused_constr(sibling, idx++, cause_mask); + } + + if (leader != event) + intel_pmu_set_acr_caused_constr(event, idx, cause_mask); + + leader->hw.flags |= PERF_X86_EVENT_ACR; + } + if ((event->attr.type == PERF_TYPE_HARDWARE) || (event->attr.type == PERF_TYPE_HW_CACHE)) return 0; @@ -6061,6 +6259,21 @@ td_is_visible(struct kobject *kobj, struct attribute *attr, int i) return attr->mode; } +PMU_FORMAT_ATTR(acr_mask, "config2:0-63"); + +static struct attribute *format_acr_attrs[] = { + &format_attr_acr_mask.attr, + NULL +}; + +static umode_t +acr_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + struct device *dev = kobj_to_dev(kobj); + + return intel_pmu_has_acr(dev_get_drvdata(dev)) ? attr->mode : 0; +} + static struct attribute_group group_events_td = { .name = "events", .is_visible = td_is_visible, @@ -6103,6 +6316,12 @@ static struct attribute_group group_format_evtsel_ext = { .is_visible = evtsel_ext_is_visible, }; +static struct attribute_group group_format_acr = { + .name = "format", + .attrs = format_acr_attrs, + .is_visible = acr_is_visible, +}; + static struct attribute_group group_default = { .attrs = intel_pmu_attrs, .is_visible = default_is_visible, @@ -6117,6 +6336,7 @@ static const struct attribute_group *attr_update[] = { &group_format_extra, &group_format_extra_skl, &group_format_evtsel_ext, + &group_format_acr, &group_default, NULL, }; @@ -6401,6 +6621,7 @@ static const struct attribute_group *hybrid_attr_update[] = { &group_caps_lbr, &hybrid_group_format_extra, &group_format_evtsel_ext, + &group_format_acr, &group_default, &hybrid_group_cpus, NULL, @@ -6593,6 +6814,7 @@ static __always_inline void intel_pmu_init_skt(struct pmu *pmu) intel_pmu_init_grt(pmu); hybrid(pmu, event_constraints) = intel_skt_event_constraints; hybrid(pmu, extra_regs) = intel_cmt_extra_regs; + static_call_update(intel_pmu_enable_acr_event, intel_pmu_enable_acr); } __init int intel_pmu_init(void) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index ab9af2e4da1b..46bbb503aca1 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -120,6 +120,11 @@ static inline bool is_pebs_counter_event_group(struct perf_event *event) return event->group_leader->hw.flags & PERF_X86_EVENT_PEBS_CNTR; } +static inline bool is_acr_event_group(struct perf_event *event) +{ + return event->group_leader->hw.flags & PERF_X86_EVENT_ACR; +} + struct amd_nb { int nb_id; /* NorthBridge id */ int refcnt; /* reference count */ @@ -287,6 +292,10 @@ struct cpu_hw_events { u64 fixed_ctrl_val; u64 active_fixed_ctrl_val; + /* Intel ACR configuration */ + u64 acr_cfg_b[X86_PMC_IDX_MAX]; + u64 acr_cfg_c[X86_PMC_IDX_MAX]; + /* * Intel LBR bits */ @@ -1103,6 +1112,7 @@ static struct perf_pmu_format_hybrid_attr format_attr_hybrid_##_name = {\ .pmu_type = _pmu, \ } +int is_x86_event(struct perf_event *event); struct pmu *x86_get_pmu(unsigned int cpu); extern struct x86_pmu x86_pmu __read_mostly; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index e6134ef2263d..53da787b9326 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -594,7 +594,11 @@ /* V6 PMON MSR range */ #define MSR_IA32_PMC_V6_GP0_CTR 0x1900 #define MSR_IA32_PMC_V6_GP0_CFG_A 0x1901 +#define MSR_IA32_PMC_V6_GP0_CFG_B 0x1902 +#define MSR_IA32_PMC_V6_GP0_CFG_C 0x1903 #define MSR_IA32_PMC_V6_FX0_CTR 0x1980 +#define MSR_IA32_PMC_V6_FX0_CFG_B 0x1982 +#define MSR_IA32_PMC_V6_FX0_CFG_C 0x1983 #define MSR_IA32_PMC_V6_STEP 4 /* KeyID partitioning between MKTME and TDX */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 5c547329cf02..947ad12dfdbe 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -157,6 +157,7 @@ struct hw_perf_event { union { struct { /* hardware */ u64 config; + u64 config1; u64 last_tag; u64 dyn_constraint; unsigned long config_base; -- cgit v1.2.3 From f400565faa50737ac1d550d2c75128c0dad75765 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 24 Apr 2025 18:11:27 +0200 Subject: perf: Remove too early and redundant CPU hotplug handling The CPU hotplug handlers are called twice: at prepare and online stage. Their role is to: 1) Enable/disable a CPU context. This is irrelevant and even buggy at the prepare stage because the CPU is still offline. On early secondary CPU up, creating an event attached to that CPU might silently fail because the CPU context is observed as online but the context installation's IPI failure is ignored. 2) Update the scope cpumasks and re-migrate the events accordingly in the CPU down case. This is irrelevant at the prepare stage. 3) Remove the events attached to the context of the offlining CPU. It even uses an (unnecessary) IPI for it. This is also irrelevant at the prepare stage. Also none of the *_PREPARE and *_STARTING architecture perf related CPU hotplug callbacks rely on CPUHP_PERF_PREPARE. CPUHP_AP_PERF_ONLINE is enough and the right place to perform the work. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250424161128.29176-4-frederic@kernel.org --- include/linux/cpuhotplug.h | 1 - kernel/cpu.c | 5 ----- 2 files changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 1987400000b4..df366ee15456 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -60,7 +60,6 @@ enum cpuhp_state { /* PREPARE section invoked on a control CPU */ CPUHP_OFFLINE = 0, CPUHP_CREATE_THREADS, - CPUHP_PERF_PREPARE, CPUHP_PERF_X86_PREPARE, CPUHP_PERF_X86_AMD_UNCORE_PREP, CPUHP_PERF_POWER, diff --git a/kernel/cpu.c b/kernel/cpu.c index b08bb34b1718..a59e009e0be4 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2069,11 +2069,6 @@ static struct cpuhp_step cpuhp_hp_states[] = { .teardown.single = NULL, .cant_stop = true, }, - [CPUHP_PERF_PREPARE] = { - .name = "perf:prepare", - .startup.single = perf_event_init_cpu, - .teardown.single = perf_event_exit_cpu, - }, [CPUHP_RANDOM_PREPARE] = { .name = "random:prepare", .startup.single = random_prepare_cpu, -- cgit v1.2.3 From 18049c8cff9cc89daadc4df6975f7d9069638926 Mon Sep 17 00:00:00 2001 From: Yabin Cui Date: Thu, 8 May 2025 16:26:42 -0700 Subject: perf/aux: Allocate non-contiguous AUX pages by default perf always allocates contiguous AUX pages based on aux_watermark. However, this contiguous allocation doesn't benefit all PMUs. For instance, ARM SPE and TRBE operate with virtual pages, and Coresight ETR allocates a separate buffer. For these PMUs, allocating contiguous AUX pages unnecessarily exacerbates memory fragmentation. This fragmentation can prevent their use on long-running devices. This patch modifies the perf driver to be memory-friendly by default, by allocating non-contiguous AUX pages. For PMUs requiring contiguous pages (Intel BTS and some Intel PT), the existing PERF_PMU_CAP_AUX_NO_SG capability can be used. For PMUs that don't require but can benefit from contiguous pages (some Intel PT), a new capability, PERF_PMU_CAP_AUX_PREFER_LARGE, is added to maintain their existing behavior. Signed-off-by: Yabin Cui Signed-off-by: Ingo Molnar Reviewed-by: James Clark Reviewed-by: Anshuman Khandual Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Alexander Shishkin Cc: Mark Rutland Cc: Namhyung Kim Link: https://lore.kernel.org/r/20250508232642.148767-1-yabinc@google.com --- arch/x86/events/intel/pt.c | 2 ++ include/linux/perf_event.h | 1 + kernel/events/ring_buffer.c | 29 ++++++++++++++++++++--------- 3 files changed, 23 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index fa37565f6418..25ead919fc48 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -1863,6 +1863,8 @@ static __init int pt_init(void) if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) pt_pmu.pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG; + else + pt_pmu.pmu.capabilities = PERF_PMU_CAP_AUX_PREFER_LARGE; pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE | diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 947ad12dfdbe..a96c00e2ceca 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -303,6 +303,7 @@ struct perf_event_pmu_context; #define PERF_PMU_CAP_AUX_OUTPUT 0x0080 #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 #define PERF_PMU_CAP_AUX_PAUSE 0x0200 +#define PERF_PMU_CAP_AUX_PREFER_LARGE 0x0400 /** * pmu::scope diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 5130b119d0ae..d2aef87c7e9f 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -679,7 +679,15 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, { bool overwrite = !(flags & RING_BUFFER_WRITABLE); int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu); - int ret = -ENOMEM, max_order; + bool use_contiguous_pages = event->pmu->capabilities & ( + PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_PREFER_LARGE); + /* + * Initialize max_order to 0 for page allocation. This allocates single + * pages to minimize memory fragmentation. This is overridden if the + * PMU needs or prefers contiguous pages (use_contiguous_pages = true). + */ + int max_order = 0; + int ret = -ENOMEM; if (!has_aux(event)) return -EOPNOTSUPP; @@ -689,8 +697,8 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, if (!overwrite) { /* - * Watermark defaults to half the buffer, and so does the - * max_order, to aid PMU drivers in double buffering. + * Watermark defaults to half the buffer, to aid PMU drivers + * in double buffering. */ if (!watermark) watermark = min_t(unsigned long, @@ -698,16 +706,19 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, (unsigned long)nr_pages << (PAGE_SHIFT - 1)); /* - * Use aux_watermark as the basis for chunking to - * help PMU drivers honor the watermark. + * If using contiguous pages, use aux_watermark as the basis + * for chunking to help PMU drivers honor the watermark. */ - max_order = get_order(watermark); + if (use_contiguous_pages) + max_order = get_order(watermark); } else { /* - * We need to start with the max_order that fits in nr_pages, - * not the other way around, hence ilog2() and not get_order. + * If using contiguous pages, we need to start with the + * max_order that fits in nr_pages, not the other way around, + * hence ilog2() and not get_order. */ - max_order = ilog2(nr_pages); + if (use_contiguous_pages) + max_order = ilog2(nr_pages); watermark = 0; } -- cgit v1.2.3 From e7d952cc39fca34386ec9f15f68cb2eaac01b5ae Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 24 May 2025 11:23:25 +0200 Subject: perf/headers: Clean up a bit Do a bit of readability spring cleaning: - Fix misaligned structure member in perf_addr_filter: the new struct perf_addr_filter::action member was too long, but when it was added it was not aligned properly. Align all fields to the customary column 41 alignment of most of the rest of the header. - Adjust the vertical alignment of the definition of other structures and definitions as well, so that the 'most of' in the previous paragraph changes to 'all of'. ;-) - Prettify the assignments in perf_clear_branch_entry_bitfields() - Move comments from CPP definitions to outside the macro - Move perf_guest_info_callbacks and related defines from the front of the header closer to where it's used within the header. - And more #endif markers for larger CPP blocks and standardize #if/#else/#endif blocks to the following nomenclature: #ifdef CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ - Standardize on consistently using the 'extern' storage class where appropriate, we had cases where method prototypes sometimes omitted the storage class: extern void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu); int perf_event_read_local(struct perf_event *event, u64 *value, u64 *enabled, u64 *running); extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); Which is obviously a bit confusing and adds unnecessary noise. - s/__u64/u64 and similar cleanups: there's no point in using __u64 in non-UAPI headers, and doing so only adds unnecessary visual noise. - Harmonize all multi-parameter function prototypes along the following style: extern struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, struct task_struct *task, perf_overflow_handler_t callback, void *context); - etc. Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Alexander Shishkin Cc: Mark Rutland Cc: Namhyung Kim Cc: Ian Rogers Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 282 +++++++++++++++++++++++++-------------------- 1 file changed, 155 insertions(+), 127 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index a96c00e2ceca..52dc7cfab0e0 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -26,18 +26,9 @@ # include #endif -#define PERF_GUEST_ACTIVE 0x01 -#define PERF_GUEST_USER 0x02 - -struct perf_guest_info_callbacks { - unsigned int (*state)(void); - unsigned long (*get_ip)(void); - unsigned int (*handle_intel_pt_intr)(void); -}; - #ifdef CONFIG_HAVE_HW_BREAKPOINT -#include -#include +# include +# include #endif #include @@ -62,19 +53,20 @@ struct perf_guest_info_callbacks { #include #include #include + #include struct perf_callchain_entry { - __u64 nr; - __u64 ip[]; /* /proc/sys/kernel/perf_event_max_stack */ + u64 nr; + u64 ip[]; /* /proc/sys/kernel/perf_event_max_stack */ }; struct perf_callchain_entry_ctx { - struct perf_callchain_entry *entry; - u32 max_stack; - u32 nr; - short contexts; - bool contexts_maxed; + struct perf_callchain_entry *entry; + u32 max_stack; + u32 nr; + short contexts; + bool contexts_maxed; }; typedef unsigned long (*perf_copy_f)(void *dst, const void *src, @@ -121,8 +113,8 @@ static __always_inline bool perf_raw_frag_last(const struct perf_raw_frag *frag) * already stored in age order, the hw_idx should be 0. */ struct perf_branch_stack { - __u64 nr; - __u64 hw_idx; + u64 nr; + u64 hw_idx; struct perf_branch_entry entries[]; }; @@ -132,10 +124,10 @@ struct task_struct; * extra PMU register associated with an event */ struct hw_perf_event_extra { - u64 config; /* register value */ - unsigned int reg; /* register address or index */ - int alloc; /* extra register already allocated */ - int idx; /* index in shared_regs->regs[] */ + u64 config; /* register value */ + unsigned int reg; /* register address or index */ + int alloc; /* extra register already allocated */ + int idx; /* index in shared_regs->regs[] */ }; /** @@ -144,8 +136,8 @@ struct hw_perf_event_extra { * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific * usage. */ -#define PERF_EVENT_FLAG_ARCH 0x0fffffff -#define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 +#define PERF_EVENT_FLAG_ARCH 0x0fffffff +#define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0); @@ -227,9 +219,14 @@ struct hw_perf_event { /* * hw_perf_event::state flags; used to track the PERF_EF_* state. */ -#define PERF_HES_STOPPED 0x01 /* the counter is stopped */ -#define PERF_HES_UPTODATE 0x02 /* event->count up-to-date */ -#define PERF_HES_ARCH 0x04 + +/* the counter is stopped */ +#define PERF_HES_STOPPED 0x01 + +/* event->count up-to-date */ +#define PERF_HES_UPTODATE 0x02 + +#define PERF_HES_ARCH 0x04 int state; @@ -278,7 +275,7 @@ struct hw_perf_event { */ u64 freq_time_stamp; u64 freq_count_stamp; -#endif +#endif /* CONFIG_PERF_EVENTS */ }; struct perf_event; @@ -287,29 +284,33 @@ struct perf_event_pmu_context; /* * Common implementation detail of pmu::{start,commit,cancel}_txn */ -#define PERF_PMU_TXN_ADD 0x1 /* txn to add/schedule event on PMU */ -#define PERF_PMU_TXN_READ 0x2 /* txn to read event group from PMU */ + +/* txn to add/schedule event on PMU */ +#define PERF_PMU_TXN_ADD 0x1 + +/* txn to read event group from PMU */ +#define PERF_PMU_TXN_READ 0x2 /** * pmu::capabilities flags */ -#define PERF_PMU_CAP_NO_INTERRUPT 0x0001 -#define PERF_PMU_CAP_NO_NMI 0x0002 -#define PERF_PMU_CAP_AUX_NO_SG 0x0004 -#define PERF_PMU_CAP_EXTENDED_REGS 0x0008 -#define PERF_PMU_CAP_EXCLUSIVE 0x0010 -#define PERF_PMU_CAP_ITRACE 0x0020 -#define PERF_PMU_CAP_NO_EXCLUDE 0x0040 -#define PERF_PMU_CAP_AUX_OUTPUT 0x0080 -#define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 -#define PERF_PMU_CAP_AUX_PAUSE 0x0200 -#define PERF_PMU_CAP_AUX_PREFER_LARGE 0x0400 +#define PERF_PMU_CAP_NO_INTERRUPT 0x0001 +#define PERF_PMU_CAP_NO_NMI 0x0002 +#define PERF_PMU_CAP_AUX_NO_SG 0x0004 +#define PERF_PMU_CAP_EXTENDED_REGS 0x0008 +#define PERF_PMU_CAP_EXCLUSIVE 0x0010 +#define PERF_PMU_CAP_ITRACE 0x0020 +#define PERF_PMU_CAP_NO_EXCLUDE 0x0040 +#define PERF_PMU_CAP_AUX_OUTPUT 0x0080 +#define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 +#define PERF_PMU_CAP_AUX_PAUSE 0x0200 +#define PERF_PMU_CAP_AUX_PREFER_LARGE 0x0400 /** * pmu::scope */ enum perf_pmu_scope { - PERF_PMU_SCOPE_NONE = 0, + PERF_PMU_SCOPE_NONE = 0, PERF_PMU_SCOPE_CORE, PERF_PMU_SCOPE_DIE, PERF_PMU_SCOPE_CLUSTER, @@ -393,11 +394,21 @@ struct pmu { * Flags for ->add()/->del()/ ->start()/->stop(). There are * matching hw_perf_event::state flags. */ -#define PERF_EF_START 0x01 /* start the counter when adding */ -#define PERF_EF_RELOAD 0x02 /* reload the counter when starting */ -#define PERF_EF_UPDATE 0x04 /* update the counter when stopping */ -#define PERF_EF_PAUSE 0x08 /* AUX area event, pause tracing */ -#define PERF_EF_RESUME 0x10 /* AUX area event, resume tracing */ + +/* start the counter when adding */ +#define PERF_EF_START 0x01 + +/* reload the counter when starting */ +#define PERF_EF_RELOAD 0x02 + +/* update the counter when stopping */ +#define PERF_EF_UPDATE 0x04 + +/* AUX area event, pause tracing */ +#define PERF_EF_PAUSE 0x08 + +/* AUX area event, resume tracing */ +#define PERF_EF_RESUME 0x10 /* * Adds/Removes a counter to/from the PMU, can be done inside a @@ -596,10 +607,10 @@ enum perf_addr_filter_action_t { * This is a hardware-agnostic filter configuration as specified by the user. */ struct perf_addr_filter { - struct list_head entry; - struct path path; - unsigned long offset; - unsigned long size; + struct list_head entry; + struct path path; + unsigned long offset; + unsigned long size; enum perf_addr_filter_action_t action; }; @@ -614,14 +625,14 @@ struct perf_addr_filter { * bundled together; see perf_event_addr_filters(). */ struct perf_addr_filters_head { - struct list_head list; - raw_spinlock_t lock; - unsigned int nr_file_filters; + struct list_head list; + raw_spinlock_t lock; + unsigned int nr_file_filters; }; struct perf_addr_filter_range { - unsigned long start; - unsigned long size; + unsigned long start; + unsigned long size; }; /** @@ -669,24 +680,24 @@ struct swevent_hlist { struct rcu_head rcu_head; }; -#define PERF_ATTACH_CONTEXT 0x0001 -#define PERF_ATTACH_GROUP 0x0002 -#define PERF_ATTACH_TASK 0x0004 -#define PERF_ATTACH_TASK_DATA 0x0008 -#define PERF_ATTACH_GLOBAL_DATA 0x0010 -#define PERF_ATTACH_SCHED_CB 0x0020 -#define PERF_ATTACH_CHILD 0x0040 -#define PERF_ATTACH_EXCLUSIVE 0x0080 -#define PERF_ATTACH_CALLCHAIN 0x0100 -#define PERF_ATTACH_ITRACE 0x0200 +#define PERF_ATTACH_CONTEXT 0x0001 +#define PERF_ATTACH_GROUP 0x0002 +#define PERF_ATTACH_TASK 0x0004 +#define PERF_ATTACH_TASK_DATA 0x0008 +#define PERF_ATTACH_GLOBAL_DATA 0x0010 +#define PERF_ATTACH_SCHED_CB 0x0020 +#define PERF_ATTACH_CHILD 0x0040 +#define PERF_ATTACH_EXCLUSIVE 0x0080 +#define PERF_ATTACH_CALLCHAIN 0x0100 +#define PERF_ATTACH_ITRACE 0x0200 struct bpf_prog; struct perf_cgroup; struct perf_buffer; struct pmu_event_list { - raw_spinlock_t lock; - struct list_head list; + raw_spinlock_t lock; + struct list_head list; }; /* @@ -696,12 +707,12 @@ struct pmu_event_list { * disabled is sufficient since it will hold-off the IPIs. */ #ifdef CONFIG_PROVE_LOCKING -#define lockdep_assert_event_ctx(event) \ +# define lockdep_assert_event_ctx(event) \ WARN_ON_ONCE(__lockdep_enabled && \ (this_cpu_read(hardirqs_enabled) && \ lockdep_is_held(&(event)->ctx->mutex) != LOCK_STATE_HELD)) #else -#define lockdep_assert_event_ctx(event) +# define lockdep_assert_event_ctx(event) #endif #define for_each_sibling_event(sibling, event) \ @@ -859,9 +870,9 @@ struct perf_event { #ifdef CONFIG_EVENT_TRACING struct trace_event_call *tp_event; struct event_filter *filter; -#ifdef CONFIG_FUNCTION_TRACER +# ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops ftrace_ops; -#endif +# endif #endif #ifdef CONFIG_CGROUP_PERF @@ -880,7 +891,7 @@ struct perf_event { * of it. event->orig_type contains original 'type' requested by * user. */ - __u32 orig_type; + u32 orig_type; #endif /* CONFIG_PERF_EVENTS */ }; @@ -945,8 +956,8 @@ static inline bool perf_pmu_ctx_is_active(struct perf_event_pmu_context *epc) } struct perf_event_groups { - struct rb_root tree; - u64 index; + struct rb_root tree; + u64 index; }; @@ -1189,16 +1200,18 @@ extern void perf_pmu_resched(struct pmu *pmu); extern int perf_event_refresh(struct perf_event *event, int refresh); extern void perf_event_update_userpage(struct perf_event *event); extern int perf_event_release_kernel(struct perf_event *event); + extern struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, - int cpu, - struct task_struct *task, - perf_overflow_handler_t callback, - void *context); + int cpu, + struct task_struct *task, + perf_overflow_handler_t callback, + void *context); + extern void perf_pmu_migrate_context(struct pmu *pmu, - int src_cpu, int dst_cpu); -int perf_event_read_local(struct perf_event *event, u64 *value, - u64 *enabled, u64 *running); + int src_cpu, int dst_cpu); +extern int perf_event_read_local(struct perf_event *event, u64 *value, + u64 *enabled, u64 *running); extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); @@ -1415,14 +1428,14 @@ static inline u32 perf_sample_data_size(struct perf_sample_data *data, */ static inline void perf_clear_branch_entry_bitfields(struct perf_branch_entry *br) { - br->mispred = 0; - br->predicted = 0; - br->in_tx = 0; - br->abort = 0; - br->cycles = 0; - br->type = 0; - br->spec = PERF_BR_SPEC_NA; - br->reserved = 0; + br->mispred = 0; + br->predicted = 0; + br->in_tx = 0; + br->abort = 0; + br->cycles = 0; + br->type = 0; + br->spec = PERF_BR_SPEC_NA; + br->reserved = 0; } extern void perf_output_sample(struct perf_output_handle *handle, @@ -1611,7 +1624,17 @@ extern void perf_event_bpf_event(struct bpf_prog *prog, enum perf_bpf_event_type type, u16 flags); +#define PERF_GUEST_ACTIVE 0x01 +#define PERF_GUEST_USER 0x02 + +struct perf_guest_info_callbacks { + unsigned int (*state)(void); + unsigned long (*get_ip)(void); + unsigned int (*handle_intel_pt_intr)(void); +}; + #ifdef CONFIG_GUEST_PERF_EVENTS + extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs; DECLARE_STATIC_CALL(__perf_guest_state, *perf_guest_cbs->state); @@ -1622,21 +1645,27 @@ static inline unsigned int perf_guest_state(void) { return static_call(__perf_guest_state)(); } + static inline unsigned long perf_guest_get_ip(void) { return static_call(__perf_guest_get_ip)(); } + static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return static_call(__perf_guest_handle_intel_pt_intr)(); } + extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); -#else + +#else /* !CONFIG_GUEST_PERF_EVENTS: */ + static inline unsigned int perf_guest_state(void) { return 0; } static inline unsigned long perf_guest_get_ip(void) { return 0; } static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return 0; } -#endif /* CONFIG_GUEST_PERF_EVENTS */ + +#endif /* !CONFIG_GUEST_PERF_EVENTS */ extern void perf_event_exec(void); extern void perf_event_comm(struct task_struct *tsk, bool exec); @@ -1666,6 +1695,7 @@ static inline int perf_callchain_store_context(struct perf_callchain_entry_ctx * { if (ctx->contexts < sysctl_perf_event_max_contexts_per_stack) { struct perf_callchain_entry *entry = ctx->entry; + entry->ip[entry->nr++] = ip; ++ctx->contexts; return 0; @@ -1679,6 +1709,7 @@ static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 { if (ctx->nr < ctx->max_stack && !ctx->contexts_maxed) { struct perf_callchain_entry *entry = ctx->entry; + entry->ip[entry->nr++] = ip; ++ctx->nr; return 0; @@ -1705,7 +1736,7 @@ static inline int perf_is_paranoid(void) return sysctl_perf_event_paranoid > -1; } -int perf_allow_kernel(void); +extern int perf_allow_kernel(void); static inline int perf_allow_cpu(void) { @@ -1827,7 +1858,7 @@ extern int perf_output_begin_backward(struct perf_output_handle *handle, extern void perf_output_end(struct perf_output_handle *handle); extern unsigned int perf_output_copy(struct perf_output_handle *handle, - const void *buf, unsigned int len); + const void *buf, unsigned int len); extern unsigned int perf_output_skip(struct perf_output_handle *handle, unsigned int len); extern long perf_output_copy_aux(struct perf_output_handle *aux_handle, @@ -1844,7 +1875,9 @@ extern void perf_event_task_tick(void); extern int perf_event_account_interrupt(struct perf_event *event); extern int perf_event_period(struct perf_event *event, u64 value); extern u64 perf_event_pause(struct perf_event *event, bool reset); + #else /* !CONFIG_PERF_EVENTS: */ + static inline void * perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event) { return NULL; } @@ -1922,19 +1955,14 @@ static inline void perf_event_disable(struct perf_event *event) { } static inline int __perf_event_disable(void *info) { return -1; } static inline void perf_event_task_tick(void) { } static inline int perf_event_release_kernel(struct perf_event *event) { return 0; } -static inline int perf_event_period(struct perf_event *event, u64 value) -{ - return -EINVAL; -} -static inline u64 perf_event_pause(struct perf_event *event, bool reset) -{ - return 0; -} -static inline int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) -{ - return 0; -} -#endif +static inline int +perf_event_period(struct perf_event *event, u64 value) { return -EINVAL; } +static inline u64 +perf_event_pause(struct perf_event *event, bool reset) { return 0; } +static inline int +perf_exclude_event(struct perf_event *event, struct pt_regs *regs) { return 0; } + +#endif /* !CONFIG_PERF_EVENTS */ #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) extern void perf_restore_debug_store(void); @@ -1942,31 +1970,31 @@ extern void perf_restore_debug_store(void); static inline void perf_restore_debug_store(void) { } #endif -#define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x)) +#define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x)) struct perf_pmu_events_attr { - struct device_attribute attr; - u64 id; - const char *event_str; + struct device_attribute attr; + u64 id; + const char *event_str; }; struct perf_pmu_events_ht_attr { - struct device_attribute attr; - u64 id; - const char *event_str_ht; - const char *event_str_noht; + struct device_attribute attr; + u64 id; + const char *event_str_ht; + const char *event_str_noht; }; struct perf_pmu_events_hybrid_attr { - struct device_attribute attr; - u64 id; - const char *event_str; - u64 pmu_type; + struct device_attribute attr; + u64 id; + const char *event_str; + u64 pmu_type; }; struct perf_pmu_format_hybrid_attr { - struct device_attribute attr; - u64 pmu_type; + struct device_attribute attr; + u64 pmu_type; }; ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr, @@ -2008,11 +2036,11 @@ static struct device_attribute format_attr_##_name = __ATTR_RO(_name) /* Performance counter hotplug functions */ #ifdef CONFIG_PERF_EVENTS -int perf_event_init_cpu(unsigned int cpu); -int perf_event_exit_cpu(unsigned int cpu); +extern int perf_event_init_cpu(unsigned int cpu); +extern int perf_event_exit_cpu(unsigned int cpu); #else -#define perf_event_init_cpu NULL -#define perf_event_exit_cpu NULL +# define perf_event_init_cpu NULL +# define perf_event_exit_cpu NULL #endif extern void arch_perf_update_userpage(struct perf_event *event, -- cgit v1.2.3