From da916e96e2dedcb2d40de77a7def833d315b81a6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 25 Oct 2024 10:21:41 +0200
Subject: perf: Make perf_pmu_unregister() useable

Previously it was only safe to call perf_pmu_unregister() if there
were no active events of that pmu around -- which was impossible to
guarantee since it races all sorts against perf_init_event().

Rework the whole thing by:

 - keeping track of all events for a given pmu

 - 'hiding' the pmu from perf_init_event()

 - waiting for the appropriate (s)rcu grace periods such that all
   prior references to the PMU will be completed

 - detaching all still existing events of that pmu (see first point)
   and moving them to a new REVOKED state.

 - actually freeing the pmu data.

Where notably the new REVOKED state must inhibit all event actions
from reaching code that wants to use event->pmu.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ravi Bangoria <ravi.bangoria@amd.com>
Link: https://lkml.kernel.org/r/20250307193723.525402029@infradead.org
---
 include/linux/perf_event.h | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0069ba6866a4..7f49a58b271d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -325,6 +325,9 @@ struct perf_output_handle;
 struct pmu {
 	struct list_head		entry;
 
+	spinlock_t			events_lock;
+	struct list_head		events;
+
 	struct module			*module;
 	struct device			*dev;
 	struct device			*parent;
@@ -622,9 +625,10 @@ struct perf_addr_filter_range {
  * enum perf_event_state - the states of an event:
  */
 enum perf_event_state {
-	PERF_EVENT_STATE_DEAD		= -4,
-	PERF_EVENT_STATE_EXIT		= -3,
-	PERF_EVENT_STATE_ERROR		= -2,
+	PERF_EVENT_STATE_DEAD		= -5,
+	PERF_EVENT_STATE_REVOKED	= -4, /* pmu gone, must not touch */
+	PERF_EVENT_STATE_EXIT		= -3, /* task died, still inherit */
+	PERF_EVENT_STATE_ERROR		= -2, /* scheduling error, can enable */
 	PERF_EVENT_STATE_OFF		= -1,
 	PERF_EVENT_STATE_INACTIVE	=  0,
 	PERF_EVENT_STATE_ACTIVE		=  1,
@@ -865,6 +869,7 @@ struct perf_event {
 	void *security;
 #endif
 	struct list_head		sb_list;
+	struct list_head		pmu_list;
 
 	/*
 	 * Certain events gets forwarded to another pmu internally by over-
@@ -1155,7 +1160,7 @@ extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags);
 extern void perf_event_itrace_started(struct perf_event *event);
 
 extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
-extern void perf_pmu_unregister(struct pmu *pmu);
+extern int perf_pmu_unregister(struct pmu *pmu);
 
 extern void __perf_event_task_sched_in(struct task_struct *prev,
 				       struct task_struct *task);
@@ -1760,7 +1765,7 @@ static inline bool needs_branch_stack(struct perf_event *event)
 
 static inline bool has_aux(struct perf_event *event)
 {
-	return event->pmu->setup_aux;
+	return event->pmu && event->pmu->setup_aux;
 }
 
 static inline bool has_aux_action(struct perf_event *event)
-- 
cgit v1.2.3


From 4dfe3232cc04325a09e96f6c7f9546ba6c0b132b Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Thu, 27 Mar 2025 12:52:13 -0700
Subject: perf/x86: Add dynamic constraint

More and more features require a dynamic event constraint, e.g., branch
counter logging, auto counter reload, Arch PEBS, etc.

Add a generic flag, PMU_FL_DYN_CONSTRAINT, to indicate the case. It
avoids keeping adding the individual flag in intel_cpuc_prepare().

Add a variable dyn_constraint in the struct hw_perf_event to track the
dynamic constraint of the event. Apply it if it's updated.

Apply the generic dynamic constraint for branch counter logging.
Many features on and after V6 require dynamic constraint. So
unconditionally set the flag for V6+.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Thomas Falcon <thomas.falcon@intel.com>
Link: https://lkml.kernel.org/r/20250327195217.2683619-2-kan.liang@linux.intel.com
---
 arch/x86/events/core.c       |  1 +
 arch/x86/events/intel/core.c | 21 +++++++++++++++------
 arch/x86/events/intel/lbr.c  |  2 +-
 arch/x86/events/perf_event.h |  1 +
 include/linux/perf_event.h   |  1 +
 5 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 6866cc5acb0b..a0fe51e0c00f 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -674,6 +674,7 @@ static int __x86_pmu_event_init(struct perf_event *event)
 	event->hw.idx = -1;
 	event->hw.last_cpu = -1;
 	event->hw.last_tag = ~0ULL;
+	event->hw.dyn_constraint = ~0ULL;
 
 	/* mark unused */
 	event->hw.extra_reg.idx = EXTRA_REG_NONE;
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 09d2d66c9f21..972492832d7f 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3730,10 +3730,9 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
 	if (cpuc->excl_cntrs)
 		return intel_get_excl_constraints(cpuc, event, idx, c2);
 
-	/* Not all counters support the branch counter feature. */
-	if (branch_sample_counters(event)) {
+	if (event->hw.dyn_constraint != ~0ULL) {
 		c2 = dyn_constraint(cpuc, c2, idx);
-		c2->idxmsk64 &= x86_pmu.lbr_counters;
+		c2->idxmsk64 &= event->hw.dyn_constraint;
 		c2->weight = hweight64(c2->idxmsk64);
 	}
 
@@ -4135,15 +4134,19 @@ static int intel_pmu_hw_config(struct perf_event *event)
 		leader = event->group_leader;
 		if (branch_sample_call_stack(leader))
 			return -EINVAL;
-		if (branch_sample_counters(leader))
+		if (branch_sample_counters(leader)) {
 			num++;
+			leader->hw.dyn_constraint &= x86_pmu.lbr_counters;
+		}
 		leader->hw.flags |= PERF_X86_EVENT_BRANCH_COUNTERS;
 
 		for_each_sibling_event(sibling, leader) {
 			if (branch_sample_call_stack(sibling))
 				return -EINVAL;
-			if (branch_sample_counters(sibling))
+			if (branch_sample_counters(sibling)) {
 				num++;
+				sibling->hw.dyn_constraint &= x86_pmu.lbr_counters;
+			}
 		}
 
 		if (num > fls(x86_pmu.lbr_counters))
@@ -4943,7 +4946,7 @@ int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu)
 			goto err;
 	}
 
-	if (x86_pmu.flags & (PMU_FL_EXCL_CNTRS | PMU_FL_TFA | PMU_FL_BR_CNTR)) {
+	if (x86_pmu.flags & (PMU_FL_EXCL_CNTRS | PMU_FL_TFA | PMU_FL_DYN_CONSTRAINT)) {
 		size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint);
 
 		cpuc->constraint_list = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu));
@@ -6664,6 +6667,12 @@ __init int intel_pmu_init(void)
 			pr_cont(" AnyThread deprecated, ");
 	}
 
+	/*
+	 * Many features on and after V6 require dynamic constraint,
+	 * e.g., Arch PEBS, ACR.
+	 */
+	if (version >= 6)
+		x86_pmu.flags |= PMU_FL_DYN_CONSTRAINT;
 	/*
 	 * Install the hw-cache-events table:
 	 */
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index f44c3d866f24..05acd6449ceb 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -1618,7 +1618,7 @@ void __init intel_pmu_arch_lbr_init(void)
 	x86_pmu.lbr_nr = lbr_nr;
 
 	if (!!x86_pmu.lbr_counters)
-		x86_pmu.flags |= PMU_FL_BR_CNTR;
+		x86_pmu.flags |= PMU_FL_BR_CNTR | PMU_FL_DYN_CONSTRAINT;
 
 	if (x86_pmu.lbr_mispred)
 		static_branch_enable(&x86_lbr_mispred);
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 2c0ce0e9545e..f5ba1658a889 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -1042,6 +1042,7 @@ do {									\
 #define PMU_FL_MEM_LOADS_AUX	0x100 /* Require an auxiliary event for the complete memory info */
 #define PMU_FL_RETIRE_LATENCY	0x200 /* Support Retire Latency in PEBS */
 #define PMU_FL_BR_CNTR		0x400 /* Support branch counter logging */
+#define PMU_FL_DYN_CONSTRAINT	0x800 /* Needs dynamic constraint */
 
 #define EVENT_VAR(_id)  event_attr_##_id
 #define EVENT_PTR(_id) &event_attr_##_id.attr.attr
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 7f49a58b271d..54dad174ed7a 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -158,6 +158,7 @@ struct hw_perf_event {
 		struct { /* hardware */
 			u64		config;
 			u64		last_tag;
+			u64		dyn_constraint;
 			unsigned long	config_base;
 			unsigned long	event_base;
 			int		event_base_rdpmc;
-- 
cgit v1.2.3


From c9449c8506a5df5052ef4d17867699517b10b55a Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Thu, 27 Mar 2025 12:52:15 -0700
Subject: perf: Extend the bit width of the arch-specific flag

The auto counter reload feature requires an event flag to indicate an
auto counter reload group, which can only be scheduled on specific
counters that enumerated in CPUID. However, the hw_perf_event.flags has
run out on X86.

Two solutions were considered to address the issue.
- Currently, 20 bits are reserved for the architecture-specific flags.
  Only the bit 31 is used for the generic flag. There is still plenty
  of space left. Reserve 8 more bits for the arch-specific flags.
- Add a new X86 specific hw_perf_event.flags1 to support more flags.

The former is implemented. Enough room is still left in the global
generic flag.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Thomas Falcon <thomas.falcon@intel.com>
Link: https://lkml.kernel.org/r/20250327195217.2683619-4-kan.liang@linux.intel.com
---
 arch/x86/events/perf_event_flags.h | 41 +++++++++++++++++++-------------------
 include/linux/perf_event.h         |  2 +-
 2 files changed, 22 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/events/perf_event_flags.h b/arch/x86/events/perf_event_flags.h
index 1d9e385649b5..70078334e4a3 100644
--- a/arch/x86/events/perf_event_flags.h
+++ b/arch/x86/events/perf_event_flags.h
@@ -2,23 +2,24 @@
 /*
  * struct hw_perf_event.flags flags
  */
-PERF_ARCH(PEBS_LDLAT,		0x00001) /* ld+ldlat data address sampling */
-PERF_ARCH(PEBS_ST,		0x00002) /* st data address sampling */
-PERF_ARCH(PEBS_ST_HSW,		0x00004) /* haswell style datala, store */
-PERF_ARCH(PEBS_LD_HSW,		0x00008) /* haswell style datala, load */
-PERF_ARCH(PEBS_NA_HSW,		0x00010) /* haswell style datala, unknown */
-PERF_ARCH(EXCL,			0x00020) /* HT exclusivity on counter */
-PERF_ARCH(DYNAMIC,		0x00040) /* dynamic alloc'd constraint */
-PERF_ARCH(PEBS_CNTR,		0x00080) /* PEBS counters snapshot */
-PERF_ARCH(EXCL_ACCT,		0x00100) /* accounted EXCL event */
-PERF_ARCH(AUTO_RELOAD,		0x00200) /* use PEBS auto-reload */
-PERF_ARCH(LARGE_PEBS,		0x00400) /* use large PEBS */
-PERF_ARCH(PEBS_VIA_PT,		0x00800) /* use PT buffer for PEBS */
-PERF_ARCH(PAIR,			0x01000) /* Large Increment per Cycle */
-PERF_ARCH(LBR_SELECT,		0x02000) /* Save/Restore MSR_LBR_SELECT */
-PERF_ARCH(TOPDOWN,		0x04000) /* Count Topdown slots/metrics events */
-PERF_ARCH(PEBS_STLAT,		0x08000) /* st+stlat data address sampling */
-PERF_ARCH(AMD_BRS,		0x10000) /* AMD Branch Sampling */
-PERF_ARCH(PEBS_LAT_HYBRID,	0x20000) /* ld and st lat for hybrid */
-PERF_ARCH(NEEDS_BRANCH_STACK,	0x40000) /* require branch stack setup */
-PERF_ARCH(BRANCH_COUNTERS,	0x80000) /* logs the counters in the extra space of each branch */
+PERF_ARCH(PEBS_LDLAT,		0x0000001) /* ld+ldlat data address sampling */
+PERF_ARCH(PEBS_ST,		0x0000002) /* st data address sampling */
+PERF_ARCH(PEBS_ST_HSW,		0x0000004) /* haswell style datala, store */
+PERF_ARCH(PEBS_LD_HSW,		0x0000008) /* haswell style datala, load */
+PERF_ARCH(PEBS_NA_HSW,		0x0000010) /* haswell style datala, unknown */
+PERF_ARCH(EXCL,			0x0000020) /* HT exclusivity on counter */
+PERF_ARCH(DYNAMIC,		0x0000040) /* dynamic alloc'd constraint */
+PERF_ARCH(PEBS_CNTR,		0x0000080) /* PEBS counters snapshot */
+PERF_ARCH(EXCL_ACCT,		0x0000100) /* accounted EXCL event */
+PERF_ARCH(AUTO_RELOAD,		0x0000200) /* use PEBS auto-reload */
+PERF_ARCH(LARGE_PEBS,		0x0000400) /* use large PEBS */
+PERF_ARCH(PEBS_VIA_PT,		0x0000800) /* use PT buffer for PEBS */
+PERF_ARCH(PAIR,			0x0001000) /* Large Increment per Cycle */
+PERF_ARCH(LBR_SELECT,		0x0002000) /* Save/Restore MSR_LBR_SELECT */
+PERF_ARCH(TOPDOWN,		0x0004000) /* Count Topdown slots/metrics events */
+PERF_ARCH(PEBS_STLAT,		0x0008000) /* st+stlat data address sampling */
+PERF_ARCH(AMD_BRS,		0x0010000) /* AMD Branch Sampling */
+PERF_ARCH(PEBS_LAT_HYBRID,	0x0020000) /* ld and st lat for hybrid */
+PERF_ARCH(NEEDS_BRANCH_STACK,	0x0040000) /* require branch stack setup */
+PERF_ARCH(BRANCH_COUNTERS,	0x0080000) /* logs the counters in the extra space of each branch */
+PERF_ARCH(ACR,			0x0100000) /* Auto counter reload */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 54dad174ed7a..5c547329cf02 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -144,7 +144,7 @@ struct hw_perf_event_extra {
  * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific
  * usage.
  */
-#define PERF_EVENT_FLAG_ARCH			0x000fffff
+#define PERF_EVENT_FLAG_ARCH			0x0fffffff
 #define PERF_EVENT_FLAG_USER_READ_CNT		0x80000000
 
 static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0);
-- 
cgit v1.2.3


From ec980e4facef8110f6fce27e5b6344660117f01f Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Thu, 27 Mar 2025 12:52:17 -0700
Subject: perf/x86/intel: Support auto counter reload
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The relative rates among two or more events are useful for performance
analysis, e.g., a high branch miss rate may indicate a performance
issue. Usually, the samples with a relative rate that exceeds some
threshold are more useful. However, the traditional sampling takes
samples of events separately. To get the relative rates among two or
more events, a high sample rate is required, which can bring high
overhead. Many samples taken in the non-hotspot area are also dropped
(useless) in the post-process.

The auto counter reload (ACR) feature takes samples when the relative
rate of two or more events exceeds some threshold, which provides the
fine-grained information at a low cost.
To support the feature, two sets of MSRs are introduced. For a given
counter IA32_PMC_GPn_CTR/IA32_PMC_FXm_CTR, bit fields in the
IA32_PMC_GPn_CFG_B/IA32_PMC_FXm_CFG_B MSR indicate which counter(s)
can cause a reload of that counter. The reload value is stored in the
IA32_PMC_GPn_CFG_C/IA32_PMC_FXm_CFG_C.
The details can be found at Intel SDM (085), Volume 3, 21.9.11 Auto
Counter Reload.

In the hw_config(), an ACR event is specially configured, because the
cause/reloadable counter mask has to be applied to the dyn_constraint.
Besides the HW limit, e.g., not support perf metrics, PDist and etc, a
SW limit is applied as well. ACR events in a group must be contiguous.
It facilitates the later conversion from the event idx to the counter
idx. Otherwise, the intel_pmu_acr_late_setup() has to traverse the whole
event list again to find the "cause" event.
Also, add a new flag PERF_X86_EVENT_ACR to indicate an ACR group, which
is set to the group leader.

The late setup() is also required for an ACR group. It's to convert the
event idx to the counter idx, and saved it in hw.config1.

The ACR configuration MSRs are only updated in the enable_event().
The disable_event() doesn't clear the ACR CFG register.
Add acr_cfg_b/acr_cfg_c in the struct cpu_hw_events to cache the MSR
values. It can avoid a MSR write if the value is not changed.

Expose an acr_mask to the sysfs. The perf tool can utilize the new
format to configure the relation of events in the group. The bit
sequence of the acr_mask follows the events enabled order of the group.

Example:

Here is the snippet of the mispredict.c. Since the array has a random
numbers, jumps are random and often mispredicted.
The mispredicted rate depends on the compared value.

For the Loop1, ~11% of all branches are mispredicted.
For the Loop2, ~21% of all branches are mispredicted.

main()
{
...
        for (i = 0; i < N; i++)
                data[i] = rand() % 256;
...
        /* Loop 1 */
        for (k = 0; k < 50; k++)
                for (i = 0; i < N; i++)
                        if (data[i] >= 64)
                                sum += data[i];
...

...
        /* Loop 2 */
        for (k = 0; k < 50; k++)
                for (i = 0; i < N; i++)
                        if (data[i] >= 128)
                                sum += data[i];
...
}

Usually, a code with a high branch miss rate means a bad performance.
To understand the branch miss rate of the codes, the traditional method
usually samples both branches and branch-misses events. E.g.,
perf record -e "{cpu_atom/branch-misses/ppu, cpu_atom/branch-instructions/u}"
               -c 1000000 -- ./mispredict

[ perf record: Woken up 4 times to write data ]
[ perf record: Captured and wrote 0.925 MB perf.data (5106 samples) ]
The 5106 samples are from both events and spread in both Loops.
In the post-process stage, a user can know that the Loop 2 has a 21%
branch miss rate. Then they can focus on the samples of branch-misses
events for the Loop 2.

With this patch, the user can generate the samples only when the branch
miss rate > 20%. For example,
perf record -e "{cpu_atom/branch-misses,period=200000,acr_mask=0x2/ppu,
                 cpu_atom/branch-instructions,period=1000000,acr_mask=0x3/u}"
                -- ./mispredict

(Two different periods are applied to branch-misses and
branch-instructions. The ratio is set to 20%.
If the branch-instructions is overflowed first, the branch-miss
rate < 20%. No samples should be generated. All counters should be
automatically reloaded.
If the branch-misses is overflowed first, the branch-miss rate > 20%.
A sample triggered by the branch-misses event should be
generated. Just the counter of the branch-instructions should be
automatically reloaded.

The branch-misses event should only be automatically reloaded when
the branch-instructions is overflowed. So the "cause" event is the
branch-instructions event. The acr_mask is set to 0x2, since the
event index in the group of branch-instructions is 1.

The branch-instructions event is automatically reloaded no matter which
events are overflowed. So the "cause" events are the branch-misses
and the branch-instructions event. The acr_mask should be set to 0x3.)

[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.098 MB perf.data (2498 samples) ]

 $perf report

Percent       │154:   movl    $0x0,-0x14(%rbp)
              │     ↓ jmp     1af
              │     for (i = j; i < N; i++)
              │15d:   mov     -0x10(%rbp),%eax
              │       mov     %eax,-0x18(%rbp)
              │     ↓ jmp     1a2
              │     if (data[i] >= 128)
              │165:   mov     -0x18(%rbp),%eax
              │       cltq
              │       lea     0x0(,%rax,4),%rdx
              │       mov     -0x8(%rbp),%rax
              │       add     %rdx,%rax
              │       mov     (%rax),%eax
              │    ┌──cmp     $0x7f,%eax
100.00   0.00 │    ├──jle     19e
              │    │sum += data[i];

The 2498 samples are all from the branch-misses events for the Loop 2.

The number of samples and overhead is significantly reduced without
losing any information.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Thomas Falcon <thomas.falcon@intel.com>
Link: https://lkml.kernel.org/r/20250327195217.2683619-6-kan.liang@linux.intel.com
---
 arch/x86/events/core.c           |   2 +-
 arch/x86/events/intel/core.c     | 226 ++++++++++++++++++++++++++++++++++++++-
 arch/x86/events/perf_event.h     |  10 ++
 arch/x86/include/asm/msr-index.h |   4 +
 include/linux/perf_event.h       |   1 +
 5 files changed, 240 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index a0fe51e0c00f..f53ae1fd986f 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -755,7 +755,7 @@ void x86_pmu_enable_all(int added)
 	}
 }
 
-static inline int is_x86_event(struct perf_event *event)
+int is_x86_event(struct perf_event *event)
 {
 	int i;
 
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 876678aa3895..3152a018c502 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2603,7 +2603,8 @@ static void intel_pmu_del_event(struct perf_event *event)
 		intel_pmu_lbr_del(event);
 	if (event->attr.precise_ip)
 		intel_pmu_pebs_del(event);
-	if (is_pebs_counter_event_group(event))
+	if (is_pebs_counter_event_group(event) ||
+	    is_acr_event_group(event))
 		this_cpu_ptr(&cpu_hw_events)->n_late_setup--;
 }
 
@@ -2882,6 +2883,52 @@ static void intel_pmu_enable_fixed(struct perf_event *event)
 	cpuc->fixed_ctrl_val |= bits;
 }
 
+static void intel_pmu_config_acr(int idx, u64 mask, u32 reload)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	int msr_b, msr_c;
+
+	if (!mask && !cpuc->acr_cfg_b[idx])
+		return;
+
+	if (idx < INTEL_PMC_IDX_FIXED) {
+		msr_b = MSR_IA32_PMC_V6_GP0_CFG_B;
+		msr_c = MSR_IA32_PMC_V6_GP0_CFG_C;
+	} else {
+		msr_b = MSR_IA32_PMC_V6_FX0_CFG_B;
+		msr_c = MSR_IA32_PMC_V6_FX0_CFG_C;
+		idx -= INTEL_PMC_IDX_FIXED;
+	}
+
+	if (cpuc->acr_cfg_b[idx] != mask) {
+		wrmsrl(msr_b + x86_pmu.addr_offset(idx, false), mask);
+		cpuc->acr_cfg_b[idx] = mask;
+	}
+	/* Only need to update the reload value when there is a valid config value. */
+	if (mask && cpuc->acr_cfg_c[idx] != reload) {
+		wrmsrl(msr_c + x86_pmu.addr_offset(idx, false), reload);
+		cpuc->acr_cfg_c[idx] = reload;
+	}
+}
+
+static void intel_pmu_enable_acr(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (!is_acr_event_group(event) || !event->attr.config2) {
+		/*
+		 * The disable doesn't clear the ACR CFG register.
+		 * Check and clear the ACR CFG register.
+		 */
+		intel_pmu_config_acr(hwc->idx, 0, 0);
+		return;
+	}
+
+	intel_pmu_config_acr(hwc->idx, hwc->config1, -hwc->sample_period);
+}
+
+DEFINE_STATIC_CALL_NULL(intel_pmu_enable_acr_event, intel_pmu_enable_acr);
+
 static void intel_pmu_enable_event(struct perf_event *event)
 {
 	u64 enable_mask = ARCH_PERFMON_EVENTSEL_ENABLE;
@@ -2896,9 +2943,12 @@ static void intel_pmu_enable_event(struct perf_event *event)
 		if (branch_sample_counters(event))
 			enable_mask |= ARCH_PERFMON_EVENTSEL_BR_CNTR;
 		intel_set_masks(event, idx);
+		static_call_cond(intel_pmu_enable_acr_event)(event);
 		__x86_pmu_enable_event(hwc, enable_mask);
 		break;
 	case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1:
+		static_call_cond(intel_pmu_enable_acr_event)(event);
+		fallthrough;
 	case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END:
 		intel_pmu_enable_fixed(event);
 		break;
@@ -2916,6 +2966,31 @@ static void intel_pmu_enable_event(struct perf_event *event)
 	}
 }
 
+static void intel_pmu_acr_late_setup(struct cpu_hw_events *cpuc)
+{
+	struct perf_event *event, *leader;
+	int i, j, idx;
+
+	for (i = 0; i < cpuc->n_events; i++) {
+		leader = cpuc->event_list[i];
+		if (!is_acr_event_group(leader))
+			continue;
+
+		/* The ACR events must be contiguous. */
+		for (j = i; j < cpuc->n_events; j++) {
+			event = cpuc->event_list[j];
+			if (event->group_leader != leader->group_leader)
+				break;
+			for_each_set_bit(idx, (unsigned long *)&event->attr.config2, X86_PMC_IDX_MAX) {
+				if (WARN_ON_ONCE(i + idx > cpuc->n_events))
+					return;
+				__set_bit(cpuc->assign[i + idx], (unsigned long *)&event->hw.config1);
+			}
+		}
+		i = j - 1;
+	}
+}
+
 void intel_pmu_late_setup(void)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -2924,6 +2999,7 @@ void intel_pmu_late_setup(void)
 		return;
 
 	intel_pmu_pebs_late_setup(cpuc);
+	intel_pmu_acr_late_setup(cpuc);
 }
 
 static void intel_pmu_add_event(struct perf_event *event)
@@ -2932,7 +3008,8 @@ static void intel_pmu_add_event(struct perf_event *event)
 		intel_pmu_pebs_add(event);
 	if (intel_pmu_needs_branch_stack(event))
 		intel_pmu_lbr_add(event);
-	if (is_pebs_counter_event_group(event))
+	if (is_pebs_counter_event_group(event) ||
+	    is_acr_event_group(event))
 		this_cpu_ptr(&cpu_hw_events)->n_late_setup++;
 }
 
@@ -4087,6 +4164,39 @@ end:
 	return start;
 }
 
+static inline bool intel_pmu_has_acr(struct pmu *pmu)
+{
+	return !!hybrid(pmu, acr_cause_mask64);
+}
+
+static bool intel_pmu_is_acr_group(struct perf_event *event)
+{
+	/* The group leader has the ACR flag set */
+	if (is_acr_event_group(event))
+		return true;
+
+	/* The acr_mask is set */
+	if (event->attr.config2)
+		return true;
+
+	return false;
+}
+
+static inline void intel_pmu_set_acr_cntr_constr(struct perf_event *event,
+						 u64 *cause_mask, int *num)
+{
+	event->hw.dyn_constraint &= hybrid(event->pmu, acr_cntr_mask64);
+	*cause_mask |= event->attr.config2;
+	*num += 1;
+}
+
+static inline void intel_pmu_set_acr_caused_constr(struct perf_event *event,
+						   int idx, u64 cause_mask)
+{
+	if (test_bit(idx, (unsigned long *)&cause_mask))
+		event->hw.dyn_constraint &= hybrid(event->pmu, acr_cause_mask64);
+}
+
 static int intel_pmu_hw_config(struct perf_event *event)
 {
 	int ret = x86_pmu_hw_config(event);
@@ -4215,6 +4325,94 @@ static int intel_pmu_hw_config(struct perf_event *event)
 	    event->attr.precise_ip)
 		event->group_leader->hw.flags |= PERF_X86_EVENT_PEBS_CNTR;
 
+	if (intel_pmu_has_acr(event->pmu) && intel_pmu_is_acr_group(event)) {
+		struct perf_event *sibling, *leader = event->group_leader;
+		struct pmu *pmu = event->pmu;
+		bool has_sw_event = false;
+		int num = 0, idx = 0;
+		u64 cause_mask = 0;
+
+		/* Not support perf metrics */
+		if (is_metric_event(event))
+			return -EINVAL;
+
+		/* Not support freq mode */
+		if (event->attr.freq)
+			return -EINVAL;
+
+		/* PDist is not supported */
+		if (event->attr.config2 && event->attr.precise_ip > 2)
+			return -EINVAL;
+
+		/* The reload value cannot exceeds the max period */
+		if (event->attr.sample_period > x86_pmu.max_period)
+			return -EINVAL;
+		/*
+		 * The counter-constraints of each event cannot be finalized
+		 * unless the whole group is scanned. However, it's hard
+		 * to know whether the event is the last one of the group.
+		 * Recalculate the counter-constraints for each event when
+		 * adding a new event.
+		 *
+		 * The group is traversed twice, which may be optimized later.
+		 * In the first round,
+		 * - Find all events which do reload when other events
+		 *   overflow and set the corresponding counter-constraints
+		 * - Add all events, which can cause other events reload,
+		 *   in the cause_mask
+		 * - Error out if the number of events exceeds the HW limit
+		 * - The ACR events must be contiguous.
+		 *   Error out if there are non-X86 events between ACR events.
+		 *   This is not a HW limit, but a SW limit.
+		 *   With the assumption, the intel_pmu_acr_late_setup() can
+		 *   easily convert the event idx to counter idx without
+		 *   traversing the whole event list.
+		 */
+		if (!is_x86_event(leader))
+			return -EINVAL;
+
+		if (leader->attr.config2)
+			intel_pmu_set_acr_cntr_constr(leader, &cause_mask, &num);
+
+		if (leader->nr_siblings) {
+			for_each_sibling_event(sibling, leader) {
+				if (!is_x86_event(sibling)) {
+					has_sw_event = true;
+					continue;
+				}
+				if (!sibling->attr.config2)
+					continue;
+				if (has_sw_event)
+					return -EINVAL;
+				intel_pmu_set_acr_cntr_constr(sibling, &cause_mask, &num);
+			}
+		}
+		if (leader != event && event->attr.config2) {
+			if (has_sw_event)
+				return -EINVAL;
+			intel_pmu_set_acr_cntr_constr(event, &cause_mask, &num);
+		}
+
+		if (hweight64(cause_mask) > hweight64(hybrid(pmu, acr_cause_mask64)) ||
+		    num > hweight64(hybrid(event->pmu, acr_cntr_mask64)))
+			return -EINVAL;
+		/*
+		 * In the second round, apply the counter-constraints for
+		 * the events which can cause other events reload.
+		 */
+		intel_pmu_set_acr_caused_constr(leader, idx++, cause_mask);
+
+		if (leader->nr_siblings) {
+			for_each_sibling_event(sibling, leader)
+				intel_pmu_set_acr_caused_constr(sibling, idx++, cause_mask);
+		}
+
+		if (leader != event)
+			intel_pmu_set_acr_caused_constr(event, idx, cause_mask);
+
+		leader->hw.flags |= PERF_X86_EVENT_ACR;
+	}
+
 	if ((event->attr.type == PERF_TYPE_HARDWARE) ||
 	    (event->attr.type == PERF_TYPE_HW_CACHE))
 		return 0;
@@ -6061,6 +6259,21 @@ td_is_visible(struct kobject *kobj, struct attribute *attr, int i)
 	return attr->mode;
 }
 
+PMU_FORMAT_ATTR(acr_mask,	"config2:0-63");
+
+static struct attribute *format_acr_attrs[] = {
+	&format_attr_acr_mask.attr,
+	NULL
+};
+
+static umode_t
+acr_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+	struct device *dev = kobj_to_dev(kobj);
+
+	return intel_pmu_has_acr(dev_get_drvdata(dev)) ? attr->mode : 0;
+}
+
 static struct attribute_group group_events_td  = {
 	.name = "events",
 	.is_visible = td_is_visible,
@@ -6103,6 +6316,12 @@ static struct attribute_group group_format_evtsel_ext = {
 	.is_visible = evtsel_ext_is_visible,
 };
 
+static struct attribute_group group_format_acr = {
+	.name       = "format",
+	.attrs      = format_acr_attrs,
+	.is_visible = acr_is_visible,
+};
+
 static struct attribute_group group_default = {
 	.attrs      = intel_pmu_attrs,
 	.is_visible = default_is_visible,
@@ -6117,6 +6336,7 @@ static const struct attribute_group *attr_update[] = {
 	&group_format_extra,
 	&group_format_extra_skl,
 	&group_format_evtsel_ext,
+	&group_format_acr,
 	&group_default,
 	NULL,
 };
@@ -6401,6 +6621,7 @@ static const struct attribute_group *hybrid_attr_update[] = {
 	&group_caps_lbr,
 	&hybrid_group_format_extra,
 	&group_format_evtsel_ext,
+	&group_format_acr,
 	&group_default,
 	&hybrid_group_cpus,
 	NULL,
@@ -6593,6 +6814,7 @@ static __always_inline void intel_pmu_init_skt(struct pmu *pmu)
 	intel_pmu_init_grt(pmu);
 	hybrid(pmu, event_constraints) = intel_skt_event_constraints;
 	hybrid(pmu, extra_regs) = intel_cmt_extra_regs;
+	static_call_update(intel_pmu_enable_acr_event, intel_pmu_enable_acr);
 }
 
 __init int intel_pmu_init(void)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index ab9af2e4da1b..46bbb503aca1 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -120,6 +120,11 @@ static inline bool is_pebs_counter_event_group(struct perf_event *event)
 	return event->group_leader->hw.flags & PERF_X86_EVENT_PEBS_CNTR;
 }
 
+static inline bool is_acr_event_group(struct perf_event *event)
+{
+	return event->group_leader->hw.flags & PERF_X86_EVENT_ACR;
+}
+
 struct amd_nb {
 	int nb_id;  /* NorthBridge id */
 	int refcnt; /* reference count */
@@ -287,6 +292,10 @@ struct cpu_hw_events {
 	u64			fixed_ctrl_val;
 	u64			active_fixed_ctrl_val;
 
+	/* Intel ACR configuration */
+	u64			acr_cfg_b[X86_PMC_IDX_MAX];
+	u64			acr_cfg_c[X86_PMC_IDX_MAX];
+
 	/*
 	 * Intel LBR bits
 	 */
@@ -1103,6 +1112,7 @@ static struct perf_pmu_format_hybrid_attr format_attr_hybrid_##_name = {\
 	.pmu_type	= _pmu,						\
 }
 
+int is_x86_event(struct perf_event *event);
 struct pmu *x86_get_pmu(unsigned int cpu);
 extern struct x86_pmu x86_pmu __read_mostly;
 
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index e6134ef2263d..53da787b9326 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -594,7 +594,11 @@
 /* V6 PMON MSR range */
 #define MSR_IA32_PMC_V6_GP0_CTR		0x1900
 #define MSR_IA32_PMC_V6_GP0_CFG_A	0x1901
+#define MSR_IA32_PMC_V6_GP0_CFG_B	0x1902
+#define MSR_IA32_PMC_V6_GP0_CFG_C	0x1903
 #define MSR_IA32_PMC_V6_FX0_CTR		0x1980
+#define MSR_IA32_PMC_V6_FX0_CFG_B	0x1982
+#define MSR_IA32_PMC_V6_FX0_CFG_C	0x1983
 #define MSR_IA32_PMC_V6_STEP		4
 
 /* KeyID partitioning between MKTME and TDX */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 5c547329cf02..947ad12dfdbe 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -157,6 +157,7 @@ struct hw_perf_event {
 	union {
 		struct { /* hardware */
 			u64		config;
+			u64		config1;
 			u64		last_tag;
 			u64		dyn_constraint;
 			unsigned long	config_base;
-- 
cgit v1.2.3


From f400565faa50737ac1d550d2c75128c0dad75765 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Thu, 24 Apr 2025 18:11:27 +0200
Subject: perf: Remove too early and redundant CPU hotplug handling

The CPU hotplug handlers are called twice: at prepare and online stage.

Their role is to:

1) Enable/disable a CPU context. This is irrelevant and even buggy at
   the prepare stage because the CPU is still offline. On early
   secondary CPU up, creating an event attached to that CPU might
   silently fail because the CPU context is observed as online but the
   context installation's IPI failure is ignored.

2) Update the scope cpumasks and re-migrate the events accordingly in
   the CPU down case. This is irrelevant at the prepare stage.

3) Remove the events attached to the context of the offlining CPU. It
   even uses an (unnecessary) IPI for it. This is also irrelevant at the
   prepare stage.

Also none of the *_PREPARE and *_STARTING architecture perf related CPU
hotplug callbacks rely on CPUHP_PERF_PREPARE.

CPUHP_AP_PERF_ONLINE is enough and the right place to perform the work.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20250424161128.29176-4-frederic@kernel.org
---
 include/linux/cpuhotplug.h | 1 -
 kernel/cpu.c               | 5 -----
 2 files changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 1987400000b4..df366ee15456 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -60,7 +60,6 @@ enum cpuhp_state {
 	/* PREPARE section invoked on a control CPU */
 	CPUHP_OFFLINE = 0,
 	CPUHP_CREATE_THREADS,
-	CPUHP_PERF_PREPARE,
 	CPUHP_PERF_X86_PREPARE,
 	CPUHP_PERF_X86_AMD_UNCORE_PREP,
 	CPUHP_PERF_POWER,
diff --git a/kernel/cpu.c b/kernel/cpu.c
index b08bb34b1718..a59e009e0be4 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -2069,11 +2069,6 @@ static struct cpuhp_step cpuhp_hp_states[] = {
 		.teardown.single	= NULL,
 		.cant_stop		= true,
 	},
-	[CPUHP_PERF_PREPARE] = {
-		.name			= "perf:prepare",
-		.startup.single		= perf_event_init_cpu,
-		.teardown.single	= perf_event_exit_cpu,
-	},
 	[CPUHP_RANDOM_PREPARE] = {
 		.name			= "random:prepare",
 		.startup.single		= random_prepare_cpu,
-- 
cgit v1.2.3


From 18049c8cff9cc89daadc4df6975f7d9069638926 Mon Sep 17 00:00:00 2001
From: Yabin Cui <yabinc@google.com>
Date: Thu, 8 May 2025 16:26:42 -0700
Subject: perf/aux: Allocate non-contiguous AUX pages by default

perf always allocates contiguous AUX pages based on aux_watermark.
However, this contiguous allocation doesn't benefit all PMUs. For
instance, ARM SPE and TRBE operate with virtual pages, and Coresight
ETR allocates a separate buffer. For these PMUs, allocating contiguous
AUX pages unnecessarily exacerbates memory fragmentation. This
fragmentation can prevent their use on long-running devices.

This patch modifies the perf driver to be memory-friendly by default,
by allocating non-contiguous AUX pages. For PMUs requiring contiguous
pages (Intel BTS and some Intel PT), the existing
PERF_PMU_CAP_AUX_NO_SG capability can be used. For PMUs that don't
require but can benefit from contiguous pages (some Intel PT), a new
capability, PERF_PMU_CAP_AUX_PREFER_LARGE, is added to maintain their
existing behavior.

Signed-off-by: Yabin Cui <yabinc@google.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: James Clark <james.clark@linaro.org>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20250508232642.148767-1-yabinc@google.com
---
 arch/x86/events/intel/pt.c  |  2 ++
 include/linux/perf_event.h  |  1 +
 kernel/events/ring_buffer.c | 29 ++++++++++++++++++++---------
 3 files changed, 23 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index fa37565f6418..25ead919fc48 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -1863,6 +1863,8 @@ static __init int pt_init(void)
 
 	if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries))
 		pt_pmu.pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG;
+	else
+		pt_pmu.pmu.capabilities = PERF_PMU_CAP_AUX_PREFER_LARGE;
 
 	pt_pmu.pmu.capabilities		|= PERF_PMU_CAP_EXCLUSIVE |
 					   PERF_PMU_CAP_ITRACE |
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 947ad12dfdbe..a96c00e2ceca 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -303,6 +303,7 @@ struct perf_event_pmu_context;
 #define PERF_PMU_CAP_AUX_OUTPUT			0x0080
 #define PERF_PMU_CAP_EXTENDED_HW_TYPE		0x0100
 #define PERF_PMU_CAP_AUX_PAUSE			0x0200
+#define PERF_PMU_CAP_AUX_PREFER_LARGE		0x0400
 
 /**
  * pmu::scope
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 5130b119d0ae..d2aef87c7e9f 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -679,7 +679,15 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
 {
 	bool overwrite = !(flags & RING_BUFFER_WRITABLE);
 	int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
-	int ret = -ENOMEM, max_order;
+	bool use_contiguous_pages = event->pmu->capabilities & (
+		PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_PREFER_LARGE);
+	/*
+	 * Initialize max_order to 0 for page allocation. This allocates single
+	 * pages to minimize memory fragmentation. This is overridden if the
+	 * PMU needs or prefers contiguous pages (use_contiguous_pages = true).
+	 */
+	int max_order = 0;
+	int ret = -ENOMEM;
 
 	if (!has_aux(event))
 		return -EOPNOTSUPP;
@@ -689,8 +697,8 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
 
 	if (!overwrite) {
 		/*
-		 * Watermark defaults to half the buffer, and so does the
-		 * max_order, to aid PMU drivers in double buffering.
+		 * Watermark defaults to half the buffer, to aid PMU drivers
+		 * in double buffering.
 		 */
 		if (!watermark)
 			watermark = min_t(unsigned long,
@@ -698,16 +706,19 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
 					  (unsigned long)nr_pages << (PAGE_SHIFT - 1));
 
 		/*
-		 * Use aux_watermark as the basis for chunking to
-		 * help PMU drivers honor the watermark.
+		 * If using contiguous pages, use aux_watermark as the basis
+		 * for chunking to help PMU drivers honor the watermark.
 		 */
-		max_order = get_order(watermark);
+		if (use_contiguous_pages)
+			max_order = get_order(watermark);
 	} else {
 		/*
-		 * We need to start with the max_order that fits in nr_pages,
-		 * not the other way around, hence ilog2() and not get_order.
+		 * If using contiguous pages, we need to start with the
+		 * max_order that fits in nr_pages, not the other way around,
+		 * hence ilog2() and not get_order.
 		 */
-		max_order = ilog2(nr_pages);
+		if (use_contiguous_pages)
+			max_order = ilog2(nr_pages);
 		watermark = 0;
 	}
 
-- 
cgit v1.2.3


From e7d952cc39fca34386ec9f15f68cb2eaac01b5ae Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 24 May 2025 11:23:25 +0200
Subject: perf/headers: Clean up <linux/perf_event.h> a bit

Do a bit of readability spring cleaning:

 - Fix misaligned structure member in perf_addr_filter: the new
   struct perf_addr_filter::action member was too long, but when
   it was added it was not aligned properly. Align all fields to
   the customary column 41 alignment of most of the rest of the
   header.

 - Adjust the vertical alignment of the definition of other
   structures and definitions as well, so that the 'most of' in
   the previous paragraph changes to 'all of'. ;-)

 - Prettify the assignments in perf_clear_branch_entry_bitfields()

 - Move comments from CPP definitions to outside the macro

 - Move perf_guest_info_callbacks and related defines from the front
   of the header closer to where it's used within the header.

 - And more #endif markers for larger CPP blocks and standardize
   #if/#else/#endif blocks to the following nomenclature:

	#ifdef CONFIG_FOO
	...
	#else /* !CONFIG_FOO: */
	...
	#endif /* !CONFIG_FOO */

 - Standardize on consistently using the 'extern' storage class where
   appropriate, we had cases where method prototypes sometimes omitted
   the storage class:

	extern void perf_pmu_migrate_context(struct pmu *pmu,
					int src_cpu, int dst_cpu);
	int perf_event_read_local(struct perf_event *event, u64 *value,
				  u64 *enabled, u64 *running);
	extern u64 perf_event_read_value(struct perf_event *event,
					 u64 *enabled, u64 *running);

   Which is obviously a bit confusing and adds unnecessary noise.

 - s/__u64/u64 and similar cleanups: there's no point in using __u64
   in non-UAPI headers, and doing so only adds unnecessary visual noise.

 - Harmonize all multi-parameter function prototypes along the following
   style:

	extern struct perf_event *
	perf_event_create_kernel_counter(struct perf_event_attr *attr,
					 int cpu,
					 struct task_struct *task,
					 perf_overflow_handler_t callback,
					 void *context);
 - etc.

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Ian Rogers <irogers@google.com>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 282 +++++++++++++++++++++++++--------------------
 1 file changed, 155 insertions(+), 127 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index a96c00e2ceca..52dc7cfab0e0 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -26,18 +26,9 @@
 # include <asm/local64.h>
 #endif
 
-#define PERF_GUEST_ACTIVE	0x01
-#define PERF_GUEST_USER	0x02
-
-struct perf_guest_info_callbacks {
-	unsigned int			(*state)(void);
-	unsigned long			(*get_ip)(void);
-	unsigned int			(*handle_intel_pt_intr)(void);
-};
-
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
-#include <linux/rhashtable-types.h>
-#include <asm/hw_breakpoint.h>
+# include <linux/rhashtable-types.h>
+# include <asm/hw_breakpoint.h>
 #endif
 
 #include <linux/list.h>
@@ -62,19 +53,20 @@ struct perf_guest_info_callbacks {
 #include <linux/security.h>
 #include <linux/static_call.h>
 #include <linux/lockdep.h>
+
 #include <asm/local.h>
 
 struct perf_callchain_entry {
-	__u64				nr;
-	__u64				ip[]; /* /proc/sys/kernel/perf_event_max_stack */
+	u64				nr;
+	u64				ip[]; /* /proc/sys/kernel/perf_event_max_stack */
 };
 
 struct perf_callchain_entry_ctx {
-	struct perf_callchain_entry *entry;
-	u32			    max_stack;
-	u32			    nr;
-	short			    contexts;
-	bool			    contexts_maxed;
+	struct perf_callchain_entry	*entry;
+	u32				max_stack;
+	u32				nr;
+	short				contexts;
+	bool				contexts_maxed;
 };
 
 typedef unsigned long (*perf_copy_f)(void *dst, const void *src,
@@ -121,8 +113,8 @@ static __always_inline bool perf_raw_frag_last(const struct perf_raw_frag *frag)
  * already stored in age order, the hw_idx should be 0.
  */
 struct perf_branch_stack {
-	__u64				nr;
-	__u64				hw_idx;
+	u64				nr;
+	u64				hw_idx;
 	struct perf_branch_entry	entries[];
 };
 
@@ -132,10 +124,10 @@ struct task_struct;
  * extra PMU register associated with an event
  */
 struct hw_perf_event_extra {
-	u64		config;	/* register value */
-	unsigned int	reg;	/* register address or index */
-	int		alloc;	/* extra register already allocated */
-	int		idx;	/* index in shared_regs->regs[] */
+	u64				config;	/* register value */
+	unsigned int			reg;	/* register address or index */
+	int				alloc;	/* extra register already allocated */
+	int				idx;	/* index in shared_regs->regs[] */
 };
 
 /**
@@ -144,8 +136,8 @@ struct hw_perf_event_extra {
  * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific
  * usage.
  */
-#define PERF_EVENT_FLAG_ARCH			0x0fffffff
-#define PERF_EVENT_FLAG_USER_READ_CNT		0x80000000
+#define PERF_EVENT_FLAG_ARCH		0x0fffffff
+#define PERF_EVENT_FLAG_USER_READ_CNT	0x80000000
 
 static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0);
 
@@ -227,9 +219,14 @@ struct hw_perf_event {
 /*
  * hw_perf_event::state flags; used to track the PERF_EF_* state.
  */
-#define PERF_HES_STOPPED	0x01 /* the counter is stopped */
-#define PERF_HES_UPTODATE	0x02 /* event->count up-to-date */
-#define PERF_HES_ARCH		0x04
+
+/* the counter is stopped */
+#define PERF_HES_STOPPED		0x01
+
+/* event->count up-to-date */
+#define PERF_HES_UPTODATE		0x02
+
+#define PERF_HES_ARCH			0x04
 
 	int				state;
 
@@ -278,7 +275,7 @@ struct hw_perf_event {
 	 */
 	u64				freq_time_stamp;
 	u64				freq_count_stamp;
-#endif
+#endif /* CONFIG_PERF_EVENTS */
 };
 
 struct perf_event;
@@ -287,29 +284,33 @@ struct perf_event_pmu_context;
 /*
  * Common implementation detail of pmu::{start,commit,cancel}_txn
  */
-#define PERF_PMU_TXN_ADD  0x1		/* txn to add/schedule event on PMU */
-#define PERF_PMU_TXN_READ 0x2		/* txn to read event group from PMU */
+
+/* txn to add/schedule event on PMU */
+#define PERF_PMU_TXN_ADD		0x1
+
+/* txn to read event group from PMU */
+#define PERF_PMU_TXN_READ		0x2
 
 /**
  * pmu::capabilities flags
  */
-#define PERF_PMU_CAP_NO_INTERRUPT		0x0001
-#define PERF_PMU_CAP_NO_NMI			0x0002
-#define PERF_PMU_CAP_AUX_NO_SG			0x0004
-#define PERF_PMU_CAP_EXTENDED_REGS		0x0008
-#define PERF_PMU_CAP_EXCLUSIVE			0x0010
-#define PERF_PMU_CAP_ITRACE			0x0020
-#define PERF_PMU_CAP_NO_EXCLUDE			0x0040
-#define PERF_PMU_CAP_AUX_OUTPUT			0x0080
-#define PERF_PMU_CAP_EXTENDED_HW_TYPE		0x0100
-#define PERF_PMU_CAP_AUX_PAUSE			0x0200
-#define PERF_PMU_CAP_AUX_PREFER_LARGE		0x0400
+#define PERF_PMU_CAP_NO_INTERRUPT	0x0001
+#define PERF_PMU_CAP_NO_NMI		0x0002
+#define PERF_PMU_CAP_AUX_NO_SG		0x0004
+#define PERF_PMU_CAP_EXTENDED_REGS	0x0008
+#define PERF_PMU_CAP_EXCLUSIVE		0x0010
+#define PERF_PMU_CAP_ITRACE		0x0020
+#define PERF_PMU_CAP_NO_EXCLUDE		0x0040
+#define PERF_PMU_CAP_AUX_OUTPUT		0x0080
+#define PERF_PMU_CAP_EXTENDED_HW_TYPE	0x0100
+#define PERF_PMU_CAP_AUX_PAUSE		0x0200
+#define PERF_PMU_CAP_AUX_PREFER_LARGE	0x0400
 
 /**
  * pmu::scope
  */
 enum perf_pmu_scope {
-	PERF_PMU_SCOPE_NONE	= 0,
+	PERF_PMU_SCOPE_NONE = 0,
 	PERF_PMU_SCOPE_CORE,
 	PERF_PMU_SCOPE_DIE,
 	PERF_PMU_SCOPE_CLUSTER,
@@ -393,11 +394,21 @@ struct pmu {
 	 * Flags for ->add()/->del()/ ->start()/->stop(). There are
 	 * matching hw_perf_event::state flags.
 	 */
-#define PERF_EF_START	0x01		/* start the counter when adding    */
-#define PERF_EF_RELOAD	0x02		/* reload the counter when starting */
-#define PERF_EF_UPDATE	0x04		/* update the counter when stopping */
-#define PERF_EF_PAUSE	0x08		/* AUX area event, pause tracing */
-#define PERF_EF_RESUME	0x10		/* AUX area event, resume tracing */
+
+/* start the counter when adding    */
+#define PERF_EF_START			0x01
+
+/* reload the counter when starting */
+#define PERF_EF_RELOAD			0x02
+
+/* update the counter when stopping */
+#define PERF_EF_UPDATE			0x04
+
+/* AUX area event, pause tracing */
+#define PERF_EF_PAUSE			0x08
+
+/* AUX area event, resume tracing */
+#define PERF_EF_RESUME			0x10
 
 	/*
 	 * Adds/Removes a counter to/from the PMU, can be done inside a
@@ -596,10 +607,10 @@ enum perf_addr_filter_action_t {
  * This is a hardware-agnostic filter configuration as specified by the user.
  */
 struct perf_addr_filter {
-	struct list_head	entry;
-	struct path		path;
-	unsigned long		offset;
-	unsigned long		size;
+	struct list_head		entry;
+	struct path			path;
+	unsigned long			offset;
+	unsigned long			size;
 	enum perf_addr_filter_action_t	action;
 };
 
@@ -614,14 +625,14 @@ struct perf_addr_filter {
  * bundled together; see perf_event_addr_filters().
  */
 struct perf_addr_filters_head {
-	struct list_head	list;
-	raw_spinlock_t		lock;
-	unsigned int		nr_file_filters;
+	struct list_head		list;
+	raw_spinlock_t			lock;
+	unsigned int			nr_file_filters;
 };
 
 struct perf_addr_filter_range {
-	unsigned long		start;
-	unsigned long		size;
+	unsigned long			start;
+	unsigned long			size;
 };
 
 /**
@@ -669,24 +680,24 @@ struct swevent_hlist {
 	struct rcu_head			rcu_head;
 };
 
-#define PERF_ATTACH_CONTEXT	0x0001
-#define PERF_ATTACH_GROUP	0x0002
-#define PERF_ATTACH_TASK	0x0004
-#define PERF_ATTACH_TASK_DATA	0x0008
-#define PERF_ATTACH_GLOBAL_DATA	0x0010
-#define PERF_ATTACH_SCHED_CB	0x0020
-#define PERF_ATTACH_CHILD	0x0040
-#define PERF_ATTACH_EXCLUSIVE	0x0080
-#define PERF_ATTACH_CALLCHAIN	0x0100
-#define PERF_ATTACH_ITRACE	0x0200
+#define PERF_ATTACH_CONTEXT		0x0001
+#define PERF_ATTACH_GROUP		0x0002
+#define PERF_ATTACH_TASK		0x0004
+#define PERF_ATTACH_TASK_DATA		0x0008
+#define PERF_ATTACH_GLOBAL_DATA		0x0010
+#define PERF_ATTACH_SCHED_CB		0x0020
+#define PERF_ATTACH_CHILD		0x0040
+#define PERF_ATTACH_EXCLUSIVE		0x0080
+#define PERF_ATTACH_CALLCHAIN		0x0100
+#define PERF_ATTACH_ITRACE		0x0200
 
 struct bpf_prog;
 struct perf_cgroup;
 struct perf_buffer;
 
 struct pmu_event_list {
-	raw_spinlock_t		lock;
-	struct list_head	list;
+	raw_spinlock_t			lock;
+	struct list_head		list;
 };
 
 /*
@@ -696,12 +707,12 @@ struct pmu_event_list {
  * disabled is sufficient since it will hold-off the IPIs.
  */
 #ifdef CONFIG_PROVE_LOCKING
-#define lockdep_assert_event_ctx(event)				\
+# define lockdep_assert_event_ctx(event)			\
 	WARN_ON_ONCE(__lockdep_enabled &&			\
 		     (this_cpu_read(hardirqs_enabled) &&	\
 		      lockdep_is_held(&(event)->ctx->mutex) != LOCK_STATE_HELD))
 #else
-#define lockdep_assert_event_ctx(event)
+# define lockdep_assert_event_ctx(event)
 #endif
 
 #define for_each_sibling_event(sibling, event)			\
@@ -859,9 +870,9 @@ struct perf_event {
 #ifdef CONFIG_EVENT_TRACING
 	struct trace_event_call		*tp_event;
 	struct event_filter		*filter;
-#ifdef CONFIG_FUNCTION_TRACER
+# ifdef CONFIG_FUNCTION_TRACER
 	struct ftrace_ops               ftrace_ops;
-#endif
+# endif
 #endif
 
 #ifdef CONFIG_CGROUP_PERF
@@ -880,7 +891,7 @@ struct perf_event {
 	 * of it. event->orig_type contains original 'type' requested by
 	 * user.
 	 */
-	__u32				orig_type;
+	u32				orig_type;
 #endif /* CONFIG_PERF_EVENTS */
 };
 
@@ -945,8 +956,8 @@ static inline bool perf_pmu_ctx_is_active(struct perf_event_pmu_context *epc)
 }
 
 struct perf_event_groups {
-	struct rb_root	tree;
-	u64		index;
+	struct rb_root			tree;
+	u64				index;
 };
 
 
@@ -1189,16 +1200,18 @@ extern void perf_pmu_resched(struct pmu *pmu);
 extern int perf_event_refresh(struct perf_event *event, int refresh);
 extern void perf_event_update_userpage(struct perf_event *event);
 extern int perf_event_release_kernel(struct perf_event *event);
+
 extern struct perf_event *
 perf_event_create_kernel_counter(struct perf_event_attr *attr,
-				int cpu,
-				struct task_struct *task,
-				perf_overflow_handler_t callback,
-				void *context);
+				 int cpu,
+				 struct task_struct *task,
+				 perf_overflow_handler_t callback,
+				 void *context);
+
 extern void perf_pmu_migrate_context(struct pmu *pmu,
-				int src_cpu, int dst_cpu);
-int perf_event_read_local(struct perf_event *event, u64 *value,
-			  u64 *enabled, u64 *running);
+				     int src_cpu, int dst_cpu);
+extern int perf_event_read_local(struct perf_event *event, u64 *value,
+				 u64 *enabled, u64 *running);
 extern u64 perf_event_read_value(struct perf_event *event,
 				 u64 *enabled, u64 *running);
 
@@ -1415,14 +1428,14 @@ static inline u32 perf_sample_data_size(struct perf_sample_data *data,
  */
 static inline void perf_clear_branch_entry_bitfields(struct perf_branch_entry *br)
 {
-	br->mispred = 0;
-	br->predicted = 0;
-	br->in_tx = 0;
-	br->abort = 0;
-	br->cycles = 0;
-	br->type = 0;
-	br->spec = PERF_BR_SPEC_NA;
-	br->reserved = 0;
+	br->mispred	= 0;
+	br->predicted	= 0;
+	br->in_tx	= 0;
+	br->abort	= 0;
+	br->cycles	= 0;
+	br->type	= 0;
+	br->spec	= PERF_BR_SPEC_NA;
+	br->reserved	= 0;
 }
 
 extern void perf_output_sample(struct perf_output_handle *handle,
@@ -1611,7 +1624,17 @@ extern void perf_event_bpf_event(struct bpf_prog *prog,
 				 enum perf_bpf_event_type type,
 				 u16 flags);
 
+#define PERF_GUEST_ACTIVE		0x01
+#define PERF_GUEST_USER			0x02
+
+struct perf_guest_info_callbacks {
+	unsigned int			(*state)(void);
+	unsigned long			(*get_ip)(void);
+	unsigned int			(*handle_intel_pt_intr)(void);
+};
+
 #ifdef CONFIG_GUEST_PERF_EVENTS
+
 extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
 
 DECLARE_STATIC_CALL(__perf_guest_state, *perf_guest_cbs->state);
@@ -1622,21 +1645,27 @@ static inline unsigned int perf_guest_state(void)
 {
 	return static_call(__perf_guest_state)();
 }
+
 static inline unsigned long perf_guest_get_ip(void)
 {
 	return static_call(__perf_guest_get_ip)();
 }
+
 static inline unsigned int perf_guest_handle_intel_pt_intr(void)
 {
 	return static_call(__perf_guest_handle_intel_pt_intr)();
 }
+
 extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs);
 extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs);
-#else
+
+#else /* !CONFIG_GUEST_PERF_EVENTS: */
+
 static inline unsigned int perf_guest_state(void)		 { return 0; }
 static inline unsigned long perf_guest_get_ip(void)		 { return 0; }
 static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return 0; }
-#endif /* CONFIG_GUEST_PERF_EVENTS */
+
+#endif /* !CONFIG_GUEST_PERF_EVENTS */
 
 extern void perf_event_exec(void);
 extern void perf_event_comm(struct task_struct *tsk, bool exec);
@@ -1666,6 +1695,7 @@ static inline int perf_callchain_store_context(struct perf_callchain_entry_ctx *
 {
 	if (ctx->contexts < sysctl_perf_event_max_contexts_per_stack) {
 		struct perf_callchain_entry *entry = ctx->entry;
+
 		entry->ip[entry->nr++] = ip;
 		++ctx->contexts;
 		return 0;
@@ -1679,6 +1709,7 @@ static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64
 {
 	if (ctx->nr < ctx->max_stack && !ctx->contexts_maxed) {
 		struct perf_callchain_entry *entry = ctx->entry;
+
 		entry->ip[entry->nr++] = ip;
 		++ctx->nr;
 		return 0;
@@ -1705,7 +1736,7 @@ static inline int perf_is_paranoid(void)
 	return sysctl_perf_event_paranoid > -1;
 }
 
-int perf_allow_kernel(void);
+extern int perf_allow_kernel(void);
 
 static inline int perf_allow_cpu(void)
 {
@@ -1827,7 +1858,7 @@ extern int perf_output_begin_backward(struct perf_output_handle *handle,
 
 extern void perf_output_end(struct perf_output_handle *handle);
 extern unsigned int perf_output_copy(struct perf_output_handle *handle,
-			     const void *buf, unsigned int len);
+				     const void *buf, unsigned int len);
 extern unsigned int perf_output_skip(struct perf_output_handle *handle,
 				     unsigned int len);
 extern long perf_output_copy_aux(struct perf_output_handle *aux_handle,
@@ -1844,7 +1875,9 @@ extern void perf_event_task_tick(void);
 extern int perf_event_account_interrupt(struct perf_event *event);
 extern int perf_event_period(struct perf_event *event, u64 value);
 extern u64 perf_event_pause(struct perf_event *event, bool reset);
+
 #else /* !CONFIG_PERF_EVENTS: */
+
 static inline void *
 perf_aux_output_begin(struct perf_output_handle *handle,
 		      struct perf_event *event)				{ return NULL; }
@@ -1922,19 +1955,14 @@ static inline void perf_event_disable(struct perf_event *event)		{ }
 static inline int __perf_event_disable(void *info)			{ return -1; }
 static inline void perf_event_task_tick(void)				{ }
 static inline int perf_event_release_kernel(struct perf_event *event)	{ return 0; }
-static inline int perf_event_period(struct perf_event *event, u64 value)
-{
-	return -EINVAL;
-}
-static inline u64 perf_event_pause(struct perf_event *event, bool reset)
-{
-	return 0;
-}
-static inline int perf_exclude_event(struct perf_event *event, struct pt_regs *regs)
-{
-	return 0;
-}
-#endif
+static inline int
+perf_event_period(struct perf_event *event, u64 value)			{ return -EINVAL; }
+static inline u64
+perf_event_pause(struct perf_event *event, bool reset)			{ return 0; }
+static inline int
+perf_exclude_event(struct perf_event *event, struct pt_regs *regs)	{ return 0; }
+
+#endif /* !CONFIG_PERF_EVENTS */
 
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
 extern void perf_restore_debug_store(void);
@@ -1942,31 +1970,31 @@ extern void perf_restore_debug_store(void);
 static inline void perf_restore_debug_store(void)			{ }
 #endif
 
-#define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x))
+#define perf_output_put(handle, x)	perf_output_copy((handle), &(x), sizeof(x))
 
 struct perf_pmu_events_attr {
-	struct device_attribute attr;
-	u64 id;
-	const char *event_str;
+	struct device_attribute		attr;
+	u64				id;
+	const char			*event_str;
 };
 
 struct perf_pmu_events_ht_attr {
-	struct device_attribute			attr;
-	u64					id;
-	const char				*event_str_ht;
-	const char				*event_str_noht;
+	struct device_attribute		attr;
+	u64				id;
+	const char			*event_str_ht;
+	const char			*event_str_noht;
 };
 
 struct perf_pmu_events_hybrid_attr {
-	struct device_attribute			attr;
-	u64					id;
-	const char				*event_str;
-	u64					pmu_type;
+	struct device_attribute		attr;
+	u64				id;
+	const char			*event_str;
+	u64				pmu_type;
 };
 
 struct perf_pmu_format_hybrid_attr {
-	struct device_attribute			attr;
-	u64					pmu_type;
+	struct device_attribute		attr;
+	u64				pmu_type;
 };
 
 ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
@@ -2008,11 +2036,11 @@ static struct device_attribute format_attr_##_name = __ATTR_RO(_name)
 
 /* Performance counter hotplug functions */
 #ifdef CONFIG_PERF_EVENTS
-int perf_event_init_cpu(unsigned int cpu);
-int perf_event_exit_cpu(unsigned int cpu);
+extern int perf_event_init_cpu(unsigned int cpu);
+extern int perf_event_exit_cpu(unsigned int cpu);
 #else
-#define perf_event_init_cpu	NULL
-#define perf_event_exit_cpu	NULL
+# define perf_event_init_cpu		NULL
+# define perf_event_exit_cpu		NULL
 #endif
 
 extern void arch_perf_update_userpage(struct perf_event *event,
-- 
cgit v1.2.3