summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
authorDapeng Mi <dapeng1.mi@linux.intel.com>2026-01-14 04:17:45 +0300
committerPeter Zijlstra <peterz@infradead.org>2026-01-15 12:04:26 +0300
commitd2bdcde9626cbea0c44a6aaa33b440c8adf81e09 (patch)
treef3866351da606d27837a3f88e576b2c8d074054d /arch
parent4e955c08d6dc76fb60cda9af955ddcebedaa7f69 (diff)
downloadlinux-d2bdcde9626cbea0c44a6aaa33b440c8adf81e09.tar.xz
perf/x86/intel: Add support for PEBS memory auxiliary info field in DMR
With the introduction of the OMR feature, the PEBS memory auxiliary info field for load and store latency events has been restructured for DMR. The memory auxiliary info field's bit[8] indicates whether a L2 cache miss occurred for a memory load or store instruction. If bit[8] is 0, it signifies no L2 cache miss, and bits[7:0] specify the exact cache data source (up to the L2 cache level). If bit[8] is 1, bits[7:0] represent the OMR encoding, indicating the specific L3 cache or memory region involved in the memory access. A significant enhancement is OMR encoding provides up to 8 fine-grained memory regions besides the cache region. A significant enhancement for OMR encoding is the ability to provide up to 8 fine-grained memory regions in addition to the cache region, offering more detailed insights into memory access regions. For detailed information on the memory auxiliary info encoding, please refer to section 16.2 "PEBS LOAD LATENCY AND STORE LATENCY FACILITY" in the ISE documentation. This patch ensures that the PEBS memory auxiliary info field is correctly interpreted and utilized in DMR. Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://patch.msgid.link/20260114011750.350569-3-dapeng1.mi@linux.intel.com
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/events/intel/ds.c140
-rw-r--r--arch/x86/events/perf_event.h2
2 files changed, 142 insertions, 0 deletions
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index feb1c3cf63e4..272e652f25fc 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -34,6 +34,17 @@ struct pebs_record_32 {
*/
+union omr_encoding {
+ struct {
+ u8 omr_source : 4;
+ u8 omr_remote : 1;
+ u8 omr_hitm : 1;
+ u8 omr_snoop : 1;
+ u8 omr_promoted : 1;
+ };
+ u8 omr_full;
+};
+
union intel_x86_pebs_dse {
u64 val;
struct {
@@ -73,6 +84,18 @@ union intel_x86_pebs_dse {
unsigned int lnc_addr_blk:1;
unsigned int ld_reserved6:18;
};
+ struct {
+ unsigned int pnc_dse: 8;
+ unsigned int pnc_l2_miss:1;
+ unsigned int pnc_stlb_clean_hit:1;
+ unsigned int pnc_stlb_any_hit:1;
+ unsigned int pnc_stlb_miss:1;
+ unsigned int pnc_locked:1;
+ unsigned int pnc_data_blk:1;
+ unsigned int pnc_addr_blk:1;
+ unsigned int pnc_fb_full:1;
+ unsigned int ld_reserved8:16;
+ };
};
@@ -228,6 +251,85 @@ void __init intel_pmu_pebs_data_source_lnl(void)
__intel_pmu_pebs_data_source_cmt(data_source);
}
+/* Version for Panthercove and later */
+
+/* L2 hit */
+#define PNC_PEBS_DATA_SOURCE_MAX 16
+static u64 pnc_pebs_l2_hit_data_source[PNC_PEBS_DATA_SOURCE_MAX] = {
+ P(OP, LOAD) | P(LVL, NA) | LEVEL(NA) | P(SNOOP, NA), /* 0x00: non-cache access */
+ OP_LH | LEVEL(L0) | P(SNOOP, NONE), /* 0x01: L0 hit */
+ OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x02: L1 hit */
+ OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x03: L1 Miss Handling Buffer hit */
+ OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, NONE), /* 0x04: L2 Hit Clean */
+ 0, /* 0x05: Reserved */
+ 0, /* 0x06: Reserved */
+ OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, HIT), /* 0x07: L2 Hit Snoop HIT */
+ OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, HITM), /* 0x08: L2 Hit Snoop Hit Modified */
+ OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, MISS), /* 0x09: Prefetch Promotion */
+ OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, MISS), /* 0x0a: Cross Core Prefetch Promotion */
+ 0, /* 0x0b: Reserved */
+ 0, /* 0x0c: Reserved */
+ 0, /* 0x0d: Reserved */
+ 0, /* 0x0e: Reserved */
+ OP_LH | P(LVL, UNC) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0f: uncached */
+};
+
+/* L2 miss */
+#define OMR_DATA_SOURCE_MAX 16
+static u64 omr_data_source[OMR_DATA_SOURCE_MAX] = {
+ P(OP, LOAD) | P(LVL, NA) | LEVEL(NA) | P(SNOOP, NA), /* 0x00: invalid */
+ 0, /* 0x01: Reserved */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, L_SHARE), /* 0x02: local CA shared cache */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, L_NON_SHARE),/* 0x03: local CA non-shared cache */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_IO), /* 0x04: other CA IO agent */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_SHARE), /* 0x05: other CA shared cache */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_NON_SHARE),/* 0x06: other CA non-shared cache */
+ OP_LH | LEVEL(RAM) | P(REGION, MMIO), /* 0x07: MMIO */
+ OP_LH | LEVEL(RAM) | P(REGION, MEM0), /* 0x08: Memory region 0 */
+ OP_LH | LEVEL(RAM) | P(REGION, MEM1), /* 0x09: Memory region 1 */
+ OP_LH | LEVEL(RAM) | P(REGION, MEM2), /* 0x0a: Memory region 2 */
+ OP_LH | LEVEL(RAM) | P(REGION, MEM3), /* 0x0b: Memory region 3 */
+ OP_LH | LEVEL(RAM) | P(REGION, MEM4), /* 0x0c: Memory region 4 */
+ OP_LH | LEVEL(RAM) | P(REGION, MEM5), /* 0x0d: Memory region 5 */
+ OP_LH | LEVEL(RAM) | P(REGION, MEM6), /* 0x0e: Memory region 6 */
+ OP_LH | LEVEL(RAM) | P(REGION, MEM7), /* 0x0f: Memory region 7 */
+};
+
+static u64 parse_omr_data_source(u8 dse)
+{
+ union omr_encoding omr;
+ u64 val = 0;
+
+ omr.omr_full = dse;
+ val = omr_data_source[omr.omr_source];
+ if (omr.omr_source > 0x1 && omr.omr_source < 0x7)
+ val |= omr.omr_remote ? P(LVL, REM_CCE1) : 0;
+ else if (omr.omr_source > 0x7)
+ val |= omr.omr_remote ? P(LVL, REM_RAM1) : P(LVL, LOC_RAM);
+
+ if (omr.omr_remote)
+ val |= REM;
+
+ val |= omr.omr_hitm ? P(SNOOP, HITM) : P(SNOOP, HIT);
+
+ if (omr.omr_source == 0x2) {
+ u8 snoop = omr.omr_snoop | omr.omr_promoted;
+
+ if (snoop == 0x0)
+ val |= P(SNOOP, NA);
+ else if (snoop == 0x1)
+ val |= P(SNOOP, MISS);
+ else if (snoop == 0x2)
+ val |= P(SNOOP, HIT);
+ else if (snoop == 0x3)
+ val |= P(SNOOP, NONE);
+ } else if (omr.omr_source > 0x2 && omr.omr_source < 0x7) {
+ val |= omr.omr_snoop ? P(SNOOPX, FWD) : 0;
+ }
+
+ return val;
+}
+
static u64 precise_store_data(u64 status)
{
union intel_x86_pebs_dse dse;
@@ -411,6 +513,44 @@ u64 arl_h_latency_data(struct perf_event *event, u64 status)
return lnl_latency_data(event, status);
}
+u64 pnc_latency_data(struct perf_event *event, u64 status)
+{
+ union intel_x86_pebs_dse dse;
+ union perf_mem_data_src src;
+ u64 val;
+
+ dse.val = status;
+
+ if (!dse.pnc_l2_miss)
+ val = pnc_pebs_l2_hit_data_source[dse.pnc_dse & 0xf];
+ else
+ val = parse_omr_data_source(dse.pnc_dse);
+
+ if (!val)
+ val = P(OP, LOAD) | LEVEL(NA) | P(SNOOP, NA);
+
+ if (dse.pnc_stlb_miss)
+ val |= P(TLB, MISS) | P(TLB, L2);
+ else
+ val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
+
+ if (dse.pnc_locked)
+ val |= P(LOCK, LOCKED);
+
+ if (dse.pnc_data_blk)
+ val |= P(BLK, DATA);
+ if (dse.pnc_addr_blk)
+ val |= P(BLK, ADDR);
+ if (!dse.pnc_data_blk && !dse.pnc_addr_blk)
+ val |= P(BLK, NA);
+
+ src.val = val;
+ if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
+ src.mem_op = P(OP, STORE);
+
+ return src.val;
+}
+
static u64 load_latency_data(struct perf_event *event, u64 status)
{
union intel_x86_pebs_dse dse;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 586e3fdfe6d8..bd501c2a0f73 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -1664,6 +1664,8 @@ u64 lnl_latency_data(struct perf_event *event, u64 status);
u64 arl_h_latency_data(struct perf_event *event, u64 status);
+u64 pnc_latency_data(struct perf_event *event, u64 status);
+
extern struct event_constraint intel_core2_pebs_event_constraints[];
extern struct event_constraint intel_atom_pebs_event_constraints[];