summaryrefslogtreecommitdiff
path: root/tools/perf/util/arm-spe.c
diff options
context:
space:
mode:
Diffstat (limited to 'tools/perf/util/arm-spe.c')
-rw-r--r--tools/perf/util/arm-spe.c349
1 files changed, 330 insertions, 19 deletions
diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index dbf13f47879c..d46e0cccac99 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -37,6 +37,8 @@
#include "../../arch/arm64/include/asm/cputype.h"
#define MAX_TIMESTAMP (~0ULL)
+#define is_ldst_op(op) (!!((op) & ARM_SPE_OP_LDST))
+
struct arm_spe {
struct auxtrace auxtrace;
struct auxtrace_queues queues;
@@ -101,8 +103,21 @@ struct arm_spe_queue {
struct thread *thread;
u64 period_instructions;
u32 flags;
+ struct branch_stack *last_branch;
+};
+
+struct data_source_handle {
+ const struct midr_range *midr_ranges;
+ void (*ds_synth)(const struct arm_spe_record *record,
+ union perf_mem_data_src *data_src);
};
+#define DS(range, func) \
+ { \
+ .midr_ranges = range, \
+ .ds_synth = arm_spe__synth_##func, \
+ }
+
static void arm_spe_dump(struct arm_spe *spe __maybe_unused,
unsigned char *buf, size_t len)
{
@@ -219,6 +234,17 @@ static struct arm_spe_queue *arm_spe__alloc_queue(struct arm_spe *spe,
params.get_trace = arm_spe_get_trace;
params.data = speq;
+ if (spe->synth_opts.last_branch) {
+ size_t sz = sizeof(struct branch_stack);
+
+ /* Allocate up to two entries for PBT + TGT */
+ sz += sizeof(struct branch_entry) *
+ min(spe->synth_opts.last_branch_sz, 2U);
+ speq->last_branch = zalloc(sz);
+ if (!speq->last_branch)
+ goto out_free;
+ }
+
/* create new decoder */
speq->decoder = arm_spe_decoder_new(&params);
if (!speq->decoder)
@@ -228,6 +254,7 @@ static struct arm_spe_queue *arm_spe__alloc_queue(struct arm_spe *spe,
out_free:
zfree(&speq->event_buf);
+ zfree(&speq->last_branch);
free(speq);
return NULL;
@@ -334,6 +361,88 @@ static void arm_spe_prep_sample(struct arm_spe *spe,
event->sample.header.size = sizeof(struct perf_event_header);
}
+static void arm_spe__prep_branch_stack(struct arm_spe_queue *speq)
+{
+ struct arm_spe *spe = speq->spe;
+ struct arm_spe_record *record = &speq->decoder->record;
+ struct branch_stack *bstack = speq->last_branch;
+ struct branch_flags *bs_flags;
+ unsigned int last_branch_sz = spe->synth_opts.last_branch_sz;
+ bool have_tgt = !!(speq->flags & PERF_IP_FLAG_BRANCH);
+ bool have_pbt = last_branch_sz >= (have_tgt + 1U) && record->prev_br_tgt;
+ size_t sz = sizeof(struct branch_stack) +
+ sizeof(struct branch_entry) * min(last_branch_sz, 2U) /* PBT + TGT */;
+ int i = 0;
+
+ /* Clean up branch stack */
+ memset(bstack, 0x0, sz);
+
+ if (!have_tgt && !have_pbt)
+ return;
+
+ if (have_tgt) {
+ bstack->entries[i].from = record->from_ip;
+ bstack->entries[i].to = record->to_ip;
+
+ bs_flags = &bstack->entries[i].flags;
+ bs_flags->value = 0;
+
+ if (record->op & ARM_SPE_OP_BR_CR_BL) {
+ if (record->op & ARM_SPE_OP_BR_COND)
+ bs_flags->type |= PERF_BR_COND_CALL;
+ else
+ bs_flags->type |= PERF_BR_CALL;
+ /*
+ * Indirect branch instruction without link (e.g. BR),
+ * take this case as function return.
+ */
+ } else if (record->op & ARM_SPE_OP_BR_CR_RET ||
+ record->op & ARM_SPE_OP_BR_INDIRECT) {
+ if (record->op & ARM_SPE_OP_BR_COND)
+ bs_flags->type |= PERF_BR_COND_RET;
+ else
+ bs_flags->type |= PERF_BR_RET;
+ } else if (record->op & ARM_SPE_OP_BR_CR_NON_BL_RET) {
+ if (record->op & ARM_SPE_OP_BR_COND)
+ bs_flags->type |= PERF_BR_COND;
+ else
+ bs_flags->type |= PERF_BR_UNCOND;
+ } else {
+ if (record->op & ARM_SPE_OP_BR_COND)
+ bs_flags->type |= PERF_BR_COND;
+ else
+ bs_flags->type |= PERF_BR_UNKNOWN;
+ }
+
+ if (record->type & ARM_SPE_BRANCH_MISS) {
+ bs_flags->mispred = 1;
+ bs_flags->predicted = 0;
+ } else {
+ bs_flags->mispred = 0;
+ bs_flags->predicted = 1;
+ }
+
+ if (record->type & ARM_SPE_BRANCH_NOT_TAKEN)
+ bs_flags->not_taken = 1;
+
+ if (record->type & ARM_SPE_IN_TXN)
+ bs_flags->in_tx = 1;
+
+ bs_flags->cycles = min(record->latency, 0xFFFFU);
+ i++;
+ }
+
+ if (have_pbt) {
+ bs_flags = &bstack->entries[i].flags;
+ bs_flags->type |= PERF_BR_UNKNOWN;
+ bstack->entries[i].to = record->prev_br_tgt;
+ i++;
+ }
+
+ bstack->nr = i;
+ bstack->hw_idx = -1ULL;
+}
+
static int arm_spe__inject_event(union perf_event *event, struct perf_sample *sample, u64 type)
{
event->header.size = perf_event__sample_event_size(sample, type, 0);
@@ -367,8 +476,10 @@ static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq,
struct arm_spe *spe = speq->spe;
struct arm_spe_record *record = &speq->decoder->record;
union perf_event *event = speq->event_buf;
- struct perf_sample sample = { .ip = 0, };
+ struct perf_sample sample;
+ int ret;
+ perf_sample__init(&sample, /*all=*/true);
arm_spe_prep_sample(spe, speq, event, &sample);
sample.id = spe_events_id;
@@ -378,7 +489,9 @@ static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq,
sample.data_src = data_src;
sample.weight = record->latency;
- return arm_spe_deliver_synth_event(spe, speq, event, &sample);
+ ret = arm_spe_deliver_synth_event(spe, speq, event, &sample);
+ perf_sample__exit(&sample);
+ return ret;
}
static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq,
@@ -387,8 +500,10 @@ static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq,
struct arm_spe *spe = speq->spe;
struct arm_spe_record *record = &speq->decoder->record;
union perf_event *event = speq->event_buf;
- struct perf_sample sample = { .ip = 0, };
+ struct perf_sample sample;
+ int ret;
+ perf_sample__init(&sample, /*all=*/true);
arm_spe_prep_sample(spe, speq, event, &sample);
sample.id = spe_events_id;
@@ -396,8 +511,11 @@ static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq,
sample.addr = record->to_ip;
sample.weight = record->latency;
sample.flags = speq->flags;
+ sample.branch_stack = speq->last_branch;
- return arm_spe_deliver_synth_event(spe, speq, event, &sample);
+ ret = arm_spe_deliver_synth_event(spe, speq, event, &sample);
+ perf_sample__exit(&sample);
+ return ret;
}
static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq,
@@ -406,7 +524,8 @@ static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq,
struct arm_spe *spe = speq->spe;
struct arm_spe_record *record = &speq->decoder->record;
union perf_event *event = speq->event_buf;
- struct perf_sample sample = { .ip = 0, };
+ struct perf_sample sample;
+ int ret;
/*
* Handles perf instruction sampling period.
@@ -416,6 +535,7 @@ static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq,
return 0;
speq->period_instructions = 0;
+ perf_sample__init(&sample, /*all=*/true);
arm_spe_prep_sample(spe, speq, event, &sample);
sample.id = spe_events_id;
@@ -426,8 +546,11 @@ static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq,
sample.period = spe->instructions_sample_period;
sample.weight = record->latency;
sample.flags = speq->flags;
+ sample.branch_stack = speq->last_branch;
- return arm_spe_deliver_synth_event(spe, speq, event, &sample);
+ ret = arm_spe_deliver_synth_event(spe, speq, event, &sample);
+ perf_sample__exit(&sample);
+ return ret;
}
static const struct midr_range common_ds_encoding_cpus[] = {
@@ -443,6 +566,16 @@ static const struct midr_range common_ds_encoding_cpus[] = {
{},
};
+static const struct midr_range ampereone_ds_encoding_cpus[] = {
+ MIDR_ALL_VERSIONS(MIDR_AMPERE1A),
+ {},
+};
+
+static const struct midr_range hisi_hip_ds_encoding_cpus[] = {
+ MIDR_ALL_VERSIONS(MIDR_HISI_HIP12),
+ {},
+};
+
static void arm_spe__sample_flags(struct arm_spe_queue *speq)
{
const struct arm_spe_record *record = &speq->decoder->record;
@@ -453,6 +586,26 @@ static void arm_spe__sample_flags(struct arm_spe_queue *speq)
if (record->type & ARM_SPE_BRANCH_MISS)
speq->flags |= PERF_IP_FLAG_BRANCH_MISS;
+
+ if (record->type & ARM_SPE_BRANCH_NOT_TAKEN)
+ speq->flags |= PERF_IP_FLAG_NOT_TAKEN;
+
+ if (record->type & ARM_SPE_IN_TXN)
+ speq->flags |= PERF_IP_FLAG_IN_TX;
+
+ if (record->op & ARM_SPE_OP_BR_COND)
+ speq->flags |= PERF_IP_FLAG_CONDITIONAL;
+
+ if (record->op & ARM_SPE_OP_BR_CR_BL)
+ speq->flags |= PERF_IP_FLAG_CALL;
+ else if (record->op & ARM_SPE_OP_BR_CR_RET)
+ speq->flags |= PERF_IP_FLAG_RETURN;
+ /*
+ * Indirect branch instruction without link (e.g. BR),
+ * take it as a function return.
+ */
+ else if (record->op & ARM_SPE_OP_BR_INDIRECT)
+ speq->flags |= PERF_IP_FLAG_RETURN;
}
}
@@ -532,6 +685,140 @@ static void arm_spe__synth_data_source_common(const struct arm_spe_record *recor
}
}
+/*
+ * Source is IMPDEF. Here we convert the source code used on AmpereOne cores
+ * to the common (Neoverse, Cortex) to avoid duplicating the decoding code.
+ */
+static void arm_spe__synth_data_source_ampereone(const struct arm_spe_record *record,
+ union perf_mem_data_src *data_src)
+{
+ struct arm_spe_record common_record;
+
+ switch (record->source) {
+ case ARM_SPE_AMPEREONE_LOCAL_CHIP_CACHE_OR_DEVICE:
+ common_record.source = ARM_SPE_COMMON_DS_PEER_CORE;
+ break;
+ case ARM_SPE_AMPEREONE_SLC:
+ common_record.source = ARM_SPE_COMMON_DS_SYS_CACHE;
+ break;
+ case ARM_SPE_AMPEREONE_REMOTE_CHIP_CACHE:
+ common_record.source = ARM_SPE_COMMON_DS_REMOTE;
+ break;
+ case ARM_SPE_AMPEREONE_DDR:
+ common_record.source = ARM_SPE_COMMON_DS_DRAM;
+ break;
+ case ARM_SPE_AMPEREONE_L1D:
+ common_record.source = ARM_SPE_COMMON_DS_L1D;
+ break;
+ case ARM_SPE_AMPEREONE_L2D:
+ common_record.source = ARM_SPE_COMMON_DS_L2;
+ break;
+ default:
+ pr_warning_once("AmpereOne: Unknown data source (0x%x)\n",
+ record->source);
+ return;
+ }
+
+ common_record.op = record->op;
+ arm_spe__synth_data_source_common(&common_record, data_src);
+}
+
+static void arm_spe__synth_data_source_hisi_hip(const struct arm_spe_record *record,
+ union perf_mem_data_src *data_src)
+{
+ /* Use common synthesis method to handle store operations */
+ if (record->op & ARM_SPE_OP_ST) {
+ arm_spe__synth_data_source_common(record, data_src);
+ return;
+ }
+
+ switch (record->source) {
+ case ARM_SPE_HISI_HIP_PEER_CPU:
+ data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
+ data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
+ data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
+ break;
+ case ARM_SPE_HISI_HIP_PEER_CPU_HITM:
+ data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
+ data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
+ data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
+ data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
+ break;
+ case ARM_SPE_HISI_HIP_L3:
+ data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
+ data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
+ data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
+ break;
+ case ARM_SPE_HISI_HIP_L3_HITM:
+ data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
+ data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
+ data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
+ break;
+ case ARM_SPE_HISI_HIP_PEER_CLUSTER:
+ data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT;
+ data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
+ data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
+ break;
+ case ARM_SPE_HISI_HIP_PEER_CLUSTER_HITM:
+ data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT;
+ data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
+ data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
+ data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
+ break;
+ case ARM_SPE_HISI_HIP_REMOTE_SOCKET:
+ data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2;
+ data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
+ data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
+ data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
+ break;
+ case ARM_SPE_HISI_HIP_REMOTE_SOCKET_HITM:
+ data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2;
+ data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
+ data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
+ data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
+ data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
+ break;
+ case ARM_SPE_HISI_HIP_LOCAL_MEM:
+ data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT;
+ data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
+ data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
+ break;
+ case ARM_SPE_HISI_HIP_REMOTE_MEM:
+ data_src->mem_lvl = PERF_MEM_LVL_REM_RAM1 | PERF_MEM_LVL_HIT;
+ data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
+ data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
+ break;
+ case ARM_SPE_HISI_HIP_NC_DEV:
+ data_src->mem_lvl = PERF_MEM_LVL_IO | PERF_MEM_LVL_HIT;
+ data_src->mem_lvl_num = PERF_MEM_LVLNUM_IO;
+ data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
+ break;
+ case ARM_SPE_HISI_HIP_L2:
+ data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
+ data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
+ data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
+ break;
+ case ARM_SPE_HISI_HIP_L2_HITM:
+ data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
+ data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
+ data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
+ break;
+ case ARM_SPE_HISI_HIP_L1:
+ data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
+ data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
+ data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
+ break;
+ default:
+ break;
+ }
+}
+
+static const struct data_source_handle data_source_handles[] = {
+ DS(common_ds_encoding_cpus, data_source_common),
+ DS(ampereone_ds_encoding_cpus, data_source_ampereone),
+ DS(hisi_hip_ds_encoding_cpus, data_source_hisi_hip),
+};
+
static void arm_spe__synth_memory_level(const struct arm_spe_record *record,
union perf_mem_data_src *data_src)
{
@@ -555,12 +842,14 @@ static void arm_spe__synth_memory_level(const struct arm_spe_record *record,
data_src->mem_lvl |= PERF_MEM_LVL_REM_CCE1;
}
-static bool arm_spe__is_common_ds_encoding(struct arm_spe_queue *speq)
+static bool arm_spe__synth_ds(struct arm_spe_queue *speq,
+ const struct arm_spe_record *record,
+ union perf_mem_data_src *data_src)
{
struct arm_spe *spe = speq->spe;
- bool is_in_cpu_list;
u64 *metadata = NULL;
- u64 midr = 0;
+ u64 midr;
+ unsigned int i;
/* Metadata version 1 assumes all CPUs are the same (old behavior) */
if (spe->metadata_ver == 1) {
@@ -592,18 +881,24 @@ static bool arm_spe__is_common_ds_encoding(struct arm_spe_queue *speq)
midr = metadata[ARM_SPE_CPU_MIDR];
}
- is_in_cpu_list = is_midr_in_range_list(midr, common_ds_encoding_cpus);
- if (is_in_cpu_list)
- return true;
- else
- return false;
+ for (i = 0; i < ARRAY_SIZE(data_source_handles); i++) {
+ if (is_midr_in_range_list(midr, data_source_handles[i].midr_ranges)) {
+ data_source_handles[i].ds_synth(record, data_src);
+ return true;
+ }
+ }
+
+ return false;
}
static u64 arm_spe__synth_data_source(struct arm_spe_queue *speq,
const struct arm_spe_record *record)
{
union perf_mem_data_src data_src = { .mem_op = PERF_MEM_OP_NA };
- bool is_common = arm_spe__is_common_ds_encoding(speq);
+
+ /* Only synthesize data source for LDST operations */
+ if (!is_ldst_op(record->op))
+ return 0;
if (record->op & ARM_SPE_OP_LD)
data_src.mem_op = PERF_MEM_OP_LOAD;
@@ -612,9 +907,7 @@ static u64 arm_spe__synth_data_source(struct arm_spe_queue *speq,
else
return 0;
- if (is_common)
- arm_spe__synth_data_source_common(record, &data_src);
- else
+ if (!arm_spe__synth_ds(speq, record, &data_src))
arm_spe__synth_memory_level(record, &data_src);
if (record->type & (ARM_SPE_TLB_ACCESS | ARM_SPE_TLB_MISS)) {
@@ -687,6 +980,10 @@ static int arm_spe_sample(struct arm_spe_queue *speq)
}
}
+ if (spe->synth_opts.last_branch &&
+ (spe->sample_branch || spe->sample_instructions))
+ arm_spe__prep_branch_stack(speq);
+
if (spe->sample_branch && (record->op & ARM_SPE_OP_BRANCH_ERET)) {
err = arm_spe__synth_branch_sample(speq, spe->branch_id);
if (err)
@@ -705,7 +1002,7 @@ static int arm_spe_sample(struct arm_spe_queue *speq)
* When data_src is zero it means the record is not a memory operation,
* skip to synthesize memory sample for this case.
*/
- if (spe->sample_memory && data_src) {
+ if (spe->sample_memory && is_ldst_op(record->op)) {
err = arm_spe__synth_mem_sample(speq, spe->memory_id, data_src);
if (err)
return err;
@@ -1178,6 +1475,7 @@ static void arm_spe_free_queue(void *priv)
thread__zput(speq->thread);
arm_spe_decoder_free(speq->decoder);
zfree(&speq->event_buf);
+ zfree(&speq->last_branch);
free(speq);
}
@@ -1397,6 +1695,19 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session)
id += 1;
}
+ if (spe->synth_opts.last_branch) {
+ if (spe->synth_opts.last_branch_sz > 2)
+ pr_debug("Arm SPE supports only two bstack entries (PBT+TGT).\n");
+
+ attr.sample_type |= PERF_SAMPLE_BRANCH_STACK;
+ /*
+ * We don't use the hardware index, but the sample generation
+ * code uses the new format branch_stack with this field,
+ * so the event attributes must indicate that it's present.
+ */
+ attr.branch_sample_type |= PERF_SAMPLE_BRANCH_HW_INDEX;
+ }
+
if (spe->synth_opts.branches) {
spe->sample_branch = true;