diff options
Diffstat (limited to 'tools/perf/util/arm-spe.c')
-rw-r--r-- | tools/perf/util/arm-spe.c | 255 |
1 files changed, 249 insertions, 6 deletions
diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c index f1365ce69ba0..d46e0cccac99 100644 --- a/tools/perf/util/arm-spe.c +++ b/tools/perf/util/arm-spe.c @@ -103,6 +103,7 @@ struct arm_spe_queue { struct thread *thread; u64 period_instructions; u32 flags; + struct branch_stack *last_branch; }; struct data_source_handle { @@ -233,6 +234,17 @@ static struct arm_spe_queue *arm_spe__alloc_queue(struct arm_spe *spe, params.get_trace = arm_spe_get_trace; params.data = speq; + if (spe->synth_opts.last_branch) { + size_t sz = sizeof(struct branch_stack); + + /* Allocate up to two entries for PBT + TGT */ + sz += sizeof(struct branch_entry) * + min(spe->synth_opts.last_branch_sz, 2U); + speq->last_branch = zalloc(sz); + if (!speq->last_branch) + goto out_free; + } + /* create new decoder */ speq->decoder = arm_spe_decoder_new(¶ms); if (!speq->decoder) @@ -242,6 +254,7 @@ static struct arm_spe_queue *arm_spe__alloc_queue(struct arm_spe *spe, out_free: zfree(&speq->event_buf); + zfree(&speq->last_branch); free(speq); return NULL; @@ -348,6 +361,88 @@ static void arm_spe_prep_sample(struct arm_spe *spe, event->sample.header.size = sizeof(struct perf_event_header); } +static void arm_spe__prep_branch_stack(struct arm_spe_queue *speq) +{ + struct arm_spe *spe = speq->spe; + struct arm_spe_record *record = &speq->decoder->record; + struct branch_stack *bstack = speq->last_branch; + struct branch_flags *bs_flags; + unsigned int last_branch_sz = spe->synth_opts.last_branch_sz; + bool have_tgt = !!(speq->flags & PERF_IP_FLAG_BRANCH); + bool have_pbt = last_branch_sz >= (have_tgt + 1U) && record->prev_br_tgt; + size_t sz = sizeof(struct branch_stack) + + sizeof(struct branch_entry) * min(last_branch_sz, 2U) /* PBT + TGT */; + int i = 0; + + /* Clean up branch stack */ + memset(bstack, 0x0, sz); + + if (!have_tgt && !have_pbt) + return; + + if (have_tgt) { + bstack->entries[i].from = record->from_ip; + bstack->entries[i].to = record->to_ip; + + bs_flags = &bstack->entries[i].flags; + bs_flags->value = 0; + + if (record->op & ARM_SPE_OP_BR_CR_BL) { + if (record->op & ARM_SPE_OP_BR_COND) + bs_flags->type |= PERF_BR_COND_CALL; + else + bs_flags->type |= PERF_BR_CALL; + /* + * Indirect branch instruction without link (e.g. BR), + * take this case as function return. + */ + } else if (record->op & ARM_SPE_OP_BR_CR_RET || + record->op & ARM_SPE_OP_BR_INDIRECT) { + if (record->op & ARM_SPE_OP_BR_COND) + bs_flags->type |= PERF_BR_COND_RET; + else + bs_flags->type |= PERF_BR_RET; + } else if (record->op & ARM_SPE_OP_BR_CR_NON_BL_RET) { + if (record->op & ARM_SPE_OP_BR_COND) + bs_flags->type |= PERF_BR_COND; + else + bs_flags->type |= PERF_BR_UNCOND; + } else { + if (record->op & ARM_SPE_OP_BR_COND) + bs_flags->type |= PERF_BR_COND; + else + bs_flags->type |= PERF_BR_UNKNOWN; + } + + if (record->type & ARM_SPE_BRANCH_MISS) { + bs_flags->mispred = 1; + bs_flags->predicted = 0; + } else { + bs_flags->mispred = 0; + bs_flags->predicted = 1; + } + + if (record->type & ARM_SPE_BRANCH_NOT_TAKEN) + bs_flags->not_taken = 1; + + if (record->type & ARM_SPE_IN_TXN) + bs_flags->in_tx = 1; + + bs_flags->cycles = min(record->latency, 0xFFFFU); + i++; + } + + if (have_pbt) { + bs_flags = &bstack->entries[i].flags; + bs_flags->type |= PERF_BR_UNKNOWN; + bstack->entries[i].to = record->prev_br_tgt; + i++; + } + + bstack->nr = i; + bstack->hw_idx = -1ULL; +} + static int arm_spe__inject_event(union perf_event *event, struct perf_sample *sample, u64 type) { event->header.size = perf_event__sample_event_size(sample, type, 0); @@ -381,8 +476,10 @@ static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq, struct arm_spe *spe = speq->spe; struct arm_spe_record *record = &speq->decoder->record; union perf_event *event = speq->event_buf; - struct perf_sample sample = { .ip = 0, }; + struct perf_sample sample; + int ret; + perf_sample__init(&sample, /*all=*/true); arm_spe_prep_sample(spe, speq, event, &sample); sample.id = spe_events_id; @@ -392,7 +489,9 @@ static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq, sample.data_src = data_src; sample.weight = record->latency; - return arm_spe_deliver_synth_event(spe, speq, event, &sample); + ret = arm_spe_deliver_synth_event(spe, speq, event, &sample); + perf_sample__exit(&sample); + return ret; } static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq, @@ -401,8 +500,10 @@ static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq, struct arm_spe *spe = speq->spe; struct arm_spe_record *record = &speq->decoder->record; union perf_event *event = speq->event_buf; - struct perf_sample sample = { .ip = 0, }; + struct perf_sample sample; + int ret; + perf_sample__init(&sample, /*all=*/true); arm_spe_prep_sample(spe, speq, event, &sample); sample.id = spe_events_id; @@ -410,8 +511,11 @@ static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq, sample.addr = record->to_ip; sample.weight = record->latency; sample.flags = speq->flags; + sample.branch_stack = speq->last_branch; - return arm_spe_deliver_synth_event(spe, speq, event, &sample); + ret = arm_spe_deliver_synth_event(spe, speq, event, &sample); + perf_sample__exit(&sample); + return ret; } static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq, @@ -420,7 +524,8 @@ static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq, struct arm_spe *spe = speq->spe; struct arm_spe_record *record = &speq->decoder->record; union perf_event *event = speq->event_buf; - struct perf_sample sample = { .ip = 0, }; + struct perf_sample sample; + int ret; /* * Handles perf instruction sampling period. @@ -430,6 +535,7 @@ static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq, return 0; speq->period_instructions = 0; + perf_sample__init(&sample, /*all=*/true); arm_spe_prep_sample(spe, speq, event, &sample); sample.id = spe_events_id; @@ -440,8 +546,11 @@ static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq, sample.period = spe->instructions_sample_period; sample.weight = record->latency; sample.flags = speq->flags; + sample.branch_stack = speq->last_branch; - return arm_spe_deliver_synth_event(spe, speq, event, &sample); + ret = arm_spe_deliver_synth_event(spe, speq, event, &sample); + perf_sample__exit(&sample); + return ret; } static const struct midr_range common_ds_encoding_cpus[] = { @@ -462,6 +571,11 @@ static const struct midr_range ampereone_ds_encoding_cpus[] = { {}, }; +static const struct midr_range hisi_hip_ds_encoding_cpus[] = { + MIDR_ALL_VERSIONS(MIDR_HISI_HIP12), + {}, +}; + static void arm_spe__sample_flags(struct arm_spe_queue *speq) { const struct arm_spe_record *record = &speq->decoder->record; @@ -472,6 +586,26 @@ static void arm_spe__sample_flags(struct arm_spe_queue *speq) if (record->type & ARM_SPE_BRANCH_MISS) speq->flags |= PERF_IP_FLAG_BRANCH_MISS; + + if (record->type & ARM_SPE_BRANCH_NOT_TAKEN) + speq->flags |= PERF_IP_FLAG_NOT_TAKEN; + + if (record->type & ARM_SPE_IN_TXN) + speq->flags |= PERF_IP_FLAG_IN_TX; + + if (record->op & ARM_SPE_OP_BR_COND) + speq->flags |= PERF_IP_FLAG_CONDITIONAL; + + if (record->op & ARM_SPE_OP_BR_CR_BL) + speq->flags |= PERF_IP_FLAG_CALL; + else if (record->op & ARM_SPE_OP_BR_CR_RET) + speq->flags |= PERF_IP_FLAG_RETURN; + /* + * Indirect branch instruction without link (e.g. BR), + * take it as a function return. + */ + else if (record->op & ARM_SPE_OP_BR_INDIRECT) + speq->flags |= PERF_IP_FLAG_RETURN; } } @@ -589,9 +723,100 @@ static void arm_spe__synth_data_source_ampereone(const struct arm_spe_record *re arm_spe__synth_data_source_common(&common_record, data_src); } +static void arm_spe__synth_data_source_hisi_hip(const struct arm_spe_record *record, + union perf_mem_data_src *data_src) +{ + /* Use common synthesis method to handle store operations */ + if (record->op & ARM_SPE_OP_ST) { + arm_spe__synth_data_source_common(record, data_src); + return; + } + + switch (record->source) { + case ARM_SPE_HISI_HIP_PEER_CPU: + data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; + data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; + break; + case ARM_SPE_HISI_HIP_PEER_CPU_HITM: + data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; + data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; + break; + case ARM_SPE_HISI_HIP_L3: + data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; + data_src->mem_snoop = PERF_MEM_SNOOP_HIT; + break; + case ARM_SPE_HISI_HIP_L3_HITM: + data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; + break; + case ARM_SPE_HISI_HIP_PEER_CLUSTER: + data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; + data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; + break; + case ARM_SPE_HISI_HIP_PEER_CLUSTER_HITM: + data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; + data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; + break; + case ARM_SPE_HISI_HIP_REMOTE_SOCKET: + data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; + data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; + break; + case ARM_SPE_HISI_HIP_REMOTE_SOCKET_HITM: + data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; + data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; + break; + case ARM_SPE_HISI_HIP_LOCAL_MEM: + data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM; + data_src->mem_snoop = PERF_MEM_SNOOP_NONE; + break; + case ARM_SPE_HISI_HIP_REMOTE_MEM: + data_src->mem_lvl = PERF_MEM_LVL_REM_RAM1 | PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM; + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; + break; + case ARM_SPE_HISI_HIP_NC_DEV: + data_src->mem_lvl = PERF_MEM_LVL_IO | PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_IO; + data_src->mem_snoop = PERF_MEM_SNOOP_NONE; + break; + case ARM_SPE_HISI_HIP_L2: + data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; + data_src->mem_snoop = PERF_MEM_SNOOP_NONE; + break; + case ARM_SPE_HISI_HIP_L2_HITM: + data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; + break; + case ARM_SPE_HISI_HIP_L1: + data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; + data_src->mem_snoop = PERF_MEM_SNOOP_NONE; + break; + default: + break; + } +} + static const struct data_source_handle data_source_handles[] = { DS(common_ds_encoding_cpus, data_source_common), DS(ampereone_ds_encoding_cpus, data_source_ampereone), + DS(hisi_hip_ds_encoding_cpus, data_source_hisi_hip), }; static void arm_spe__synth_memory_level(const struct arm_spe_record *record, @@ -755,6 +980,10 @@ static int arm_spe_sample(struct arm_spe_queue *speq) } } + if (spe->synth_opts.last_branch && + (spe->sample_branch || spe->sample_instructions)) + arm_spe__prep_branch_stack(speq); + if (spe->sample_branch && (record->op & ARM_SPE_OP_BRANCH_ERET)) { err = arm_spe__synth_branch_sample(speq, spe->branch_id); if (err) @@ -1246,6 +1475,7 @@ static void arm_spe_free_queue(void *priv) thread__zput(speq->thread); arm_spe_decoder_free(speq->decoder); zfree(&speq->event_buf); + zfree(&speq->last_branch); free(speq); } @@ -1465,6 +1695,19 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session) id += 1; } + if (spe->synth_opts.last_branch) { + if (spe->synth_opts.last_branch_sz > 2) + pr_debug("Arm SPE supports only two bstack entries (PBT+TGT).\n"); + + attr.sample_type |= PERF_SAMPLE_BRANCH_STACK; + /* + * We don't use the hardware index, but the sample generation + * code uses the new format branch_stack with this field, + * so the event attributes must indicate that it's present. + */ + attr.branch_sample_type |= PERF_SAMPLE_BRANCH_HW_INDEX; + } + if (spe->synth_opts.branches) { spe->sample_branch = true; |