diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-03-31 02:40:08 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-03-31 02:40:08 +0300 |
commit | 9b82f05f869a823d43ea4186f5f732f2924d3693 (patch) | |
tree | 6aaa625789d7d345d0694ebe20276f0b42e5a149 /tools | |
parent | 4b9fd8a829a1eec7442e38afff21d610604de56a (diff) | |
parent | 629b3df7ecb01fddfdf71cb5d3c563d143117c33 (diff) | |
download | linux-9b82f05f869a823d43ea4186f5f732f2924d3693.tar.xz |
Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull perf updates from Ingo Molnar:
"The main changes in this cycle were:
Kernel side changes:
- A couple of x86/cpu cleanups and changes were grandfathered in due
to patch dependencies. These clean up the set of CPU model/family
matching macros with a consistent namespace and C99 initializer
style.
- A bunch of updates to various low level PMU drivers:
* AMD Family 19h L3 uncore PMU
* Intel Tiger Lake uncore support
* misc fixes to LBR TOS sampling
- optprobe fixes
- perf/cgroup: optimize cgroup event sched-in processing
- misc cleanups and fixes
Tooling side changes are to:
- perf {annotate,expr,record,report,stat,test}
- perl scripting
- libapi, libperf and libtraceevent
- vendor events on Intel and S390, ARM cs-etm
- Intel PT updates
- Documentation changes and updates to core facilities
- misc cleanups, fixes and other enhancements"
* 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (89 commits)
cpufreq/intel_pstate: Fix wrong macro conversion
x86/cpu: Cleanup the now unused CPU match macros
hwrng: via_rng: Convert to new X86 CPU match macros
crypto: Convert to new CPU match macros
ASoC: Intel: Convert to new X86 CPU match macros
powercap/intel_rapl: Convert to new X86 CPU match macros
PCI: intel-mid: Convert to new X86 CPU match macros
mmc: sdhci-acpi: Convert to new X86 CPU match macros
intel_idle: Convert to new X86 CPU match macros
extcon: axp288: Convert to new X86 CPU match macros
thermal: Convert to new X86 CPU match macros
hwmon: Convert to new X86 CPU match macros
platform/x86: Convert to new CPU match macros
EDAC: Convert to new X86 CPU match macros
cpufreq: Convert to new X86 CPU match macros
ACPI: Convert to new X86 CPU match macros
x86/platform: Convert to new CPU match macros
x86/kernel: Convert to new CPU match macros
x86/kvm: Convert to new CPU match macros
x86/perf/events: Convert to new CPU match macros
...
Diffstat (limited to 'tools')
67 files changed, 2088 insertions, 1500 deletions
diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 377d794d3105..397cfd65b3fe 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -181,6 +181,8 @@ enum perf_branch_sample_type_shift { PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT = 16, /* save branch type */ + PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT = 17, /* save low level index of raw branch records */ + PERF_SAMPLE_BRANCH_MAX_SHIFT /* non-ABI */ }; @@ -208,6 +210,8 @@ enum perf_branch_sample_type { PERF_SAMPLE_BRANCH_TYPE_SAVE = 1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT, + PERF_SAMPLE_BRANCH_HW_INDEX = 1U << PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT, + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, }; @@ -853,7 +857,9 @@ enum perf_event_type { * char data[size];}&& PERF_SAMPLE_RAW * * { u64 nr; - * { u64 from, to, flags } lbr[nr];} && PERF_SAMPLE_BRANCH_STACK + * { u64 hw_idx; } && PERF_SAMPLE_BRANCH_HW_INDEX + * { u64 from, to, flags } lbr[nr]; + * } && PERF_SAMPLE_BRANCH_STACK * * { u64 abi; # enum perf_sample_regs_abi * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER diff --git a/tools/lib/api/fs/Build b/tools/lib/api/fs/Build index f4ed9629ae85..0f75b28654de 100644 --- a/tools/lib/api/fs/Build +++ b/tools/lib/api/fs/Build @@ -1,2 +1,3 @@ libapi-y += fs.o libapi-y += tracing_path.o +libapi-y += cgroup.o diff --git a/tools/lib/api/fs/cgroup.c b/tools/lib/api/fs/cgroup.c new file mode 100644 index 000000000000..889a6eb4aaca --- /dev/null +++ b/tools/lib/api/fs/cgroup.c @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/stringify.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "fs.h" + +int cgroupfs_find_mountpoint(char *buf, size_t maxlen, const char *subsys) +{ + FILE *fp; + char mountpoint[PATH_MAX + 1], tokens[PATH_MAX + 1], type[PATH_MAX + 1]; + char path_v1[PATH_MAX + 1], path_v2[PATH_MAX + 2], *path; + char *token, *saved_ptr = NULL; + + fp = fopen("/proc/mounts", "r"); + if (!fp) + return -1; + + /* + * in order to handle split hierarchy, we need to scan /proc/mounts + * and inspect every cgroupfs mount point to find one that has + * perf_event subsystem + */ + path_v1[0] = '\0'; + path_v2[0] = '\0'; + + while (fscanf(fp, "%*s %"__stringify(PATH_MAX)"s %"__stringify(PATH_MAX)"s %" + __stringify(PATH_MAX)"s %*d %*d\n", + mountpoint, type, tokens) == 3) { + + if (!path_v1[0] && !strcmp(type, "cgroup")) { + + token = strtok_r(tokens, ",", &saved_ptr); + + while (token != NULL) { + if (subsys && !strcmp(token, subsys)) { + strcpy(path_v1, mountpoint); + break; + } + token = strtok_r(NULL, ",", &saved_ptr); + } + } + + if (!path_v2[0] && !strcmp(type, "cgroup2")) + strcpy(path_v2, mountpoint); + + if (path_v1[0] && path_v2[0]) + break; + } + fclose(fp); + + if (path_v1[0]) + path = path_v1; + else if (path_v2[0]) + path = path_v2; + else + return -1; + + if (strlen(path) < maxlen) { + strcpy(buf, path); + return 0; + } + return -1; +} diff --git a/tools/lib/api/fs/fs.h b/tools/lib/api/fs/fs.h index 92d03b8396b1..936edb95e1f3 100644 --- a/tools/lib/api/fs/fs.h +++ b/tools/lib/api/fs/fs.h @@ -28,6 +28,8 @@ FS(bpf_fs) #undef FS +int cgroupfs_find_mountpoint(char *buf, size_t maxlen, const char *subsys); + int filename__read_int(const char *filename, int *value); int filename__read_ull(const char *filename, unsigned long long *value); int filename__read_xll(const char *filename, unsigned long long *value); diff --git a/tools/lib/perf/Documentation/examples/counting.c b/tools/lib/perf/Documentation/examples/counting.c new file mode 100644 index 000000000000..6085693571ef --- /dev/null +++ b/tools/lib/perf/Documentation/examples/counting.c @@ -0,0 +1,83 @@ +#include <linux/perf_event.h> +#include <perf/evlist.h> +#include <perf/evsel.h> +#include <perf/cpumap.h> +#include <perf/threadmap.h> +#include <perf/mmap.h> +#include <perf/core.h> +#include <perf/event.h> +#include <stdio.h> +#include <unistd.h> + +static int libperf_print(enum libperf_print_level level, + const char *fmt, va_list ap) +{ + return vfprintf(stderr, fmt, ap); +} + +int main(int argc, char **argv) +{ + int count = 100000, err = 0; + struct perf_evlist *evlist; + struct perf_evsel *evsel; + struct perf_thread_map *threads; + struct perf_counts_values counts; + + struct perf_event_attr attr1 = { + .type = PERF_TYPE_SOFTWARE, + .config = PERF_COUNT_SW_CPU_CLOCK, + .read_format = PERF_FORMAT_TOTAL_TIME_ENABLED|PERF_FORMAT_TOTAL_TIME_RUNNING, + .disabled = 1, + }; + struct perf_event_attr attr2 = { + .type = PERF_TYPE_SOFTWARE, + .config = PERF_COUNT_SW_TASK_CLOCK, + .read_format = PERF_FORMAT_TOTAL_TIME_ENABLED|PERF_FORMAT_TOTAL_TIME_RUNNING, + .disabled = 1, + }; + + libperf_init(libperf_print); + threads = perf_thread_map__new_dummy(); + if (!threads) { + fprintf(stderr, "failed to create threads\n"); + return -1; + } + perf_thread_map__set_pid(threads, 0, 0); + evlist = perf_evlist__new(); + if (!evlist) { + fprintf(stderr, "failed to create evlist\n"); + goto out_threads; + } + evsel = perf_evsel__new(&attr1); + if (!evsel) { + fprintf(stderr, "failed to create evsel1\n"); + goto out_evlist; + } + perf_evlist__add(evlist, evsel); + evsel = perf_evsel__new(&attr2); + if (!evsel) { + fprintf(stderr, "failed to create evsel2\n"); + goto out_evlist; + } + perf_evlist__add(evlist, evsel); + perf_evlist__set_maps(evlist, NULL, threads); + err = perf_evlist__open(evlist); + if (err) { + fprintf(stderr, "failed to open evsel\n"); + goto out_evlist; + } + perf_evlist__enable(evlist); + while (count--); + perf_evlist__disable(evlist); + perf_evlist__for_each_evsel(evlist, evsel) { + perf_evsel__read(evsel, 0, 0, &counts); + fprintf(stdout, "count %llu, enabled %llu, run %llu\n", + counts.val, counts.ena, counts.run); + } + perf_evlist__close(evlist); +out_evlist: + perf_evlist__delete(evlist); +out_threads: + perf_thread_map__put(threads); + return err; +} diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index beaa8b8c08ff..e1bd2a93c6db 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -5541,7 +5541,7 @@ static void print_event_time(struct tep_handle *tep, struct trace_seq *s, if (p10 > 1 && p10 < time) trace_seq_printf(s, "%5llu.%0*llu", time / p10, prec, time % p10); else - trace_seq_printf(s, "%12llu\n", time); + trace_seq_printf(s, "%12llu", time); } struct print_event_type { diff --git a/tools/perf/Documentation/Makefile b/tools/perf/Documentation/Makefile index adc5a7e44b98..31824d5269cc 100644 --- a/tools/perf/Documentation/Makefile +++ b/tools/perf/Documentation/Makefile @@ -295,7 +295,10 @@ $(OUTPUT)%.1 $(OUTPUT)%.5 $(OUTPUT)%.7 : $(OUTPUT)%.xml $(OUTPUT)%.xml : %.txt $(QUIET_ASCIIDOC)$(RM) $@+ $@ && \ $(ASCIIDOC) -b docbook -d manpage \ - $(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \ + $(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) \ + -aperf_date=$(shell git log -1 --pretty="format:%cd" \ + --date=short $<) \ + -o $@+ $< && \ mv $@+ $@ XSLT = docbook.xsl diff --git a/tools/perf/Documentation/intel-pt.txt b/tools/perf/Documentation/intel-pt.txt index 2cf2d9e9d0da..fd9241a1b987 100644 --- a/tools/perf/Documentation/intel-pt.txt +++ b/tools/perf/Documentation/intel-pt.txt @@ -1,991 +1 @@ -Intel Processor Trace -===================== - -Overview -======== - -Intel Processor Trace (Intel PT) is an extension of Intel Architecture that -collects information about software execution such as control flow, execution -modes and timings and formats it into highly compressed binary packets. -Technical details are documented in the Intel 64 and IA-32 Architectures -Software Developer Manuals, Chapter 36 Intel Processor Trace. - -Intel PT is first supported in Intel Core M and 5th generation Intel Core -processors that are based on the Intel micro-architecture code name Broadwell. - -Trace data is collected by 'perf record' and stored within the perf.data file. -See below for options to 'perf record'. - -Trace data must be 'decoded' which involves walking the object code and matching -the trace data packets. For example a TNT packet only tells whether a -conditional branch was taken or not taken, so to make use of that packet the -decoder must know precisely which instruction was being executed. - -Decoding is done on-the-fly. The decoder outputs samples in the same format as -samples output by perf hardware events, for example as though the "instructions" -or "branches" events had been recorded. Presently 3 tools support this: -'perf script', 'perf report' and 'perf inject'. See below for more information -on using those tools. - -The main distinguishing feature of Intel PT is that the decoder can determine -the exact flow of software execution. Intel PT can be used to understand why -and how did software get to a certain point, or behave a certain way. The -software does not have to be recompiled, so Intel PT works with debug or release -builds, however the executed images are needed - which makes use in JIT-compiled -environments, or with self-modified code, a challenge. Also symbols need to be -provided to make sense of addresses. - -A limitation of Intel PT is that it produces huge amounts of trace data -(hundreds of megabytes per second per core) which takes a long time to decode, -for example two or three orders of magnitude longer than it took to collect. -Another limitation is the performance impact of tracing, something that will -vary depending on the use-case and architecture. - - -Quickstart -========== - -It is important to start small. That is because it is easy to capture vastly -more data than can possibly be processed. - -The simplest thing to do with Intel PT is userspace profiling of small programs. -Data is captured with 'perf record' e.g. to trace 'ls' userspace-only: - - perf record -e intel_pt//u ls - -And profiled with 'perf report' e.g. - - perf report - -To also trace kernel space presents a problem, namely kernel self-modifying -code. A fairly good kernel image is available in /proc/kcore but to get an -accurate image a copy of /proc/kcore needs to be made under the same conditions -as the data capture. A script perf-with-kcore can do that, but beware that the -script makes use of 'sudo' to copy /proc/kcore. If you have perf installed -locally from the source tree you can do: - - ~/libexec/perf-core/perf-with-kcore record pt_ls -e intel_pt// -- ls - -which will create a directory named 'pt_ls' and put the perf.data file and -copies of /proc/kcore, /proc/kallsyms and /proc/modules into it. Then to use -'perf report' becomes: - - ~/libexec/perf-core/perf-with-kcore report pt_ls - -Because samples are synthesized after-the-fact, the sampling period can be -selected for reporting. e.g. sample every microsecond - - ~/libexec/perf-core/perf-with-kcore report pt_ls --itrace=i1usge - -See the sections below for more information about the --itrace option. - -Beware the smaller the period, the more samples that are produced, and the -longer it takes to process them. - -Also note that the coarseness of Intel PT timing information will start to -distort the statistical value of the sampling as the sampling period becomes -smaller. - -To represent software control flow, "branches" samples are produced. By default -a branch sample is synthesized for every single branch. To get an idea what -data is available you can use the 'perf script' tool with all itrace sampling -options, which will list all the samples. - - perf record -e intel_pt//u ls - perf script --itrace=ibxwpe - -An interesting field that is not printed by default is 'flags' which can be -displayed as follows: - - perf script --itrace=ibxwpe -F+flags - -The flags are "bcrosyiABEx" which stand for branch, call, return, conditional, -system, asynchronous, interrupt, transaction abort, trace begin, trace end, and -in transaction, respectively. - -Another interesting field that is not printed by default is 'ipc' which can be -displayed as follows: - - perf script --itrace=be -F+ipc - -There are two ways that instructions-per-cycle (IPC) can be calculated depending -on the recording. - -If the 'cyc' config term (see config terms section below) was used, then IPC is -calculated using the cycle count from CYC packets, otherwise MTC packets are -used - refer to the 'mtc' config term. When MTC is used, however, the values -are less accurate because the timing is less accurate. - -Because Intel PT does not update the cycle count on every branch or instruction, -the values will often be zero. When there are values, they will be the number -of instructions and number of cycles since the last update, and thus represent -the average IPC since the last IPC for that event type. Note IPC for "branches" -events is calculated separately from IPC for "instructions" events. - -Also note that the IPC instruction count may or may not include the current -instruction. If the cycle count is associated with an asynchronous branch -(e.g. page fault or interrupt), then the instruction count does not include the -current instruction, otherwise it does. That is consistent with whether or not -that instruction has retired when the cycle count is updated. - -Another note, in the case of "branches" events, non-taken branches are not -presently sampled, so IPC values for them do not appear e.g. a CYC packet with a -TNT packet that starts with a non-taken branch. To see every possible IPC -value, "instructions" events can be used e.g. --itrace=i0ns - -While it is possible to create scripts to analyze the data, an alternative -approach is available to export the data to a sqlite or postgresql database. -Refer to script export-to-sqlite.py or export-to-postgresql.py for more details, -and to script exported-sql-viewer.py for an example of using the database. - -There is also script intel-pt-events.py which provides an example of how to -unpack the raw data for power events and PTWRITE. - -As mentioned above, it is easy to capture too much data. One way to limit the -data captured is to use 'snapshot' mode which is explained further below. -Refer to 'new snapshot option' and 'Intel PT modes of operation' further below. - -Another problem that will be experienced is decoder errors. They can be caused -by inability to access the executed image, self-modified or JIT-ed code, or the -inability to match side-band information (such as context switches and mmaps) -which results in the decoder not knowing what code was executed. - -There is also the problem of perf not being able to copy the data fast enough, -resulting in data lost because the buffer was full. See 'Buffer handling' below -for more details. - - -perf record -=========== - -new event ---------- - -The Intel PT kernel driver creates a new PMU for Intel PT. PMU events are -selected by providing the PMU name followed by the "config" separated by slashes. -An enhancement has been made to allow default "config" e.g. the option - - -e intel_pt// - -will use a default config value. Currently that is the same as - - -e intel_pt/tsc,noretcomp=0/ - -which is the same as - - -e intel_pt/tsc=1,noretcomp=0/ - -Note there are now new config terms - see section 'config terms' further below. - -The config terms are listed in /sys/devices/intel_pt/format. They are bit -fields within the config member of the struct perf_event_attr which is -passed to the kernel by the perf_event_open system call. They correspond to bit -fields in the IA32_RTIT_CTL MSR. Here is a list of them and their definitions: - - $ grep -H . /sys/bus/event_source/devices/intel_pt/format/* - /sys/bus/event_source/devices/intel_pt/format/cyc:config:1 - /sys/bus/event_source/devices/intel_pt/format/cyc_thresh:config:19-22 - /sys/bus/event_source/devices/intel_pt/format/mtc:config:9 - /sys/bus/event_source/devices/intel_pt/format/mtc_period:config:14-17 - /sys/bus/event_source/devices/intel_pt/format/noretcomp:config:11 - /sys/bus/event_source/devices/intel_pt/format/psb_period:config:24-27 - /sys/bus/event_source/devices/intel_pt/format/tsc:config:10 - -Note that the default config must be overridden for each term i.e. - - -e intel_pt/noretcomp=0/ - -is the same as: - - -e intel_pt/tsc=1,noretcomp=0/ - -So, to disable TSC packets use: - - -e intel_pt/tsc=0/ - -It is also possible to specify the config value explicitly: - - -e intel_pt/config=0x400/ - -Note that, as with all events, the event is suffixed with event modifiers: - - u userspace - k kernel - h hypervisor - G guest - H host - p precise ip - -'h', 'G' and 'H' are for virtualization which is not supported by Intel PT. -'p' is also not relevant to Intel PT. So only options 'u' and 'k' are -meaningful for Intel PT. - -perf_event_attr is displayed if the -vv option is used e.g. - - ------------------------------------------------------------ - perf_event_attr: - type 6 - size 112 - config 0x400 - { sample_period, sample_freq } 1 - sample_type IP|TID|TIME|CPU|IDENTIFIER - read_format ID - disabled 1 - inherit 1 - exclude_kernel 1 - exclude_hv 1 - enable_on_exec 1 - sample_id_all 1 - ------------------------------------------------------------ - sys_perf_event_open: pid 31104 cpu 0 group_fd -1 flags 0x8 - sys_perf_event_open: pid 31104 cpu 1 group_fd -1 flags 0x8 - sys_perf_event_open: pid 31104 cpu 2 group_fd -1 flags 0x8 - sys_perf_event_open: pid 31104 cpu 3 group_fd -1 flags 0x8 - ------------------------------------------------------------ - - -config terms ------------- - -The June 2015 version of Intel 64 and IA-32 Architectures Software Developer -Manuals, Chapter 36 Intel Processor Trace, defined new Intel PT features. -Some of the features are reflect in new config terms. All the config terms are -described below. - -tsc Always supported. Produces TSC timestamp packets to provide - timing information. In some cases it is possible to decode - without timing information, for example a per-thread context - that does not overlap executable memory maps. - - The default config selects tsc (i.e. tsc=1). - -noretcomp Always supported. Disables "return compression" so a TIP packet - is produced when a function returns. Causes more packets to be - produced but might make decoding more reliable. - - The default config does not select noretcomp (i.e. noretcomp=0). - -psb_period Allows the frequency of PSB packets to be specified. - - The PSB packet is a synchronization packet that provides a - starting point for decoding or recovery from errors. - - Support for psb_period is indicated by: - - /sys/bus/event_source/devices/intel_pt/caps/psb_cyc - - which contains "1" if the feature is supported and "0" - otherwise. - - Valid values are given by: - - /sys/bus/event_source/devices/intel_pt/caps/psb_periods - - which contains a hexadecimal value, the bits of which represent - valid values e.g. bit 2 set means value 2 is valid. - - The psb_period value is converted to the approximate number of - trace bytes between PSB packets as: - - 2 ^ (value + 11) - - e.g. value 3 means 16KiB bytes between PSBs - - If an invalid value is entered, the error message - will give a list of valid values e.g. - - $ perf record -e intel_pt/psb_period=15/u uname - Invalid psb_period for intel_pt. Valid values are: 0-5 - - If MTC packets are selected, the default config selects a value - of 3 (i.e. psb_period=3) or the nearest lower value that is - supported (0 is always supported). Otherwise the default is 0. - - If decoding is expected to be reliable and the buffer is large - then a large PSB period can be used. - - Because a TSC packet is produced with PSB, the PSB period can - also affect the granularity to timing information in the absence - of MTC or CYC. - -mtc Produces MTC timing packets. - - MTC packets provide finer grain timestamp information than TSC - packets. MTC packets record time using the hardware crystal - clock (CTC) which is related to TSC packets using a TMA packet. - - Support for this feature is indicated by: - - /sys/bus/event_source/devices/intel_pt/caps/mtc - - which contains "1" if the feature is supported and - "0" otherwise. - - The frequency of MTC packets can also be specified - see - mtc_period below. - -mtc_period Specifies how frequently MTC packets are produced - see mtc - above for how to determine if MTC packets are supported. - - Valid values are given by: - - /sys/bus/event_source/devices/intel_pt/caps/mtc_periods - - which contains a hexadecimal value, the bits of which represent - valid values e.g. bit 2 set means value 2 is valid. - - The mtc_period value is converted to the MTC frequency as: - - CTC-frequency / (2 ^ value) - - e.g. value 3 means one eighth of CTC-frequency - - Where CTC is the hardware crystal clock, the frequency of which - can be related to TSC via values provided in cpuid leaf 0x15. - - If an invalid value is entered, the error message - will give a list of valid values e.g. - - $ perf record -e intel_pt/mtc_period=15/u uname - Invalid mtc_period for intel_pt. Valid values are: 0,3,6,9 - - The default value is 3 or the nearest lower value - that is supported (0 is always supported). - -cyc Produces CYC timing packets. - - CYC packets provide even finer grain timestamp information than - MTC and TSC packets. A CYC packet contains the number of CPU - cycles since the last CYC packet. Unlike MTC and TSC packets, - CYC packets are only sent when another packet is also sent. - - Support for this feature is indicated by: - - /sys/bus/event_source/devices/intel_pt/caps/psb_cyc - - which contains "1" if the feature is supported and - "0" otherwise. - - The number of CYC packets produced can be reduced by specifying - a threshold - see cyc_thresh below. - -cyc_thresh Specifies how frequently CYC packets are produced - see cyc - above for how to determine if CYC packets are supported. - - Valid cyc_thresh values are given by: - - /sys/bus/event_source/devices/intel_pt/caps/cycle_thresholds - - which contains a hexadecimal value, the bits of which represent - valid values e.g. bit 2 set means value 2 is valid. - - The cyc_thresh value represents the minimum number of CPU cycles - that must have passed before a CYC packet can be sent. The - number of CPU cycles is: - - 2 ^ (value - 1) - - e.g. value 4 means 8 CPU cycles must pass before a CYC packet - can be sent. Note a CYC packet is still only sent when another - packet is sent, not at, e.g. every 8 CPU cycles. - - If an invalid value is entered, the error message - will give a list of valid values e.g. - - $ perf record -e intel_pt/cyc,cyc_thresh=15/u uname - Invalid cyc_thresh for intel_pt. Valid values are: 0-12 - - CYC packets are not requested by default. - -pt Specifies pass-through which enables the 'branch' config term. - - The default config selects 'pt' if it is available, so a user will - never need to specify this term. - -branch Enable branch tracing. Branch tracing is enabled by default so to - disable branch tracing use 'branch=0'. - - The default config selects 'branch' if it is available. - -ptw Enable PTWRITE packets which are produced when a ptwrite instruction - is executed. - - Support for this feature is indicated by: - - /sys/bus/event_source/devices/intel_pt/caps/ptwrite - - which contains "1" if the feature is supported and - "0" otherwise. - -fup_on_ptw Enable a FUP packet to follow the PTWRITE packet. The FUP packet - provides the address of the ptwrite instruction. In the absence of - fup_on_ptw, the decoder will use the address of the previous branch - if branch tracing is enabled, otherwise the address will be zero. - Note that fup_on_ptw will work even when branch tracing is disabled. - -pwr_evt Enable power events. The power events provide information about - changes to the CPU C-state. - - Support for this feature is indicated by: - - /sys/bus/event_source/devices/intel_pt/caps/power_event_trace - - which contains "1" if the feature is supported and - "0" otherwise. - - -AUX area sampling option ------------------------- - -To select Intel PT "sampling" the AUX area sampling option can be used: - - --aux-sample - -Optionally it can be followed by the sample size in bytes e.g. - - --aux-sample=8192 - -In addition, the Intel PT event to sample must be defined e.g. - - -e intel_pt//u - -Samples on other events will be created containing Intel PT data e.g. the -following will create Intel PT samples on the branch-misses event, note the -events must be grouped using {}: - - perf record --aux-sample -e '{intel_pt//u,branch-misses:u}' - -An alternative to '--aux-sample' is to add the config term 'aux-sample-size' to -events. In this case, the grouping is implied e.g. - - perf record -e intel_pt//u -e branch-misses/aux-sample-size=8192/u - -is the same as: - - perf record -e '{intel_pt//u,branch-misses/aux-sample-size=8192/u}' - -but allows for also using an address filter e.g.: - - perf record -e intel_pt//u --filter 'filter * @/bin/ls' -e branch-misses/aux-sample-size=8192/u -- ls - -It is important to select a sample size that is big enough to contain at least -one PSB packet. If not a warning will be displayed: - - Intel PT sample size (%zu) may be too small for PSB period (%zu) - -The calculation used for that is: if sample_size <= psb_period + 256 display the -warning. When sampling is used, psb_period defaults to 0 (2KiB). - -The default sample size is 4KiB. - -The sample size is passed in aux_sample_size in struct perf_event_attr. The -sample size is limited by the maximum event size which is 64KiB. It is -difficult to know how big the event might be without the trace sample attached, -but the tool validates that the sample size is not greater than 60KiB. - - -new snapshot option -------------------- - -The difference between full trace and snapshot from the kernel's perspective is -that in full trace we don't overwrite trace data that the user hasn't collected -yet (and indicated that by advancing aux_tail), whereas in snapshot mode we let -the trace run and overwrite older data in the buffer so that whenever something -interesting happens, we can stop it and grab a snapshot of what was going on -around that interesting moment. - -To select snapshot mode a new option has been added: - - -S - -Optionally it can be followed by the snapshot size e.g. - - -S0x100000 - -The default snapshot size is the auxtrace mmap size. If neither auxtrace mmap size -nor snapshot size is specified, then the default is 4MiB for privileged users -(or if /proc/sys/kernel/perf_event_paranoid < 0), 128KiB for unprivileged users. -If an unprivileged user does not specify mmap pages, the mmap pages will be -reduced as described in the 'new auxtrace mmap size option' section below. - -The snapshot size is displayed if the option -vv is used e.g. - - Intel PT snapshot size: %zu - - -new auxtrace mmap size option ---------------------------- - -Intel PT buffer size is specified by an addition to the -m option e.g. - - -m,16 - -selects a buffer size of 16 pages i.e. 64KiB. - -Note that the existing functionality of -m is unchanged. The auxtrace mmap size -is specified by the optional addition of a comma and the value. - -The default auxtrace mmap size for Intel PT is 4MiB/page_size for privileged users -(or if /proc/sys/kernel/perf_event_paranoid < 0), 128KiB for unprivileged users. -If an unprivileged user does not specify mmap pages, the mmap pages will be -reduced from the default 512KiB/page_size to 256KiB/page_size, otherwise the -user is likely to get an error as they exceed their mlock limit (Max locked -memory as shown in /proc/self/limits). Note that perf does not count the first -512KiB (actually /proc/sys/kernel/perf_event_mlock_kb minus 1 page) per cpu -against the mlock limit so an unprivileged user is allowed 512KiB per cpu plus -their mlock limit (which defaults to 64KiB but is not multiplied by the number -of cpus). - -In full-trace mode, powers of two are allowed for buffer size, with a minimum -size of 2 pages. In snapshot mode or sampling mode, it is the same but the -minimum size is 1 page. - -The mmap size and auxtrace mmap size are displayed if the -vv option is used e.g. - - mmap length 528384 - auxtrace mmap length 4198400 - - -Intel PT modes of operation ---------------------------- - -Intel PT can be used in 2 modes: - full-trace mode - sample mode - snapshot mode - -Full-trace mode traces continuously e.g. - - perf record -e intel_pt//u uname - -Sample mode attaches a Intel PT sample to other events e.g. - - perf record --aux-sample -e intel_pt//u -e branch-misses:u - -Snapshot mode captures the available data when a signal is sent e.g. - - perf record -v -e intel_pt//u -S ./loopy 1000000000 & - [1] 11435 - kill -USR2 11435 - Recording AUX area tracing snapshot - -Note that the signal sent is SIGUSR2. -Note that "Recording AUX area tracing snapshot" is displayed because the -v -option is used. - -The 2 modes cannot be used together. - - -Buffer handling ---------------- - -There may be buffer limitations (i.e. single ToPa entry) which means that actual -buffer sizes are limited to powers of 2 up to 4MiB (MAX_ORDER). In order to -provide other sizes, and in particular an arbitrarily large size, multiple -buffers are logically concatenated. However an interrupt must be used to switch -between buffers. That has two potential problems: - a) the interrupt may not be handled in time so that the current buffer - becomes full and some trace data is lost. - b) the interrupts may slow the system and affect the performance - results. - -If trace data is lost, the driver sets 'truncated' in the PERF_RECORD_AUX event -which the tools report as an error. - -In full-trace mode, the driver waits for data to be copied out before allowing -the (logical) buffer to wrap-around. If data is not copied out quickly enough, -again 'truncated' is set in the PERF_RECORD_AUX event. If the driver has to -wait, the intel_pt event gets disabled. Because it is difficult to know when -that happens, perf tools always re-enable the intel_pt event after copying out -data. - - -Intel PT and build ids ----------------------- - -By default "perf record" post-processes the event stream to find all build ids -for executables for all addresses sampled. Deliberately, Intel PT is not -decoded for that purpose (it would take too long). Instead the build ids for -all executables encountered (due to mmap, comm or task events) are included -in the perf.data file. - -To see buildids included in the perf.data file use the command: - - perf buildid-list - -If the perf.data file contains Intel PT data, that is the same as: - - perf buildid-list --with-hits - - -Snapshot mode and event disabling ---------------------------------- - -In order to make a snapshot, the intel_pt event is disabled using an IOCTL, -namely PERF_EVENT_IOC_DISABLE. However doing that can also disable the -collection of side-band information. In order to prevent that, a dummy -software event has been introduced that permits tracking events (like mmaps) to -continue to be recorded while intel_pt is disabled. That is important to ensure -there is complete side-band information to allow the decoding of subsequent -snapshots. - -A test has been created for that. To find the test: - - perf test list - ... - 23: Test using a dummy software event to keep tracking - -To run the test: - - perf test 23 - 23: Test using a dummy software event to keep tracking : Ok - - -perf record modes (nothing new here) ------------------------------------- - -perf record essentially operates in one of three modes: - per thread - per cpu - workload only - -"per thread" mode is selected by -t or by --per-thread (with -p or -u or just a -workload). -"per cpu" is selected by -C or -a. -"workload only" mode is selected by not using the other options but providing a -command to run (i.e. the workload). - -In per-thread mode an exact list of threads is traced. There is no inheritance. -Each thread has its own event buffer. - -In per-cpu mode all processes (or processes from the selected cgroup i.e. -G -option, or processes selected with -p or -u) are traced. Each cpu has its own -buffer. Inheritance is allowed. - -In workload-only mode, the workload is traced but with per-cpu buffers. -Inheritance is allowed. Note that you can now trace a workload in per-thread -mode by using the --per-thread option. - - -Privileged vs non-privileged users ----------------------------------- - -Unless /proc/sys/kernel/perf_event_paranoid is set to -1, unprivileged users -have memory limits imposed upon them. That affects what buffer sizes they can -have as outlined above. - -The v4.2 kernel introduced support for a context switch metadata event, -PERF_RECORD_SWITCH, which allows unprivileged users to see when their processes -are scheduled out and in, just not by whom, which is left for the -PERF_RECORD_SWITCH_CPU_WIDE, that is only accessible in system wide context, -which in turn requires CAP_SYS_ADMIN. - -Please see the 45ac1403f564 ("perf: Add PERF_RECORD_SWITCH to indicate context -switches") commit, that introduces these metadata events for further info. - -When working with kernels < v4.2, the following considerations must be taken, -as the sched:sched_switch tracepoints will be used to receive such information: - -Unless /proc/sys/kernel/perf_event_paranoid is set to -1, unprivileged users are -not permitted to use tracepoints which means there is insufficient side-band -information to decode Intel PT in per-cpu mode, and potentially workload-only -mode too if the workload creates new processes. - -Note also, that to use tracepoints, read-access to debugfs is required. So if -debugfs is not mounted or the user does not have read-access, it will again not -be possible to decode Intel PT in per-cpu mode. - - -sched_switch tracepoint ------------------------ - -The sched_switch tracepoint is used to provide side-band data for Intel PT -decoding in kernels where the PERF_RECORD_SWITCH metadata event isn't -available. - -The sched_switch events are automatically added. e.g. the second event shown -below: - - $ perf record -vv -e intel_pt//u uname - ------------------------------------------------------------ - perf_event_attr: - type 6 - size 112 - config 0x400 - { sample_period, sample_freq } 1 - sample_type IP|TID|TIME|CPU|IDENTIFIER - read_format ID - disabled 1 - inherit 1 - exclude_kernel 1 - exclude_hv 1 - enable_on_exec 1 - sample_id_all 1 - ------------------------------------------------------------ - sys_perf_event_open: pid 31104 cpu 0 group_fd -1 flags 0x8 - sys_perf_event_open: pid 31104 cpu 1 group_fd -1 flags 0x8 - sys_perf_event_open: pid 31104 cpu 2 group_fd -1 flags 0x8 - sys_perf_event_open: pid 31104 cpu 3 group_fd -1 flags 0x8 - ------------------------------------------------------------ - perf_event_attr: - type 2 - size 112 - config 0x108 - { sample_period, sample_freq } 1 - sample_type IP|TID|TIME|CPU|PERIOD|RAW|IDENTIFIER - read_format ID - inherit 1 - sample_id_all 1 - exclude_guest 1 - ------------------------------------------------------------ - sys_perf_event_open: pid -1 cpu 0 group_fd -1 flags 0x8 - sys_perf_event_open: pid -1 cpu 1 group_fd -1 flags 0x8 - sys_perf_event_open: pid -1 cpu 2 group_fd -1 flags 0x8 - sys_perf_event_open: pid -1 cpu 3 group_fd -1 flags 0x8 - ------------------------------------------------------------ - perf_event_attr: - type 1 - size 112 - config 0x9 - { sample_period, sample_freq } 1 - sample_type IP|TID|TIME|IDENTIFIER - read_format ID - disabled 1 - inherit 1 - exclude_kernel 1 - exclude_hv 1 - mmap 1 - comm 1 - enable_on_exec 1 - task 1 - sample_id_all 1 - mmap2 1 - comm_exec 1 - ------------------------------------------------------------ - sys_perf_event_open: pid 31104 cpu 0 group_fd -1 flags 0x8 - sys_perf_event_open: pid 31104 cpu 1 group_fd -1 flags 0x8 - sys_perf_event_open: pid 31104 cpu 2 group_fd -1 flags 0x8 - sys_perf_event_open: pid 31104 cpu 3 group_fd -1 flags 0x8 - mmap size 528384B - AUX area mmap length 4194304 - perf event ring buffer mmapped per cpu - Synthesizing auxtrace information - Linux - [ perf record: Woken up 1 times to write data ] - [ perf record: Captured and wrote 0.042 MB perf.data ] - -Note, the sched_switch event is only added if the user is permitted to use it -and only in per-cpu mode. - -Note also, the sched_switch event is only added if TSC packets are requested. -That is because, in the absence of timing information, the sched_switch events -cannot be matched against the Intel PT trace. - - -perf script -=========== - -By default, perf script will decode trace data found in the perf.data file. -This can be further controlled by new option --itrace. - - -New --itrace option -------------------- - -Having no option is the same as - - --itrace - -which, in turn, is the same as - - --itrace=cepwx - -The letters are: - - i synthesize "instructions" events - b synthesize "branches" events - x synthesize "transactions" events - w synthesize "ptwrite" events - p synthesize "power" events - c synthesize branches events (calls only) - r synthesize branches events (returns only) - e synthesize tracing error events - d create a debug log - g synthesize a call chain (use with i or x) - l synthesize last branch entries (use with i or x) - s skip initial number of events - -"Instructions" events look like they were recorded by "perf record -e -instructions". - -"Branches" events look like they were recorded by "perf record -e branches". "c" -and "r" can be combined to get calls and returns. - -"Transactions" events correspond to the start or end of transactions. The -'flags' field can be used in perf script to determine whether the event is a -tranasaction start, commit or abort. - -Note that "instructions", "branches" and "transactions" events depend on code -flow packets which can be disabled by using the config term "branch=0". Refer -to the config terms section above. - -"ptwrite" events record the payload of the ptwrite instruction and whether -"fup_on_ptw" was used. "ptwrite" events depend on PTWRITE packets which are -recorded only if the "ptw" config term was used. Refer to the config terms -section above. perf script "synth" field displays "ptwrite" information like -this: "ip: 0 payload: 0x123456789abcdef0" where "ip" is 1 if "fup_on_ptw" was -used. - -"Power" events correspond to power event packets and CBR (core-to-bus ratio) -packets. While CBR packets are always recorded when tracing is enabled, power -event packets are recorded only if the "pwr_evt" config term was used. Refer to -the config terms section above. The power events record information about -C-state changes, whereas CBR is indicative of CPU frequency. perf script -"event,synth" fields display information like this: - cbr: cbr: 22 freq: 2189 MHz (200%) - mwait: hints: 0x60 extensions: 0x1 - pwre: hw: 0 cstate: 2 sub-cstate: 0 - exstop: ip: 1 - pwrx: deepest cstate: 2 last cstate: 2 wake reason: 0x4 -Where: - "cbr" includes the frequency and the percentage of maximum non-turbo - "mwait" shows mwait hints and extensions - "pwre" shows C-state transitions (to a C-state deeper than C0) and - whether initiated by hardware - "exstop" indicates execution stopped and whether the IP was recorded - exactly, - "pwrx" indicates return to C0 -For more details refer to the Intel 64 and IA-32 Architectures Software -Developer Manuals. - -Error events show where the decoder lost the trace. Error events -are quite important. Users must know if what they are seeing is a complete -picture or not. - -The "d" option will cause the creation of a file "intel_pt.log" containing all -decoded packets and instructions. Note that this option slows down the decoder -and that the resulting file may be very large. - -In addition, the period of the "instructions" event can be specified. e.g. - - --itrace=i10us - -sets the period to 10us i.e. one instruction sample is synthesized for each 10 -microseconds of trace. Alternatives to "us" are "ms" (milliseconds), -"ns" (nanoseconds), "t" (TSC ticks) or "i" (instructions). - -"ms", "us" and "ns" are converted to TSC ticks. - -The timing information included with Intel PT does not give the time of every -instruction. Consequently, for the purpose of sampling, the decoder estimates -the time since the last timing packet based on 1 tick per instruction. The time -on the sample is *not* adjusted and reflects the last known value of TSC. - -For Intel PT, the default period is 100us. - -Setting it to a zero period means "as often as possible". - -In the case of Intel PT that is the same as a period of 1 and a unit of -'instructions' (i.e. --itrace=i1i). - -Also the call chain size (default 16, max. 1024) for instructions or -transactions events can be specified. e.g. - - --itrace=ig32 - --itrace=xg32 - -Also the number of last branch entries (default 64, max. 1024) for instructions or -transactions events can be specified. e.g. - - --itrace=il10 - --itrace=xl10 - -Note that last branch entries are cleared for each sample, so there is no overlap -from one sample to the next. - -To disable trace decoding entirely, use the option --no-itrace. - -It is also possible to skip events generated (instructions, branches, transactions) -at the beginning. This is useful to ignore initialization code. - - --itrace=i0nss1000000 - -skips the first million instructions. - -dump option ------------ - -perf script has an option (-D) to "dump" the events i.e. display the binary -data. - -When -D is used, Intel PT packets are displayed. The packet decoder does not -pay attention to PSB packets, but just decodes the bytes - so the packets seen -by the actual decoder may not be identical in places where the data is corrupt. -One example of that would be when the buffer-switching interrupt has been too -slow, and the buffer has been filled completely. In that case, the last packet -in the buffer might be truncated and immediately followed by a PSB as the trace -continues in the next buffer. - -To disable the display of Intel PT packets, combine the -D option with ---no-itrace. - - -perf report -=========== - -By default, perf report will decode trace data found in the perf.data file. -This can be further controlled by new option --itrace exactly the same as -perf script, with the exception that the default is --itrace=igxe. - - -perf inject -=========== - -perf inject also accepts the --itrace option in which case tracing data is -removed and replaced with the synthesized events. e.g. - - perf inject --itrace -i perf.data -o perf.data.new - -Below is an example of using Intel PT with autofdo. It requires autofdo -(https://github.com/google/autofdo) and gcc version 5. The bubble -sort example is from the AutoFDO tutorial (https://gcc.gnu.org/wiki/AutoFDO/Tutorial) -amended to take the number of elements as a parameter. - - $ gcc-5 -O3 sort.c -o sort_optimized - $ ./sort_optimized 30000 - Bubble sorting array of 30000 elements - 2254 ms - - $ cat ~/.perfconfig - [intel-pt] - mispred-all = on - - $ perf record -e intel_pt//u ./sort 3000 - Bubble sorting array of 3000 elements - 58 ms - [ perf record: Woken up 2 times to write data ] - [ perf record: Captured and wrote 3.939 MB perf.data ] - $ perf inject -i perf.data -o inj --itrace=i100usle --strip - $ ./create_gcov --binary=./sort --profile=inj --gcov=sort.gcov -gcov_version=1 - $ gcc-5 -O3 -fauto-profile=sort.gcov sort.c -o sort_autofdo - $ ./sort_autofdo 30000 - Bubble sorting array of 30000 elements - 2155 ms - -Note there is currently no advantage to using Intel PT instead of LBR, but -that may change in the future if greater use is made of the data. - - -PEBS via Intel PT -================= - -Some hardware has the feature to redirect PEBS records to the Intel PT trace. -Recording is selected by using the aux-output config term e.g. - - perf record -c 10000 -e '{intel_pt/branch=0/,cycles/aux-output/ppp}' uname - -Note that currently, software only supports redirecting at most one PEBS event. - -To display PEBS events from the Intel PT trace, use the itrace 'o' option e.g. - - perf script --itrace=oe +Documentation for support for Intel Processor Trace within perf tools' has moved to file perf-intel-pt.txt diff --git a/tools/perf/Documentation/perf-inject.txt b/tools/perf/Documentation/perf-inject.txt index a64d6588470e..70969ea73e01 100644 --- a/tools/perf/Documentation/perf-inject.txt +++ b/tools/perf/Documentation/perf-inject.txt @@ -66,4 +66,5 @@ include::itrace.txt[] SEE ALSO -------- -linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-archive[1] +linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-archive[1], +linkperf:perf-intel-pt[1] diff --git a/tools/perf/Documentation/perf-intel-pt.txt b/tools/perf/Documentation/perf-intel-pt.txt new file mode 100644 index 000000000000..456fdcbf26ac --- /dev/null +++ b/tools/perf/Documentation/perf-intel-pt.txt @@ -0,0 +1,1007 @@ +perf-intel-pt(1) +================ + +NAME +---- +perf-intel-pt - Support for Intel Processor Trace within perf tools + +SYNOPSIS +-------- +[verse] +'perf record' -e intel_pt// + +DESCRIPTION +----------- + +Intel Processor Trace (Intel PT) is an extension of Intel Architecture that +collects information about software execution such as control flow, execution +modes and timings and formats it into highly compressed binary packets. +Technical details are documented in the Intel 64 and IA-32 Architectures +Software Developer Manuals, Chapter 36 Intel Processor Trace. + +Intel PT is first supported in Intel Core M and 5th generation Intel Core +processors that are based on the Intel micro-architecture code name Broadwell. + +Trace data is collected by 'perf record' and stored within the perf.data file. +See below for options to 'perf record'. + +Trace data must be 'decoded' which involves walking the object code and matching +the trace data packets. For example a TNT packet only tells whether a +conditional branch was taken or not taken, so to make use of that packet the +decoder must know precisely which instruction was being executed. + +Decoding is done on-the-fly. The decoder outputs samples in the same format as +samples output by perf hardware events, for example as though the "instructions" +or "branches" events had been recorded. Presently 3 tools support this: +'perf script', 'perf report' and 'perf inject'. See below for more information +on using those tools. + +The main distinguishing feature of Intel PT is that the decoder can determine +the exact flow of software execution. Intel PT can be used to understand why +and how did software get to a certain point, or behave a certain way. The +software does not have to be recompiled, so Intel PT works with debug or release +builds, however the executed images are needed - which makes use in JIT-compiled +environments, or with self-modified code, a challenge. Also symbols need to be +provided to make sense of addresses. + +A limitation of Intel PT is that it produces huge amounts of trace data +(hundreds of megabytes per second per core) which takes a long time to decode, +for example two or three orders of magnitude longer than it took to collect. +Another limitation is the performance impact of tracing, something that will +vary depending on the use-case and architecture. + + +Quickstart +---------- + +It is important to start small. That is because it is easy to capture vastly +more data than can possibly be processed. + +The simplest thing to do with Intel PT is userspace profiling of small programs. +Data is captured with 'perf record' e.g. to trace 'ls' userspace-only: + + perf record -e intel_pt//u ls + +And profiled with 'perf report' e.g. + + perf report + +To also trace kernel space presents a problem, namely kernel self-modifying +code. A fairly good kernel image is available in /proc/kcore but to get an +accurate image a copy of /proc/kcore needs to be made under the same conditions +as the data capture. A script perf-with-kcore can do that, but beware that the +script makes use of 'sudo' to copy /proc/kcore. If you have perf installed +locally from the source tree you can do: + + ~/libexec/perf-core/perf-with-kcore record pt_ls -e intel_pt// -- ls + +which will create a directory named 'pt_ls' and put the perf.data file and +copies of /proc/kcore, /proc/kallsyms and /proc/modules into it. Then to use +'perf report' becomes: + + ~/libexec/perf-core/perf-with-kcore report pt_ls + +Because samples are synthesized after-the-fact, the sampling period can be +selected for reporting. e.g. sample every microsecond + + ~/libexec/perf-core/perf-with-kcore report pt_ls --itrace=i1usge + +See the sections below for more information about the --itrace option. + +Beware the smaller the period, the more samples that are produced, and the +longer it takes to process them. + +Also note that the coarseness of Intel PT timing information will start to +distort the statistical value of the sampling as the sampling period becomes +smaller. + +To represent software control flow, "branches" samples are produced. By default +a branch sample is synthesized for every single branch. To get an idea what +data is available you can use the 'perf script' tool with all itrace sampling +options, which will list all the samples. + + perf record -e intel_pt//u ls + perf script --itrace=ibxwpe + +An interesting field that is not printed by default is 'flags' which can be +displayed as follows: + + perf script --itrace=ibxwpe -F+flags + +The flags are "bcrosyiABEx" which stand for branch, call, return, conditional, +system, asynchronous, interrupt, transaction abort, trace begin, trace end, and +in transaction, respectively. + +Another interesting field that is not printed by default is 'ipc' which can be +displayed as follows: + + perf script --itrace=be -F+ipc + +There are two ways that instructions-per-cycle (IPC) can be calculated depending +on the recording. + +If the 'cyc' config term (see config terms section below) was used, then IPC is +calculated using the cycle count from CYC packets, otherwise MTC packets are +used - refer to the 'mtc' config term. When MTC is used, however, the values +are less accurate because the timing is less accurate. + +Because Intel PT does not update the cycle count on every branch or instruction, +the values will often be zero. When there are values, they will be the number +of instructions and number of cycles since the last update, and thus represent +the average IPC since the last IPC for that event type. Note IPC for "branches" +events is calculated separately from IPC for "instructions" events. + +Also note that the IPC instruction count may or may not include the current +instruction. If the cycle count is associated with an asynchronous branch +(e.g. page fault or interrupt), then the instruction count does not include the +current instruction, otherwise it does. That is consistent with whether or not +that instruction has retired when the cycle count is updated. + +Another note, in the case of "branches" events, non-taken branches are not +presently sampled, so IPC values for them do not appear e.g. a CYC packet with a +TNT packet that starts with a non-taken branch. To see every possible IPC +value, "instructions" events can be used e.g. --itrace=i0ns + +While it is possible to create scripts to analyze the data, an alternative +approach is available to export the data to a sqlite or postgresql database. +Refer to script export-to-sqlite.py or export-to-postgresql.py for more details, +and to script exported-sql-viewer.py for an example of using the database. + +There is also script intel-pt-events.py which provides an example of how to +unpack the raw data for power events and PTWRITE. + +As mentioned above, it is easy to capture too much data. One way to limit the +data captured is to use 'snapshot' mode which is explained further below. +Refer to 'new snapshot option' and 'Intel PT modes of operation' further below. + +Another problem that will be experienced is decoder errors. They can be caused +by inability to access the executed image, self-modified or JIT-ed code, or the +inability to match side-band information (such as context switches and mmaps) +which results in the decoder not knowing what code was executed. + +There is also the problem of perf not being able to copy the data fast enough, +resulting in data lost because the buffer was full. See 'Buffer handling' below +for more details. + + +perf record +----------- + +new event +~~~~~~~~~ + +The Intel PT kernel driver creates a new PMU for Intel PT. PMU events are +selected by providing the PMU name followed by the "config" separated by slashes. +An enhancement has been made to allow default "config" e.g. the option + + -e intel_pt// + +will use a default config value. Currently that is the same as + + -e intel_pt/tsc,noretcomp=0/ + +which is the same as + + -e intel_pt/tsc=1,noretcomp=0/ + +Note there are now new config terms - see section 'config terms' further below. + +The config terms are listed in /sys/devices/intel_pt/format. They are bit +fields within the config member of the struct perf_event_attr which is +passed to the kernel by the perf_event_open system call. They correspond to bit +fields in the IA32_RTIT_CTL MSR. Here is a list of them and their definitions: + + $ grep -H . /sys/bus/event_source/devices/intel_pt/format/* + /sys/bus/event_source/devices/intel_pt/format/cyc:config:1 + /sys/bus/event_source/devices/intel_pt/format/cyc_thresh:config:19-22 + /sys/bus/event_source/devices/intel_pt/format/mtc:config:9 + /sys/bus/event_source/devices/intel_pt/format/mtc_period:config:14-17 + /sys/bus/event_source/devices/intel_pt/format/noretcomp:config:11 + /sys/bus/event_source/devices/intel_pt/format/psb_period:config:24-27 + /sys/bus/event_source/devices/intel_pt/format/tsc:config:10 + +Note that the default config must be overridden for each term i.e. + + -e intel_pt/noretcomp=0/ + +is the same as: + + -e intel_pt/tsc=1,noretcomp=0/ + +So, to disable TSC packets use: + + -e intel_pt/tsc=0/ + +It is also possible to specify the config value explicitly: + + -e intel_pt/config=0x400/ + +Note that, as with all events, the event is suffixed with event modifiers: + + u userspace + k kernel + h hypervisor + G guest + H host + p precise ip + +'h', 'G' and 'H' are for virtualization which is not supported by Intel PT. +'p' is also not relevant to Intel PT. So only options 'u' and 'k' are +meaningful for Intel PT. + +perf_event_attr is displayed if the -vv option is used e.g. + + ------------------------------------------------------------ + perf_event_attr: + type 6 + size 112 + config 0x400 + { sample_period, sample_freq } 1 + sample_type IP|TID|TIME|CPU|IDENTIFIER + read_format ID + disabled 1 + inherit 1 + exclude_kernel 1 + exclude_hv 1 + enable_on_exec 1 + sample_id_all 1 + ------------------------------------------------------------ + sys_perf_event_open: pid 31104 cpu 0 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 1 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 2 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 3 group_fd -1 flags 0x8 + ------------------------------------------------------------ + + +config terms +~~~~~~~~~~~~ + +The June 2015 version of Intel 64 and IA-32 Architectures Software Developer +Manuals, Chapter 36 Intel Processor Trace, defined new Intel PT features. +Some of the features are reflect in new config terms. All the config terms are +described below. + +tsc Always supported. Produces TSC timestamp packets to provide + timing information. In some cases it is possible to decode + without timing information, for example a per-thread context + that does not overlap executable memory maps. + + The default config selects tsc (i.e. tsc=1). + +noretcomp Always supported. Disables "return compression" so a TIP packet + is produced when a function returns. Causes more packets to be + produced but might make decoding more reliable. + + The default config does not select noretcomp (i.e. noretcomp=0). + +psb_period Allows the frequency of PSB packets to be specified. + + The PSB packet is a synchronization packet that provides a + starting point for decoding or recovery from errors. + + Support for psb_period is indicated by: + + /sys/bus/event_source/devices/intel_pt/caps/psb_cyc + + which contains "1" if the feature is supported and "0" + otherwise. + + Valid values are given by: + + /sys/bus/event_source/devices/intel_pt/caps/psb_periods + + which contains a hexadecimal value, the bits of which represent + valid values e.g. bit 2 set means value 2 is valid. + + The psb_period value is converted to the approximate number of + trace bytes between PSB packets as: + + 2 ^ (value + 11) + + e.g. value 3 means 16KiB bytes between PSBs + + If an invalid value is entered, the error message + will give a list of valid values e.g. + + $ perf record -e intel_pt/psb_period=15/u uname + Invalid psb_period for intel_pt. Valid values are: 0-5 + + If MTC packets are selected, the default config selects a value + of 3 (i.e. psb_period=3) or the nearest lower value that is + supported (0 is always supported). Otherwise the default is 0. + + If decoding is expected to be reliable and the buffer is large + then a large PSB period can be used. + + Because a TSC packet is produced with PSB, the PSB period can + also affect the granularity to timing information in the absence + of MTC or CYC. + +mtc Produces MTC timing packets. + + MTC packets provide finer grain timestamp information than TSC + packets. MTC packets record time using the hardware crystal + clock (CTC) which is related to TSC packets using a TMA packet. + + Support for this feature is indicated by: + + /sys/bus/event_source/devices/intel_pt/caps/mtc + + which contains "1" if the feature is supported and + "0" otherwise. + + The frequency of MTC packets can also be specified - see + mtc_period below. + +mtc_period Specifies how frequently MTC packets are produced - see mtc + above for how to determine if MTC packets are supported. + + Valid values are given by: + + /sys/bus/event_source/devices/intel_pt/caps/mtc_periods + + which contains a hexadecimal value, the bits of which represent + valid values e.g. bit 2 set means value 2 is valid. + + The mtc_period value is converted to the MTC frequency as: + + CTC-frequency / (2 ^ value) + + e.g. value 3 means one eighth of CTC-frequency + + Where CTC is the hardware crystal clock, the frequency of which + can be related to TSC via values provided in cpuid leaf 0x15. + + If an invalid value is entered, the error message + will give a list of valid values e.g. + + $ perf record -e intel_pt/mtc_period=15/u uname + Invalid mtc_period for intel_pt. Valid values are: 0,3,6,9 + + The default value is 3 or the nearest lower value + that is supported (0 is always supported). + +cyc Produces CYC timing packets. + + CYC packets provide even finer grain timestamp information than + MTC and TSC packets. A CYC packet contains the number of CPU + cycles since the last CYC packet. Unlike MTC and TSC packets, + CYC packets are only sent when another packet is also sent. + + Support for this feature is indicated by: + + /sys/bus/event_source/devices/intel_pt/caps/psb_cyc + + which contains "1" if the feature is supported and + "0" otherwise. + + The number of CYC packets produced can be reduced by specifying + a threshold - see cyc_thresh below. + +cyc_thresh Specifies how frequently CYC packets are produced - see cyc + above for how to determine if CYC packets are supported. + + Valid cyc_thresh values are given by: + + /sys/bus/event_source/devices/intel_pt/caps/cycle_thresholds + + which contains a hexadecimal value, the bits of which represent + valid values e.g. bit 2 set means value 2 is valid. + + The cyc_thresh value represents the minimum number of CPU cycles + that must have passed before a CYC packet can be sent. The + number of CPU cycles is: + + 2 ^ (value - 1) + + e.g. value 4 means 8 CPU cycles must pass before a CYC packet + can be sent. Note a CYC packet is still only sent when another + packet is sent, not at, e.g. every 8 CPU cycles. + + If an invalid value is entered, the error message + will give a list of valid values e.g. + + $ perf record -e intel_pt/cyc,cyc_thresh=15/u uname + Invalid cyc_thresh for intel_pt. Valid values are: 0-12 + + CYC packets are not requested by default. + +pt Specifies pass-through which enables the 'branch' config term. + + The default config selects 'pt' if it is available, so a user will + never need to specify this term. + +branch Enable branch tracing. Branch tracing is enabled by default so to + disable branch tracing use 'branch=0'. + + The default config selects 'branch' if it is available. + +ptw Enable PTWRITE packets which are produced when a ptwrite instruction + is executed. + + Support for this feature is indicated by: + + /sys/bus/event_source/devices/intel_pt/caps/ptwrite + + which contains "1" if the feature is supported and + "0" otherwise. + +fup_on_ptw Enable a FUP packet to follow the PTWRITE packet. The FUP packet + provides the address of the ptwrite instruction. In the absence of + fup_on_ptw, the decoder will use the address of the previous branch + if branch tracing is enabled, otherwise the address will be zero. + Note that fup_on_ptw will work even when branch tracing is disabled. + +pwr_evt Enable power events. The power events provide information about + changes to the CPU C-state. + + Support for this feature is indicated by: + + /sys/bus/event_source/devices/intel_pt/caps/power_event_trace + + which contains "1" if the feature is supported and + "0" otherwise. + + +AUX area sampling option +~~~~~~~~~~~~~~~~~~~~~~~~ + +To select Intel PT "sampling" the AUX area sampling option can be used: + + --aux-sample + +Optionally it can be followed by the sample size in bytes e.g. + + --aux-sample=8192 + +In addition, the Intel PT event to sample must be defined e.g. + + -e intel_pt//u + +Samples on other events will be created containing Intel PT data e.g. the +following will create Intel PT samples on the branch-misses event, note the +events must be grouped using {}: + + perf record --aux-sample -e '{intel_pt//u,branch-misses:u}' + +An alternative to '--aux-sample' is to add the config term 'aux-sample-size' to +events. In this case, the grouping is implied e.g. + + perf record -e intel_pt//u -e branch-misses/aux-sample-size=8192/u + +is the same as: + + perf record -e '{intel_pt//u,branch-misses/aux-sample-size=8192/u}' + +but allows for also using an address filter e.g.: + + perf record -e intel_pt//u --filter 'filter * @/bin/ls' -e branch-misses/aux-sample-size=8192/u -- ls + +It is important to select a sample size that is big enough to contain at least +one PSB packet. If not a warning will be displayed: + + Intel PT sample size (%zu) may be too small for PSB period (%zu) + +The calculation used for that is: if sample_size <= psb_period + 256 display the +warning. When sampling is used, psb_period defaults to 0 (2KiB). + +The default sample size is 4KiB. + +The sample size is passed in aux_sample_size in struct perf_event_attr. The +sample size is limited by the maximum event size which is 64KiB. It is +difficult to know how big the event might be without the trace sample attached, +but the tool validates that the sample size is not greater than 60KiB. + + +new snapshot option +~~~~~~~~~~~~~~~~~~~ + +The difference between full trace and snapshot from the kernel's perspective is +that in full trace we don't overwrite trace data that the user hasn't collected +yet (and indicated that by advancing aux_tail), whereas in snapshot mode we let +the trace run and overwrite older data in the buffer so that whenever something +interesting happens, we can stop it and grab a snapshot of what was going on +around that interesting moment. + +To select snapshot mode a new option has been added: + + -S + +Optionally it can be followed by the snapshot size e.g. + + -S0x100000 + +The default snapshot size is the auxtrace mmap size. If neither auxtrace mmap size +nor snapshot size is specified, then the default is 4MiB for privileged users +(or if /proc/sys/kernel/perf_event_paranoid < 0), 128KiB for unprivileged users. +If an unprivileged user does not specify mmap pages, the mmap pages will be +reduced as described in the 'new auxtrace mmap size option' section below. + +The snapshot size is displayed if the option -vv is used e.g. + + Intel PT snapshot size: %zu + + +new auxtrace mmap size option +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Intel PT buffer size is specified by an addition to the -m option e.g. + + -m,16 + +selects a buffer size of 16 pages i.e. 64KiB. + +Note that the existing functionality of -m is unchanged. The auxtrace mmap size +is specified by the optional addition of a comma and the value. + +The default auxtrace mmap size for Intel PT is 4MiB/page_size for privileged users +(or if /proc/sys/kernel/perf_event_paranoid < 0), 128KiB for unprivileged users. +If an unprivileged user does not specify mmap pages, the mmap pages will be +reduced from the default 512KiB/page_size to 256KiB/page_size, otherwise the +user is likely to get an error as they exceed their mlock limit (Max locked +memory as shown in /proc/self/limits). Note that perf does not count the first +512KiB (actually /proc/sys/kernel/perf_event_mlock_kb minus 1 page) per cpu +against the mlock limit so an unprivileged user is allowed 512KiB per cpu plus +their mlock limit (which defaults to 64KiB but is not multiplied by the number +of cpus). + +In full-trace mode, powers of two are allowed for buffer size, with a minimum +size of 2 pages. In snapshot mode or sampling mode, it is the same but the +minimum size is 1 page. + +The mmap size and auxtrace mmap size are displayed if the -vv option is used e.g. + + mmap length 528384 + auxtrace mmap length 4198400 + + +Intel PT modes of operation +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Intel PT can be used in 2 modes: + full-trace mode + sample mode + snapshot mode + +Full-trace mode traces continuously e.g. + + perf record -e intel_pt//u uname + +Sample mode attaches a Intel PT sample to other events e.g. + + perf record --aux-sample -e intel_pt//u -e branch-misses:u + +Snapshot mode captures the available data when a signal is sent e.g. + + perf record -v -e intel_pt//u -S ./loopy 1000000000 & + [1] 11435 + kill -USR2 11435 + Recording AUX area tracing snapshot + +Note that the signal sent is SIGUSR2. +Note that "Recording AUX area tracing snapshot" is displayed because the -v +option is used. + +The 2 modes cannot be used together. + + +Buffer handling +~~~~~~~~~~~~~~~ + +There may be buffer limitations (i.e. single ToPa entry) which means that actual +buffer sizes are limited to powers of 2 up to 4MiB (MAX_ORDER). In order to +provide other sizes, and in particular an arbitrarily large size, multiple +buffers are logically concatenated. However an interrupt must be used to switch +between buffers. That has two potential problems: + a) the interrupt may not be handled in time so that the current buffer + becomes full and some trace data is lost. + b) the interrupts may slow the system and affect the performance + results. + +If trace data is lost, the driver sets 'truncated' in the PERF_RECORD_AUX event +which the tools report as an error. + +In full-trace mode, the driver waits for data to be copied out before allowing +the (logical) buffer to wrap-around. If data is not copied out quickly enough, +again 'truncated' is set in the PERF_RECORD_AUX event. If the driver has to +wait, the intel_pt event gets disabled. Because it is difficult to know when +that happens, perf tools always re-enable the intel_pt event after copying out +data. + + +Intel PT and build ids +~~~~~~~~~~~~~~~~~~~~~~ + +By default "perf record" post-processes the event stream to find all build ids +for executables for all addresses sampled. Deliberately, Intel PT is not +decoded for that purpose (it would take too long). Instead the build ids for +all executables encountered (due to mmap, comm or task events) are included +in the perf.data file. + +To see buildids included in the perf.data file use the command: + + perf buildid-list + +If the perf.data file contains Intel PT data, that is the same as: + + perf buildid-list --with-hits + + +Snapshot mode and event disabling +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In order to make a snapshot, the intel_pt event is disabled using an IOCTL, +namely PERF_EVENT_IOC_DISABLE. However doing that can also disable the +collection of side-band information. In order to prevent that, a dummy +software event has been introduced that permits tracking events (like mmaps) to +continue to be recorded while intel_pt is disabled. That is important to ensure +there is complete side-band information to allow the decoding of subsequent +snapshots. + +A test has been created for that. To find the test: + + perf test list + ... + 23: Test using a dummy software event to keep tracking + +To run the test: + + perf test 23 + 23: Test using a dummy software event to keep tracking : Ok + + +perf record modes (nothing new here) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +perf record essentially operates in one of three modes: + per thread + per cpu + workload only + +"per thread" mode is selected by -t or by --per-thread (with -p or -u or just a +workload). +"per cpu" is selected by -C or -a. +"workload only" mode is selected by not using the other options but providing a +command to run (i.e. the workload). + +In per-thread mode an exact list of threads is traced. There is no inheritance. +Each thread has its own event buffer. + +In per-cpu mode all processes (or processes from the selected cgroup i.e. -G +option, or processes selected with -p or -u) are traced. Each cpu has its own +buffer. Inheritance is allowed. + +In workload-only mode, the workload is traced but with per-cpu buffers. +Inheritance is allowed. Note that you can now trace a workload in per-thread +mode by using the --per-thread option. + + +Privileged vs non-privileged users +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Unless /proc/sys/kernel/perf_event_paranoid is set to -1, unprivileged users +have memory limits imposed upon them. That affects what buffer sizes they can +have as outlined above. + +The v4.2 kernel introduced support for a context switch metadata event, +PERF_RECORD_SWITCH, which allows unprivileged users to see when their processes +are scheduled out and in, just not by whom, which is left for the +PERF_RECORD_SWITCH_CPU_WIDE, that is only accessible in system wide context, +which in turn requires CAP_SYS_ADMIN. + +Please see the 45ac1403f564 ("perf: Add PERF_RECORD_SWITCH to indicate context +switches") commit, that introduces these metadata events for further info. + +When working with kernels < v4.2, the following considerations must be taken, +as the sched:sched_switch tracepoints will be used to receive such information: + +Unless /proc/sys/kernel/perf_event_paranoid is set to -1, unprivileged users are +not permitted to use tracepoints which means there is insufficient side-band +information to decode Intel PT in per-cpu mode, and potentially workload-only +mode too if the workload creates new processes. + +Note also, that to use tracepoints, read-access to debugfs is required. So if +debugfs is not mounted or the user does not have read-access, it will again not +be possible to decode Intel PT in per-cpu mode. + + +sched_switch tracepoint +~~~~~~~~~~~~~~~~~~~~~~~ + +The sched_switch tracepoint is used to provide side-band data for Intel PT +decoding in kernels where the PERF_RECORD_SWITCH metadata event isn't +available. + +The sched_switch events are automatically added. e.g. the second event shown +below: + + $ perf record -vv -e intel_pt//u uname + ------------------------------------------------------------ + perf_event_attr: + type 6 + size 112 + config 0x400 + { sample_period, sample_freq } 1 + sample_type IP|TID|TIME|CPU|IDENTIFIER + read_format ID + disabled 1 + inherit 1 + exclude_kernel 1 + exclude_hv 1 + enable_on_exec 1 + sample_id_all 1 + ------------------------------------------------------------ + sys_perf_event_open: pid 31104 cpu 0 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 1 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 2 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 3 group_fd -1 flags 0x8 + ------------------------------------------------------------ + perf_event_attr: + type 2 + size 112 + config 0x108 + { sample_period, sample_freq } 1 + sample_type IP|TID|TIME|CPU|PERIOD|RAW|IDENTIFIER + read_format ID + inherit 1 + sample_id_all 1 + exclude_guest 1 + ------------------------------------------------------------ + sys_perf_event_open: pid -1 cpu 0 group_fd -1 flags 0x8 + sys_perf_event_open: pid -1 cpu 1 group_fd -1 flags 0x8 + sys_perf_event_open: pid -1 cpu 2 group_fd -1 flags 0x8 + sys_perf_event_open: pid -1 cpu 3 group_fd -1 flags 0x8 + ------------------------------------------------------------ + perf_event_attr: + type 1 + size 112 + config 0x9 + { sample_period, sample_freq } 1 + sample_type IP|TID|TIME|IDENTIFIER + read_format ID + disabled 1 + inherit 1 + exclude_kernel 1 + exclude_hv 1 + mmap 1 + comm 1 + enable_on_exec 1 + task 1 + sample_id_all 1 + mmap2 1 + comm_exec 1 + ------------------------------------------------------------ + sys_perf_event_open: pid 31104 cpu 0 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 1 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 2 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 3 group_fd -1 flags 0x8 + mmap size 528384B + AUX area mmap length 4194304 + perf event ring buffer mmapped per cpu + Synthesizing auxtrace information + Linux + [ perf record: Woken up 1 times to write data ] + [ perf record: Captured and wrote 0.042 MB perf.data ] + +Note, the sched_switch event is only added if the user is permitted to use it +and only in per-cpu mode. + +Note also, the sched_switch event is only added if TSC packets are requested. +That is because, in the absence of timing information, the sched_switch events +cannot be matched against the Intel PT trace. + + +perf script +----------- + +By default, perf script will decode trace data found in the perf.data file. +This can be further controlled by new option --itrace. + + +New --itrace option +~~~~~~~~~~~~~~~~~~~ + +Having no option is the same as + + --itrace + +which, in turn, is the same as + + --itrace=cepwx + +The letters are: + + i synthesize "instructions" events + b synthesize "branches" events + x synthesize "transactions" events + w synthesize "ptwrite" events + p synthesize "power" events + c synthesize branches events (calls only) + r synthesize branches events (returns only) + e synthesize tracing error events + d create a debug log + g synthesize a call chain (use with i or x) + l synthesize last branch entries (use with i or x) + s skip initial number of events + +"Instructions" events look like they were recorded by "perf record -e +instructions". + +"Branches" events look like they were recorded by "perf record -e branches". "c" +and "r" can be combined to get calls and returns. + +"Transactions" events correspond to the start or end of transactions. The +'flags' field can be used in perf script to determine whether the event is a +tranasaction start, commit or abort. + +Note that "instructions", "branches" and "transactions" events depend on code +flow packets which can be disabled by using the config term "branch=0". Refer +to the config terms section above. + +"ptwrite" events record the payload of the ptwrite instruction and whether +"fup_on_ptw" was used. "ptwrite" events depend on PTWRITE packets which are +recorded only if the "ptw" config term was used. Refer to the config terms +section above. perf script "synth" field displays "ptwrite" information like +this: "ip: 0 payload: 0x123456789abcdef0" where "ip" is 1 if "fup_on_ptw" was +used. + +"Power" events correspond to power event packets and CBR (core-to-bus ratio) +packets. While CBR packets are always recorded when tracing is enabled, power +event packets are recorded only if the "pwr_evt" config term was used. Refer to +the config terms section above. The power events record information about +C-state changes, whereas CBR is indicative of CPU frequency. perf script +"event,synth" fields display information like this: + cbr: cbr: 22 freq: 2189 MHz (200%) + mwait: hints: 0x60 extensions: 0x1 + pwre: hw: 0 cstate: 2 sub-cstate: 0 + exstop: ip: 1 + pwrx: deepest cstate: 2 last cstate: 2 wake reason: 0x4 +Where: + "cbr" includes the frequency and the percentage of maximum non-turbo + "mwait" shows mwait hints and extensions + "pwre" shows C-state transitions (to a C-state deeper than C0) and + whether initiated by hardware + "exstop" indicates execution stopped and whether the IP was recorded + exactly, + "pwrx" indicates return to C0 +For more details refer to the Intel 64 and IA-32 Architectures Software +Developer Manuals. + +Error events show where the decoder lost the trace. Error events +are quite important. Users must know if what they are seeing is a complete +picture or not. + +The "d" option will cause the creation of a file "intel_pt.log" containing all +decoded packets and instructions. Note that this option slows down the decoder +and that the resulting file may be very large. + +In addition, the period of the "instructions" event can be specified. e.g. + + --itrace=i10us + +sets the period to 10us i.e. one instruction sample is synthesized for each 10 +microseconds of trace. Alternatives to "us" are "ms" (milliseconds), +"ns" (nanoseconds), "t" (TSC ticks) or "i" (instructions). + +"ms", "us" and "ns" are converted to TSC ticks. + +The timing information included with Intel PT does not give the time of every +instruction. Consequently, for the purpose of sampling, the decoder estimates +the time since the last timing packet based on 1 tick per instruction. The time +on the sample is *not* adjusted and reflects the last known value of TSC. + +For Intel PT, the default period is 100us. + +Setting it to a zero period means "as often as possible". + +In the case of Intel PT that is the same as a period of 1 and a unit of +'instructions' (i.e. --itrace=i1i). + +Also the call chain size (default 16, max. 1024) for instructions or +transactions events can be specified. e.g. + + --itrace=ig32 + --itrace=xg32 + +Also the number of last branch entries (default 64, max. 1024) for instructions or +transactions events can be specified. e.g. + + --itrace=il10 + --itrace=xl10 + +Note that last branch entries are cleared for each sample, so there is no overlap +from one sample to the next. + +To disable trace decoding entirely, use the option --no-itrace. + +It is also possible to skip events generated (instructions, branches, transactions) +at the beginning. This is useful to ignore initialization code. + + --itrace=i0nss1000000 + +skips the first million instructions. + +dump option +~~~~~~~~~~~ + +perf script has an option (-D) to "dump" the events i.e. display the binary +data. + +When -D is used, Intel PT packets are displayed. The packet decoder does not +pay attention to PSB packets, but just decodes the bytes - so the packets seen +by the actual decoder may not be identical in places where the data is corrupt. +One example of that would be when the buffer-switching interrupt has been too +slow, and the buffer has been filled completely. In that case, the last packet +in the buffer might be truncated and immediately followed by a PSB as the trace +continues in the next buffer. + +To disable the display of Intel PT packets, combine the -D option with +--no-itrace. + + +perf report +----------- + +By default, perf report will decode trace data found in the perf.data file. +This can be further controlled by new option --itrace exactly the same as +perf script, with the exception that the default is --itrace=igxe. + + +perf inject +----------- + +perf inject also accepts the --itrace option in which case tracing data is +removed and replaced with the synthesized events. e.g. + + perf inject --itrace -i perf.data -o perf.data.new + +Below is an example of using Intel PT with autofdo. It requires autofdo +(https://github.com/google/autofdo) and gcc version 5. The bubble +sort example is from the AutoFDO tutorial (https://gcc.gnu.org/wiki/AutoFDO/Tutorial) +amended to take the number of elements as a parameter. + + $ gcc-5 -O3 sort.c -o sort_optimized + $ ./sort_optimized 30000 + Bubble sorting array of 30000 elements + 2254 ms + + $ cat ~/.perfconfig + [intel-pt] + mispred-all = on + + $ perf record -e intel_pt//u ./sort 3000 + Bubble sorting array of 3000 elements + 58 ms + [ perf record: Woken up 2 times to write data ] + [ perf record: Captured and wrote 3.939 MB perf.data ] + $ perf inject -i perf.data -o inj --itrace=i100usle --strip + $ ./create_gcov --binary=./sort --profile=inj --gcov=sort.gcov -gcov_version=1 + $ gcc-5 -O3 -fauto-profile=sort.gcov sort.c -o sort_autofdo + $ ./sort_autofdo 30000 + Bubble sorting array of 30000 elements + 2155 ms + +Note there is currently no advantage to using Intel PT instead of LBR, but +that may change in the future if greater use is made of the data. + + +PEBS via Intel PT +----------------- + +Some hardware has the feature to redirect PEBS records to the Intel PT trace. +Recording is selected by using the aux-output config term e.g. + + perf record -c 10000 -e '{intel_pt/branch=0/,cycles/aux-output/ppp}' uname + +Note that currently, software only supports redirecting at most one PEBS event. + +To display PEBS events from the Intel PT trace, use the itrace 'o' option e.g. + + perf script --itrace=oe + + +SEE ALSO +-------- + +linkperf:perf-record[1], linkperf:perf-script[1], linkperf:perf-report[1], +linkperf:perf-inject[1] diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index b23a4012a606..7f4db7592467 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -589,4 +589,4 @@ appended unit character - B/K/M/G SEE ALSO -------- -linkperf:perf-stat[1], linkperf:perf-list[1] +linkperf:perf-stat[1], linkperf:perf-list[1], linkperf:perf-intel-pt[1] diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index db61f16ffa56..bd0a029d4c08 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -546,4 +546,5 @@ include::callchain-overhead-calculation.txt[] SEE ALSO -------- -linkperf:perf-stat[1], linkperf:perf-annotate[1], linkperf:perf-record[1] +linkperf:perf-stat[1], linkperf:perf-annotate[1], linkperf:perf-record[1], +linkperf:perf-intel-pt[1] diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index 2599b057e47b..db6a36aac47e 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -429,4 +429,4 @@ include::itrace.txt[] SEE ALSO -------- linkperf:perf-record[1], linkperf:perf-script-perl[1], -linkperf:perf-script-python[1] +linkperf:perf-script-python[1], linkperf:perf-intel-pt[1] diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt index 9431b8066fb4..4d56586b2fb9 100644 --- a/tools/perf/Documentation/perf-stat.txt +++ b/tools/perf/Documentation/perf-stat.txt @@ -334,6 +334,15 @@ Configure all used events to run in kernel space. --all-user:: Configure all used events to run in user space. +--percore-show-thread:: +The event modifier "percore" has supported to sum up the event counts +for all hardware threads in a core and show the counts per core. + +This option with event modifier "percore" enabled also sums up the event +counts for all hardware threads in a core but show the sum counts per +hardware thread. This is essentially a replacement for the any bit and +convenient for post processing. + EXAMPLES -------- diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c index c03c36fde7e2..5e697cd2224a 100644 --- a/tools/perf/builtin-diff.c +++ b/tools/perf/builtin-diff.c @@ -572,29 +572,12 @@ static void init_block_hist(struct block_hist *bh) bh->valid = true; } -static int block_pair_cmp(struct hist_entry *a, struct hist_entry *b) -{ - struct block_info *bi_a = a->block_info; - struct block_info *bi_b = b->block_info; - int cmp; - - if (!bi_a->sym || !bi_b->sym) - return -1; - - cmp = strcmp(bi_a->sym->name, bi_b->sym->name); - - if ((!cmp) && (bi_a->start == bi_b->start) && (bi_a->end == bi_b->end)) - return 0; - - return -1; -} - static struct hist_entry *get_block_pair(struct hist_entry *he, struct hists *hists_pair) { struct rb_root_cached *root = hists_pair->entries_in; struct rb_node *next = rb_first_cached(root); - int cmp; + int64_t cmp; while (next != NULL) { struct hist_entry *he_pair = rb_entry(next, struct hist_entry, @@ -602,7 +585,7 @@ static struct hist_entry *get_block_pair(struct hist_entry *he, next = rb_next(&he_pair->rb_node_in); - cmp = block_pair_cmp(he_pair, he); + cmp = __block_info__cmp(he_pair, he); if (!cmp) return he_pair; } diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 72a12b69f120..5f4045df76f4 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -104,6 +104,7 @@ struct report { bool symbol_ipc; bool total_cycles_mode; struct block_report *block_reports; + int nr_block_reports; }; static int report__config(const char *var, const char *value, void *cb) @@ -185,24 +186,23 @@ static int hist_iter__branch_callback(struct hist_entry_iter *iter, { struct hist_entry *he = iter->he; struct report *rep = arg; - struct branch_info *bi; + struct branch_info *bi = he->branch_info; struct perf_sample *sample = iter->sample; struct evsel *evsel = iter->evsel; int err; + branch_type_count(&rep->brtype_stat, &bi->flags, + bi->from.addr, bi->to.addr); + if (!ui__has_annotation() && !rep->symbol_ipc) return 0; - bi = he->branch_info; err = addr_map_symbol__inc_samples(&bi->from, sample, evsel); if (err) goto out; err = addr_map_symbol__inc_samples(&bi->to, sample, evsel); - branch_type_count(&rep->brtype_stat, &bi->flags, - bi->from.addr, bi->to.addr); - out: return err; } @@ -966,8 +966,19 @@ static int __cmd_report(struct report *rep) report__output_resort(rep); if (rep->total_cycles_mode) { + int block_hpps[6] = { + PERF_HPP_REPORT__BLOCK_TOTAL_CYCLES_PCT, + PERF_HPP_REPORT__BLOCK_LBR_CYCLES, + PERF_HPP_REPORT__BLOCK_CYCLES_PCT, + PERF_HPP_REPORT__BLOCK_AVG_CYCLES, + PERF_HPP_REPORT__BLOCK_RANGE, + PERF_HPP_REPORT__BLOCK_DSO, + }; + rep->block_reports = block_info__create_report(session->evlist, - rep->total_cycles); + rep->total_cycles, + block_hpps, 6, + &rep->nr_block_reports); if (!rep->block_reports) return -1; } @@ -1551,8 +1562,11 @@ error: zfree(&report.ptime_range); } - if (report.block_reports) - zfree(&report.block_reports); + if (report.block_reports) { + block_info__free_report(report.block_reports, + report.nr_block_reports); + report.block_reports = NULL; + } zstd_fini(&(session->zstd_data)); perf_session__delete(session); diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index e2406b291c1c..656b347f6dd8 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -735,6 +735,7 @@ static int perf_sample__fprintf_brstack(struct perf_sample *sample, struct perf_event_attr *attr, FILE *fp) { struct branch_stack *br = sample->branch_stack; + struct branch_entry *entries = perf_sample__branch_entries(sample); struct addr_location alf, alt; u64 i, from, to; int printed = 0; @@ -743,8 +744,8 @@ static int perf_sample__fprintf_brstack(struct perf_sample *sample, return 0; for (i = 0; i < br->nr; i++) { - from = br->entries[i].from; - to = br->entries[i].to; + from = entries[i].from; + to = entries[i].to; if (PRINT_FIELD(DSO)) { memset(&alf, 0, sizeof(alf)); @@ -768,10 +769,10 @@ static int perf_sample__fprintf_brstack(struct perf_sample *sample, } printed += fprintf(fp, "/%c/%c/%c/%d ", - mispred_str( br->entries + i), - br->entries[i].flags.in_tx? 'X' : '-', - br->entries[i].flags.abort? 'A' : '-', - br->entries[i].flags.cycles); + mispred_str(entries + i), + entries[i].flags.in_tx ? 'X' : '-', + entries[i].flags.abort ? 'A' : '-', + entries[i].flags.cycles); } return printed; @@ -782,6 +783,7 @@ static int perf_sample__fprintf_brstacksym(struct perf_sample *sample, struct perf_event_attr *attr, FILE *fp) { struct branch_stack *br = sample->branch_stack; + struct branch_entry *entries = perf_sample__branch_entries(sample); struct addr_location alf, alt; u64 i, from, to; int printed = 0; @@ -793,8 +795,8 @@ static int perf_sample__fprintf_brstacksym(struct perf_sample *sample, memset(&alf, 0, sizeof(alf)); memset(&alt, 0, sizeof(alt)); - from = br->entries[i].from; - to = br->entries[i].to; + from = entries[i].from; + to = entries[i].to; thread__find_symbol_fb(thread, sample->cpumode, from, &alf); thread__find_symbol_fb(thread, sample->cpumode, to, &alt); @@ -813,10 +815,10 @@ static int perf_sample__fprintf_brstacksym(struct perf_sample *sample, printed += fprintf(fp, ")"); } printed += fprintf(fp, "/%c/%c/%c/%d ", - mispred_str( br->entries + i), - br->entries[i].flags.in_tx? 'X' : '-', - br->entries[i].flags.abort? 'A' : '-', - br->entries[i].flags.cycles); + mispred_str(entries + i), + entries[i].flags.in_tx ? 'X' : '-', + entries[i].flags.abort ? 'A' : '-', + entries[i].flags.cycles); } return printed; @@ -827,6 +829,7 @@ static int perf_sample__fprintf_brstackoff(struct perf_sample *sample, struct perf_event_attr *attr, FILE *fp) { struct branch_stack *br = sample->branch_stack; + struct branch_entry *entries = perf_sample__branch_entries(sample); struct addr_location alf, alt; u64 i, from, to; int printed = 0; @@ -838,8 +841,8 @@ static int perf_sample__fprintf_brstackoff(struct perf_sample *sample, memset(&alf, 0, sizeof(alf)); memset(&alt, 0, sizeof(alt)); - from = br->entries[i].from; - to = br->entries[i].to; + from = entries[i].from; + to = entries[i].to; if (thread__find_map_fb(thread, sample->cpumode, from, &alf) && !alf.map->dso->adjust_symbols) @@ -862,10 +865,10 @@ static int perf_sample__fprintf_brstackoff(struct perf_sample *sample, printed += fprintf(fp, ")"); } printed += fprintf(fp, "/%c/%c/%c/%d ", - mispred_str(br->entries + i), - br->entries[i].flags.in_tx ? 'X' : '-', - br->entries[i].flags.abort ? 'A' : '-', - br->entries[i].flags.cycles); + mispred_str(entries + i), + entries[i].flags.in_tx ? 'X' : '-', + entries[i].flags.abort ? 'A' : '-', + entries[i].flags.cycles); } return printed; @@ -1053,6 +1056,7 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, struct machine *machine, FILE *fp) { struct branch_stack *br = sample->branch_stack; + struct branch_entry *entries = perf_sample__branch_entries(sample); u64 start, end; int i, insn, len, nr, ilen, printed = 0; struct perf_insn x; @@ -1073,31 +1077,31 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, printed += fprintf(fp, "%c", '\n'); /* Handle first from jump, of which we don't know the entry. */ - len = grab_bb(buffer, br->entries[nr-1].from, - br->entries[nr-1].from, + len = grab_bb(buffer, entries[nr-1].from, + entries[nr-1].from, machine, thread, &x.is64bit, &x.cpumode, false); if (len > 0) { - printed += ip__fprintf_sym(br->entries[nr - 1].from, thread, + printed += ip__fprintf_sym(entries[nr - 1].from, thread, x.cpumode, x.cpu, &lastsym, attr, fp); - printed += ip__fprintf_jump(br->entries[nr - 1].from, &br->entries[nr - 1], + printed += ip__fprintf_jump(entries[nr - 1].from, &entries[nr - 1], &x, buffer, len, 0, fp, &total_cycles); if (PRINT_FIELD(SRCCODE)) - printed += print_srccode(thread, x.cpumode, br->entries[nr - 1].from); + printed += print_srccode(thread, x.cpumode, entries[nr - 1].from); } /* Print all blocks */ for (i = nr - 2; i >= 0; i--) { - if (br->entries[i].from || br->entries[i].to) + if (entries[i].from || entries[i].to) pr_debug("%d: %" PRIx64 "-%" PRIx64 "\n", i, - br->entries[i].from, - br->entries[i].to); - start = br->entries[i + 1].to; - end = br->entries[i].from; + entries[i].from, + entries[i].to); + start = entries[i + 1].to; + end = entries[i].from; len = grab_bb(buffer, start, end, machine, thread, &x.is64bit, &x.cpumode, false); /* Patch up missing kernel transfers due to ring filters */ if (len == -ENXIO && i > 0) { - end = br->entries[--i].from; + end = entries[--i].from; pr_debug("\tpatching up to %" PRIx64 "-%" PRIx64 "\n", start, end); len = grab_bb(buffer, start, end, machine, thread, &x.is64bit, &x.cpumode, false); } @@ -1110,7 +1114,7 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, printed += ip__fprintf_sym(ip, thread, x.cpumode, x.cpu, &lastsym, attr, fp); if (ip == end) { - printed += ip__fprintf_jump(ip, &br->entries[i], &x, buffer + off, len - off, ++insn, fp, + printed += ip__fprintf_jump(ip, &entries[i], &x, buffer + off, len - off, ++insn, fp, &total_cycles); if (PRINT_FIELD(SRCCODE)) printed += print_srccode(thread, x.cpumode, ip); @@ -1134,9 +1138,9 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, * Hit the branch? In this case we are already done, and the target * has not been executed yet. */ - if (br->entries[0].from == sample->ip) + if (entries[0].from == sample->ip) goto out; - if (br->entries[0].flags.abort) + if (entries[0].flags.abort) goto out; /* @@ -1147,7 +1151,7 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, * between final branch and sample. When this happens just * continue walking after the last TO until we hit a branch. */ - start = br->entries[0].to; + start = entries[0].to; end = sample->ip; if (end < start) { /* Missing jump. Scan 128 bytes for the next branch */ diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index a098c2ebf4ea..ec053dc1e35c 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -929,6 +929,10 @@ static struct option stat_options[] = { OPT_BOOLEAN_FLAG(0, "all-user", &stat_config.all_user, "Configure all used events to run in user space.", PARSE_OPT_EXCLUSIVE), + OPT_BOOLEAN(0, "percore-show-thread", &stat_config.percore_show_thread, + "Use with 'percore' event qualifier to show the event " + "counts of one hardware thread by sum up total hardware " + "threads of same physical core"), OPT_END() }; diff --git a/tools/perf/pmu-events/arch/s390/cf_z15/crypto6.json b/tools/perf/pmu-events/arch/s390/cf_z15/crypto6.json index 5e36bc2468d0..c998e4f1d1d2 100644 --- a/tools/perf/pmu-events/arch/s390/cf_z15/crypto6.json +++ b/tools/perf/pmu-events/arch/s390/cf_z15/crypto6.json @@ -4,27 +4,27 @@ "EventCode": "80", "EventName": "ECC_FUNCTION_COUNT", "BriefDescription": "ECC Function Count", - "PublicDescription": "Long ECC function Count" + "PublicDescription": "This counter counts the total number of the elliptic-curve cryptography (ECC) functions issued by the CPU." }, { "Unit": "CPU-M-CF", "EventCode": "81", "EventName": "ECC_CYCLES_COUNT", "BriefDescription": "ECC Cycles Count", - "PublicDescription": "Long ECC Function cycles count" + "PublicDescription": "This counter counts the total number of CPU cycles when the ECC coprocessor is busy performing the elliptic-curve cryptography (ECC) functions issued by the CPU." }, { "Unit": "CPU-M-CF", "EventCode": "82", "EventName": "ECC_BLOCKED_FUNCTION_COUNT", "BriefDescription": "Ecc Blocked Function Count", - "PublicDescription": "Long ECC blocked function count" + "PublicDescription": "This counter counts the total number of the elliptic-curve cryptography (ECC) functions that are issued by the CPU and are blocked because the ECC coprocessor is busy performing a function issued by another CPU." }, { "Unit": "CPU-M-CF", "EventCode": "83", "EventName": "ECC_BLOCKED_CYCLES_COUNT", "BriefDescription": "ECC Blocked Cycles Count", - "PublicDescription": "Long ECC blocked cycles count" + "PublicDescription": "This counter counts the total number of CPU cycles blocked for the elliptic-curve cryptography (ECC) functions issued by the CPU because the ECC coprocessor is busy performing a function issued by another CPU." }, ] diff --git a/tools/perf/pmu-events/arch/s390/cf_z15/extended.json b/tools/perf/pmu-events/arch/s390/cf_z15/extended.json index 89e070727e1b..2df2e231e9ee 100644 --- a/tools/perf/pmu-events/arch/s390/cf_z15/extended.json +++ b/tools/perf/pmu-events/arch/s390/cf_z15/extended.json @@ -25,7 +25,7 @@ "EventCode": "131", "EventName": "DTLB2_HPAGE_WRITES", "BriefDescription": "DTLB2 One-Megabyte Page Writes", - "PublicDescription": "A translation entry was written into the Combined Region and Segment Table Entry array in the Level-2 TLB for a one-megabyte page or a Last Host Translation was done" + "PublicDescription": "A translation entry was written into the Combined Region and Segment Table Entry array in the Level-2 TLB for a one-megabyte page" }, { "Unit": "CPU-M-CF", @@ -358,6 +358,34 @@ }, { "Unit": "CPU-M-CF", + "EventCode": "247", + "EventName": "DFLT_ACCESS", + "BriefDescription": "Cycles CPU spent obtaining access to Deflate unit", + "PublicDescription": "Cycles CPU spent obtaining access to Deflate unit" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "252", + "EventName": "DFLT_CYCLES", + "BriefDescription": "Cycles CPU is using Deflate unit", + "PublicDescription": "Cycles CPU is using Deflate unit" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "264", + "EventName": "DFLT_CC", + "BriefDescription": "Increments by one for every DEFLATE CONVERSION CALL instruction executed", + "PublicDescription": "Increments by one for every DEFLATE CONVERSION CALL instruction executed" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "265", + "EventName": "DFLT_CCERROR", + "BriefDescription": "Increments by one for every DEFLATE CONVERSION CALL instruction executed that ended in Condition Codes 0, 1 or 2", + "PublicDescription": "Increments by one for every DEFLATE CONVERSION CALL instruction executed that ended in Condition Codes 0, 1 or 2" + }, + { + "Unit": "CPU-M-CF", "EventCode": "448", "EventName": "MT_DIAG_CYCLES_ONE_THR_ACTIVE", "BriefDescription": "Cycle count with one thread active", diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json b/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json index f94653229dd4..a728c6e5119b 100644 --- a/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json @@ -215,7 +215,8 @@ "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING ) / ( 2 * cycles )", "MetricGroup": "TLB", - "MetricName": "Page_Walks_Utilization" + "MetricName": "Page_Walks_Utilization", + "MetricConstraint": "NO_NMI_WATCHDOG" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", diff --git a/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json b/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json index e7feb60f9fa9..f97e8316ad2f 100644 --- a/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json @@ -215,7 +215,8 @@ "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING ) / ( 2 * cycles )", "MetricGroup": "TLB", - "MetricName": "Page_Walks_Utilization" + "MetricName": "Page_Walks_Utilization", + "MetricConstraint": "NO_NMI_WATCHDOG" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", diff --git a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json index 21d7a0c2c2e8..35f5db1786f7 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json @@ -215,7 +215,8 @@ "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING ) / ( 2 * cycles )", "MetricGroup": "TLB", - "MetricName": "Page_Walks_Utilization" + "MetricName": "Page_Walks_Utilization", + "MetricConstraint": "NO_NMI_WATCHDOG" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", diff --git a/tools/perf/pmu-events/jevents.c b/tools/perf/pmu-events/jevents.c index 27b4da80f751..3c4236a5bad8 100644 --- a/tools/perf/pmu-events/jevents.c +++ b/tools/perf/pmu-events/jevents.c @@ -323,7 +323,7 @@ static int print_events_table_entry(void *data, char *name, char *event, char *pmu, char *unit, char *perpkg, char *metric_expr, char *metric_name, char *metric_group, - char *deprecated) + char *deprecated, char *metric_constraint) { struct perf_entry_data *pd = data; FILE *outfp = pd->outfp; @@ -357,6 +357,8 @@ static int print_events_table_entry(void *data, char *name, char *event, fprintf(outfp, "\t.metric_group = \"%s\",\n", metric_group); if (deprecated) fprintf(outfp, "\t.deprecated = \"%s\",\n", deprecated); + if (metric_constraint) + fprintf(outfp, "\t.metric_constraint = \"%s\",\n", metric_constraint); fprintf(outfp, "},\n"); return 0; @@ -375,6 +377,7 @@ struct event_struct { char *metric_name; char *metric_group; char *deprecated; + char *metric_constraint; }; #define ADD_EVENT_FIELD(field) do { if (field) { \ @@ -422,7 +425,7 @@ static int save_arch_std_events(void *data, char *name, char *event, char *desc, char *long_desc, char *pmu, char *unit, char *perpkg, char *metric_expr, char *metric_name, char *metric_group, - char *deprecated) + char *deprecated, char *metric_constraint) { struct event_struct *es; @@ -486,7 +489,7 @@ try_fixup(const char *fn, char *arch_std, char **event, char **desc, char **name, char **long_desc, char **pmu, char **filter, char **perpkg, char **unit, char **metric_expr, char **metric_name, char **metric_group, unsigned long long eventcode, - char **deprecated) + char **deprecated, char **metric_constraint) { /* try to find matching event from arch standard values */ struct event_struct *es; @@ -515,7 +518,7 @@ int json_events(const char *fn, char *pmu, char *unit, char *perpkg, char *metric_expr, char *metric_name, char *metric_group, - char *deprecated), + char *deprecated, char *metric_constraint), void *data) { int err; @@ -545,6 +548,7 @@ int json_events(const char *fn, char *metric_name = NULL; char *metric_group = NULL; char *deprecated = NULL; + char *metric_constraint = NULL; char *arch_std = NULL; unsigned long long eventcode = 0; struct msrmap *msr = NULL; @@ -629,6 +633,8 @@ int json_events(const char *fn, addfield(map, &metric_name, "", "", val); } else if (json_streq(map, field, "MetricGroup")) { addfield(map, &metric_group, "", "", val); + } else if (json_streq(map, field, "MetricConstraint")) { + addfield(map, &metric_constraint, "", "", val); } else if (json_streq(map, field, "MetricExpr")) { addfield(map, &metric_expr, "", "", val); for (s = metric_expr; *s; s++) @@ -670,13 +676,13 @@ int json_events(const char *fn, &long_desc, &pmu, &filter, &perpkg, &unit, &metric_expr, &metric_name, &metric_group, eventcode, - &deprecated); + &deprecated, &metric_constraint); if (err) goto free_strings; } err = func(data, name, real_event(name, event), desc, long_desc, pmu, unit, perpkg, metric_expr, metric_name, - metric_group, deprecated); + metric_group, deprecated, metric_constraint); free_strings: free(event); free(desc); @@ -691,6 +697,7 @@ free_strings: free(metric_expr); free(metric_name); free(metric_group); + free(metric_constraint); free(arch_std); if (err) diff --git a/tools/perf/pmu-events/jevents.h b/tools/perf/pmu-events/jevents.h index 5cda49a42143..2afc8304529e 100644 --- a/tools/perf/pmu-events/jevents.h +++ b/tools/perf/pmu-events/jevents.h @@ -8,7 +8,7 @@ int json_events(const char *fn, char *pmu, char *unit, char *perpkg, char *metric_expr, char *metric_name, char *metric_group, - char *deprecated), + char *deprecated, char *metric_constraint), void *data); char *get_cpu_str(void); diff --git a/tools/perf/pmu-events/pmu-events.h b/tools/perf/pmu-events/pmu-events.h index caeb577d36c9..53e76d5d5b37 100644 --- a/tools/perf/pmu-events/pmu-events.h +++ b/tools/perf/pmu-events/pmu-events.h @@ -18,6 +18,7 @@ struct pmu_event { const char *metric_name; const char *metric_group; const char *deprecated; + const char *metric_constraint; }; /* diff --git a/tools/perf/scripts/perl/check-perf-trace.pl b/tools/perf/scripts/perl/check-perf-trace.pl index 4e7076c20616..d307ce8fd6ed 100644 --- a/tools/perf/scripts/perl/check-perf-trace.pl +++ b/tools/perf/scripts/perl/check-perf-trace.pl @@ -28,7 +28,7 @@ sub trace_end sub irq::softirq_entry { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $vec) = @_; print_header($event_name, $common_cpu, $common_secs, $common_nsecs, @@ -43,7 +43,7 @@ sub irq::softirq_entry sub kmem::kmalloc { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $call_site, $ptr, $bytes_req, $bytes_alloc, $gfp_flags) = @_; @@ -92,7 +92,7 @@ sub print_unhandled sub trace_unhandled { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm) = @_; + $common_pid, $common_comm, $common_callchain) = @_; $unhandled{$event_name}++; } diff --git a/tools/perf/scripts/perl/failed-syscalls.pl b/tools/perf/scripts/perl/failed-syscalls.pl index 55e7ae4c5c88..05954a8f363a 100644 --- a/tools/perf/scripts/perl/failed-syscalls.pl +++ b/tools/perf/scripts/perl/failed-syscalls.pl @@ -18,7 +18,7 @@ my %failed_syscalls; sub raw_syscalls::sys_exit { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $id, $ret) = @_; if ($ret < 0) { diff --git a/tools/perf/scripts/perl/rw-by-file.pl b/tools/perf/scripts/perl/rw-by-file.pl index 168fa5e94b44..92a750b8552b 100644 --- a/tools/perf/scripts/perl/rw-by-file.pl +++ b/tools/perf/scripts/perl/rw-by-file.pl @@ -28,7 +28,7 @@ my %writes; sub syscalls::sys_enter_read { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, $nr, $fd, $buf, $count) = @_; + $common_pid, $common_comm, $common_callchain, $nr, $fd, $buf, $count) = @_; if ($common_comm eq $for_comm) { $reads{$fd}{bytes_requested} += $count; @@ -39,7 +39,7 @@ sub syscalls::sys_enter_read sub syscalls::sys_enter_write { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, $nr, $fd, $buf, $count) = @_; + $common_pid, $common_comm, $common_callchain, $nr, $fd, $buf, $count) = @_; if ($common_comm eq $for_comm) { $writes{$fd}{bytes_written} += $count; @@ -98,7 +98,7 @@ sub print_unhandled sub trace_unhandled { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm) = @_; + $common_pid, $common_comm, $common_callchain) = @_; $unhandled{$event_name}++; } diff --git a/tools/perf/scripts/perl/rw-by-pid.pl b/tools/perf/scripts/perl/rw-by-pid.pl index 495698250b2f..d789fe39caab 100644 --- a/tools/perf/scripts/perl/rw-by-pid.pl +++ b/tools/perf/scripts/perl/rw-by-pid.pl @@ -24,7 +24,7 @@ my %writes; sub syscalls::sys_exit_read { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $nr, $ret) = @_; if ($ret > 0) { @@ -40,7 +40,7 @@ sub syscalls::sys_exit_read sub syscalls::sys_enter_read { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $nr, $fd, $buf, $count) = @_; $reads{$common_pid}{bytes_requested} += $count; @@ -51,7 +51,7 @@ sub syscalls::sys_enter_read sub syscalls::sys_exit_write { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $nr, $ret) = @_; if ($ret <= 0) { @@ -62,7 +62,7 @@ sub syscalls::sys_exit_write sub syscalls::sys_enter_write { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $nr, $fd, $buf, $count) = @_; $writes{$common_pid}{bytes_written} += $count; @@ -178,7 +178,7 @@ sub print_unhandled sub trace_unhandled { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm) = @_; + $common_pid, $common_comm, $common_callchain) = @_; $unhandled{$event_name}++; } diff --git a/tools/perf/scripts/perl/rwtop.pl b/tools/perf/scripts/perl/rwtop.pl index 6473442568a2..eba4df67af6b 100644 --- a/tools/perf/scripts/perl/rwtop.pl +++ b/tools/perf/scripts/perl/rwtop.pl @@ -35,7 +35,7 @@ if (!$interval) { sub syscalls::sys_exit_read { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $nr, $ret) = @_; print_check(); @@ -53,7 +53,7 @@ sub syscalls::sys_exit_read sub syscalls::sys_enter_read { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $nr, $fd, $buf, $count) = @_; print_check(); @@ -66,7 +66,7 @@ sub syscalls::sys_enter_read sub syscalls::sys_exit_write { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $nr, $ret) = @_; print_check(); @@ -79,7 +79,7 @@ sub syscalls::sys_exit_write sub syscalls::sys_enter_write { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $nr, $fd, $buf, $count) = @_; print_check(); @@ -197,7 +197,7 @@ sub print_unhandled sub trace_unhandled { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm) = @_; + $common_pid, $common_comm, $common_callchain) = @_; $unhandled{$event_name}++; } diff --git a/tools/perf/scripts/perl/wakeup-latency.pl b/tools/perf/scripts/perl/wakeup-latency.pl index efcfec5e347a..53444ff4ec7f 100644 --- a/tools/perf/scripts/perl/wakeup-latency.pl +++ b/tools/perf/scripts/perl/wakeup-latency.pl @@ -28,7 +28,7 @@ my $total_wakeups = 0; sub sched::sched_switch { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $prev_comm, $prev_pid, $prev_prio, $prev_state, $next_comm, $next_pid, $next_prio) = @_; @@ -51,7 +51,7 @@ sub sched::sched_switch sub sched::sched_wakeup { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $comm, $pid, $prio, $success, $target_cpu) = @_; $last_wakeup{$target_cpu}{ts} = nsecs($common_secs, $common_nsecs); @@ -101,7 +101,7 @@ sub print_unhandled sub trace_unhandled { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm) = @_; + $common_pid, $common_comm, $common_callchain) = @_; $unhandled{$event_name}++; } diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c index 5f05db75cdd8..54d9516c9839 100644 --- a/tools/perf/tests/builtin-test.c +++ b/tools/perf/tests/builtin-test.c @@ -543,8 +543,11 @@ static int run_shell_tests(int argc, const char *argv[], int i, int width) return -1; dir = opendir(st.dir); - if (!dir) + if (!dir) { + pr_err("failed to open shell test directory: %s\n", + st.dir); return -1; + } for_each_shell_test(dir, st.dir, ent) { int curr = i++; diff --git a/tools/perf/tests/expr.c b/tools/perf/tests/expr.c index 87843af4c118..28313e59d6f6 100644 --- a/tools/perf/tests/expr.c +++ b/tools/perf/tests/expr.c @@ -10,7 +10,7 @@ static int test(struct parse_ctx *ctx, const char *e, double val2) { double val; - if (expr__parse(&val, ctx, &e)) + if (expr__parse(&val, ctx, e)) TEST_ASSERT_VAL("parse test failed", 0); TEST_ASSERT_VAL("unexpected value", val == val2); return 0; @@ -44,12 +44,12 @@ int test__expr(struct test *t __maybe_unused, int subtest __maybe_unused) return ret; p = "FOO/0"; - ret = expr__parse(&val, &ctx, &p); - TEST_ASSERT_VAL("division by zero", ret == 1); + ret = expr__parse(&val, &ctx, p); + TEST_ASSERT_VAL("division by zero", ret == -1); p = "BAR/"; - ret = expr__parse(&val, &ctx, &p); - TEST_ASSERT_VAL("missing operand", ret == 1); + ret = expr__parse(&val, &ctx, p); + TEST_ASSERT_VAL("missing operand", ret == -1); TEST_ASSERT_VAL("find other", expr__find_other("FOO + BAR + BAZ + BOZO", "FOO", &other, &num_other) == 0); diff --git a/tools/perf/tests/sample-parsing.c b/tools/perf/tests/sample-parsing.c index 2762e1155238..14239e472187 100644 --- a/tools/perf/tests/sample-parsing.c +++ b/tools/perf/tests/sample-parsing.c @@ -99,6 +99,7 @@ static bool samples_same(const struct perf_sample *s1, if (type & PERF_SAMPLE_BRANCH_STACK) { COMP(branch_stack->nr); + COMP(branch_stack->hw_idx); for (i = 0; i < s1->branch_stack->nr; i++) MCOMP(branch_stack->entries[i]); } @@ -186,7 +187,7 @@ static int do_test(u64 sample_type, u64 sample_regs, u64 read_format) u64 data[64]; } branch_stack = { /* 1 branch_entry */ - .data = {1, 211, 212, 213}, + .data = {1, -1ULL, 211, 212, 213}, }; u64 regs[64]; const u64 raw_data[] = {0x123456780a0b0c0dULL, 0x1102030405060708ULL}; @@ -208,6 +209,7 @@ static int do_test(u64 sample_type, u64 sample_regs, u64 read_format) .transaction = 112, .raw_data = (void *)raw_data, .callchain = &callchain.callchain, + .no_hw_idx = false, .branch_stack = &branch_stack.branch_stack, .user_regs = { .abi = PERF_SAMPLE_REGS_ABI_64, @@ -244,6 +246,9 @@ static int do_test(u64 sample_type, u64 sample_regs, u64 read_format) if (sample_type & PERF_SAMPLE_REGS_INTR) evsel.core.attr.sample_regs_intr = sample_regs; + if (sample_type & PERF_SAMPLE_BRANCH_STACK) + evsel.core.attr.branch_sample_type |= PERF_SAMPLE_BRANCH_HW_INDEX; + for (i = 0; i < sizeof(regs); i++) *(i + (u8 *)regs) = i & 0xfe; diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 07da6c790b63..c0cf8dff694e 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -121,7 +121,9 @@ perf-y += mem-events.o perf-y += vsprintf.o perf-y += units.o perf-y += time-utils.o +perf-y += expr-flex.o perf-y += expr-bison.o +perf-y += expr.o perf-y += branch.o perf-y += mem2node.o @@ -189,9 +191,13 @@ $(OUTPUT)util/parse-events-bison.c: util/parse-events.y $(call rule_mkdir) $(Q)$(call echo-cmd,bison)$(BISON) -v util/parse-events.y -d $(PARSER_DEBUG_BISON) -o $@ -p parse_events_ +$(OUTPUT)util/expr-flex.c: util/expr.l $(OUTPUT)util/expr-bison.c + $(call rule_mkdir) + $(Q)$(call echo-cmd,flex)$(FLEX) -o $@ --header-file=$(OUTPUT)util/expr-flex.h $(PARSER_DEBUG_FLEX) util/expr.l + $(OUTPUT)util/expr-bison.c: util/expr.y $(call rule_mkdir) - $(Q)$(call echo-cmd,bison)$(BISON) -v util/expr.y -d $(PARSER_DEBUG_BISON) -o $@ -p expr__ + $(Q)$(call echo-cmd,bison)$(BISON) -v util/expr.y -d $(PARSER_DEBUG_BISON) -o $@ -p expr_ $(OUTPUT)util/pmu-flex.c: util/pmu.l $(OUTPUT)util/pmu-bison.c $(call rule_mkdir) @@ -203,12 +209,14 @@ $(OUTPUT)util/pmu-bison.c: util/pmu.y CFLAGS_parse-events-flex.o += -w CFLAGS_pmu-flex.o += -w +CFLAGS_expr-flex.o += -w CFLAGS_parse-events-bison.o += -DYYENABLE_NLS=0 -w CFLAGS_pmu-bison.o += -DYYENABLE_NLS=0 -DYYLTYPE_IS_TRIVIAL=0 -w CFLAGS_expr-bison.o += -DYYENABLE_NLS=0 -DYYLTYPE_IS_TRIVIAL=0 -w $(OUTPUT)util/parse-events.o: $(OUTPUT)util/parse-events-flex.c $(OUTPUT)util/parse-events-bison.c $(OUTPUT)util/pmu.o: $(OUTPUT)util/pmu-flex.c $(OUTPUT)util/pmu-bison.c +$(OUTPUT)util/expr.o: $(OUTPUT)util/expr-flex.c $(OUTPUT)util/expr-bison.c CFLAGS_bitmap.o += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))" CFLAGS_find_bit.o += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))" @@ -216,6 +224,7 @@ CFLAGS_rbtree.o += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ET CFLAGS_libstring.o += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))" CFLAGS_hweight.o += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))" CFLAGS_parse-events.o += -Wno-redundant-decls +CFLAGS_expr.o += -Wno-redundant-decls CFLAGS_header.o += -include $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)util/kallsyms.o: ../lib/symbol/kallsyms.c FORCE diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 0ea95be84b3b..f1ea0d61eb5b 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -2611,8 +2611,6 @@ void annotation__mark_jump_targets(struct annotation *notes, struct symbol *sym) if (++al->jump_sources > notes->max_jump_sources) notes->max_jump_sources = al->jump_sources; - - ++notes->nr_jumps; } } diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 001258601a37..07c775938d46 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -279,7 +279,6 @@ struct annotation { struct annotation_options *options; struct annotation_line **offsets; int nr_events; - int nr_jumps; int max_jump_sources; int nr_entries; int nr_asm_entries; diff --git a/tools/perf/util/block-info.c b/tools/perf/util/block-info.c index fbbb6d640dad..423ec69bda6c 100644 --- a/tools/perf/util/block-info.c +++ b/tools/perf/util/block-info.c @@ -65,8 +65,7 @@ struct block_info *block_info__new(void) return bi; } -int64_t block_info__cmp(struct perf_hpp_fmt *fmt __maybe_unused, - struct hist_entry *left, struct hist_entry *right) +int64_t __block_info__cmp(struct hist_entry *left, struct hist_entry *right) { struct block_info *bi_l = left->block_info; struct block_info *bi_r = right->block_info; @@ -74,30 +73,27 @@ int64_t block_info__cmp(struct perf_hpp_fmt *fmt __maybe_unused, if (!bi_l->sym || !bi_r->sym) { if (!bi_l->sym && !bi_r->sym) - return 0; + return -1; else if (!bi_l->sym) return -1; else return 1; } - if (bi_l->sym == bi_r->sym) { - if (bi_l->start == bi_r->start) { - if (bi_l->end == bi_r->end) - return 0; - else - return (int64_t)(bi_r->end - bi_l->end); - } else - return (int64_t)(bi_r->start - bi_l->start); - } else { - cmp = strcmp(bi_l->sym->name, bi_r->sym->name); + cmp = strcmp(bi_l->sym->name, bi_r->sym->name); + if (cmp) return cmp; - } - if (bi_l->sym->start != bi_r->sym->start) - return (int64_t)(bi_r->sym->start - bi_l->sym->start); + if (bi_l->start != bi_r->start) + return (int64_t)(bi_r->start - bi_l->start); - return (int64_t)(bi_r->sym->end - bi_l->sym->end); + return (int64_t)(bi_r->end - bi_l->end); +} + +int64_t block_info__cmp(struct perf_hpp_fmt *fmt __maybe_unused, + struct hist_entry *left, struct hist_entry *right) +{ + return __block_info__cmp(left, right); } static void init_block_info(struct block_info *bi, struct symbol *sym, @@ -185,6 +181,17 @@ static int block_column_width(struct perf_hpp_fmt *fmt, return block_fmt->width; } +static int color_pct(struct perf_hpp *hpp, int width, double pct) +{ +#ifdef HAVE_SLANG_SUPPORT + if (use_browser) { + return __hpp__slsmg_color_printf(hpp, "%*.2f%%", + width - 1, pct); + } +#endif + return hpp_color_scnprintf(hpp, "%*.2f%%", width - 1, pct); +} + static int block_total_cycles_pct_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp, struct hist_entry *he) @@ -192,14 +199,11 @@ static int block_total_cycles_pct_entry(struct perf_hpp_fmt *fmt, struct block_fmt *block_fmt = container_of(fmt, struct block_fmt, fmt); struct block_info *bi = he->block_info; double ratio = 0.0; - char buf[16]; if (block_fmt->total_cycles) ratio = (double)bi->cycles / (double)block_fmt->total_cycles; - sprintf(buf, "%.2f%%", 100.0 * ratio); - - return scnprintf(hpp->buf, hpp->size, "%*s", block_fmt->width, buf); + return color_pct(hpp, block_fmt->width, 100.0 * ratio); } static int64_t block_total_cycles_pct_sort(struct perf_hpp_fmt *fmt, @@ -252,16 +256,13 @@ static int block_cycles_pct_entry(struct perf_hpp_fmt *fmt, struct block_info *bi = he->block_info; double ratio = 0.0; u64 avg; - char buf[16]; if (block_fmt->block_cycles && bi->num_aggr) { avg = bi->cycles_aggr / bi->num_aggr; ratio = (double)avg / (double)block_fmt->block_cycles; } - sprintf(buf, "%.2f%%", 100.0 * ratio); - - return scnprintf(hpp->buf, hpp->size, "%*s", block_fmt->width, buf); + return color_pct(hpp, block_fmt->width, 100.0 * ratio); } static int block_avg_cycles_entry(struct perf_hpp_fmt *fmt, @@ -349,7 +350,7 @@ static void hpp_register(struct block_fmt *block_fmt, int idx, switch (idx) { case PERF_HPP_REPORT__BLOCK_TOTAL_CYCLES_PCT: - fmt->entry = block_total_cycles_pct_entry; + fmt->color = block_total_cycles_pct_entry; fmt->cmp = block_info__cmp; fmt->sort = block_total_cycles_pct_sort; break; @@ -357,7 +358,7 @@ static void hpp_register(struct block_fmt *block_fmt, int idx, fmt->entry = block_cycles_lbr_entry; break; case PERF_HPP_REPORT__BLOCK_CYCLES_PCT: - fmt->entry = block_cycles_pct_entry; + fmt->color = block_cycles_pct_entry; break; case PERF_HPP_REPORT__BLOCK_AVG_CYCLES: fmt->entry = block_avg_cycles_entry; @@ -377,33 +378,41 @@ static void hpp_register(struct block_fmt *block_fmt, int idx, } static void register_block_columns(struct perf_hpp_list *hpp_list, - struct block_fmt *block_fmts) + struct block_fmt *block_fmts, + int *block_hpps, int nr_hpps) { - for (int i = 0; i < PERF_HPP_REPORT__BLOCK_MAX_INDEX; i++) - hpp_register(&block_fmts[i], i, hpp_list); + for (int i = 0; i < nr_hpps; i++) + hpp_register(&block_fmts[i], block_hpps[i], hpp_list); } -static void init_block_hist(struct block_hist *bh, struct block_fmt *block_fmts) +static void init_block_hist(struct block_hist *bh, struct block_fmt *block_fmts, + int *block_hpps, int nr_hpps) { __hists__init(&bh->block_hists, &bh->block_list); perf_hpp_list__init(&bh->block_list); bh->block_list.nr_header_lines = 1; - register_block_columns(&bh->block_list, block_fmts); + register_block_columns(&bh->block_list, block_fmts, + block_hpps, nr_hpps); - perf_hpp_list__register_sort_field(&bh->block_list, - &block_fmts[PERF_HPP_REPORT__BLOCK_TOTAL_CYCLES_PCT].fmt); + /* Sort by the first fmt */ + perf_hpp_list__register_sort_field(&bh->block_list, &block_fmts[0].fmt); } -static void process_block_report(struct hists *hists, - struct block_report *block_report, - u64 total_cycles) +static int process_block_report(struct hists *hists, + struct block_report *block_report, + u64 total_cycles, int *block_hpps, + int nr_hpps) { struct rb_node *next = rb_first_cached(&hists->entries); struct block_hist *bh = &block_report->hist; struct hist_entry *he; - init_block_hist(bh, block_report->fmts); + if (nr_hpps > PERF_HPP_REPORT__BLOCK_MAX_INDEX) + return -1; + + block_report->nr_fmts = nr_hpps; + init_block_hist(bh, block_report->fmts, block_hpps, nr_hpps); while (next) { he = rb_entry(next, struct hist_entry, rb_node); @@ -412,16 +421,19 @@ static void process_block_report(struct hists *hists, next = rb_next(&he->rb_node); } - for (int i = 0; i < PERF_HPP_REPORT__BLOCK_MAX_INDEX; i++) { + for (int i = 0; i < nr_hpps; i++) { block_report->fmts[i].total_cycles = total_cycles; block_report->fmts[i].block_cycles = block_report->cycles; } hists__output_resort(&bh->block_hists, NULL); + return 0; } struct block_report *block_info__create_report(struct evlist *evlist, - u64 total_cycles) + u64 total_cycles, + int *block_hpps, int nr_hpps, + int *nr_reps) { struct block_report *block_reports; int nr_hists = evlist->core.nr_entries, i = 0; @@ -434,13 +446,23 @@ struct block_report *block_info__create_report(struct evlist *evlist, evlist__for_each_entry(evlist, pos) { struct hists *hists = evsel__hists(pos); - process_block_report(hists, &block_reports[i], total_cycles); + process_block_report(hists, &block_reports[i], total_cycles, + block_hpps, nr_hpps); i++; } + *nr_reps = nr_hists; return block_reports; } +void block_info__free_report(struct block_report *reps, int nr_reps) +{ + for (int i = 0; i < nr_reps; i++) + hists__delete_entries(&reps[i].hist.block_hists); + + free(reps); +} + int report__browse_block_hists(struct block_hist *bh, float min_percent, struct evsel *evsel, struct perf_env *env, struct annotation_options *annotation_opts) @@ -452,13 +474,11 @@ int report__browse_block_hists(struct block_hist *bh, float min_percent, symbol_conf.report_individual_block = true; hists__fprintf(&bh->block_hists, true, 0, 0, min_percent, stdout, true); - hists__delete_entries(&bh->block_hists); return 0; case 1: symbol_conf.report_individual_block = true; ret = block_hists_tui_browse(bh, evsel, min_percent, env, annotation_opts); - hists__delete_entries(&bh->block_hists); return ret; default: return -1; diff --git a/tools/perf/util/block-info.h b/tools/perf/util/block-info.h index bef0d75e9819..42e9dcc4cf0a 100644 --- a/tools/perf/util/block-info.h +++ b/tools/perf/util/block-info.h @@ -45,6 +45,7 @@ struct block_report { struct block_hist hist; u64 cycles; struct block_fmt fmts[PERF_HPP_REPORT__BLOCK_MAX_INDEX]; + int nr_fmts; }; struct block_hist; @@ -61,6 +62,8 @@ static inline void __block_info__zput(struct block_info **bi) #define block_info__zput(bi) __block_info__zput(&bi) +int64_t __block_info__cmp(struct hist_entry *left, struct hist_entry *right); + int64_t block_info__cmp(struct perf_hpp_fmt *fmt __maybe_unused, struct hist_entry *left, struct hist_entry *right); @@ -68,7 +71,11 @@ int block_info__process_sym(struct hist_entry *he, struct block_hist *bh, u64 *block_cycles_aggr, u64 total_cycles); struct block_report *block_info__create_report(struct evlist *evlist, - u64 total_cycles); + u64 total_cycles, + int *block_hpps, int nr_hpps, + int *nr_reps); + +void block_info__free_report(struct block_report *reps, int nr_reps); int report__browse_block_hists(struct block_hist *bh, float min_percent, struct evsel *evsel, struct perf_env *env, diff --git a/tools/perf/util/branch.h b/tools/perf/util/branch.h index 88e00d268f6f..154a05cd03af 100644 --- a/tools/perf/util/branch.h +++ b/tools/perf/util/branch.h @@ -12,6 +12,7 @@ #include <linux/stddef.h> #include <linux/perf_event.h> #include <linux/types.h> +#include "event.h" struct branch_flags { u64 mispred:1; @@ -39,9 +40,30 @@ struct branch_entry { struct branch_stack { u64 nr; + u64 hw_idx; struct branch_entry entries[0]; }; +/* + * The hw_idx is only available when PERF_SAMPLE_BRANCH_HW_INDEX is applied. + * Otherwise, the output format of a sample with branch stack is + * struct branch_stack { + * u64 nr; + * struct branch_entry entries[0]; + * } + * Check whether the hw_idx is available, + * and return the corresponding pointer of entries[0]. + */ +static inline struct branch_entry *perf_sample__branch_entries(struct perf_sample *sample) +{ + u64 *entry = (u64 *)sample->branch_stack; + + entry++; + if (sample->no_hw_idx) + return (struct branch_entry *)entry; + return (struct branch_entry *)(++entry); +} + struct branch_type_stat { bool branch_to; u64 counts[PERF_BR_MAX]; diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c index 4881d4af3381..5bc9d3b01bd9 100644 --- a/tools/perf/util/cgroup.c +++ b/tools/perf/util/cgroup.c @@ -3,75 +3,16 @@ #include "evsel.h" #include "cgroup.h" #include "evlist.h" -#include <linux/stringify.h> #include <linux/zalloc.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <stdlib.h> #include <string.h> +#include <api/fs/fs.h> int nr_cgroups; -static int -cgroupfs_find_mountpoint(char *buf, size_t maxlen) -{ - FILE *fp; - char mountpoint[PATH_MAX + 1], tokens[PATH_MAX + 1], type[PATH_MAX + 1]; - char path_v1[PATH_MAX + 1], path_v2[PATH_MAX + 2], *path; - char *token, *saved_ptr = NULL; - - fp = fopen("/proc/mounts", "r"); - if (!fp) - return -1; - - /* - * in order to handle split hierarchy, we need to scan /proc/mounts - * and inspect every cgroupfs mount point to find one that has - * perf_event subsystem - */ - path_v1[0] = '\0'; - path_v2[0] = '\0'; - - while (fscanf(fp, "%*s %"__stringify(PATH_MAX)"s %"__stringify(PATH_MAX)"s %" - __stringify(PATH_MAX)"s %*d %*d\n", - mountpoint, type, tokens) == 3) { - - if (!path_v1[0] && !strcmp(type, "cgroup")) { - - token = strtok_r(tokens, ",", &saved_ptr); - - while (token != NULL) { - if (!strcmp(token, "perf_event")) { - strcpy(path_v1, mountpoint); - break; - } - token = strtok_r(NULL, ",", &saved_ptr); - } - } - - if (!path_v2[0] && !strcmp(type, "cgroup2")) - strcpy(path_v2, mountpoint); - - if (path_v1[0] && path_v2[0]) - break; - } - fclose(fp); - - if (path_v1[0]) - path = path_v1; - else if (path_v2[0]) - path = path_v2; - else - return -1; - - if (strlen(path) < maxlen) { - strcpy(buf, path); - return 0; - } - return -1; -} - static int open_cgroup(const char *name) { char path[PATH_MAX + 1]; @@ -79,7 +20,7 @@ static int open_cgroup(const char *name) int fd; - if (cgroupfs_find_mountpoint(mnt, PATH_MAX + 1)) + if (cgroupfs_find_mountpoint(mnt, PATH_MAX + 1, "perf_event")) return -1; scnprintf(path, PATH_MAX, "%s/%s", mnt, name); diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 5471045ebf5c..62d2f9b9ce1b 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -363,6 +363,23 @@ struct cs_etm_packet_queue return NULL; } +static void cs_etm__packet_swap(struct cs_etm_auxtrace *etm, + struct cs_etm_traceid_queue *tidq) +{ + struct cs_etm_packet *tmp; + + if (etm->sample_branches || etm->synth_opts.last_branch || + etm->sample_instructions) { + /* + * Swap PACKET with PREV_PACKET: PACKET becomes PREV_PACKET for + * the next incoming packet. + */ + tmp = tidq->packet; + tidq->packet = tidq->prev_packet; + tidq->prev_packet = tmp; + } +} + static void cs_etm__packet_dump(const char *pkt_string) { const char *color = PERF_COLOR_BLUE; @@ -945,7 +962,7 @@ static inline u64 cs_etm__instr_addr(struct cs_etm_queue *etmq, if (packet->isa == CS_ETM_ISA_T32) { u64 addr = packet->start_addr; - while (offset > 0) { + while (offset) { addr += cs_etm__t32_instr_size(etmq, trace_chan_id, addr); offset--; @@ -1134,10 +1151,8 @@ static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq, cs_etm__copy_insn(etmq, tidq->trace_chan_id, tidq->packet, &sample); - if (etm->synth_opts.last_branch) { - cs_etm__copy_last_branch_rb(etmq, tidq); + if (etm->synth_opts.last_branch) sample.branch_stack = tidq->last_branch; - } if (etm->synth_opts.inject) { ret = cs_etm__inject_event(event, &sample, @@ -1153,9 +1168,6 @@ static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq, "CS ETM Trace: failed to deliver instruction event, error %d\n", ret); - if (etm->synth_opts.last_branch) - cs_etm__reset_last_branch_rb(tidq); - return ret; } @@ -1172,6 +1184,7 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq, union perf_event *event = tidq->event_buf; struct dummy_branch_stack { u64 nr; + u64 hw_idx; struct branch_entry entries; } dummy_bs; u64 ip; @@ -1202,6 +1215,7 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq, if (etm->synth_opts.last_branch) { dummy_bs = (struct dummy_branch_stack){ .nr = 1, + .hw_idx = -1ULL, .entries = { .from = sample.ip, .to = sample.addr, @@ -1340,12 +1354,14 @@ static int cs_etm__sample(struct cs_etm_queue *etmq, struct cs_etm_traceid_queue *tidq) { struct cs_etm_auxtrace *etm = etmq->etm; - struct cs_etm_packet *tmp; int ret; u8 trace_chan_id = tidq->trace_chan_id; - u64 instrs_executed = tidq->packet->instr_count; + u64 instrs_prev; + + /* Get instructions remainder from previous packet */ + instrs_prev = tidq->period_instructions; - tidq->period_instructions += instrs_executed; + tidq->period_instructions += tidq->packet->instr_count; /* * Record a branch when the last instruction in @@ -1363,26 +1379,80 @@ static int cs_etm__sample(struct cs_etm_queue *etmq, * TODO: allow period to be defined in cycles and clock time */ - /* Get number of instructions executed after the sample point */ - u64 instrs_over = tidq->period_instructions - - etm->instructions_sample_period; + /* + * Below diagram demonstrates the instruction samples + * generation flows: + * + * Instrs Instrs Instrs Instrs + * Sample(n) Sample(n+1) Sample(n+2) Sample(n+3) + * | | | | + * V V V V + * -------------------------------------------------- + * ^ ^ + * | | + * Period Period + * instructions(Pi) instructions(Pi') + * + * | | + * \---------------- -----------------/ + * V + * tidq->packet->instr_count + * + * Instrs Sample(n...) are the synthesised samples occurring + * every etm->instructions_sample_period instructions - as + * defined on the perf command line. Sample(n) is being the + * last sample before the current etm packet, n+1 to n+3 + * samples are generated from the current etm packet. + * + * tidq->packet->instr_count represents the number of + * instructions in the current etm packet. + * + * Period instructions (Pi) contains the the number of + * instructions executed after the sample point(n) from the + * previous etm packet. This will always be less than + * etm->instructions_sample_period. + * + * When generate new samples, it combines with two parts + * instructions, one is the tail of the old packet and another + * is the head of the new coming packet, to generate + * sample(n+1); sample(n+2) and sample(n+3) consume the + * instructions with sample period. After sample(n+3), the rest + * instructions will be used by later packet and it is assigned + * to tidq->period_instructions for next round calculation. + */ /* - * Calculate the address of the sampled instruction (-1 as - * sample is reported as though instruction has just been - * executed, but PC has not advanced to next instruction) + * Get the initial offset into the current packet instructions; + * entry conditions ensure that instrs_prev is less than + * etm->instructions_sample_period. */ - u64 offset = (instrs_executed - instrs_over - 1); - u64 addr = cs_etm__instr_addr(etmq, trace_chan_id, - tidq->packet, offset); + u64 offset = etm->instructions_sample_period - instrs_prev; + u64 addr; - ret = cs_etm__synth_instruction_sample( - etmq, tidq, addr, etm->instructions_sample_period); - if (ret) - return ret; + /* Prepare last branches for instruction sample */ + if (etm->synth_opts.last_branch) + cs_etm__copy_last_branch_rb(etmq, tidq); - /* Carry remaining instructions into next sample period */ - tidq->period_instructions = instrs_over; + while (tidq->period_instructions >= + etm->instructions_sample_period) { + /* + * Calculate the address of the sampled instruction (-1 + * as sample is reported as though instruction has just + * been executed, but PC has not advanced to next + * instruction) + */ + addr = cs_etm__instr_addr(etmq, trace_chan_id, + tidq->packet, offset - 1); + ret = cs_etm__synth_instruction_sample( + etmq, tidq, addr, + etm->instructions_sample_period); + if (ret) + return ret; + + offset += etm->instructions_sample_period; + tidq->period_instructions -= + etm->instructions_sample_period; + } } if (etm->sample_branches) { @@ -1404,15 +1474,7 @@ static int cs_etm__sample(struct cs_etm_queue *etmq, } } - if (etm->sample_branches || etm->synth_opts.last_branch) { - /* - * Swap PACKET with PREV_PACKET: PACKET becomes PREV_PACKET for - * the next incoming packet. - */ - tmp = tidq->packet; - tidq->packet = tidq->prev_packet; - tidq->prev_packet = tmp; - } + cs_etm__packet_swap(etm, tidq); return 0; } @@ -1441,7 +1503,6 @@ static int cs_etm__flush(struct cs_etm_queue *etmq, { int err = 0; struct cs_etm_auxtrace *etm = etmq->etm; - struct cs_etm_packet *tmp; /* Handle start tracing packet */ if (tidq->prev_packet->sample_type == CS_ETM_EMPTY) @@ -1449,6 +1510,11 @@ static int cs_etm__flush(struct cs_etm_queue *etmq, if (etmq->etm->synth_opts.last_branch && tidq->prev_packet->sample_type == CS_ETM_RANGE) { + u64 addr; + + /* Prepare last branches for instruction sample */ + cs_etm__copy_last_branch_rb(etmq, tidq); + /* * Generate a last branch event for the branches left in the * circular buffer at the end of the trace. @@ -1456,7 +1522,7 @@ static int cs_etm__flush(struct cs_etm_queue *etmq, * Use the address of the end of the last reported execution * range */ - u64 addr = cs_etm__last_executed_instr(tidq->prev_packet); + addr = cs_etm__last_executed_instr(tidq->prev_packet); err = cs_etm__synth_instruction_sample( etmq, tidq, addr, @@ -1476,15 +1542,11 @@ static int cs_etm__flush(struct cs_etm_queue *etmq, } swap_packet: - if (etm->sample_branches || etm->synth_opts.last_branch) { - /* - * Swap PACKET with PREV_PACKET: PACKET becomes PREV_PACKET for - * the next incoming packet. - */ - tmp = tidq->packet; - tidq->packet = tidq->prev_packet; - tidq->prev_packet = tmp; - } + cs_etm__packet_swap(etm, tidq); + + /* Reset last branches after flush the trace */ + if (etm->synth_opts.last_branch) + cs_etm__reset_last_branch_rb(tidq); return err; } @@ -1505,11 +1567,16 @@ static int cs_etm__end_block(struct cs_etm_queue *etmq, */ if (etmq->etm->synth_opts.last_branch && tidq->prev_packet->sample_type == CS_ETM_RANGE) { + u64 addr; + + /* Prepare last branches for instruction sample */ + cs_etm__copy_last_branch_rb(etmq, tidq); + /* * Use the address of the end of the last reported execution * range. */ - u64 addr = cs_etm__last_executed_instr(tidq->prev_packet); + addr = cs_etm__last_executed_instr(tidq->prev_packet); err = cs_etm__synth_instruction_sample( etmq, tidq, addr, diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index 85223159737c..3cda40a2fafc 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -139,6 +139,7 @@ struct perf_sample { u16 insn_len; u8 cpumode; u16 misc; + bool no_hw_idx; /* No hw_idx collected in branch_stack */ char insn[MAX_INSN]; void *raw_data; struct ip_callchain *callchain; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index c8dc4450884c..816d930d774e 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -712,7 +712,8 @@ static void __perf_evsel__config_callchain(struct evsel *evsel, attr->branch_sample_type = PERF_SAMPLE_BRANCH_USER | PERF_SAMPLE_BRANCH_CALL_STACK | PERF_SAMPLE_BRANCH_NO_CYCLES | - PERF_SAMPLE_BRANCH_NO_FLAGS; + PERF_SAMPLE_BRANCH_NO_FLAGS | + PERF_SAMPLE_BRANCH_HW_INDEX; } } else pr_warning("Cannot use LBR callstack with branch stack. " @@ -763,7 +764,8 @@ perf_evsel__reset_callgraph(struct evsel *evsel, if (param->record_mode == CALLCHAIN_LBR) { perf_evsel__reset_sample_bit(evsel, BRANCH_STACK); attr->branch_sample_type &= ~(PERF_SAMPLE_BRANCH_USER | - PERF_SAMPLE_BRANCH_CALL_STACK); + PERF_SAMPLE_BRANCH_CALL_STACK | + PERF_SAMPLE_BRANCH_HW_INDEX); } if (param->record_mode == CALLCHAIN_DWARF) { perf_evsel__reset_sample_bit(evsel, REGS_USER); @@ -1673,6 +1675,8 @@ fallback_missing_features: evsel->core.attr.ksymbol = 0; if (perf_missing_features.bpf) evsel->core.attr.bpf_event = 0; + if (perf_missing_features.branch_hw_idx) + evsel->core.attr.branch_sample_type &= ~PERF_SAMPLE_BRANCH_HW_INDEX; retry_sample_id: if (perf_missing_features.sample_id_all) evsel->core.attr.sample_id_all = 0; @@ -1784,7 +1788,12 @@ try_fallback: * Must probe features in the order they were added to the * perf_event_attr interface. */ - if (!perf_missing_features.aux_output && evsel->core.attr.aux_output) { + if (!perf_missing_features.branch_hw_idx && + (evsel->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX)) { + perf_missing_features.branch_hw_idx = true; + pr_debug2("switching off branch HW index support\n"); + goto fallback_missing_features; + } else if (!perf_missing_features.aux_output && evsel->core.attr.aux_output) { perf_missing_features.aux_output = true; pr_debug2_peo("Kernel has no attr.aux_output support, bailing out\n"); goto out_close; @@ -2169,7 +2178,12 @@ int perf_evsel__parse_sample(struct evsel *evsel, union perf_event *event, if (data->branch_stack->nr > max_branch_nr) return -EFAULT; + sz = data->branch_stack->nr * sizeof(struct branch_entry); + if (perf_evsel__has_branch_hw_idx(evsel)) + sz += sizeof(u64); + else + data->no_hw_idx = true; OVERFLOW_CHECK(array, sz, max_size); array = (void *)array + sz; } diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index dc14f4a823cd..33804740e2ca 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -119,6 +119,7 @@ struct perf_missing_features { bool ksymbol; bool bpf; bool aux_output; + bool branch_hw_idx; }; extern struct perf_missing_features perf_missing_features; @@ -389,6 +390,11 @@ static inline bool perf_evsel__has_branch_callstack(const struct evsel *evsel) return evsel->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK; } +static inline bool perf_evsel__has_branch_hw_idx(const struct evsel *evsel) +{ + return evsel->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; +} + static inline bool evsel__has_callchain(const struct evsel *evsel) { return (evsel->core.attr.sample_type & PERF_SAMPLE_CALLCHAIN) != 0; diff --git a/tools/perf/util/expr.c b/tools/perf/util/expr.c new file mode 100644 index 000000000000..fd192ddf93c1 --- /dev/null +++ b/tools/perf/util/expr.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <stdbool.h> +#include <assert.h> +#include "expr.h" +#include "expr-bison.h" +#define YY_EXTRA_TYPE int +#include "expr-flex.h" + +#ifdef PARSER_DEBUG +extern int expr_debug; +#endif + +/* Caller must make sure id is allocated */ +void expr__add_id(struct parse_ctx *ctx, const char *name, double val) +{ + int idx; + + assert(ctx->num_ids < MAX_PARSE_ID); + idx = ctx->num_ids++; + ctx->ids[idx].name = name; + ctx->ids[idx].val = val; +} + +void expr__ctx_init(struct parse_ctx *ctx) +{ + ctx->num_ids = 0; +} + +static int +__expr__parse(double *val, struct parse_ctx *ctx, const char *expr, + int start) +{ + YY_BUFFER_STATE buffer; + void *scanner; + int ret; + + ret = expr_lex_init_extra(start, &scanner); + if (ret) + return ret; + + buffer = expr__scan_string(expr, scanner); + +#ifdef PARSER_DEBUG + expr_debug = 1; +#endif + + ret = expr_parse(val, ctx, scanner); + + expr__flush_buffer(buffer, scanner); + expr__delete_buffer(buffer, scanner); + expr_lex_destroy(scanner); + return ret; +} + +int expr__parse(double *final_val, struct parse_ctx *ctx, const char *expr) +{ + return __expr__parse(final_val, ctx, expr, EXPR_PARSE) ? -1 : 0; +} + +static bool +already_seen(const char *val, const char *one, const char **other, + int num_other) +{ + int i; + + if (one && !strcasecmp(one, val)) + return true; + for (i = 0; i < num_other; i++) + if (!strcasecmp(other[i], val)) + return true; + return false; +} + +int expr__find_other(const char *expr, const char *one, const char ***other, + int *num_other) +{ + int err, i = 0, j = 0; + struct parse_ctx ctx; + + expr__ctx_init(&ctx); + err = __expr__parse(NULL, &ctx, expr, EXPR_OTHER); + if (err) + return -1; + + *other = malloc((ctx.num_ids + 1) * sizeof(char *)); + if (!*other) + return -ENOMEM; + + for (i = 0, j = 0; i < ctx.num_ids; i++) { + const char *str = ctx.ids[i].name; + + if (already_seen(str, one, *other, j)) + continue; + + str = strdup(str); + if (!str) + goto out; + (*other)[j++] = str; + } + (*other)[j] = NULL; + +out: + if (i != ctx.num_ids) { + while (--j) + free((char *) (*other)[i]); + free(*other); + err = -1; + } + + *num_other = j; + return err; +} diff --git a/tools/perf/util/expr.h b/tools/perf/util/expr.h index 046160831f90..9377538f4097 100644 --- a/tools/perf/util/expr.h +++ b/tools/perf/util/expr.h @@ -2,7 +2,7 @@ #ifndef PARSE_CTX_H #define PARSE_CTX_H 1 -#define EXPR_MAX_OTHER 15 +#define EXPR_MAX_OTHER 20 #define MAX_PARSE_ID EXPR_MAX_OTHER struct parse_id { @@ -17,10 +17,8 @@ struct parse_ctx { void expr__ctx_init(struct parse_ctx *ctx); void expr__add_id(struct parse_ctx *ctx, const char *id, double val); -#ifndef IN_EXPR_Y -int expr__parse(double *final_val, struct parse_ctx *ctx, const char **pp); -#endif -int expr__find_other(const char *p, const char *one, const char ***other, +int expr__parse(double *final_val, struct parse_ctx *ctx, const char *expr); +int expr__find_other(const char *expr, const char *one, const char ***other, int *num_other); #endif diff --git a/tools/perf/util/expr.l b/tools/perf/util/expr.l new file mode 100644 index 000000000000..eaad29243c23 --- /dev/null +++ b/tools/perf/util/expr.l @@ -0,0 +1,114 @@ +%option prefix="expr_" +%option reentrant +%option bison-bridge + +%{ +#include <linux/compiler.h> +#include "expr.h" +#include "expr-bison.h" + +char *expr_get_text(yyscan_t yyscanner); +YYSTYPE *expr_get_lval(yyscan_t yyscanner); + +static int __value(YYSTYPE *yylval, char *str, int base, int token) +{ + u64 num; + + errno = 0; + num = strtoull(str, NULL, base); + if (errno) + return EXPR_ERROR; + + yylval->num = num; + return token; +} + +static int value(yyscan_t scanner, int base) +{ + YYSTYPE *yylval = expr_get_lval(scanner); + char *text = expr_get_text(scanner); + + return __value(yylval, text, base, NUMBER); +} + +/* + * Allow @ instead of / to be able to specify pmu/event/ without + * conflicts with normal division. + */ +static char *normalize(char *str) +{ + char *ret = str; + char *dst = str; + + while (*str) { + if (*str == '@') + *dst++ = '/'; + else if (*str == '\\') + *dst++ = *++str; + else + *dst++ = *str; + str++; + } + + *dst = 0x0; + return ret; +} + +static int str(yyscan_t scanner, int token) +{ + YYSTYPE *yylval = expr_get_lval(scanner); + char *text = expr_get_text(scanner); + + yylval->str = normalize(strdup(text)); + if (!yylval->str) + return EXPR_ERROR; + + yylval->str = normalize(yylval->str); + return token; +} +%} + +number [0-9]+ + +sch [-,=] +spec \\{sch} +sym [0-9a-zA-Z_\.:@]+ +symbol {spec}*{sym}*{spec}*{sym}* + +%% + { + int start_token; + + start_token = expr_get_extra(yyscanner); + + if (start_token) { + expr_set_extra(NULL, yyscanner); + return start_token; + } + } + +max { return MAX; } +min { return MIN; } +if { return IF; } +else { return ELSE; } +#smt_on { return SMT_ON; } +{number} { return value(yyscanner, 10); } +{symbol} { return str(yyscanner, ID); } +"|" { return '|'; } +"^" { return '^'; } +"&" { return '&'; } +"-" { return '-'; } +"+" { return '+'; } +"*" { return '*'; } +"/" { return '/'; } +"%" { return '%'; } +"(" { return '('; } +")" { return ')'; } +"," { return ','; } +. { } +%% + +int expr_wrap(void *scanner __maybe_unused) +{ + return 1; +} diff --git a/tools/perf/util/expr.y b/tools/perf/util/expr.y index 7d226241f1d7..4720cbe79357 100644 --- a/tools/perf/util/expr.y +++ b/tools/perf/util/expr.y @@ -1,31 +1,32 @@ /* Simple expression parser */ %{ +#define YYDEBUG 1 +#include <stdio.h> #include "util.h" #include "util/debug.h" #include <stdlib.h> // strtod() #define IN_EXPR_Y 1 #include "expr.h" #include "smt.h" -#include <assert.h> #include <string.h> -#define MAXIDLEN 256 %} %define api.pure full %parse-param { double *final_val } %parse-param { struct parse_ctx *ctx } -%parse-param { const char **pp } -%lex-param { const char **pp } +%parse-param {void *scanner} +%lex-param {void* scanner} %union { - double num; - char id[MAXIDLEN+1]; + double num; + char *str; } +%token EXPR_PARSE EXPR_OTHER EXPR_ERROR %token <num> NUMBER -%token <id> ID +%token <str> ID %token MIN MAX IF ELSE SMT_ON %left MIN MAX IF %left '|' @@ -37,11 +38,9 @@ %type <num> expr if_expr %{ -static int expr__lex(YYSTYPE *res, const char **pp); - -static void expr__error(double *final_val __maybe_unused, +static void expr_error(double *final_val __maybe_unused, struct parse_ctx *ctx __maybe_unused, - const char **pp __maybe_unused, + void *scanner, const char *s) { pr_debug("%s\n", s); @@ -63,6 +62,27 @@ static int lookup_id(struct parse_ctx *ctx, char *id, double *val) %} %% +start: +EXPR_PARSE all_expr +| +EXPR_OTHER all_other + +all_other: all_other other +| + +other: ID +{ + if (ctx->num_ids + 1 >= EXPR_MAX_OTHER) { + pr_err("failed: way too many variables"); + YYABORT; + } + + ctx->ids[ctx->num_ids++].name = $1; +} +| +MIN | MAX | IF | ELSE | SMT_ON | NUMBER | '|' | '^' | '&' | '-' | '+' | '*' | '/' | '%' | '(' | ')' + + all_expr: if_expr { *final_val = $1; } ; @@ -93,146 +113,3 @@ expr: NUMBER ; %% - -static int expr__symbol(YYSTYPE *res, const char *p, const char **pp) -{ - char *dst = res->id; - const char *s = p; - - if (*p == '#') - *dst++ = *p++; - - while (isalnum(*p) || *p == '_' || *p == '.' || *p == ':' || *p == '@' || *p == '\\') { - if (p - s >= MAXIDLEN) - return -1; - /* - * Allow @ instead of / to be able to specify pmu/event/ without - * conflicts with normal division. - */ - if (*p == '@') - *dst++ = '/'; - else if (*p == '\\') - *dst++ = *++p; - else - *dst++ = *p; - p++; - } - *dst = 0; - *pp = p; - dst = res->id; - switch (dst[0]) { - case 'm': - if (!strcmp(dst, "min")) - return MIN; - if (!strcmp(dst, "max")) - return MAX; - break; - case 'i': - if (!strcmp(dst, "if")) - return IF; - break; - case 'e': - if (!strcmp(dst, "else")) - return ELSE; - break; - case '#': - if (!strcasecmp(dst, "#smt_on")) - return SMT_ON; - break; - } - return ID; -} - -static int expr__lex(YYSTYPE *res, const char **pp) -{ - int tok; - const char *s; - const char *p = *pp; - - while (isspace(*p)) - p++; - s = p; - switch (*p++) { - case '#': - case 'a' ... 'z': - case 'A' ... 'Z': - return expr__symbol(res, p - 1, pp); - case '0' ... '9': case '.': - res->num = strtod(s, (char **)&p); - tok = NUMBER; - break; - default: - tok = *s; - break; - } - *pp = p; - return tok; -} - -/* Caller must make sure id is allocated */ -void expr__add_id(struct parse_ctx *ctx, const char *name, double val) -{ - int idx; - assert(ctx->num_ids < MAX_PARSE_ID); - idx = ctx->num_ids++; - ctx->ids[idx].name = name; - ctx->ids[idx].val = val; -} - -void expr__ctx_init(struct parse_ctx *ctx) -{ - ctx->num_ids = 0; -} - -static bool already_seen(const char *val, const char *one, const char **other, - int num_other) -{ - int i; - - if (one && !strcasecmp(one, val)) - return true; - for (i = 0; i < num_other; i++) - if (!strcasecmp(other[i], val)) - return true; - return false; -} - -int expr__find_other(const char *p, const char *one, const char ***other, - int *num_otherp) -{ - const char *orig = p; - int err = -1; - int num_other; - - *other = malloc((EXPR_MAX_OTHER + 1) * sizeof(char *)); - if (!*other) - return -1; - - num_other = 0; - for (;;) { - YYSTYPE val; - int tok = expr__lex(&val, &p); - if (tok == 0) { - err = 0; - break; - } - if (tok == ID && !already_seen(val.id, one, *other, num_other)) { - if (num_other >= EXPR_MAX_OTHER - 1) { - pr_debug("Too many extra events in %s\n", orig); - break; - } - (*other)[num_other] = strdup(val.id); - if (!(*other)[num_other]) - return -1; - num_other++; - } - } - (*other)[num_other] = NULL; - *num_otherp = num_other; - if (err) { - *num_otherp = 0; - free(*other); - *other = NULL; - } - return err; -} diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 4246e7447e54..acbd046bf95c 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1590,6 +1590,40 @@ static void free_event_desc(struct evsel *events) free(events); } +static bool perf_attr_check(struct perf_event_attr *attr) +{ + if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) { + pr_warning("Reserved bits are set unexpectedly. " + "Please update perf tool.\n"); + return false; + } + + if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) { + pr_warning("Unknown sample type (0x%llx) is detected. " + "Please update perf tool.\n", + attr->sample_type); + return false; + } + + if (attr->read_format & ~(PERF_FORMAT_MAX-1)) { + pr_warning("Unknown read format (0x%llx) is detected. " + "Please update perf tool.\n", + attr->read_format); + return false; + } + + if ((attr->sample_type & PERF_SAMPLE_BRANCH_STACK) && + (attr->branch_sample_type & ~(PERF_SAMPLE_BRANCH_MAX-1))) { + pr_warning("Unknown branch sample type (0x%llx) is detected. " + "Please update perf tool.\n", + attr->branch_sample_type); + + return false; + } + + return true; +} + static struct evsel *read_event_desc(struct feat_fd *ff) { struct evsel *evsel, *events = NULL; @@ -1634,6 +1668,9 @@ static struct evsel *read_event_desc(struct feat_fd *ff) memcpy(&evsel->core.attr, buf, msz); + if (!perf_attr_check(&evsel->core.attr)) + goto error; + if (do_read_u32(ff, &nr)) goto error; diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index ca5a8f4d007e..e74a5acf66d9 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -2584,9 +2584,10 @@ void hist__account_cycles(struct branch_stack *bs, struct addr_location *al, u64 *total_cycles) { struct branch_info *bi; + struct branch_entry *entries = perf_sample__branch_entries(sample); /* If we have branch cycles always annotate them. */ - if (bs && bs->nr && bs->entries[0].flags.cycles) { + if (bs && bs->nr && entries[0].flags.cycles) { int i; bi = sample__resolve_bstack(sample, al); diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index 33cf8928cf05..23c8289c2472 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -1295,6 +1295,7 @@ static int intel_pt_synth_branch_sample(struct intel_pt_queue *ptq) struct perf_sample sample = { .ip = 0, }; struct dummy_branch_stack { u64 nr; + u64 hw_idx; struct branch_entry entries; } dummy_bs; @@ -1316,6 +1317,7 @@ static int intel_pt_synth_branch_sample(struct intel_pt_queue *ptq) if (pt->synth_opts.last_branch && sort__mode == SORT_MODE__BRANCH) { dummy_bs = (struct dummy_branch_stack){ .nr = 1, + .hw_idx = -1ULL, .entries = { .from = sample.ip, .to = sample.addr, diff --git a/tools/perf/util/llvm-utils.c b/tools/perf/util/llvm-utils.c index b5af680fc667..dbdffb6673fe 100644 --- a/tools/perf/util/llvm-utils.c +++ b/tools/perf/util/llvm-utils.c @@ -265,6 +265,8 @@ static int detect_kbuild_dir(char **kbuild_dir) return -ENOMEM; return 0; } + pr_debug("%s: Couldn't find \"%s\", missing kernel-devel package?.\n", + __func__, autoconf_path); free(autoconf_path); return -ENOENT; } diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index fb5c2cd44d30..fd14f1489802 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -2081,15 +2081,16 @@ struct branch_info *sample__resolve_bstack(struct perf_sample *sample, { unsigned int i; const struct branch_stack *bs = sample->branch_stack; + struct branch_entry *entries = perf_sample__branch_entries(sample); struct branch_info *bi = calloc(bs->nr, sizeof(struct branch_info)); if (!bi) return NULL; for (i = 0; i < bs->nr; i++) { - ip__resolve_ams(al->thread, &bi[i].to, bs->entries[i].to); - ip__resolve_ams(al->thread, &bi[i].from, bs->entries[i].from); - bi[i].flags = bs->entries[i].flags; + ip__resolve_ams(al->thread, &bi[i].to, entries[i].to); + ip__resolve_ams(al->thread, &bi[i].from, entries[i].from); + bi[i].flags = entries[i].flags; } return bi; } @@ -2185,6 +2186,7 @@ static int resolve_lbr_callchain_sample(struct thread *thread, /* LBR only affects the user callchain */ if (i != chain_nr) { struct branch_stack *lbr_stack = sample->branch_stack; + struct branch_entry *entries = perf_sample__branch_entries(sample); int lbr_nr = lbr_stack->nr, j, k; bool branch; struct branch_flags *flags; @@ -2210,31 +2212,29 @@ static int resolve_lbr_callchain_sample(struct thread *thread, ip = chain->ips[j]; else if (j > i + 1) { k = j - i - 2; - ip = lbr_stack->entries[k].from; + ip = entries[k].from; branch = true; - flags = &lbr_stack->entries[k].flags; + flags = &entries[k].flags; } else { - ip = lbr_stack->entries[0].to; + ip = entries[0].to; branch = true; - flags = &lbr_stack->entries[0].flags; - branch_from = - lbr_stack->entries[0].from; + flags = &entries[0].flags; + branch_from = entries[0].from; } } else { if (j < lbr_nr) { k = lbr_nr - j - 1; - ip = lbr_stack->entries[k].from; + ip = entries[k].from; branch = true; - flags = &lbr_stack->entries[k].flags; + flags = &entries[k].flags; } else if (j > lbr_nr) ip = chain->ips[i + 1 - (j - lbr_nr)]; else { - ip = lbr_stack->entries[0].to; + ip = entries[0].to; branch = true; - flags = &lbr_stack->entries[0].flags; - branch_from = - lbr_stack->entries[0].from; + flags = &entries[0].flags; + branch_from = entries[0].from; } } @@ -2281,6 +2281,7 @@ static int thread__resolve_callchain_sample(struct thread *thread, int max_stack) { struct branch_stack *branch = sample->branch_stack; + struct branch_entry *entries = perf_sample__branch_entries(sample); struct ip_callchain *chain = sample->callchain; int chain_nr = 0; u8 cpumode = PERF_RECORD_MISC_USER; @@ -2328,7 +2329,7 @@ static int thread__resolve_callchain_sample(struct thread *thread, for (i = 0; i < nr; i++) { if (callchain_param.order == ORDER_CALLEE) { - be[i] = branch->entries[i]; + be[i] = entries[i]; if (chain == NULL) continue; @@ -2347,7 +2348,7 @@ static int thread__resolve_callchain_sample(struct thread *thread, be[i].from >= chain->ips[first_call] - 8) first_call++; } else - be[i] = branch->entries[branch->nr - i - 1]; + be[i] = entries[branch->nr - i - 1]; } memset(iter, 0, sizeof(struct iterations) * nr); diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index b342f744b1fc..53d96611e6a6 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -44,8 +44,8 @@ static inline int is_no_dso_memory(const char *filename) static inline int is_android_lib(const char *filename) { - return !strncmp(filename, "/data/app-lib", 13) || - !strncmp(filename, "/system/lib", 11); + return strstarts(filename, "/data/app-lib/") || + strstarts(filename, "/system/lib/"); } static inline bool replace_android_lib(const char *filename, char *newfilename) @@ -65,7 +65,7 @@ static inline bool replace_android_lib(const char *filename, char *newfilename) app_abi_length = strlen(app_abi); - if (!strncmp(filename, "/data/app-lib", 13)) { + if (strstarts(filename, "/data/app-lib/")) { char *apk_path; if (!app_abi_length) @@ -89,7 +89,7 @@ static inline bool replace_android_lib(const char *filename, char *newfilename) return true; } - if (!strncmp(filename, "/system/lib/", 12)) { + if (strstarts(filename, "/system/lib/")) { char *ndk, *app; const char *arch; size_t ndk_length; diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index 02aee946b6c1..c3a8c701609a 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -22,6 +22,8 @@ #include <linux/string.h> #include <linux/zalloc.h> #include <subcmd/parse-options.h> +#include <api/fs/fs.h> +#include "util.h" struct metric_event *metricgroup__lookup(struct rblist *metric_events, struct evsel *evsel, @@ -399,13 +401,85 @@ void metricgroup__print(bool metrics, bool metricgroups, char *filter, strlist__delete(metriclist); } +static void metricgroup__add_metric_weak_group(struct strbuf *events, + const char **ids, + int idnum) +{ + bool no_group = false; + int i; + + for (i = 0; i < idnum; i++) { + pr_debug("found event %s\n", ids[i]); + /* + * Duration time maps to a software event and can make + * groups not count. Always use it outside a + * group. + */ + if (!strcmp(ids[i], "duration_time")) { + if (i > 0) + strbuf_addf(events, "}:W,"); + strbuf_addf(events, "duration_time"); + no_group = true; + continue; + } + strbuf_addf(events, "%s%s", + i == 0 || no_group ? "{" : ",", + ids[i]); + no_group = false; + } + if (!no_group) + strbuf_addf(events, "}:W"); +} + +static void metricgroup__add_metric_non_group(struct strbuf *events, + const char **ids, + int idnum) +{ + int i; + + for (i = 0; i < idnum; i++) + strbuf_addf(events, ",%s", ids[i]); +} + +static void metricgroup___watchdog_constraint_hint(const char *name, bool foot) +{ + static bool violate_nmi_constraint; + + if (!foot) { + pr_warning("Splitting metric group %s into standalone metrics.\n", name); + violate_nmi_constraint = true; + return; + } + + if (!violate_nmi_constraint) + return; + + pr_warning("Try disabling the NMI watchdog to comply NO_NMI_WATCHDOG metric constraint:\n" + " echo 0 > /proc/sys/kernel/nmi_watchdog\n" + " perf stat ...\n" + " echo 1 > /proc/sys/kernel/nmi_watchdog\n"); +} + +static bool metricgroup__has_constraint(struct pmu_event *pe) +{ + if (!pe->metric_constraint) + return false; + + if (!strcmp(pe->metric_constraint, "NO_NMI_WATCHDOG") && + sysctl__nmi_watchdog_enabled()) { + metricgroup___watchdog_constraint_hint(pe->metric_name, false); + return true; + } + + return false; +} + static int metricgroup__add_metric(const char *metric, struct strbuf *events, struct list_head *group_list) { struct pmu_events_map *map = perf_pmu__find_map(NULL); struct pmu_event *pe; - int ret = -EINVAL; - int i, j; + int i, ret = -EINVAL; if (!map) return 0; @@ -422,7 +496,6 @@ static int metricgroup__add_metric(const char *metric, struct strbuf *events, const char **ids; int idnum; struct egroup *eg; - bool no_group = false; pr_debug("metric expr %s for %s\n", pe->metric_expr, pe->metric_name); @@ -431,27 +504,11 @@ static int metricgroup__add_metric(const char *metric, struct strbuf *events, continue; if (events->len > 0) strbuf_addf(events, ","); - for (j = 0; j < idnum; j++) { - pr_debug("found event %s\n", ids[j]); - /* - * Duration time maps to a software event and can make - * groups not count. Always use it outside a - * group. - */ - if (!strcmp(ids[j], "duration_time")) { - if (j > 0) - strbuf_addf(events, "}:W,"); - strbuf_addf(events, "duration_time"); - no_group = true; - continue; - } - strbuf_addf(events, "%s%s", - j == 0 || no_group ? "{" : ",", - ids[j]); - no_group = false; - } - if (!no_group) - strbuf_addf(events, "}:W"); + + if (metricgroup__has_constraint(pe)) + metricgroup__add_metric_non_group(events, ids, idnum); + else + metricgroup__add_metric_weak_group(events, ids, idnum); eg = malloc(sizeof(struct egroup)); if (!eg) { @@ -493,6 +550,10 @@ static int metricgroup__add_metric_list(const char *list, struct strbuf *events, } } free(nlist); + + if (!ret) + metricgroup___watchdog_constraint_hint(NULL, true); + return ret; } diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index 3b664fa673a6..ab7108d22428 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -98,20 +98,29 @@ static int perf_mmap__aio_bind(struct mmap *map, int idx, int cpu, int affinity) { void *data; size_t mmap_len; - unsigned long node_mask; + unsigned long *node_mask; + unsigned long node_index; + int err = 0; if (affinity != PERF_AFFINITY_SYS && cpu__max_node() > 1) { data = map->aio.data[idx]; mmap_len = mmap__mmap_len(map); - node_mask = 1UL << cpu__get_node(cpu); - if (mbind(data, mmap_len, MPOL_BIND, &node_mask, 1, 0)) { - pr_err("Failed to bind [%p-%p] AIO buffer to node %d: error %m\n", - data, data + mmap_len, cpu__get_node(cpu)); + node_index = cpu__get_node(cpu); + node_mask = bitmap_alloc(node_index + 1); + if (!node_mask) { + pr_err("Failed to allocate node mask for mbind: error %m\n"); return -1; } + set_bit(node_index, node_mask); + if (mbind(data, mmap_len, MPOL_BIND, node_mask, node_index + 1 + 1, 0)) { + pr_err("Failed to bind [%p-%p] AIO buffer to node %lu: error %m\n", + data, data + mmap_len, node_index); + err = -1; + } + bitmap_free(node_mask); } - return 0; + return err; } #else /* !HAVE_LIBNUMA_SUPPORT */ static int perf_mmap__aio_alloc(struct mmap *map, int idx) diff --git a/tools/perf/util/perf_event_attr_fprintf.c b/tools/perf/util/perf_event_attr_fprintf.c index 651203126c71..355d3458d4e6 100644 --- a/tools/perf/util/perf_event_attr_fprintf.c +++ b/tools/perf/util/perf_event_attr_fprintf.c @@ -50,6 +50,7 @@ static void __p_branch_sample_type(char *buf, size_t size, u64 value) bit_name(ABORT_TX), bit_name(IN_TX), bit_name(NO_TX), bit_name(COND), bit_name(CALL_STACK), bit_name(IND_JUMP), bit_name(CALL), bit_name(NO_FLAGS), bit_name(NO_CYCLES), + bit_name(HW_INDEX), { .name = NULL, } }; #undef bit_name diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index 80ca5d0ab7fe..8c1b27cd8b99 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -464,6 +464,7 @@ static PyObject *python_process_brstack(struct perf_sample *sample, struct thread *thread) { struct branch_stack *br = sample->branch_stack; + struct branch_entry *entries = perf_sample__branch_entries(sample); PyObject *pylist; u64 i; @@ -484,28 +485,28 @@ static PyObject *python_process_brstack(struct perf_sample *sample, Py_FatalError("couldn't create Python dictionary"); pydict_set_item_string_decref(pyelem, "from", - PyLong_FromUnsignedLongLong(br->entries[i].from)); + PyLong_FromUnsignedLongLong(entries[i].from)); pydict_set_item_string_decref(pyelem, "to", - PyLong_FromUnsignedLongLong(br->entries[i].to)); + PyLong_FromUnsignedLongLong(entries[i].to)); pydict_set_item_string_decref(pyelem, "mispred", - PyBool_FromLong(br->entries[i].flags.mispred)); + PyBool_FromLong(entries[i].flags.mispred)); pydict_set_item_string_decref(pyelem, "predicted", - PyBool_FromLong(br->entries[i].flags.predicted)); + PyBool_FromLong(entries[i].flags.predicted)); pydict_set_item_string_decref(pyelem, "in_tx", - PyBool_FromLong(br->entries[i].flags.in_tx)); + PyBool_FromLong(entries[i].flags.in_tx)); pydict_set_item_string_decref(pyelem, "abort", - PyBool_FromLong(br->entries[i].flags.abort)); + PyBool_FromLong(entries[i].flags.abort)); pydict_set_item_string_decref(pyelem, "cycles", - PyLong_FromUnsignedLongLong(br->entries[i].flags.cycles)); + PyLong_FromUnsignedLongLong(entries[i].flags.cycles)); thread__find_map_fb(thread, sample->cpumode, - br->entries[i].from, &al); + entries[i].from, &al); dsoname = get_dsoname(al.map); pydict_set_item_string_decref(pyelem, "from_dsoname", _PyUnicode_FromString(dsoname)); thread__find_map_fb(thread, sample->cpumode, - br->entries[i].to, &al); + entries[i].to, &al); dsoname = get_dsoname(al.map); pydict_set_item_string_decref(pyelem, "to_dsoname", _PyUnicode_FromString(dsoname)); @@ -561,6 +562,7 @@ static PyObject *python_process_brstacksym(struct perf_sample *sample, struct thread *thread) { struct branch_stack *br = sample->branch_stack; + struct branch_entry *entries = perf_sample__branch_entries(sample); PyObject *pylist; u64 i; char bf[512]; @@ -581,22 +583,22 @@ static PyObject *python_process_brstacksym(struct perf_sample *sample, Py_FatalError("couldn't create Python dictionary"); thread__find_symbol_fb(thread, sample->cpumode, - br->entries[i].from, &al); + entries[i].from, &al); get_symoff(al.sym, &al, true, bf, sizeof(bf)); pydict_set_item_string_decref(pyelem, "from", _PyUnicode_FromString(bf)); thread__find_symbol_fb(thread, sample->cpumode, - br->entries[i].to, &al); + entries[i].to, &al); get_symoff(al.sym, &al, true, bf, sizeof(bf)); pydict_set_item_string_decref(pyelem, "to", _PyUnicode_FromString(bf)); - get_br_mspred(&br->entries[i].flags, bf, sizeof(bf)); + get_br_mspred(&entries[i].flags, bf, sizeof(bf)); pydict_set_item_string_decref(pyelem, "pred", _PyUnicode_FromString(bf)); - if (br->entries[i].flags.in_tx) { + if (entries[i].flags.in_tx) { pydict_set_item_string_decref(pyelem, "in_tx", _PyUnicode_FromString("X")); } else { @@ -604,7 +606,7 @@ static PyObject *python_process_brstacksym(struct perf_sample *sample, _PyUnicode_FromString("-")); } - if (br->entries[i].flags.abort) { + if (entries[i].flags.abort) { pydict_set_item_string_decref(pyelem, "abort", _PyUnicode_FromString("A")); } else { diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index d0d7d25b23e3..055b00abd56d 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1007,6 +1007,7 @@ static void callchain__lbr_callstack_printf(struct perf_sample *sample) { struct ip_callchain *callchain = sample->callchain; struct branch_stack *lbr_stack = sample->branch_stack; + struct branch_entry *entries = perf_sample__branch_entries(sample); u64 kernel_callchain_nr = callchain->nr; unsigned int i; @@ -1043,10 +1044,10 @@ static void callchain__lbr_callstack_printf(struct perf_sample *sample) i, callchain->ips[i]); printf("..... %2d: %016" PRIx64 "\n", - (int)(kernel_callchain_nr), lbr_stack->entries[0].to); + (int)(kernel_callchain_nr), entries[0].to); for (i = 0; i < lbr_stack->nr; i++) printf("..... %2d: %016" PRIx64 "\n", - (int)(i + kernel_callchain_nr + 1), lbr_stack->entries[i].from); + (int)(i + kernel_callchain_nr + 1), entries[i].from); } } @@ -1068,6 +1069,7 @@ static void callchain__printf(struct evsel *evsel, static void branch_stack__printf(struct perf_sample *sample, bool callstack) { + struct branch_entry *entries = perf_sample__branch_entries(sample); uint64_t i; printf("%s: nr:%" PRIu64 "\n", @@ -1075,7 +1077,7 @@ static void branch_stack__printf(struct perf_sample *sample, bool callstack) sample->branch_stack->nr); for (i = 0; i < sample->branch_stack->nr; i++) { - struct branch_entry *e = &sample->branch_stack->entries[i]; + struct branch_entry *e = &entries[i]; if (!callstack) { printf("..... %2"PRIu64": %016" PRIx64 " -> %016" PRIx64 " %hu cycles %s%s%s%s %x\n", diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index bc31fccc0057..76c6052b12e2 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -16,6 +16,7 @@ #include <linux/ctype.h> #include "cgroup.h" #include <api/fs/fs.h> +#include "util.h" #define CNTR_NOT_SUPPORTED "<not supported>" #define CNTR_NOT_COUNTED "<not counted>" @@ -110,7 +111,7 @@ static void aggr_printout(struct perf_stat_config *config, config->csv_sep); break; case AGGR_NONE: - if (evsel->percore) { + if (evsel->percore && !config->percore_show_thread) { fprintf(config->output, "S%d-D%d-C%*d%s", cpu_map__id_to_socket(id), cpu_map__id_to_die(id), @@ -628,7 +629,7 @@ static void aggr_cb(struct perf_stat_config *config, static void print_counter_aggrdata(struct perf_stat_config *config, struct evsel *counter, int s, char *prefix, bool metric_only, - bool *first) + bool *first, int cpu) { struct aggr_data ad; FILE *output = config->output; @@ -654,7 +655,7 @@ static void print_counter_aggrdata(struct perf_stat_config *config, fprintf(output, "%s", prefix); uval = val * counter->scale; - printout(config, id, nr, counter, uval, prefix, + printout(config, cpu != -1 ? cpu : id, nr, counter, uval, prefix, run, ena, 1.0, &rt_stat); if (!metric_only) fputc('\n', output); @@ -687,7 +688,7 @@ static void print_aggr(struct perf_stat_config *config, evlist__for_each_entry(evlist, counter) { print_counter_aggrdata(config, counter, s, prefix, metric_only, - &first); + &first, -1); } if (metric_only) fputc('\n', output); @@ -1097,7 +1098,6 @@ static void print_footer(struct perf_stat_config *config) { double avg = avg_stats(config->walltime_nsecs_stats) / NSEC_PER_SEC; FILE *output = config->output; - int n; if (!config->null_run) fprintf(output, "\n"); @@ -1131,9 +1131,7 @@ static void print_footer(struct perf_stat_config *config) } fprintf(output, "\n\n"); - if (config->print_free_counters_hint && - sysctl__read_int("kernel/nmi_watchdog", &n) >= 0 && - n > 0) + if (config->print_free_counters_hint && sysctl__nmi_watchdog_enabled()) fprintf(output, "Some events weren't counted. Try disabling the NMI watchdog:\n" " echo 0 > /proc/sys/kernel/nmi_watchdog\n" @@ -1146,6 +1144,26 @@ static void print_footer(struct perf_stat_config *config) "the same PMU. Try reorganizing the group.\n"); } +static void print_percore_thread(struct perf_stat_config *config, + struct evsel *counter, char *prefix) +{ + int s, s2, id; + bool first = true; + + for (int i = 0; i < perf_evsel__nr_cpus(counter); i++) { + s2 = config->aggr_get_id(config, evsel__cpus(counter), i); + for (s = 0; s < config->aggr_map->nr; s++) { + id = config->aggr_map->map[s]; + if (s2 == id) + break; + } + + print_counter_aggrdata(config, counter, s, + prefix, false, + &first, i); + } +} + static void print_percore(struct perf_stat_config *config, struct evsel *counter, char *prefix) { @@ -1157,13 +1175,16 @@ static void print_percore(struct perf_stat_config *config, if (!(config->aggr_map || config->aggr_get_id)) return; + if (config->percore_show_thread) + return print_percore_thread(config, counter, prefix); + for (s = 0; s < config->aggr_map->nr; s++) { if (prefix && metric_only) fprintf(output, "%s", prefix); print_counter_aggrdata(config, counter, s, prefix, metric_only, - &first); + &first, -1); } if (metric_only) diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c index 90d23cc3c8d4..0fd713d3674f 100644 --- a/tools/perf/util/stat-shadow.c +++ b/tools/perf/util/stat-shadow.c @@ -777,9 +777,7 @@ static void generic_metric(struct perf_stat_config *config, } if (!metric_events[i]) { - const char *p = metric_expr; - - if (expr__parse(&ratio, &pctx, &p) == 0) { + if (expr__parse(&ratio, &pctx, metric_expr) == 0) { char *unit; char metric_bf[64]; diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index fb990efa54a8..b4fdfaa7f2c0 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -109,6 +109,7 @@ struct perf_stat_config { bool walltime_run_table; bool all_kernel; bool all_user; + bool percore_show_thread; FILE *output; unsigned int interval; unsigned int timeout; diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index c423298fe62d..3f28af39f9c6 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -345,6 +345,7 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool, continue; event->mmap2.ino = (u64)ino; + event->mmap2.ino_generation = 0; /* * Just like the kernel, see __perf_event_mmap in kernel/perf_event.c @@ -1183,7 +1184,8 @@ size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type, if (type & PERF_SAMPLE_BRANCH_STACK) { sz = sample->branch_stack->nr * sizeof(struct branch_entry); - sz += sizeof(u64); + /* nr, hw_idx */ + sz += 2 * sizeof(u64); result += sz; } @@ -1344,7 +1346,8 @@ int perf_event__synthesize_sample(union perf_event *event, u64 type, u64 read_fo if (type & PERF_SAMPLE_BRANCH_STACK) { sz = sample->branch_stack->nr * sizeof(struct branch_entry); - sz += sizeof(u64); + /* nr, hw_idx */ + sz += 2 * sizeof(u64); memcpy(array, sample->branch_stack, sz); array = (void *)array + sz; } diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c index 969ae560dad9..d707c9624dd9 100644 --- a/tools/perf/util/util.c +++ b/tools/perf/util/util.c @@ -55,6 +55,24 @@ int sysctl__max_stack(void) return sysctl_perf_event_max_stack; } +bool sysctl__nmi_watchdog_enabled(void) +{ + static bool cached; + static bool nmi_watchdog; + int value; + + if (cached) + return nmi_watchdog; + + if (sysctl__read_int("kernel/nmi_watchdog", &value) < 0) + return false; + + nmi_watchdog = (value > 0) ? true : false; + cached = true; + + return nmi_watchdog; +} + bool test_attr__enabled; bool perf_host = true; diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index 9969b8b46f7c..f486fdd3a538 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -29,6 +29,8 @@ size_t hex_width(u64 v); int sysctl__max_stack(void); +bool sysctl__nmi_watchdog_enabled(void); + int fetch_kernel_version(unsigned int *puint, char *str, size_t str_sz); #define KVER_VERSION(x) (((x) >> 16) & 0xff) |